From 86ab4355bb22a3a44d3c800fac0cf6fe1c9cac67 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Wed, 27 May 2026 05:56:17 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Neelectric/Llama-3.1-8B-Instruct_SFT_sciencev00.09 Source: Original Platform --- .gitattributes | 36 + README.md | 59 + all_results.json | 8 + chat_template.jinja | 121 + config.json | 35 + generation_config.json | 8 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 299 + special_tokens_map.json | 10 + tokenizer.json | 3 + tokenizer_config.json | 2062 + train_results.json | 8 + trainer_state.json | 163951 ++++++++++++++++++++++++++++ training_args.bin | 3 + 17 files changed, 166615 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..737ffb6 --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/Replay_0.05.MoT_science.wildguardmix_reasoning.Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_sciencev00.09 +tags: +- generated_from_trainer +- trl +- open-r1 +- sft +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_sciencev00.09 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/Replay_0.05.MoT_science.wildguardmix_reasoning.Llama3_4096toks](https://huggingface.co/datasets/Neelectric/Replay_0.05.MoT_science.wildguardmix_reasoning.Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_sciencev00.09", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_science/runs/nowgnpye) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.28.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..4c4fb69 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.0464297170303975e+19, + "train_loss": 0.9477986063134819, + "train_runtime": 21240.259, + "train_samples": 145693, + "train_samples_per_second": 13.719, + "train_steps_per_second": 0.857 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e1d9068 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..7649397 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ff99398f37f2a2ab11a1dd50f4d785c608520f3ead4d5f8f00dd15eeaacaf2e +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..30bcdd3 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5c050bb1bd7d156dc6fe68a01acd33e9ed2229f079029e96d4296e279d666a0 +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..af5a04e --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:460da88a6d2d50ab51a3e3b3a301394362be3bc6cc11624a48df1a2a558b2c9a +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..544714c --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab5cb9da3eeb0dd4656c50f90790bb95dcd827e009fb01fd62547112c653255 +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..4c4fb69 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.0464297170303975e+19, + "train_loss": 0.9477986063134819, + "train_runtime": 21240.259, + "train_samples": 145693, + "train_samples_per_second": 13.719, + "train_steps_per_second": 0.857 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..d2166ca --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,163951 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 18212, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010981770261366133, + "grad_norm": 7.3007402420043945, + "learning_rate": 0.0, + "loss": 1.4841, + "mean_token_accuracy": 0.6132287979125977, + "num_tokens": 25099.0, + "step": 1 + }, + { + "epoch": 0.00021963540522732265, + "grad_norm": 6.589829444885254, + "learning_rate": 5.48847420417124e-10, + "loss": 1.4857, + "mean_token_accuracy": 0.6123285293579102, + "num_tokens": 54691.0, + "step": 2 + }, + { + "epoch": 0.000329453107840984, + "grad_norm": 7.001859188079834, + "learning_rate": 1.097694840834248e-09, + "loss": 1.4769, + "mean_token_accuracy": 0.6073166131973267, + "num_tokens": 79633.0, + "step": 3 + }, + { + "epoch": 0.0004392708104546453, + "grad_norm": 7.589627265930176, + "learning_rate": 1.646542261251372e-09, + "loss": 1.5602, + "mean_token_accuracy": 0.6010137796401978, + "num_tokens": 104748.0, + "step": 4 + }, + { + "epoch": 0.0005490885130683066, + "grad_norm": 7.9542317390441895, + "learning_rate": 2.195389681668496e-09, + "loss": 1.5339, + "mean_token_accuracy": 0.6186473965644836, + "num_tokens": 124837.0, + "step": 5 + }, + { + "epoch": 0.000658906215681968, + "grad_norm": 7.75429630279541, + "learning_rate": 2.7442371020856205e-09, + "loss": 1.4745, + "mean_token_accuracy": 0.6186634302139282, + "num_tokens": 147361.0, + "step": 6 + }, + { + "epoch": 0.0007687239182956292, + "grad_norm": 6.962357521057129, + "learning_rate": 3.293084522502744e-09, + "loss": 1.4552, + "mean_token_accuracy": 0.6184208393096924, + "num_tokens": 173736.0, + "step": 7 + }, + { + "epoch": 0.0008785416209092906, + "grad_norm": 7.96127986907959, + "learning_rate": 3.841931942919868e-09, + "loss": 1.5505, + "mean_token_accuracy": 0.5959036350250244, + "num_tokens": 196732.0, + "step": 8 + }, + { + "epoch": 0.000988359323522952, + "grad_norm": 6.649240016937256, + "learning_rate": 4.390779363336992e-09, + "loss": 1.3833, + "mean_token_accuracy": 0.6350213885307312, + "num_tokens": 223925.0, + "step": 9 + }, + { + "epoch": 0.0010981770261366132, + "grad_norm": 8.135478973388672, + "learning_rate": 4.939626783754116e-09, + "loss": 1.6276, + "mean_token_accuracy": 0.5855638384819031, + "num_tokens": 245790.0, + "step": 10 + }, + { + "epoch": 0.0012079947287502745, + "grad_norm": 6.200130462646484, + "learning_rate": 5.488474204171241e-09, + "loss": 1.5234, + "mean_token_accuracy": 0.6064732074737549, + "num_tokens": 281533.0, + "step": 11 + }, + { + "epoch": 0.001317812431363936, + "grad_norm": 7.386454105377197, + "learning_rate": 6.0373216245883644e-09, + "loss": 1.5267, + "mean_token_accuracy": 0.610355019569397, + "num_tokens": 305166.0, + "step": 12 + }, + { + "epoch": 0.0014276301339775973, + "grad_norm": 6.7723002433776855, + "learning_rate": 6.586169045005488e-09, + "loss": 1.4827, + "mean_token_accuracy": 0.6181972622871399, + "num_tokens": 332719.0, + "step": 13 + }, + { + "epoch": 0.0015374478365912585, + "grad_norm": 7.308136940002441, + "learning_rate": 7.135016465422612e-09, + "loss": 1.526, + "mean_token_accuracy": 0.6071454286575317, + "num_tokens": 358321.0, + "step": 14 + }, + { + "epoch": 0.0016472655392049198, + "grad_norm": 6.2947916984558105, + "learning_rate": 7.683863885839736e-09, + "loss": 1.5067, + "mean_token_accuracy": 0.6069461107254028, + "num_tokens": 393397.0, + "step": 15 + }, + { + "epoch": 0.0017570832418185812, + "grad_norm": 7.498538017272949, + "learning_rate": 8.23271130625686e-09, + "loss": 1.4103, + "mean_token_accuracy": 0.6233857870101929, + "num_tokens": 417449.0, + "step": 16 + }, + { + "epoch": 0.0018669009444322424, + "grad_norm": 6.981041431427002, + "learning_rate": 8.781558726673984e-09, + "loss": 1.4598, + "mean_token_accuracy": 0.6129097938537598, + "num_tokens": 442454.0, + "step": 17 + }, + { + "epoch": 0.001976718647045904, + "grad_norm": 8.632234573364258, + "learning_rate": 9.330406147091108e-09, + "loss": 1.5177, + "mean_token_accuracy": 0.6089996099472046, + "num_tokens": 462589.0, + "step": 18 + }, + { + "epoch": 0.002086536349659565, + "grad_norm": 7.434192180633545, + "learning_rate": 9.879253567508231e-09, + "loss": 1.4542, + "mean_token_accuracy": 0.6180649995803833, + "num_tokens": 483388.0, + "step": 19 + }, + { + "epoch": 0.0021963540522732263, + "grad_norm": 7.816112518310547, + "learning_rate": 1.0428100987925357e-08, + "loss": 1.5372, + "mean_token_accuracy": 0.6070811152458191, + "num_tokens": 505871.0, + "step": 20 + }, + { + "epoch": 0.002306171754886888, + "grad_norm": 7.700085639953613, + "learning_rate": 1.0976948408342482e-08, + "loss": 1.5917, + "mean_token_accuracy": 0.5927896499633789, + "num_tokens": 529112.0, + "step": 21 + }, + { + "epoch": 0.002415989457500549, + "grad_norm": 7.041522026062012, + "learning_rate": 1.1525795828759604e-08, + "loss": 1.4034, + "mean_token_accuracy": 0.6363484263420105, + "num_tokens": 554235.0, + "step": 22 + }, + { + "epoch": 0.0025258071601142102, + "grad_norm": 6.622668266296387, + "learning_rate": 1.2074643249176729e-08, + "loss": 1.5044, + "mean_token_accuracy": 0.6120204925537109, + "num_tokens": 582850.0, + "step": 23 + }, + { + "epoch": 0.002635624862727872, + "grad_norm": 7.42185640335083, + "learning_rate": 1.2623490669593852e-08, + "loss": 1.5237, + "mean_token_accuracy": 0.6059737205505371, + "num_tokens": 605400.0, + "step": 24 + }, + { + "epoch": 0.002745442565341533, + "grad_norm": 6.3387064933776855, + "learning_rate": 1.3172338090010976e-08, + "loss": 1.3596, + "mean_token_accuracy": 0.6446506381034851, + "num_tokens": 633962.0, + "step": 25 + }, + { + "epoch": 0.0028552602679551946, + "grad_norm": 6.315981864929199, + "learning_rate": 1.3721185510428101e-08, + "loss": 1.5064, + "mean_token_accuracy": 0.607424795627594, + "num_tokens": 664148.0, + "step": 26 + }, + { + "epoch": 0.0029650779705688557, + "grad_norm": 6.1447577476501465, + "learning_rate": 1.4270032930845225e-08, + "loss": 1.5112, + "mean_token_accuracy": 0.605770468711853, + "num_tokens": 695218.0, + "step": 27 + }, + { + "epoch": 0.003074895673182517, + "grad_norm": 7.027062892913818, + "learning_rate": 1.4818880351262348e-08, + "loss": 1.5011, + "mean_token_accuracy": 0.6116410493850708, + "num_tokens": 720984.0, + "step": 28 + }, + { + "epoch": 0.0031847133757961785, + "grad_norm": 7.532924175262451, + "learning_rate": 1.536772777167947e-08, + "loss": 1.4969, + "mean_token_accuracy": 0.6148400902748108, + "num_tokens": 739905.0, + "step": 29 + }, + { + "epoch": 0.0032945310784098397, + "grad_norm": 7.411428451538086, + "learning_rate": 1.5916575192096597e-08, + "loss": 1.5085, + "mean_token_accuracy": 0.6185125112533569, + "num_tokens": 763433.0, + "step": 30 + }, + { + "epoch": 0.003404348781023501, + "grad_norm": 6.972318649291992, + "learning_rate": 1.646542261251372e-08, + "loss": 1.5112, + "mean_token_accuracy": 0.6122591495513916, + "num_tokens": 791451.0, + "step": 31 + }, + { + "epoch": 0.0035141664836371624, + "grad_norm": 6.889634609222412, + "learning_rate": 1.7014270032930844e-08, + "loss": 1.4984, + "mean_token_accuracy": 0.6076732277870178, + "num_tokens": 820451.0, + "step": 32 + }, + { + "epoch": 0.0036239841862508236, + "grad_norm": 7.338368892669678, + "learning_rate": 1.756311745334797e-08, + "loss": 1.4515, + "mean_token_accuracy": 0.6220721006393433, + "num_tokens": 845408.0, + "step": 33 + }, + { + "epoch": 0.0037338018888644848, + "grad_norm": 6.97843599319458, + "learning_rate": 1.811196487376509e-08, + "loss": 1.4034, + "mean_token_accuracy": 0.6325271129608154, + "num_tokens": 867223.0, + "step": 34 + }, + { + "epoch": 0.0038436195914781464, + "grad_norm": 6.1522908210754395, + "learning_rate": 1.8660812294182216e-08, + "loss": 1.3991, + "mean_token_accuracy": 0.6308186054229736, + "num_tokens": 897187.0, + "step": 35 + }, + { + "epoch": 0.003953437294091808, + "grad_norm": 6.4405622482299805, + "learning_rate": 1.920965971459934e-08, + "loss": 1.4558, + "mean_token_accuracy": 0.624022901058197, + "num_tokens": 922659.0, + "step": 36 + }, + { + "epoch": 0.004063254996705469, + "grad_norm": 6.828300952911377, + "learning_rate": 1.9758507135016463e-08, + "loss": 1.5075, + "mean_token_accuracy": 0.6085585355758667, + "num_tokens": 950733.0, + "step": 37 + }, + { + "epoch": 0.00417307269931913, + "grad_norm": 8.31611156463623, + "learning_rate": 2.030735455543359e-08, + "loss": 1.5399, + "mean_token_accuracy": 0.5995083451271057, + "num_tokens": 971960.0, + "step": 38 + }, + { + "epoch": 0.004282890401932792, + "grad_norm": 7.829165935516357, + "learning_rate": 2.0856201975850713e-08, + "loss": 1.4907, + "mean_token_accuracy": 0.620111346244812, + "num_tokens": 993635.0, + "step": 39 + }, + { + "epoch": 0.004392708104546453, + "grad_norm": 7.270971775054932, + "learning_rate": 2.1405049396267835e-08, + "loss": 1.477, + "mean_token_accuracy": 0.6158565878868103, + "num_tokens": 1018021.0, + "step": 40 + }, + { + "epoch": 0.004502525807160114, + "grad_norm": 6.465977668762207, + "learning_rate": 2.1953896816684964e-08, + "loss": 1.5879, + "mean_token_accuracy": 0.5886329412460327, + "num_tokens": 1050221.0, + "step": 41 + }, + { + "epoch": 0.004612343509773776, + "grad_norm": 7.993412017822266, + "learning_rate": 2.2502744237102085e-08, + "loss": 1.4994, + "mean_token_accuracy": 0.620638906955719, + "num_tokens": 1072122.0, + "step": 42 + }, + { + "epoch": 0.0047221612123874365, + "grad_norm": 7.143760681152344, + "learning_rate": 2.3051591657519207e-08, + "loss": 1.3972, + "mean_token_accuracy": 0.640123188495636, + "num_tokens": 1096396.0, + "step": 43 + }, + { + "epoch": 0.004831978915001098, + "grad_norm": 7.772852897644043, + "learning_rate": 2.3600439077936336e-08, + "loss": 1.466, + "mean_token_accuracy": 0.6249079704284668, + "num_tokens": 1117958.0, + "step": 44 + }, + { + "epoch": 0.00494179661761476, + "grad_norm": 8.439007759094238, + "learning_rate": 2.4149286498353458e-08, + "loss": 1.5979, + "mean_token_accuracy": 0.5888959169387817, + "num_tokens": 1137829.0, + "step": 45 + }, + { + "epoch": 0.0050516143202284204, + "grad_norm": 7.0045013427734375, + "learning_rate": 2.469813391877058e-08, + "loss": 1.4905, + "mean_token_accuracy": 0.607435941696167, + "num_tokens": 1165048.0, + "step": 46 + }, + { + "epoch": 0.005161432022842082, + "grad_norm": 7.486037254333496, + "learning_rate": 2.5246981339187705e-08, + "loss": 1.3667, + "mean_token_accuracy": 0.6386758089065552, + "num_tokens": 1186203.0, + "step": 47 + }, + { + "epoch": 0.005271249725455744, + "grad_norm": 7.05202579498291, + "learning_rate": 2.579582875960483e-08, + "loss": 1.5163, + "mean_token_accuracy": 0.6045367121696472, + "num_tokens": 1214013.0, + "step": 48 + }, + { + "epoch": 0.005381067428069404, + "grad_norm": 6.294488430023193, + "learning_rate": 2.6344676180021952e-08, + "loss": 1.4032, + "mean_token_accuracy": 0.6296184062957764, + "num_tokens": 1244209.0, + "step": 49 + }, + { + "epoch": 0.005490885130683066, + "grad_norm": 7.002264976501465, + "learning_rate": 2.6893523600439077e-08, + "loss": 1.4184, + "mean_token_accuracy": 0.6315704584121704, + "num_tokens": 1271352.0, + "step": 50 + }, + { + "epoch": 0.0056007028332967276, + "grad_norm": 8.117701530456543, + "learning_rate": 2.7442371020856202e-08, + "loss": 1.4424, + "mean_token_accuracy": 0.6226906776428223, + "num_tokens": 1291262.0, + "step": 51 + }, + { + "epoch": 0.005710520535910389, + "grad_norm": 7.040408134460449, + "learning_rate": 2.7991218441273324e-08, + "loss": 1.4102, + "mean_token_accuracy": 0.6271786689758301, + "num_tokens": 1314456.0, + "step": 52 + }, + { + "epoch": 0.00582033823852405, + "grad_norm": 7.499876022338867, + "learning_rate": 2.854006586169045e-08, + "loss": 1.4701, + "mean_token_accuracy": 0.6221399307250977, + "num_tokens": 1337041.0, + "step": 53 + }, + { + "epoch": 0.0059301559411377115, + "grad_norm": 7.501381874084473, + "learning_rate": 2.9088913282107574e-08, + "loss": 1.5255, + "mean_token_accuracy": 0.6071066856384277, + "num_tokens": 1359693.0, + "step": 54 + }, + { + "epoch": 0.006039973643751373, + "grad_norm": 6.573636531829834, + "learning_rate": 2.9637760702524696e-08, + "loss": 1.4723, + "mean_token_accuracy": 0.6129328012466431, + "num_tokens": 1388885.0, + "step": 55 + }, + { + "epoch": 0.006149791346365034, + "grad_norm": 5.980090141296387, + "learning_rate": 3.018660812294182e-08, + "loss": 1.5387, + "mean_token_accuracy": 0.6039096117019653, + "num_tokens": 1424618.0, + "step": 56 + }, + { + "epoch": 0.006259609048978695, + "grad_norm": 7.871952056884766, + "learning_rate": 3.073545554335894e-08, + "loss": 1.5339, + "mean_token_accuracy": 0.5999553203582764, + "num_tokens": 1446508.0, + "step": 57 + }, + { + "epoch": 0.006369426751592357, + "grad_norm": 7.982438564300537, + "learning_rate": 3.128430296377607e-08, + "loss": 1.4884, + "mean_token_accuracy": 0.6124846339225769, + "num_tokens": 1468229.0, + "step": 58 + }, + { + "epoch": 0.006479244454206018, + "grad_norm": 7.512792587280273, + "learning_rate": 3.1833150384193193e-08, + "loss": 1.4891, + "mean_token_accuracy": 0.6167940497398376, + "num_tokens": 1492628.0, + "step": 59 + }, + { + "epoch": 0.006589062156819679, + "grad_norm": 6.567518711090088, + "learning_rate": 3.2381997804610315e-08, + "loss": 1.4591, + "mean_token_accuracy": 0.622778594493866, + "num_tokens": 1521752.0, + "step": 60 + }, + { + "epoch": 0.006698879859433341, + "grad_norm": 6.773961544036865, + "learning_rate": 3.293084522502744e-08, + "loss": 1.4189, + "mean_token_accuracy": 0.6260044574737549, + "num_tokens": 1547820.0, + "step": 61 + }, + { + "epoch": 0.006808697562047002, + "grad_norm": 6.898876667022705, + "learning_rate": 3.3479692645444566e-08, + "loss": 1.5109, + "mean_token_accuracy": 0.617505669593811, + "num_tokens": 1575562.0, + "step": 62 + }, + { + "epoch": 0.006918515264660663, + "grad_norm": 8.722769737243652, + "learning_rate": 3.402854006586169e-08, + "loss": 1.4633, + "mean_token_accuracy": 0.6232997179031372, + "num_tokens": 1595090.0, + "step": 63 + }, + { + "epoch": 0.007028332967274325, + "grad_norm": 7.860049724578857, + "learning_rate": 3.4577387486278816e-08, + "loss": 1.5114, + "mean_token_accuracy": 0.6112312078475952, + "num_tokens": 1616327.0, + "step": 64 + }, + { + "epoch": 0.007138150669887986, + "grad_norm": 7.5092854499816895, + "learning_rate": 3.512623490669594e-08, + "loss": 1.5882, + "mean_token_accuracy": 0.6025569438934326, + "num_tokens": 1640453.0, + "step": 65 + }, + { + "epoch": 0.007247968372501647, + "grad_norm": 6.770665168762207, + "learning_rate": 3.567508232711306e-08, + "loss": 1.4575, + "mean_token_accuracy": 0.6141598224639893, + "num_tokens": 1663864.0, + "step": 66 + }, + { + "epoch": 0.007357786075115309, + "grad_norm": 6.757351398468018, + "learning_rate": 3.622392974753018e-08, + "loss": 1.4927, + "mean_token_accuracy": 0.6107399463653564, + "num_tokens": 1690093.0, + "step": 67 + }, + { + "epoch": 0.0074676037777289695, + "grad_norm": 7.696837425231934, + "learning_rate": 3.677277716794731e-08, + "loss": 1.5559, + "mean_token_accuracy": 0.6027103662490845, + "num_tokens": 1713394.0, + "step": 68 + }, + { + "epoch": 0.007577421480342631, + "grad_norm": 6.835667610168457, + "learning_rate": 3.732162458836443e-08, + "loss": 1.5121, + "mean_token_accuracy": 0.6034413576126099, + "num_tokens": 1740709.0, + "step": 69 + }, + { + "epoch": 0.007687239182956293, + "grad_norm": 5.931532382965088, + "learning_rate": 3.787047200878156e-08, + "loss": 1.374, + "mean_token_accuracy": 0.6416874527931213, + "num_tokens": 1772103.0, + "step": 70 + }, + { + "epoch": 0.007797056885569954, + "grad_norm": 7.165716171264648, + "learning_rate": 3.841931942919868e-08, + "loss": 1.5623, + "mean_token_accuracy": 0.6083561182022095, + "num_tokens": 1795314.0, + "step": 71 + }, + { + "epoch": 0.007906874588183616, + "grad_norm": 7.169013023376465, + "learning_rate": 3.8968166849615804e-08, + "loss": 1.5972, + "mean_token_accuracy": 0.5887801647186279, + "num_tokens": 1821345.0, + "step": 72 + }, + { + "epoch": 0.008016692290797276, + "grad_norm": 7.234011173248291, + "learning_rate": 3.9517014270032926e-08, + "loss": 1.5597, + "mean_token_accuracy": 0.5945669412612915, + "num_tokens": 1845652.0, + "step": 73 + }, + { + "epoch": 0.008126509993410937, + "grad_norm": 7.248448371887207, + "learning_rate": 4.006586169045005e-08, + "loss": 1.4356, + "mean_token_accuracy": 0.6178821325302124, + "num_tokens": 1869249.0, + "step": 74 + }, + { + "epoch": 0.008236327696024599, + "grad_norm": 7.5012359619140625, + "learning_rate": 4.061470911086718e-08, + "loss": 1.5376, + "mean_token_accuracy": 0.5998717546463013, + "num_tokens": 1892143.0, + "step": 75 + }, + { + "epoch": 0.00834614539863826, + "grad_norm": 6.740009784698486, + "learning_rate": 4.1163556531284305e-08, + "loss": 1.4624, + "mean_token_accuracy": 0.6149723529815674, + "num_tokens": 1919911.0, + "step": 76 + }, + { + "epoch": 0.008455963101251922, + "grad_norm": 6.307705879211426, + "learning_rate": 4.1712403951701427e-08, + "loss": 1.4623, + "mean_token_accuracy": 0.6265270709991455, + "num_tokens": 1948473.0, + "step": 77 + }, + { + "epoch": 0.008565780803865584, + "grad_norm": 6.788443088531494, + "learning_rate": 4.226125137211855e-08, + "loss": 1.4568, + "mean_token_accuracy": 0.6229809522628784, + "num_tokens": 1973011.0, + "step": 78 + }, + { + "epoch": 0.008675598506479244, + "grad_norm": 6.373062610626221, + "learning_rate": 4.281009879253567e-08, + "loss": 1.5002, + "mean_token_accuracy": 0.6027998328208923, + "num_tokens": 2003086.0, + "step": 79 + }, + { + "epoch": 0.008785416209092905, + "grad_norm": 7.058175086975098, + "learning_rate": 4.335894621295279e-08, + "loss": 1.4295, + "mean_token_accuracy": 0.6263693571090698, + "num_tokens": 2026821.0, + "step": 80 + }, + { + "epoch": 0.008895233911706567, + "grad_norm": 6.273486137390137, + "learning_rate": 4.390779363336993e-08, + "loss": 1.5081, + "mean_token_accuracy": 0.6083534955978394, + "num_tokens": 2057977.0, + "step": 81 + }, + { + "epoch": 0.009005051614320228, + "grad_norm": 7.040412902832031, + "learning_rate": 4.445664105378705e-08, + "loss": 1.5528, + "mean_token_accuracy": 0.591522216796875, + "num_tokens": 2082919.0, + "step": 82 + }, + { + "epoch": 0.00911486931693389, + "grad_norm": 7.889280796051025, + "learning_rate": 4.500548847420417e-08, + "loss": 1.4533, + "mean_token_accuracy": 0.6240402460098267, + "num_tokens": 2103490.0, + "step": 83 + }, + { + "epoch": 0.009224687019547552, + "grad_norm": 6.279137134552002, + "learning_rate": 4.555433589462129e-08, + "loss": 1.417, + "mean_token_accuracy": 0.6489643454551697, + "num_tokens": 2128348.0, + "step": 84 + }, + { + "epoch": 0.009334504722161213, + "grad_norm": 7.138681411743164, + "learning_rate": 4.6103183315038415e-08, + "loss": 1.5835, + "mean_token_accuracy": 0.5870269536972046, + "num_tokens": 2153511.0, + "step": 85 + }, + { + "epoch": 0.009444322424774873, + "grad_norm": 7.571696758270264, + "learning_rate": 4.6652030735455537e-08, + "loss": 1.4576, + "mean_token_accuracy": 0.6219869256019592, + "num_tokens": 2174626.0, + "step": 86 + }, + { + "epoch": 0.009554140127388535, + "grad_norm": 6.562620639801025, + "learning_rate": 4.720087815587267e-08, + "loss": 1.4779, + "mean_token_accuracy": 0.6096662878990173, + "num_tokens": 2202358.0, + "step": 87 + }, + { + "epoch": 0.009663957830002196, + "grad_norm": 6.463425159454346, + "learning_rate": 4.7749725576289793e-08, + "loss": 1.4354, + "mean_token_accuracy": 0.6248286366462708, + "num_tokens": 2225849.0, + "step": 88 + }, + { + "epoch": 0.009773775532615858, + "grad_norm": 7.753304481506348, + "learning_rate": 4.8298572996706915e-08, + "loss": 1.4192, + "mean_token_accuracy": 0.6247750520706177, + "num_tokens": 2246305.0, + "step": 89 + }, + { + "epoch": 0.00988359323522952, + "grad_norm": 5.915476322174072, + "learning_rate": 4.884742041712404e-08, + "loss": 1.3859, + "mean_token_accuracy": 0.6341253519058228, + "num_tokens": 2274359.0, + "step": 90 + }, + { + "epoch": 0.009993410937843181, + "grad_norm": 6.269830226898193, + "learning_rate": 4.939626783754116e-08, + "loss": 1.4437, + "mean_token_accuracy": 0.6236516833305359, + "num_tokens": 2300391.0, + "step": 91 + }, + { + "epoch": 0.010103228640456841, + "grad_norm": 5.6357340812683105, + "learning_rate": 4.994511525795828e-08, + "loss": 1.389, + "mean_token_accuracy": 0.6331000924110413, + "num_tokens": 2328140.0, + "step": 92 + }, + { + "epoch": 0.010213046343070502, + "grad_norm": 5.845223426818848, + "learning_rate": 5.049396267837541e-08, + "loss": 1.3792, + "mean_token_accuracy": 0.6359796524047852, + "num_tokens": 2357038.0, + "step": 93 + }, + { + "epoch": 0.010322864045684164, + "grad_norm": 6.309408664703369, + "learning_rate": 5.104281009879254e-08, + "loss": 1.4378, + "mean_token_accuracy": 0.6149685978889465, + "num_tokens": 2381995.0, + "step": 94 + }, + { + "epoch": 0.010432681748297826, + "grad_norm": 6.17453670501709, + "learning_rate": 5.159165751920966e-08, + "loss": 1.3921, + "mean_token_accuracy": 0.6280044317245483, + "num_tokens": 2408629.0, + "step": 95 + }, + { + "epoch": 0.010542499450911487, + "grad_norm": 6.9587483406066895, + "learning_rate": 5.214050493962678e-08, + "loss": 1.4491, + "mean_token_accuracy": 0.6178081631660461, + "num_tokens": 2431863.0, + "step": 96 + }, + { + "epoch": 0.010652317153525149, + "grad_norm": 6.0525221824646, + "learning_rate": 5.2689352360043903e-08, + "loss": 1.4638, + "mean_token_accuracy": 0.6104514598846436, + "num_tokens": 2461297.0, + "step": 97 + }, + { + "epoch": 0.010762134856138809, + "grad_norm": 7.247806549072266, + "learning_rate": 5.3238199780461025e-08, + "loss": 1.4948, + "mean_token_accuracy": 0.627984881401062, + "num_tokens": 2480297.0, + "step": 98 + }, + { + "epoch": 0.01087195255875247, + "grad_norm": 6.950901508331299, + "learning_rate": 5.3787047200878154e-08, + "loss": 1.5023, + "mean_token_accuracy": 0.6074822545051575, + "num_tokens": 2504118.0, + "step": 99 + }, + { + "epoch": 0.010981770261366132, + "grad_norm": 6.4474992752075195, + "learning_rate": 5.433589462129528e-08, + "loss": 1.4877, + "mean_token_accuracy": 0.6115164756774902, + "num_tokens": 2528086.0, + "step": 100 + }, + { + "epoch": 0.011091587963979794, + "grad_norm": 6.206024169921875, + "learning_rate": 5.4884742041712404e-08, + "loss": 1.428, + "mean_token_accuracy": 0.6295562386512756, + "num_tokens": 2553486.0, + "step": 101 + }, + { + "epoch": 0.011201405666593455, + "grad_norm": 6.637907028198242, + "learning_rate": 5.5433589462129526e-08, + "loss": 1.4751, + "mean_token_accuracy": 0.6146992444992065, + "num_tokens": 2578664.0, + "step": 102 + }, + { + "epoch": 0.011311223369207117, + "grad_norm": 7.163166522979736, + "learning_rate": 5.598243688254665e-08, + "loss": 1.5806, + "mean_token_accuracy": 0.5960236191749573, + "num_tokens": 2601508.0, + "step": 103 + }, + { + "epoch": 0.011421041071820778, + "grad_norm": 8.087705612182617, + "learning_rate": 5.653128430296377e-08, + "loss": 1.5074, + "mean_token_accuracy": 0.6071646213531494, + "num_tokens": 2620659.0, + "step": 104 + }, + { + "epoch": 0.011530858774434438, + "grad_norm": 6.997490882873535, + "learning_rate": 5.70801317233809e-08, + "loss": 1.526, + "mean_token_accuracy": 0.6032649278640747, + "num_tokens": 2645067.0, + "step": 105 + }, + { + "epoch": 0.0116406764770481, + "grad_norm": 6.638916492462158, + "learning_rate": 5.762897914379802e-08, + "loss": 1.3867, + "mean_token_accuracy": 0.6285203695297241, + "num_tokens": 2669425.0, + "step": 106 + }, + { + "epoch": 0.011750494179661761, + "grad_norm": 7.619155406951904, + "learning_rate": 5.817782656421515e-08, + "loss": 1.38, + "mean_token_accuracy": 0.6349570155143738, + "num_tokens": 2689160.0, + "step": 107 + }, + { + "epoch": 0.011860311882275423, + "grad_norm": 8.149062156677246, + "learning_rate": 5.872667398463227e-08, + "loss": 1.4228, + "mean_token_accuracy": 0.6309859752655029, + "num_tokens": 2707923.0, + "step": 108 + }, + { + "epoch": 0.011970129584889085, + "grad_norm": 6.87787389755249, + "learning_rate": 5.927552140504939e-08, + "loss": 1.4303, + "mean_token_accuracy": 0.62212073802948, + "num_tokens": 2732891.0, + "step": 109 + }, + { + "epoch": 0.012079947287502746, + "grad_norm": 7.5543212890625, + "learning_rate": 5.982436882546651e-08, + "loss": 1.4833, + "mean_token_accuracy": 0.6150633096694946, + "num_tokens": 2754996.0, + "step": 110 + }, + { + "epoch": 0.012189764990116406, + "grad_norm": 6.50202751159668, + "learning_rate": 6.037321624588364e-08, + "loss": 1.5073, + "mean_token_accuracy": 0.6082611083984375, + "num_tokens": 2783881.0, + "step": 111 + }, + { + "epoch": 0.012299582692730068, + "grad_norm": 6.443778038024902, + "learning_rate": 6.092206366630077e-08, + "loss": 1.4383, + "mean_token_accuracy": 0.6146246194839478, + "num_tokens": 2811294.0, + "step": 112 + }, + { + "epoch": 0.01240940039534373, + "grad_norm": 6.748902797698975, + "learning_rate": 6.147091108671789e-08, + "loss": 1.3802, + "mean_token_accuracy": 0.6329888105392456, + "num_tokens": 2833565.0, + "step": 113 + }, + { + "epoch": 0.01251921809795739, + "grad_norm": 6.903865814208984, + "learning_rate": 6.201975850713501e-08, + "loss": 1.3627, + "mean_token_accuracy": 0.6401166915893555, + "num_tokens": 2856557.0, + "step": 114 + }, + { + "epoch": 0.012629035800571052, + "grad_norm": 6.116842269897461, + "learning_rate": 6.256860592755214e-08, + "loss": 1.4116, + "mean_token_accuracy": 0.6253392696380615, + "num_tokens": 2882440.0, + "step": 115 + }, + { + "epoch": 0.012738853503184714, + "grad_norm": 6.086690425872803, + "learning_rate": 6.311745334796927e-08, + "loss": 1.3775, + "mean_token_accuracy": 0.6288226842880249, + "num_tokens": 2911246.0, + "step": 116 + }, + { + "epoch": 0.012848671205798374, + "grad_norm": 7.457494735717773, + "learning_rate": 6.366630076838639e-08, + "loss": 1.4282, + "mean_token_accuracy": 0.6172651052474976, + "num_tokens": 2932948.0, + "step": 117 + }, + { + "epoch": 0.012958488908412035, + "grad_norm": 7.08140230178833, + "learning_rate": 6.421514818880352e-08, + "loss": 1.4533, + "mean_token_accuracy": 0.6085711717605591, + "num_tokens": 2955087.0, + "step": 118 + }, + { + "epoch": 0.013068306611025697, + "grad_norm": 6.994325160980225, + "learning_rate": 6.476399560922063e-08, + "loss": 1.4029, + "mean_token_accuracy": 0.6318500638008118, + "num_tokens": 2977999.0, + "step": 119 + }, + { + "epoch": 0.013178124313639359, + "grad_norm": 6.275083065032959, + "learning_rate": 6.531284302963776e-08, + "loss": 1.4449, + "mean_token_accuracy": 0.6135345697402954, + "num_tokens": 3005403.0, + "step": 120 + }, + { + "epoch": 0.01328794201625302, + "grad_norm": 5.475348949432373, + "learning_rate": 6.586169045005487e-08, + "loss": 1.262, + "mean_token_accuracy": 0.6556241512298584, + "num_tokens": 3033027.0, + "step": 121 + }, + { + "epoch": 0.013397759718866682, + "grad_norm": 5.8613057136535645, + "learning_rate": 6.6410537870472e-08, + "loss": 1.5076, + "mean_token_accuracy": 0.6015868782997131, + "num_tokens": 3059292.0, + "step": 122 + }, + { + "epoch": 0.013507577421480343, + "grad_norm": 5.903887748718262, + "learning_rate": 6.695938529088913e-08, + "loss": 1.393, + "mean_token_accuracy": 0.6286495923995972, + "num_tokens": 3084239.0, + "step": 123 + }, + { + "epoch": 0.013617395124094003, + "grad_norm": 6.065432548522949, + "learning_rate": 6.750823271130625e-08, + "loss": 1.3774, + "mean_token_accuracy": 0.6297359466552734, + "num_tokens": 3107956.0, + "step": 124 + }, + { + "epoch": 0.013727212826707665, + "grad_norm": 5.490118503570557, + "learning_rate": 6.805708013172338e-08, + "loss": 1.408, + "mean_token_accuracy": 0.6166077852249146, + "num_tokens": 3136403.0, + "step": 125 + }, + { + "epoch": 0.013837030529321327, + "grad_norm": 5.179423809051514, + "learning_rate": 6.86059275521405e-08, + "loss": 1.3679, + "mean_token_accuracy": 0.629126787185669, + "num_tokens": 3162556.0, + "step": 126 + }, + { + "epoch": 0.013946848231934988, + "grad_norm": 5.615283966064453, + "learning_rate": 6.915477497255763e-08, + "loss": 1.2855, + "mean_token_accuracy": 0.6525399684906006, + "num_tokens": 3184722.0, + "step": 127 + }, + { + "epoch": 0.01405666593454865, + "grad_norm": 5.599029064178467, + "learning_rate": 6.970362239297475e-08, + "loss": 1.4025, + "mean_token_accuracy": 0.6259146332740784, + "num_tokens": 3208070.0, + "step": 128 + }, + { + "epoch": 0.014166483637162311, + "grad_norm": 5.678755283355713, + "learning_rate": 7.025246981339188e-08, + "loss": 1.4675, + "mean_token_accuracy": 0.6107256412506104, + "num_tokens": 3232456.0, + "step": 129 + }, + { + "epoch": 0.014276301339775971, + "grad_norm": 5.4064860343933105, + "learning_rate": 7.0801317233809e-08, + "loss": 1.3396, + "mean_token_accuracy": 0.6464923620223999, + "num_tokens": 3258860.0, + "step": 130 + }, + { + "epoch": 0.014386119042389633, + "grad_norm": 4.852438449859619, + "learning_rate": 7.135016465422612e-08, + "loss": 1.3949, + "mean_token_accuracy": 0.6333706378936768, + "num_tokens": 3289054.0, + "step": 131 + }, + { + "epoch": 0.014495936745003294, + "grad_norm": 5.281308650970459, + "learning_rate": 7.189901207464325e-08, + "loss": 1.4658, + "mean_token_accuracy": 0.6093812584877014, + "num_tokens": 3315642.0, + "step": 132 + }, + { + "epoch": 0.014605754447616956, + "grad_norm": 5.316889762878418, + "learning_rate": 7.244785949506036e-08, + "loss": 1.4364, + "mean_token_accuracy": 0.6028910875320435, + "num_tokens": 3339714.0, + "step": 133 + }, + { + "epoch": 0.014715572150230618, + "grad_norm": 4.456579685211182, + "learning_rate": 7.299670691547749e-08, + "loss": 1.4203, + "mean_token_accuracy": 0.6197012662887573, + "num_tokens": 3372857.0, + "step": 134 + }, + { + "epoch": 0.01482538985284428, + "grad_norm": 5.125270366668701, + "learning_rate": 7.354555433589462e-08, + "loss": 1.3733, + "mean_token_accuracy": 0.6252163052558899, + "num_tokens": 3399496.0, + "step": 135 + }, + { + "epoch": 0.014935207555457939, + "grad_norm": 5.647879123687744, + "learning_rate": 7.409440175631174e-08, + "loss": 1.426, + "mean_token_accuracy": 0.6099004745483398, + "num_tokens": 3422639.0, + "step": 136 + }, + { + "epoch": 0.0150450252580716, + "grad_norm": 5.531869888305664, + "learning_rate": 7.464324917672886e-08, + "loss": 1.4518, + "mean_token_accuracy": 0.6105077862739563, + "num_tokens": 3446379.0, + "step": 137 + }, + { + "epoch": 0.015154842960685262, + "grad_norm": 5.062038421630859, + "learning_rate": 7.519209659714599e-08, + "loss": 1.4172, + "mean_token_accuracy": 0.6107718348503113, + "num_tokens": 3473872.0, + "step": 138 + }, + { + "epoch": 0.015264660663298924, + "grad_norm": 4.627781391143799, + "learning_rate": 7.574094401756312e-08, + "loss": 1.4201, + "mean_token_accuracy": 0.6220059394836426, + "num_tokens": 3504220.0, + "step": 139 + }, + { + "epoch": 0.015374478365912585, + "grad_norm": 4.901545524597168, + "learning_rate": 7.628979143798024e-08, + "loss": 1.2756, + "mean_token_accuracy": 0.6505841016769409, + "num_tokens": 3526440.0, + "step": 140 + }, + { + "epoch": 0.015484296068526247, + "grad_norm": 5.302318096160889, + "learning_rate": 7.683863885839736e-08, + "loss": 1.304, + "mean_token_accuracy": 0.6431175470352173, + "num_tokens": 3547523.0, + "step": 141 + }, + { + "epoch": 0.015594113771139909, + "grad_norm": 5.305556774139404, + "learning_rate": 7.738748627881449e-08, + "loss": 1.3291, + "mean_token_accuracy": 0.6309767365455627, + "num_tokens": 3570441.0, + "step": 142 + }, + { + "epoch": 0.01570393147375357, + "grad_norm": 5.277497291564941, + "learning_rate": 7.793633369923161e-08, + "loss": 1.361, + "mean_token_accuracy": 0.6416422724723816, + "num_tokens": 3593915.0, + "step": 143 + }, + { + "epoch": 0.015813749176367232, + "grad_norm": 5.115762233734131, + "learning_rate": 7.848518111964874e-08, + "loss": 1.3022, + "mean_token_accuracy": 0.6345009803771973, + "num_tokens": 3617422.0, + "step": 144 + }, + { + "epoch": 0.01592356687898089, + "grad_norm": 5.243889808654785, + "learning_rate": 7.903402854006585e-08, + "loss": 1.3681, + "mean_token_accuracy": 0.6282845139503479, + "num_tokens": 3638524.0, + "step": 145 + }, + { + "epoch": 0.01603338458159455, + "grad_norm": 5.173805236816406, + "learning_rate": 7.958287596048298e-08, + "loss": 1.4134, + "mean_token_accuracy": 0.613021969795227, + "num_tokens": 3658569.0, + "step": 146 + }, + { + "epoch": 0.016143202284208215, + "grad_norm": 5.573866367340088, + "learning_rate": 8.01317233809001e-08, + "loss": 1.3458, + "mean_token_accuracy": 0.6362459659576416, + "num_tokens": 3678862.0, + "step": 147 + }, + { + "epoch": 0.016253019986821875, + "grad_norm": 4.206975936889648, + "learning_rate": 8.068057080131722e-08, + "loss": 1.3648, + "mean_token_accuracy": 0.6230988502502441, + "num_tokens": 3709194.0, + "step": 148 + }, + { + "epoch": 0.016362837689435538, + "grad_norm": 5.222945690155029, + "learning_rate": 8.122941822173437e-08, + "loss": 1.3564, + "mean_token_accuracy": 0.6337326765060425, + "num_tokens": 3730221.0, + "step": 149 + }, + { + "epoch": 0.016472655392049198, + "grad_norm": 4.513251781463623, + "learning_rate": 8.177826564215148e-08, + "loss": 1.3656, + "mean_token_accuracy": 0.6328703165054321, + "num_tokens": 3757609.0, + "step": 150 + }, + { + "epoch": 0.01658247309466286, + "grad_norm": 4.579383373260498, + "learning_rate": 8.232711306256861e-08, + "loss": 1.4308, + "mean_token_accuracy": 0.6067019104957581, + "num_tokens": 3786349.0, + "step": 151 + }, + { + "epoch": 0.01669229079727652, + "grad_norm": 4.8106207847595215, + "learning_rate": 8.287596048298572e-08, + "loss": 1.289, + "mean_token_accuracy": 0.6492617130279541, + "num_tokens": 3810687.0, + "step": 152 + }, + { + "epoch": 0.01680210849989018, + "grad_norm": 4.604519844055176, + "learning_rate": 8.342480790340285e-08, + "loss": 1.3119, + "mean_token_accuracy": 0.6488734483718872, + "num_tokens": 3835709.0, + "step": 153 + }, + { + "epoch": 0.016911926202503844, + "grad_norm": 4.522201061248779, + "learning_rate": 8.397365532381998e-08, + "loss": 1.3547, + "mean_token_accuracy": 0.6320141553878784, + "num_tokens": 3863035.0, + "step": 154 + }, + { + "epoch": 0.017021743905117504, + "grad_norm": 4.3953986167907715, + "learning_rate": 8.45225027442371e-08, + "loss": 1.3934, + "mean_token_accuracy": 0.6206352710723877, + "num_tokens": 3894426.0, + "step": 155 + }, + { + "epoch": 0.017131561607731167, + "grad_norm": 4.595627784729004, + "learning_rate": 8.507135016465423e-08, + "loss": 1.2667, + "mean_token_accuracy": 0.6531518697738647, + "num_tokens": 3917852.0, + "step": 156 + }, + { + "epoch": 0.017241379310344827, + "grad_norm": 4.625207901000977, + "learning_rate": 8.562019758507134e-08, + "loss": 1.3909, + "mean_token_accuracy": 0.6255594491958618, + "num_tokens": 3945651.0, + "step": 157 + }, + { + "epoch": 0.017351197012958487, + "grad_norm": 4.71535587310791, + "learning_rate": 8.616904500548847e-08, + "loss": 1.3252, + "mean_token_accuracy": 0.6380660533905029, + "num_tokens": 3967753.0, + "step": 158 + }, + { + "epoch": 0.01746101471557215, + "grad_norm": 4.766262054443359, + "learning_rate": 8.671789242590558e-08, + "loss": 1.3811, + "mean_token_accuracy": 0.6230779886245728, + "num_tokens": 3992733.0, + "step": 159 + }, + { + "epoch": 0.01757083241818581, + "grad_norm": 5.358283042907715, + "learning_rate": 8.726673984632271e-08, + "loss": 1.4062, + "mean_token_accuracy": 0.6199740171432495, + "num_tokens": 4015940.0, + "step": 160 + }, + { + "epoch": 0.017680650120799474, + "grad_norm": 4.369855880737305, + "learning_rate": 8.781558726673985e-08, + "loss": 1.2935, + "mean_token_accuracy": 0.649287760257721, + "num_tokens": 4044575.0, + "step": 161 + }, + { + "epoch": 0.017790467823413134, + "grad_norm": 4.304438591003418, + "learning_rate": 8.836443468715697e-08, + "loss": 1.3677, + "mean_token_accuracy": 0.6290391087532043, + "num_tokens": 4071329.0, + "step": 162 + }, + { + "epoch": 0.017900285526026797, + "grad_norm": 4.542998313903809, + "learning_rate": 8.89132821075741e-08, + "loss": 1.3757, + "mean_token_accuracy": 0.6220839023590088, + "num_tokens": 4095862.0, + "step": 163 + }, + { + "epoch": 0.018010103228640457, + "grad_norm": 4.446436882019043, + "learning_rate": 8.946212952799121e-08, + "loss": 1.3158, + "mean_token_accuracy": 0.6491161584854126, + "num_tokens": 4122957.0, + "step": 164 + }, + { + "epoch": 0.018119920931254117, + "grad_norm": 4.296895503997803, + "learning_rate": 9.001097694840834e-08, + "loss": 1.323, + "mean_token_accuracy": 0.6403034925460815, + "num_tokens": 4152547.0, + "step": 165 + }, + { + "epoch": 0.01822973863386778, + "grad_norm": 4.061207294464111, + "learning_rate": 9.055982436882546e-08, + "loss": 1.4046, + "mean_token_accuracy": 0.6189900636672974, + "num_tokens": 4181502.0, + "step": 166 + }, + { + "epoch": 0.01833955633648144, + "grad_norm": 4.634115219116211, + "learning_rate": 9.110867178924259e-08, + "loss": 1.4276, + "mean_token_accuracy": 0.6123782396316528, + "num_tokens": 4209452.0, + "step": 167 + }, + { + "epoch": 0.018449374039095103, + "grad_norm": 5.494330883026123, + "learning_rate": 9.165751920965971e-08, + "loss": 1.283, + "mean_token_accuracy": 0.6388599872589111, + "num_tokens": 4229250.0, + "step": 168 + }, + { + "epoch": 0.018559191741708763, + "grad_norm": 4.770991802215576, + "learning_rate": 9.220636663007683e-08, + "loss": 1.3786, + "mean_token_accuracy": 0.617233157157898, + "num_tokens": 4250923.0, + "step": 169 + }, + { + "epoch": 0.018669009444322426, + "grad_norm": 5.574711799621582, + "learning_rate": 9.275521405049396e-08, + "loss": 1.2419, + "mean_token_accuracy": 0.6618347764015198, + "num_tokens": 4267278.0, + "step": 170 + }, + { + "epoch": 0.018778827146936086, + "grad_norm": 5.327880382537842, + "learning_rate": 9.330406147091107e-08, + "loss": 1.3362, + "mean_token_accuracy": 0.635615348815918, + "num_tokens": 4285984.0, + "step": 171 + }, + { + "epoch": 0.018888644849549746, + "grad_norm": 4.421748638153076, + "learning_rate": 9.38529088913282e-08, + "loss": 1.3735, + "mean_token_accuracy": 0.6183093190193176, + "num_tokens": 4315041.0, + "step": 172 + }, + { + "epoch": 0.01899846255216341, + "grad_norm": 4.207748889923096, + "learning_rate": 9.440175631174534e-08, + "loss": 1.3557, + "mean_token_accuracy": 0.6276798248291016, + "num_tokens": 4338592.0, + "step": 173 + }, + { + "epoch": 0.01910828025477707, + "grad_norm": 4.029463768005371, + "learning_rate": 9.495060373216246e-08, + "loss": 1.2287, + "mean_token_accuracy": 0.6564114093780518, + "num_tokens": 4364189.0, + "step": 174 + }, + { + "epoch": 0.019218097957390733, + "grad_norm": 4.710605621337891, + "learning_rate": 9.549945115257959e-08, + "loss": 1.3004, + "mean_token_accuracy": 0.643635630607605, + "num_tokens": 4384563.0, + "step": 175 + }, + { + "epoch": 0.019327915660004392, + "grad_norm": 4.0110063552856445, + "learning_rate": 9.60482985729967e-08, + "loss": 1.334, + "mean_token_accuracy": 0.6402037739753723, + "num_tokens": 4412563.0, + "step": 176 + }, + { + "epoch": 0.019437733362618052, + "grad_norm": 4.329104423522949, + "learning_rate": 9.659714599341383e-08, + "loss": 1.332, + "mean_token_accuracy": 0.6369059085845947, + "num_tokens": 4434835.0, + "step": 177 + }, + { + "epoch": 0.019547551065231716, + "grad_norm": 3.8363037109375, + "learning_rate": 9.714599341383095e-08, + "loss": 1.3574, + "mean_token_accuracy": 0.6282293200492859, + "num_tokens": 4463604.0, + "step": 178 + }, + { + "epoch": 0.019657368767845376, + "grad_norm": 3.7040507793426514, + "learning_rate": 9.769484083424807e-08, + "loss": 1.2924, + "mean_token_accuracy": 0.6484848260879517, + "num_tokens": 4491337.0, + "step": 179 + }, + { + "epoch": 0.01976718647045904, + "grad_norm": 4.3339056968688965, + "learning_rate": 9.82436882546652e-08, + "loss": 1.3607, + "mean_token_accuracy": 0.6251245737075806, + "num_tokens": 4513783.0, + "step": 180 + }, + { + "epoch": 0.0198770041730727, + "grad_norm": 3.7627763748168945, + "learning_rate": 9.879253567508232e-08, + "loss": 1.3004, + "mean_token_accuracy": 0.6489370465278625, + "num_tokens": 4543452.0, + "step": 181 + }, + { + "epoch": 0.019986821875686362, + "grad_norm": 4.027946949005127, + "learning_rate": 9.934138309549945e-08, + "loss": 1.2874, + "mean_token_accuracy": 0.6431794166564941, + "num_tokens": 4574392.0, + "step": 182 + }, + { + "epoch": 0.020096639578300022, + "grad_norm": 3.976728677749634, + "learning_rate": 9.989023051591656e-08, + "loss": 1.3181, + "mean_token_accuracy": 0.6343864798545837, + "num_tokens": 4603971.0, + "step": 183 + }, + { + "epoch": 0.020206457280913682, + "grad_norm": 3.9191699028015137, + "learning_rate": 1.0043907793633369e-07, + "loss": 1.3257, + "mean_token_accuracy": 0.6358782649040222, + "num_tokens": 4634768.0, + "step": 184 + }, + { + "epoch": 0.020316274983527345, + "grad_norm": 4.783876419067383, + "learning_rate": 1.0098792535675082e-07, + "loss": 1.2891, + "mean_token_accuracy": 0.6473667621612549, + "num_tokens": 4657230.0, + "step": 185 + }, + { + "epoch": 0.020426092686141005, + "grad_norm": 5.707295894622803, + "learning_rate": 1.0153677277716795e-07, + "loss": 1.2956, + "mean_token_accuracy": 0.6455444097518921, + "num_tokens": 4674380.0, + "step": 186 + }, + { + "epoch": 0.02053591038875467, + "grad_norm": 4.204931259155273, + "learning_rate": 1.0208562019758508e-07, + "loss": 1.2855, + "mean_token_accuracy": 0.6430768966674805, + "num_tokens": 4697452.0, + "step": 187 + }, + { + "epoch": 0.020645728091368328, + "grad_norm": 4.3520121574401855, + "learning_rate": 1.0263446761800219e-07, + "loss": 1.3149, + "mean_token_accuracy": 0.639366626739502, + "num_tokens": 4721904.0, + "step": 188 + }, + { + "epoch": 0.02075554579398199, + "grad_norm": 4.06973934173584, + "learning_rate": 1.0318331503841932e-07, + "loss": 1.3084, + "mean_token_accuracy": 0.640625536441803, + "num_tokens": 4748101.0, + "step": 189 + }, + { + "epoch": 0.02086536349659565, + "grad_norm": 5.341164588928223, + "learning_rate": 1.0373216245883643e-07, + "loss": 1.4645, + "mean_token_accuracy": 0.6180014610290527, + "num_tokens": 4770179.0, + "step": 190 + }, + { + "epoch": 0.02097518119920931, + "grad_norm": 4.7698845863342285, + "learning_rate": 1.0428100987925356e-07, + "loss": 1.2922, + "mean_token_accuracy": 0.6438108086585999, + "num_tokens": 4796964.0, + "step": 191 + }, + { + "epoch": 0.021084998901822975, + "grad_norm": 4.403766632080078, + "learning_rate": 1.0482985729967068e-07, + "loss": 1.3328, + "mean_token_accuracy": 0.6321154832839966, + "num_tokens": 4825274.0, + "step": 192 + }, + { + "epoch": 0.021194816604436634, + "grad_norm": 3.625188112258911, + "learning_rate": 1.0537870472008781e-07, + "loss": 1.347, + "mean_token_accuracy": 0.6316443085670471, + "num_tokens": 4857586.0, + "step": 193 + }, + { + "epoch": 0.021304634307050298, + "grad_norm": 4.827327251434326, + "learning_rate": 1.0592755214050494e-07, + "loss": 1.4174, + "mean_token_accuracy": 0.614277184009552, + "num_tokens": 4880996.0, + "step": 194 + }, + { + "epoch": 0.021414452009663958, + "grad_norm": 4.4447808265686035, + "learning_rate": 1.0647639956092205e-07, + "loss": 1.2832, + "mean_token_accuracy": 0.6444212794303894, + "num_tokens": 4902727.0, + "step": 195 + }, + { + "epoch": 0.021524269712277617, + "grad_norm": 4.409107685089111, + "learning_rate": 1.0702524698133918e-07, + "loss": 1.3106, + "mean_token_accuracy": 0.6436893939971924, + "num_tokens": 4925696.0, + "step": 196 + }, + { + "epoch": 0.02163408741489128, + "grad_norm": 4.1312408447265625, + "learning_rate": 1.0757409440175631e-07, + "loss": 1.2877, + "mean_token_accuracy": 0.6438153982162476, + "num_tokens": 4953004.0, + "step": 197 + }, + { + "epoch": 0.02174390511750494, + "grad_norm": 4.932766914367676, + "learning_rate": 1.0812294182217344e-07, + "loss": 1.2931, + "mean_token_accuracy": 0.6434406042098999, + "num_tokens": 4977360.0, + "step": 198 + }, + { + "epoch": 0.021853722820118604, + "grad_norm": 7.592950820922852, + "learning_rate": 1.0867178924259056e-07, + "loss": 1.2138, + "mean_token_accuracy": 0.6545286178588867, + "num_tokens": 4993457.0, + "step": 199 + }, + { + "epoch": 0.021963540522732264, + "grad_norm": 4.2755303382873535, + "learning_rate": 1.0922063666300768e-07, + "loss": 1.3319, + "mean_token_accuracy": 0.6288120746612549, + "num_tokens": 5019910.0, + "step": 200 + }, + { + "epoch": 0.022073358225345927, + "grad_norm": 4.182487964630127, + "learning_rate": 1.0976948408342481e-07, + "loss": 1.3303, + "mean_token_accuracy": 0.6255499124526978, + "num_tokens": 5046396.0, + "step": 201 + }, + { + "epoch": 0.022183175927959587, + "grad_norm": 4.508297920227051, + "learning_rate": 1.1031833150384192e-07, + "loss": 1.3602, + "mean_token_accuracy": 0.6210637092590332, + "num_tokens": 5073671.0, + "step": 202 + }, + { + "epoch": 0.022292993630573247, + "grad_norm": 4.1902008056640625, + "learning_rate": 1.1086717892425905e-07, + "loss": 1.1471, + "mean_token_accuracy": 0.6689696311950684, + "num_tokens": 5097947.0, + "step": 203 + }, + { + "epoch": 0.02240281133318691, + "grad_norm": 5.037697792053223, + "learning_rate": 1.1141602634467617e-07, + "loss": 1.2787, + "mean_token_accuracy": 0.642744779586792, + "num_tokens": 5117643.0, + "step": 204 + }, + { + "epoch": 0.02251262903580057, + "grad_norm": 3.8362958431243896, + "learning_rate": 1.119648737650933e-07, + "loss": 1.3391, + "mean_token_accuracy": 0.6328462362289429, + "num_tokens": 5145133.0, + "step": 205 + }, + { + "epoch": 0.022622446738414233, + "grad_norm": 4.749488830566406, + "learning_rate": 1.1251372118551042e-07, + "loss": 1.3117, + "mean_token_accuracy": 0.6341179609298706, + "num_tokens": 5172065.0, + "step": 206 + }, + { + "epoch": 0.022732264441027893, + "grad_norm": 4.55946683883667, + "learning_rate": 1.1306256860592754e-07, + "loss": 1.327, + "mean_token_accuracy": 0.6360080242156982, + "num_tokens": 5198021.0, + "step": 207 + }, + { + "epoch": 0.022842082143641557, + "grad_norm": 4.376400470733643, + "learning_rate": 1.1361141602634467e-07, + "loss": 1.3287, + "mean_token_accuracy": 0.637261152267456, + "num_tokens": 5223375.0, + "step": 208 + }, + { + "epoch": 0.022951899846255217, + "grad_norm": 4.690371036529541, + "learning_rate": 1.141602634467618e-07, + "loss": 1.3157, + "mean_token_accuracy": 0.6370708346366882, + "num_tokens": 5251623.0, + "step": 209 + }, + { + "epoch": 0.023061717548868876, + "grad_norm": 4.438716411590576, + "learning_rate": 1.1470911086717892e-07, + "loss": 1.3481, + "mean_token_accuracy": 0.631139874458313, + "num_tokens": 5278640.0, + "step": 210 + }, + { + "epoch": 0.02317153525148254, + "grad_norm": 4.738504886627197, + "learning_rate": 1.1525795828759604e-07, + "loss": 1.2389, + "mean_token_accuracy": 0.6536725759506226, + "num_tokens": 5304850.0, + "step": 211 + }, + { + "epoch": 0.0232813529540962, + "grad_norm": 4.956092357635498, + "learning_rate": 1.1580680570801317e-07, + "loss": 1.3218, + "mean_token_accuracy": 0.6314120292663574, + "num_tokens": 5334523.0, + "step": 212 + }, + { + "epoch": 0.023391170656709863, + "grad_norm": 5.150985240936279, + "learning_rate": 1.163556531284303e-07, + "loss": 1.3705, + "mean_token_accuracy": 0.6374960541725159, + "num_tokens": 5361416.0, + "step": 213 + }, + { + "epoch": 0.023500988359323523, + "grad_norm": 6.7626214027404785, + "learning_rate": 1.1690450054884741e-07, + "loss": 1.245, + "mean_token_accuracy": 0.6542019844055176, + "num_tokens": 5383620.0, + "step": 214 + }, + { + "epoch": 0.023610806061937183, + "grad_norm": 4.786448955535889, + "learning_rate": 1.1745334796926454e-07, + "loss": 1.2855, + "mean_token_accuracy": 0.6434435844421387, + "num_tokens": 5414290.0, + "step": 215 + }, + { + "epoch": 0.023720623764550846, + "grad_norm": 5.765920639038086, + "learning_rate": 1.1800219538968166e-07, + "loss": 1.233, + "mean_token_accuracy": 0.6527726054191589, + "num_tokens": 5434133.0, + "step": 216 + }, + { + "epoch": 0.023830441467164506, + "grad_norm": 5.286128044128418, + "learning_rate": 1.1855104281009878e-07, + "loss": 1.3513, + "mean_token_accuracy": 0.6311407685279846, + "num_tokens": 5465986.0, + "step": 217 + }, + { + "epoch": 0.02394025916977817, + "grad_norm": 4.600885391235352, + "learning_rate": 1.1909989023051591e-07, + "loss": 1.2532, + "mean_token_accuracy": 0.6518765687942505, + "num_tokens": 5492258.0, + "step": 218 + }, + { + "epoch": 0.02405007687239183, + "grad_norm": 5.613969802856445, + "learning_rate": 1.1964873765093303e-07, + "loss": 1.2871, + "mean_token_accuracy": 0.643208384513855, + "num_tokens": 5520234.0, + "step": 219 + }, + { + "epoch": 0.024159894575005492, + "grad_norm": 6.820932865142822, + "learning_rate": 1.2019758507135017e-07, + "loss": 1.2958, + "mean_token_accuracy": 0.6391181945800781, + "num_tokens": 5540608.0, + "step": 220 + }, + { + "epoch": 0.024269712277619152, + "grad_norm": 4.330109119415283, + "learning_rate": 1.2074643249176729e-07, + "loss": 1.2483, + "mean_token_accuracy": 0.6538245677947998, + "num_tokens": 5571987.0, + "step": 221 + }, + { + "epoch": 0.024379529980232812, + "grad_norm": 5.116732597351074, + "learning_rate": 1.212952799121844e-07, + "loss": 1.2873, + "mean_token_accuracy": 0.6501505374908447, + "num_tokens": 5596986.0, + "step": 222 + }, + { + "epoch": 0.024489347682846475, + "grad_norm": 6.206184387207031, + "learning_rate": 1.2184412733260154e-07, + "loss": 1.2054, + "mean_token_accuracy": 0.6714794635772705, + "num_tokens": 5616453.0, + "step": 223 + }, + { + "epoch": 0.024599165385460135, + "grad_norm": 6.674288749694824, + "learning_rate": 1.2239297475301866e-07, + "loss": 1.1611, + "mean_token_accuracy": 0.6654064059257507, + "num_tokens": 5638313.0, + "step": 224 + }, + { + "epoch": 0.0247089830880738, + "grad_norm": 5.596703052520752, + "learning_rate": 1.2294182217343577e-07, + "loss": 1.2304, + "mean_token_accuracy": 0.6464753150939941, + "num_tokens": 5657936.0, + "step": 225 + }, + { + "epoch": 0.02481880079068746, + "grad_norm": 5.9812164306640625, + "learning_rate": 1.2349066959385291e-07, + "loss": 1.235, + "mean_token_accuracy": 0.6520105600357056, + "num_tokens": 5685761.0, + "step": 226 + }, + { + "epoch": 0.024928618493301122, + "grad_norm": 5.022336959838867, + "learning_rate": 1.2403951701427003e-07, + "loss": 1.3169, + "mean_token_accuracy": 0.6200309991836548, + "num_tokens": 5714291.0, + "step": 227 + }, + { + "epoch": 0.02503843619591478, + "grad_norm": 6.815615653991699, + "learning_rate": 1.2458836443468714e-07, + "loss": 1.2159, + "mean_token_accuracy": 0.6591601371765137, + "num_tokens": 5736082.0, + "step": 228 + }, + { + "epoch": 0.02514825389852844, + "grad_norm": 4.524533271789551, + "learning_rate": 1.2513721185510429e-07, + "loss": 1.1757, + "mean_token_accuracy": 0.6709659099578857, + "num_tokens": 5762583.0, + "step": 229 + }, + { + "epoch": 0.025258071601142105, + "grad_norm": 4.4044294357299805, + "learning_rate": 1.256860592755214e-07, + "loss": 1.2184, + "mean_token_accuracy": 0.662255048751831, + "num_tokens": 5791237.0, + "step": 230 + }, + { + "epoch": 0.025367889303755765, + "grad_norm": 4.544266700744629, + "learning_rate": 1.2623490669593854e-07, + "loss": 1.2461, + "mean_token_accuracy": 0.6586649417877197, + "num_tokens": 5819365.0, + "step": 231 + }, + { + "epoch": 0.025477707006369428, + "grad_norm": 7.368698596954346, + "learning_rate": 1.2678375411635563e-07, + "loss": 1.3391, + "mean_token_accuracy": 0.642717719078064, + "num_tokens": 5841192.0, + "step": 232 + }, + { + "epoch": 0.025587524708983088, + "grad_norm": 7.408446788787842, + "learning_rate": 1.2733260153677277e-07, + "loss": 1.3097, + "mean_token_accuracy": 0.6291868686676025, + "num_tokens": 5863592.0, + "step": 233 + }, + { + "epoch": 0.025697342411596748, + "grad_norm": 6.151417255401611, + "learning_rate": 1.278814489571899e-07, + "loss": 1.2967, + "mean_token_accuracy": 0.6351355314254761, + "num_tokens": 5886589.0, + "step": 234 + }, + { + "epoch": 0.02580716011421041, + "grad_norm": 4.357510089874268, + "learning_rate": 1.2843029637760703e-07, + "loss": 1.1695, + "mean_token_accuracy": 0.6799167394638062, + "num_tokens": 5916866.0, + "step": 235 + }, + { + "epoch": 0.02591697781682407, + "grad_norm": 7.317298889160156, + "learning_rate": 1.2897914379802412e-07, + "loss": 1.2234, + "mean_token_accuracy": 0.6575740575790405, + "num_tokens": 5942207.0, + "step": 236 + }, + { + "epoch": 0.026026795519437734, + "grad_norm": 5.652281761169434, + "learning_rate": 1.2952799121844126e-07, + "loss": 1.1498, + "mean_token_accuracy": 0.6763193607330322, + "num_tokens": 5967843.0, + "step": 237 + }, + { + "epoch": 0.026136613222051394, + "grad_norm": 4.957432270050049, + "learning_rate": 1.300768386388584e-07, + "loss": 1.1981, + "mean_token_accuracy": 0.6655565500259399, + "num_tokens": 5993502.0, + "step": 238 + }, + { + "epoch": 0.026246430924665057, + "grad_norm": 5.6925835609436035, + "learning_rate": 1.3062568605927552e-07, + "loss": 1.2872, + "mean_token_accuracy": 0.6431940197944641, + "num_tokens": 6018768.0, + "step": 239 + }, + { + "epoch": 0.026356248627278717, + "grad_norm": 6.66880464553833, + "learning_rate": 1.3117453347969266e-07, + "loss": 1.2454, + "mean_token_accuracy": 0.650638997554779, + "num_tokens": 6040629.0, + "step": 240 + }, + { + "epoch": 0.026466066329892377, + "grad_norm": 5.321034908294678, + "learning_rate": 1.3172338090010975e-07, + "loss": 1.2244, + "mean_token_accuracy": 0.6579973101615906, + "num_tokens": 6061735.0, + "step": 241 + }, + { + "epoch": 0.02657588403250604, + "grad_norm": 5.6708197593688965, + "learning_rate": 1.322722283205269e-07, + "loss": 1.3246, + "mean_token_accuracy": 0.6281349658966064, + "num_tokens": 6087488.0, + "step": 242 + }, + { + "epoch": 0.0266857017351197, + "grad_norm": 5.025697231292725, + "learning_rate": 1.32821075740944e-07, + "loss": 1.2413, + "mean_token_accuracy": 0.65772545337677, + "num_tokens": 6116783.0, + "step": 243 + }, + { + "epoch": 0.026795519437733364, + "grad_norm": 5.411428928375244, + "learning_rate": 1.3336992316136115e-07, + "loss": 1.3839, + "mean_token_accuracy": 0.6136792898178101, + "num_tokens": 6143117.0, + "step": 244 + }, + { + "epoch": 0.026905337140347024, + "grad_norm": 5.23464822769165, + "learning_rate": 1.3391877058177826e-07, + "loss": 1.2007, + "mean_token_accuracy": 0.6833933591842651, + "num_tokens": 6167585.0, + "step": 245 + }, + { + "epoch": 0.027015154842960687, + "grad_norm": 5.8835368156433105, + "learning_rate": 1.3446761800219538e-07, + "loss": 1.3295, + "mean_token_accuracy": 0.6370112895965576, + "num_tokens": 6192559.0, + "step": 246 + }, + { + "epoch": 0.027124972545574347, + "grad_norm": 5.145666599273682, + "learning_rate": 1.350164654226125e-07, + "loss": 1.2231, + "mean_token_accuracy": 0.6512213945388794, + "num_tokens": 6217766.0, + "step": 247 + }, + { + "epoch": 0.027234790248188007, + "grad_norm": 7.372541427612305, + "learning_rate": 1.3556531284302963e-07, + "loss": 1.1973, + "mean_token_accuracy": 0.6614512205123901, + "num_tokens": 6238552.0, + "step": 248 + }, + { + "epoch": 0.02734460795080167, + "grad_norm": 5.995645523071289, + "learning_rate": 1.3611416026344675e-07, + "loss": 1.2055, + "mean_token_accuracy": 0.6598985195159912, + "num_tokens": 6259449.0, + "step": 249 + }, + { + "epoch": 0.02745442565341533, + "grad_norm": 6.555222988128662, + "learning_rate": 1.366630076838639e-07, + "loss": 1.3016, + "mean_token_accuracy": 0.6406447887420654, + "num_tokens": 6287676.0, + "step": 250 + }, + { + "epoch": 0.027564243356028993, + "grad_norm": 6.730244159698486, + "learning_rate": 1.37211855104281e-07, + "loss": 1.2027, + "mean_token_accuracy": 0.6566339731216431, + "num_tokens": 6311201.0, + "step": 251 + }, + { + "epoch": 0.027674061058642653, + "grad_norm": 4.824339866638184, + "learning_rate": 1.3776070252469812e-07, + "loss": 1.2251, + "mean_token_accuracy": 0.6458152532577515, + "num_tokens": 6334163.0, + "step": 252 + }, + { + "epoch": 0.027783878761256313, + "grad_norm": 7.177898406982422, + "learning_rate": 1.3830954994511526e-07, + "loss": 1.2602, + "mean_token_accuracy": 0.64267498254776, + "num_tokens": 6355546.0, + "step": 253 + }, + { + "epoch": 0.027893696463869976, + "grad_norm": 7.565207481384277, + "learning_rate": 1.3885839736553238e-07, + "loss": 1.1062, + "mean_token_accuracy": 0.6858924031257629, + "num_tokens": 6376917.0, + "step": 254 + }, + { + "epoch": 0.028003514166483636, + "grad_norm": 4.849475860595703, + "learning_rate": 1.394072447859495e-07, + "loss": 1.2064, + "mean_token_accuracy": 0.6564304828643799, + "num_tokens": 6403575.0, + "step": 255 + }, + { + "epoch": 0.0281133318690973, + "grad_norm": 5.6385111808776855, + "learning_rate": 1.399560922063666e-07, + "loss": 1.1946, + "mean_token_accuracy": 0.6631309986114502, + "num_tokens": 6426230.0, + "step": 256 + }, + { + "epoch": 0.02822314957171096, + "grad_norm": 5.383663177490234, + "learning_rate": 1.4050493962678375e-07, + "loss": 1.3005, + "mean_token_accuracy": 0.6363176107406616, + "num_tokens": 6452371.0, + "step": 257 + }, + { + "epoch": 0.028332967274324623, + "grad_norm": 4.18161153793335, + "learning_rate": 1.4105378704720087e-07, + "loss": 1.1988, + "mean_token_accuracy": 0.6691715717315674, + "num_tokens": 6477204.0, + "step": 258 + }, + { + "epoch": 0.028442784976938282, + "grad_norm": 6.083310127258301, + "learning_rate": 1.41602634467618e-07, + "loss": 1.2602, + "mean_token_accuracy": 0.6649537682533264, + "num_tokens": 6498956.0, + "step": 259 + }, + { + "epoch": 0.028552602679551942, + "grad_norm": 5.772022724151611, + "learning_rate": 1.421514818880351e-07, + "loss": 1.2566, + "mean_token_accuracy": 0.652179479598999, + "num_tokens": 6519820.0, + "step": 260 + }, + { + "epoch": 0.028662420382165606, + "grad_norm": 4.705920219421387, + "learning_rate": 1.4270032930845224e-07, + "loss": 1.2104, + "mean_token_accuracy": 0.6587014198303223, + "num_tokens": 6550168.0, + "step": 261 + }, + { + "epoch": 0.028772238084779266, + "grad_norm": 5.354385852813721, + "learning_rate": 1.4324917672886938e-07, + "loss": 1.28, + "mean_token_accuracy": 0.6385823488235474, + "num_tokens": 6575296.0, + "step": 262 + }, + { + "epoch": 0.02888205578739293, + "grad_norm": 5.240947723388672, + "learning_rate": 1.437980241492865e-07, + "loss": 1.1896, + "mean_token_accuracy": 0.6627821922302246, + "num_tokens": 6600401.0, + "step": 263 + }, + { + "epoch": 0.02899187349000659, + "grad_norm": 4.877986431121826, + "learning_rate": 1.4434687156970364e-07, + "loss": 1.2697, + "mean_token_accuracy": 0.6425244808197021, + "num_tokens": 6625621.0, + "step": 264 + }, + { + "epoch": 0.029101691192620252, + "grad_norm": 5.443619728088379, + "learning_rate": 1.4489571899012073e-07, + "loss": 1.3084, + "mean_token_accuracy": 0.6367912888526917, + "num_tokens": 6647081.0, + "step": 265 + }, + { + "epoch": 0.029211508895233912, + "grad_norm": 4.627569675445557, + "learning_rate": 1.4544456641053787e-07, + "loss": 1.1849, + "mean_token_accuracy": 0.6672314405441284, + "num_tokens": 6673373.0, + "step": 266 + }, + { + "epoch": 0.029321326597847572, + "grad_norm": 5.227025985717773, + "learning_rate": 1.4599341383095498e-07, + "loss": 1.3268, + "mean_token_accuracy": 0.6388777494430542, + "num_tokens": 6698783.0, + "step": 267 + }, + { + "epoch": 0.029431144300461235, + "grad_norm": 5.097924709320068, + "learning_rate": 1.4654226125137212e-07, + "loss": 1.2159, + "mean_token_accuracy": 0.665357768535614, + "num_tokens": 6726013.0, + "step": 268 + }, + { + "epoch": 0.029540962003074895, + "grad_norm": 5.528903007507324, + "learning_rate": 1.4709110867178924e-07, + "loss": 1.2263, + "mean_token_accuracy": 0.6548078060150146, + "num_tokens": 6752686.0, + "step": 269 + }, + { + "epoch": 0.02965077970568856, + "grad_norm": 4.746731758117676, + "learning_rate": 1.4763995609220636e-07, + "loss": 1.2669, + "mean_token_accuracy": 0.6469071507453918, + "num_tokens": 6782904.0, + "step": 270 + }, + { + "epoch": 0.029760597408302218, + "grad_norm": 5.385994911193848, + "learning_rate": 1.4818880351262347e-07, + "loss": 1.283, + "mean_token_accuracy": 0.6455955505371094, + "num_tokens": 6809110.0, + "step": 271 + }, + { + "epoch": 0.029870415110915878, + "grad_norm": 5.26521110534668, + "learning_rate": 1.487376509330406e-07, + "loss": 1.175, + "mean_token_accuracy": 0.6710473299026489, + "num_tokens": 6833332.0, + "step": 272 + }, + { + "epoch": 0.02998023281352954, + "grad_norm": 6.522159576416016, + "learning_rate": 1.4928649835345773e-07, + "loss": 1.2938, + "mean_token_accuracy": 0.6595526933670044, + "num_tokens": 6858250.0, + "step": 273 + }, + { + "epoch": 0.0300900505161432, + "grad_norm": 4.291301727294922, + "learning_rate": 1.4983534577387484e-07, + "loss": 1.1883, + "mean_token_accuracy": 0.667352557182312, + "num_tokens": 6884987.0, + "step": 274 + }, + { + "epoch": 0.030199868218756865, + "grad_norm": 5.082127571105957, + "learning_rate": 1.5038419319429198e-07, + "loss": 1.2161, + "mean_token_accuracy": 0.6467617750167847, + "num_tokens": 6910698.0, + "step": 275 + }, + { + "epoch": 0.030309685921370524, + "grad_norm": 5.197915554046631, + "learning_rate": 1.509330406147091e-07, + "loss": 1.1088, + "mean_token_accuracy": 0.6779587268829346, + "num_tokens": 6934512.0, + "step": 276 + }, + { + "epoch": 0.030419503623984188, + "grad_norm": 5.501759052276611, + "learning_rate": 1.5148188803512624e-07, + "loss": 1.2656, + "mean_token_accuracy": 0.6448374390602112, + "num_tokens": 6959523.0, + "step": 277 + }, + { + "epoch": 0.030529321326597848, + "grad_norm": 4.196843147277832, + "learning_rate": 1.5203073545554336e-07, + "loss": 1.2289, + "mean_token_accuracy": 0.6578442454338074, + "num_tokens": 6984240.0, + "step": 278 + }, + { + "epoch": 0.030639139029211507, + "grad_norm": 6.128082275390625, + "learning_rate": 1.5257958287596047e-07, + "loss": 1.1971, + "mean_token_accuracy": 0.6618665456771851, + "num_tokens": 7007892.0, + "step": 279 + }, + { + "epoch": 0.03074895673182517, + "grad_norm": 5.8914923667907715, + "learning_rate": 1.531284302963776e-07, + "loss": 1.1676, + "mean_token_accuracy": 0.6750379204750061, + "num_tokens": 7032667.0, + "step": 280 + }, + { + "epoch": 0.03085877443443883, + "grad_norm": 4.785383224487305, + "learning_rate": 1.5367727771679473e-07, + "loss": 1.3193, + "mean_token_accuracy": 0.629509449005127, + "num_tokens": 7063582.0, + "step": 281 + }, + { + "epoch": 0.030968592137052494, + "grad_norm": 4.90354585647583, + "learning_rate": 1.5422612513721184e-07, + "loss": 1.2643, + "mean_token_accuracy": 0.6474126577377319, + "num_tokens": 7093215.0, + "step": 282 + }, + { + "epoch": 0.031078409839666154, + "grad_norm": 4.583083152770996, + "learning_rate": 1.5477497255762899e-07, + "loss": 1.2191, + "mean_token_accuracy": 0.6544786691665649, + "num_tokens": 7121462.0, + "step": 283 + }, + { + "epoch": 0.031188227542279817, + "grad_norm": 4.5312395095825195, + "learning_rate": 1.5532381997804607e-07, + "loss": 1.2502, + "mean_token_accuracy": 0.649340033531189, + "num_tokens": 7144710.0, + "step": 284 + }, + { + "epoch": 0.03129804524489348, + "grad_norm": 6.7868971824646, + "learning_rate": 1.5587266739846322e-07, + "loss": 1.2011, + "mean_token_accuracy": 0.6708967685699463, + "num_tokens": 7164692.0, + "step": 285 + }, + { + "epoch": 0.03140786294750714, + "grad_norm": 6.447272777557373, + "learning_rate": 1.5642151481888036e-07, + "loss": 1.1355, + "mean_token_accuracy": 0.672548770904541, + "num_tokens": 7184525.0, + "step": 286 + }, + { + "epoch": 0.0315176806501208, + "grad_norm": 5.471215724945068, + "learning_rate": 1.5697036223929747e-07, + "loss": 1.1891, + "mean_token_accuracy": 0.664596438407898, + "num_tokens": 7204983.0, + "step": 287 + }, + { + "epoch": 0.031627498352734464, + "grad_norm": 4.345337390899658, + "learning_rate": 1.5751920965971461e-07, + "loss": 1.2461, + "mean_token_accuracy": 0.6521209478378296, + "num_tokens": 7238038.0, + "step": 288 + }, + { + "epoch": 0.03173731605534812, + "grad_norm": 5.051107406616211, + "learning_rate": 1.580680570801317e-07, + "loss": 1.192, + "mean_token_accuracy": 0.6492318511009216, + "num_tokens": 7261453.0, + "step": 289 + }, + { + "epoch": 0.03184713375796178, + "grad_norm": 4.64546012878418, + "learning_rate": 1.5861690450054885e-07, + "loss": 1.1863, + "mean_token_accuracy": 0.6624681949615479, + "num_tokens": 7290691.0, + "step": 290 + }, + { + "epoch": 0.03195695146057544, + "grad_norm": 6.447824001312256, + "learning_rate": 1.5916575192096596e-07, + "loss": 1.2606, + "mean_token_accuracy": 0.6476118564605713, + "num_tokens": 7311912.0, + "step": 291 + }, + { + "epoch": 0.0320667691631891, + "grad_norm": 4.198253154754639, + "learning_rate": 1.597145993413831e-07, + "loss": 1.3346, + "mean_token_accuracy": 0.6263102293014526, + "num_tokens": 7344414.0, + "step": 292 + }, + { + "epoch": 0.03217658686580277, + "grad_norm": 4.975472450256348, + "learning_rate": 1.602634467618002e-07, + "loss": 1.2342, + "mean_token_accuracy": 0.6615923643112183, + "num_tokens": 7370546.0, + "step": 293 + }, + { + "epoch": 0.03228640456841643, + "grad_norm": 4.1100969314575195, + "learning_rate": 1.6081229418221733e-07, + "loss": 1.2606, + "mean_token_accuracy": 0.6452193856239319, + "num_tokens": 7396574.0, + "step": 294 + }, + { + "epoch": 0.03239622227103009, + "grad_norm": 4.680819988250732, + "learning_rate": 1.6136114160263445e-07, + "loss": 1.1532, + "mean_token_accuracy": 0.6670925617218018, + "num_tokens": 7422967.0, + "step": 295 + }, + { + "epoch": 0.03250603997364375, + "grad_norm": 4.144844055175781, + "learning_rate": 1.619099890230516e-07, + "loss": 1.165, + "mean_token_accuracy": 0.6615139245986938, + "num_tokens": 7451510.0, + "step": 296 + }, + { + "epoch": 0.03261585767625741, + "grad_norm": 5.0326104164123535, + "learning_rate": 1.6245883644346873e-07, + "loss": 1.2667, + "mean_token_accuracy": 0.6371589303016663, + "num_tokens": 7474567.0, + "step": 297 + }, + { + "epoch": 0.032725675378871076, + "grad_norm": 5.170776844024658, + "learning_rate": 1.6300768386388582e-07, + "loss": 1.1062, + "mean_token_accuracy": 0.6821861863136292, + "num_tokens": 7498125.0, + "step": 298 + }, + { + "epoch": 0.032835493081484736, + "grad_norm": 4.986992359161377, + "learning_rate": 1.6355653128430296e-07, + "loss": 1.2858, + "mean_token_accuracy": 0.6449693441390991, + "num_tokens": 7520438.0, + "step": 299 + }, + { + "epoch": 0.032945310784098396, + "grad_norm": 3.9638819694519043, + "learning_rate": 1.6410537870472008e-07, + "loss": 1.1723, + "mean_token_accuracy": 0.6665599942207336, + "num_tokens": 7549817.0, + "step": 300 + }, + { + "epoch": 0.033055128486712056, + "grad_norm": 4.720790386199951, + "learning_rate": 1.6465422612513722e-07, + "loss": 1.1486, + "mean_token_accuracy": 0.6696271896362305, + "num_tokens": 7571621.0, + "step": 301 + }, + { + "epoch": 0.03316494618932572, + "grad_norm": 5.6290740966796875, + "learning_rate": 1.6520307354555433e-07, + "loss": 1.1793, + "mean_token_accuracy": 0.6594653725624084, + "num_tokens": 7591537.0, + "step": 302 + }, + { + "epoch": 0.03327476389193938, + "grad_norm": 4.2256975173950195, + "learning_rate": 1.6575192096597145e-07, + "loss": 1.2234, + "mean_token_accuracy": 0.6611863374710083, + "num_tokens": 7615299.0, + "step": 303 + }, + { + "epoch": 0.03338458159455304, + "grad_norm": 4.668864727020264, + "learning_rate": 1.6630076838638856e-07, + "loss": 1.2318, + "mean_token_accuracy": 0.6527267098426819, + "num_tokens": 7648301.0, + "step": 304 + }, + { + "epoch": 0.0334943992971667, + "grad_norm": 4.794196128845215, + "learning_rate": 1.668496158068057e-07, + "loss": 1.2127, + "mean_token_accuracy": 0.6537520289421082, + "num_tokens": 7674556.0, + "step": 305 + }, + { + "epoch": 0.03360421699978036, + "grad_norm": 4.4398932456970215, + "learning_rate": 1.6739846322722282e-07, + "loss": 1.1496, + "mean_token_accuracy": 0.6725006103515625, + "num_tokens": 7704087.0, + "step": 306 + }, + { + "epoch": 0.03371403470239403, + "grad_norm": 5.26851749420166, + "learning_rate": 1.6794731064763996e-07, + "loss": 1.2428, + "mean_token_accuracy": 0.6515628099441528, + "num_tokens": 7729186.0, + "step": 307 + }, + { + "epoch": 0.03382385240500769, + "grad_norm": 5.163531303405762, + "learning_rate": 1.6849615806805705e-07, + "loss": 1.2096, + "mean_token_accuracy": 0.6558130979537964, + "num_tokens": 7752867.0, + "step": 308 + }, + { + "epoch": 0.03393367010762135, + "grad_norm": 4.7719340324401855, + "learning_rate": 1.690450054884742e-07, + "loss": 1.1918, + "mean_token_accuracy": 0.6633815765380859, + "num_tokens": 7776588.0, + "step": 309 + }, + { + "epoch": 0.03404348781023501, + "grad_norm": 4.6261444091796875, + "learning_rate": 1.6959385290889134e-07, + "loss": 1.2407, + "mean_token_accuracy": 0.6534309387207031, + "num_tokens": 7806830.0, + "step": 310 + }, + { + "epoch": 0.03415330551284867, + "grad_norm": 4.383133888244629, + "learning_rate": 1.7014270032930845e-07, + "loss": 1.2192, + "mean_token_accuracy": 0.6567720174789429, + "num_tokens": 7840316.0, + "step": 311 + }, + { + "epoch": 0.034263123215462335, + "grad_norm": 4.799578666687012, + "learning_rate": 1.7069154774972557e-07, + "loss": 1.2522, + "mean_token_accuracy": 0.6436808109283447, + "num_tokens": 7868210.0, + "step": 312 + }, + { + "epoch": 0.034372940918075995, + "grad_norm": 5.3193359375, + "learning_rate": 1.7124039517014268e-07, + "loss": 1.1478, + "mean_token_accuracy": 0.6786339282989502, + "num_tokens": 7892318.0, + "step": 313 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 4.089936256408691, + "learning_rate": 1.7178924259055982e-07, + "loss": 1.2439, + "mean_token_accuracy": 0.649398922920227, + "num_tokens": 7920807.0, + "step": 314 + }, + { + "epoch": 0.034592576323303315, + "grad_norm": 4.7243499755859375, + "learning_rate": 1.7233809001097694e-07, + "loss": 1.1391, + "mean_token_accuracy": 0.6763508915901184, + "num_tokens": 7944902.0, + "step": 315 + }, + { + "epoch": 0.034702394025916974, + "grad_norm": 4.589184284210205, + "learning_rate": 1.7288693743139408e-07, + "loss": 1.1133, + "mean_token_accuracy": 0.6758476495742798, + "num_tokens": 7969090.0, + "step": 316 + }, + { + "epoch": 0.03481221172853064, + "grad_norm": 4.3234639167785645, + "learning_rate": 1.7343578485181117e-07, + "loss": 1.1611, + "mean_token_accuracy": 0.6645885705947876, + "num_tokens": 7990772.0, + "step": 317 + }, + { + "epoch": 0.0349220294311443, + "grad_norm": 5.598495960235596, + "learning_rate": 1.739846322722283e-07, + "loss": 1.1715, + "mean_token_accuracy": 0.6751824617385864, + "num_tokens": 8018758.0, + "step": 318 + }, + { + "epoch": 0.03503184713375796, + "grad_norm": 5.128063678741455, + "learning_rate": 1.7453347969264543e-07, + "loss": 1.1141, + "mean_token_accuracy": 0.6755867004394531, + "num_tokens": 8044250.0, + "step": 319 + }, + { + "epoch": 0.03514166483637162, + "grad_norm": 4.945514678955078, + "learning_rate": 1.7508232711306257e-07, + "loss": 1.1738, + "mean_token_accuracy": 0.669885516166687, + "num_tokens": 8065189.0, + "step": 320 + }, + { + "epoch": 0.03525148253898529, + "grad_norm": 5.828752517700195, + "learning_rate": 1.756311745334797e-07, + "loss": 1.1936, + "mean_token_accuracy": 0.6695883274078369, + "num_tokens": 8084430.0, + "step": 321 + }, + { + "epoch": 0.03536130024159895, + "grad_norm": 3.8580005168914795, + "learning_rate": 1.761800219538968e-07, + "loss": 1.2044, + "mean_token_accuracy": 0.6641748547554016, + "num_tokens": 8112323.0, + "step": 322 + }, + { + "epoch": 0.03547111794421261, + "grad_norm": 5.936093807220459, + "learning_rate": 1.7672886937431394e-07, + "loss": 1.1254, + "mean_token_accuracy": 0.6664762496948242, + "num_tokens": 8130025.0, + "step": 323 + }, + { + "epoch": 0.03558093564682627, + "grad_norm": 4.810251712799072, + "learning_rate": 1.7727771679473105e-07, + "loss": 1.2782, + "mean_token_accuracy": 0.6336172819137573, + "num_tokens": 8156988.0, + "step": 324 + }, + { + "epoch": 0.03569075334943993, + "grad_norm": 4.872331142425537, + "learning_rate": 1.778265642151482e-07, + "loss": 1.2896, + "mean_token_accuracy": 0.6426844596862793, + "num_tokens": 8181601.0, + "step": 325 + }, + { + "epoch": 0.035800571052053594, + "grad_norm": 4.904670238494873, + "learning_rate": 1.7837541163556529e-07, + "loss": 1.1173, + "mean_token_accuracy": 0.6756529808044434, + "num_tokens": 8203545.0, + "step": 326 + }, + { + "epoch": 0.035910388754667254, + "grad_norm": 4.067117691040039, + "learning_rate": 1.7892425905598243e-07, + "loss": 1.2656, + "mean_token_accuracy": 0.6451557874679565, + "num_tokens": 8230941.0, + "step": 327 + }, + { + "epoch": 0.036020206457280914, + "grad_norm": 5.9998602867126465, + "learning_rate": 1.7947310647639954e-07, + "loss": 1.1459, + "mean_token_accuracy": 0.6764713525772095, + "num_tokens": 8257388.0, + "step": 328 + }, + { + "epoch": 0.03613002415989457, + "grad_norm": 4.286131381988525, + "learning_rate": 1.8002195389681668e-07, + "loss": 1.2528, + "mean_token_accuracy": 0.6431605219841003, + "num_tokens": 8284180.0, + "step": 329 + }, + { + "epoch": 0.03623984186250823, + "grad_norm": 4.618105411529541, + "learning_rate": 1.805708013172338e-07, + "loss": 1.3128, + "mean_token_accuracy": 0.6409908533096313, + "num_tokens": 8311543.0, + "step": 330 + }, + { + "epoch": 0.0363496595651219, + "grad_norm": 4.710247039794922, + "learning_rate": 1.8111964873765091e-07, + "loss": 1.3071, + "mean_token_accuracy": 0.6326075792312622, + "num_tokens": 8336352.0, + "step": 331 + }, + { + "epoch": 0.03645947726773556, + "grad_norm": 5.32722806930542, + "learning_rate": 1.8166849615806803e-07, + "loss": 1.2277, + "mean_token_accuracy": 0.6474761366844177, + "num_tokens": 8361598.0, + "step": 332 + }, + { + "epoch": 0.03656929497034922, + "grad_norm": 4.680039405822754, + "learning_rate": 1.8221734357848517e-07, + "loss": 1.1455, + "mean_token_accuracy": 0.6743332147598267, + "num_tokens": 8385034.0, + "step": 333 + }, + { + "epoch": 0.03667911267296288, + "grad_norm": 3.5686020851135254, + "learning_rate": 1.827661909989023e-07, + "loss": 1.2144, + "mean_token_accuracy": 0.6573251485824585, + "num_tokens": 8416498.0, + "step": 334 + }, + { + "epoch": 0.03678893037557654, + "grad_norm": 5.030928134918213, + "learning_rate": 1.8331503841931943e-07, + "loss": 1.1278, + "mean_token_accuracy": 0.6793110370635986, + "num_tokens": 8437096.0, + "step": 335 + }, + { + "epoch": 0.036898748078190206, + "grad_norm": 4.744215488433838, + "learning_rate": 1.8386388583973654e-07, + "loss": 1.1251, + "mean_token_accuracy": 0.6720342636108398, + "num_tokens": 8454938.0, + "step": 336 + }, + { + "epoch": 0.037008565780803866, + "grad_norm": 4.780300140380859, + "learning_rate": 1.8441273326015366e-07, + "loss": 1.158, + "mean_token_accuracy": 0.671971321105957, + "num_tokens": 8478466.0, + "step": 337 + }, + { + "epoch": 0.037118383483417526, + "grad_norm": 4.3432464599609375, + "learning_rate": 1.849615806805708e-07, + "loss": 1.1716, + "mean_token_accuracy": 0.658909797668457, + "num_tokens": 8503570.0, + "step": 338 + }, + { + "epoch": 0.037228201186031186, + "grad_norm": 5.702790260314941, + "learning_rate": 1.8551042810098792e-07, + "loss": 1.149, + "mean_token_accuracy": 0.670754075050354, + "num_tokens": 8528086.0, + "step": 339 + }, + { + "epoch": 0.03733801888864485, + "grad_norm": 5.362975120544434, + "learning_rate": 1.8605927552140506e-07, + "loss": 1.0923, + "mean_token_accuracy": 0.6903218030929565, + "num_tokens": 8544497.0, + "step": 340 + }, + { + "epoch": 0.03744783659125851, + "grad_norm": 4.275668144226074, + "learning_rate": 1.8660812294182215e-07, + "loss": 1.2409, + "mean_token_accuracy": 0.6549261808395386, + "num_tokens": 8569704.0, + "step": 341 + }, + { + "epoch": 0.03755765429387217, + "grad_norm": 3.6996867656707764, + "learning_rate": 1.871569703622393e-07, + "loss": 1.2511, + "mean_token_accuracy": 0.6422382593154907, + "num_tokens": 8599974.0, + "step": 342 + }, + { + "epoch": 0.03766747199648583, + "grad_norm": 4.683351993560791, + "learning_rate": 1.877058177826564e-07, + "loss": 1.1831, + "mean_token_accuracy": 0.6626964807510376, + "num_tokens": 8623061.0, + "step": 343 + }, + { + "epoch": 0.03777728969909949, + "grad_norm": 4.473572731018066, + "learning_rate": 1.8825466520307354e-07, + "loss": 1.2437, + "mean_token_accuracy": 0.6478390693664551, + "num_tokens": 8647371.0, + "step": 344 + }, + { + "epoch": 0.03788710740171316, + "grad_norm": 5.966235637664795, + "learning_rate": 1.8880351262349069e-07, + "loss": 1.0969, + "mean_token_accuracy": 0.6811678409576416, + "num_tokens": 8664529.0, + "step": 345 + }, + { + "epoch": 0.03799692510432682, + "grad_norm": 3.952026128768921, + "learning_rate": 1.8935236004390778e-07, + "loss": 1.2596, + "mean_token_accuracy": 0.6443883180618286, + "num_tokens": 8693307.0, + "step": 346 + }, + { + "epoch": 0.03810674280694048, + "grad_norm": 4.700806140899658, + "learning_rate": 1.8990120746432492e-07, + "loss": 1.2627, + "mean_token_accuracy": 0.6388077735900879, + "num_tokens": 8722858.0, + "step": 347 + }, + { + "epoch": 0.03821656050955414, + "grad_norm": 4.186601638793945, + "learning_rate": 1.9045005488474203e-07, + "loss": 1.2321, + "mean_token_accuracy": 0.6535555124282837, + "num_tokens": 8754909.0, + "step": 348 + }, + { + "epoch": 0.0383263782121678, + "grad_norm": 4.425135612487793, + "learning_rate": 1.9099890230515917e-07, + "loss": 1.1721, + "mean_token_accuracy": 0.6602531671524048, + "num_tokens": 8777528.0, + "step": 349 + }, + { + "epoch": 0.038436195914781465, + "grad_norm": 4.558811187744141, + "learning_rate": 1.9154774972557626e-07, + "loss": 1.091, + "mean_token_accuracy": 0.6773300170898438, + "num_tokens": 8796496.0, + "step": 350 + }, + { + "epoch": 0.038546013617395125, + "grad_norm": 3.911237955093384, + "learning_rate": 1.920965971459934e-07, + "loss": 1.2141, + "mean_token_accuracy": 0.6687044501304626, + "num_tokens": 8821184.0, + "step": 351 + }, + { + "epoch": 0.038655831320008785, + "grad_norm": 4.969822883605957, + "learning_rate": 1.9264544456641052e-07, + "loss": 1.1161, + "mean_token_accuracy": 0.6765338778495789, + "num_tokens": 8844430.0, + "step": 352 + }, + { + "epoch": 0.038765649022622445, + "grad_norm": 4.545711040496826, + "learning_rate": 1.9319429198682766e-07, + "loss": 1.2973, + "mean_token_accuracy": 0.6383607387542725, + "num_tokens": 8873822.0, + "step": 353 + }, + { + "epoch": 0.038875466725236105, + "grad_norm": 5.024028778076172, + "learning_rate": 1.9374313940724478e-07, + "loss": 1.1743, + "mean_token_accuracy": 0.6645085215568542, + "num_tokens": 8896186.0, + "step": 354 + }, + { + "epoch": 0.03898528442784977, + "grad_norm": 4.600150108337402, + "learning_rate": 1.942919868276619e-07, + "loss": 1.225, + "mean_token_accuracy": 0.6524146199226379, + "num_tokens": 8918535.0, + "step": 355 + }, + { + "epoch": 0.03909510213046343, + "grad_norm": 4.5758538246154785, + "learning_rate": 1.9484083424807903e-07, + "loss": 1.1684, + "mean_token_accuracy": 0.6677610278129578, + "num_tokens": 8945549.0, + "step": 356 + }, + { + "epoch": 0.03920491983307709, + "grad_norm": 4.84603214263916, + "learning_rate": 1.9538968166849615e-07, + "loss": 1.2638, + "mean_token_accuracy": 0.6516224145889282, + "num_tokens": 8971579.0, + "step": 357 + }, + { + "epoch": 0.03931473753569075, + "grad_norm": 5.051676273345947, + "learning_rate": 1.959385290889133e-07, + "loss": 1.1758, + "mean_token_accuracy": 0.6663045883178711, + "num_tokens": 8993842.0, + "step": 358 + }, + { + "epoch": 0.03942455523830442, + "grad_norm": 4.282263278961182, + "learning_rate": 1.964873765093304e-07, + "loss": 1.1236, + "mean_token_accuracy": 0.6742977499961853, + "num_tokens": 9017809.0, + "step": 359 + }, + { + "epoch": 0.03953437294091808, + "grad_norm": 5.061391830444336, + "learning_rate": 1.9703622392974752e-07, + "loss": 1.1545, + "mean_token_accuracy": 0.6847081780433655, + "num_tokens": 9040299.0, + "step": 360 + }, + { + "epoch": 0.03964419064353174, + "grad_norm": 4.038498878479004, + "learning_rate": 1.9758507135016464e-07, + "loss": 1.1889, + "mean_token_accuracy": 0.6561180353164673, + "num_tokens": 9065126.0, + "step": 361 + }, + { + "epoch": 0.0397540083461454, + "grad_norm": 3.999471426010132, + "learning_rate": 1.9813391877058178e-07, + "loss": 1.2024, + "mean_token_accuracy": 0.652737021446228, + "num_tokens": 9093369.0, + "step": 362 + }, + { + "epoch": 0.03986382604875906, + "grad_norm": 3.8037617206573486, + "learning_rate": 1.986827661909989e-07, + "loss": 1.2166, + "mean_token_accuracy": 0.6539229154586792, + "num_tokens": 9122461.0, + "step": 363 + }, + { + "epoch": 0.039973643751372724, + "grad_norm": 4.419576644897461, + "learning_rate": 1.9923161361141603e-07, + "loss": 1.2214, + "mean_token_accuracy": 0.6643878221511841, + "num_tokens": 9146570.0, + "step": 364 + }, + { + "epoch": 0.040083461453986384, + "grad_norm": 3.9505300521850586, + "learning_rate": 1.9978046103183312e-07, + "loss": 1.2561, + "mean_token_accuracy": 0.6441210508346558, + "num_tokens": 9174092.0, + "step": 365 + }, + { + "epoch": 0.040193279156600044, + "grad_norm": 4.929006576538086, + "learning_rate": 2.0032930845225027e-07, + "loss": 1.226, + "mean_token_accuracy": 0.6444753408432007, + "num_tokens": 9194999.0, + "step": 366 + }, + { + "epoch": 0.040303096859213704, + "grad_norm": 3.9224793910980225, + "learning_rate": 2.0087815587266738e-07, + "loss": 1.174, + "mean_token_accuracy": 0.6762328147888184, + "num_tokens": 9224304.0, + "step": 367 + }, + { + "epoch": 0.040412914561827364, + "grad_norm": 3.609927177429199, + "learning_rate": 2.0142700329308452e-07, + "loss": 1.1677, + "mean_token_accuracy": 0.6613124012947083, + "num_tokens": 9250308.0, + "step": 368 + }, + { + "epoch": 0.04052273226444103, + "grad_norm": 3.8748393058776855, + "learning_rate": 2.0197585071350164e-07, + "loss": 1.1825, + "mean_token_accuracy": 0.6828035116195679, + "num_tokens": 9273552.0, + "step": 369 + }, + { + "epoch": 0.04063254996705469, + "grad_norm": 3.7912180423736572, + "learning_rate": 2.0252469813391875e-07, + "loss": 1.0477, + "mean_token_accuracy": 0.6989374756813049, + "num_tokens": 9298427.0, + "step": 370 + }, + { + "epoch": 0.04074236766966835, + "grad_norm": 3.914924144744873, + "learning_rate": 2.030735455543359e-07, + "loss": 1.1241, + "mean_token_accuracy": 0.6924439072608948, + "num_tokens": 9322217.0, + "step": 371 + }, + { + "epoch": 0.04085218537228201, + "grad_norm": 3.418095350265503, + "learning_rate": 2.03622392974753e-07, + "loss": 1.1452, + "mean_token_accuracy": 0.6691194772720337, + "num_tokens": 9353784.0, + "step": 372 + }, + { + "epoch": 0.04096200307489567, + "grad_norm": 4.588866710662842, + "learning_rate": 2.0417124039517015e-07, + "loss": 1.1872, + "mean_token_accuracy": 0.668235719203949, + "num_tokens": 9377228.0, + "step": 373 + }, + { + "epoch": 0.04107182077750934, + "grad_norm": 2.973106861114502, + "learning_rate": 2.0472008781558724e-07, + "loss": 1.1853, + "mean_token_accuracy": 0.6615515351295471, + "num_tokens": 9412086.0, + "step": 374 + }, + { + "epoch": 0.041181638480122996, + "grad_norm": 3.5315239429473877, + "learning_rate": 2.0526893523600438e-07, + "loss": 1.2578, + "mean_token_accuracy": 0.6441173553466797, + "num_tokens": 9441642.0, + "step": 375 + }, + { + "epoch": 0.041291456182736656, + "grad_norm": 3.560837984085083, + "learning_rate": 2.058177826564215e-07, + "loss": 1.2107, + "mean_token_accuracy": 0.6588911414146423, + "num_tokens": 9471374.0, + "step": 376 + }, + { + "epoch": 0.041401273885350316, + "grad_norm": 5.056316375732422, + "learning_rate": 2.0636663007683864e-07, + "loss": 1.1966, + "mean_token_accuracy": 0.6556231379508972, + "num_tokens": 9490733.0, + "step": 377 + }, + { + "epoch": 0.04151109158796398, + "grad_norm": 4.695251941680908, + "learning_rate": 2.0691547749725575e-07, + "loss": 1.2313, + "mean_token_accuracy": 0.6463441848754883, + "num_tokens": 9515863.0, + "step": 378 + }, + { + "epoch": 0.04162090929057764, + "grad_norm": 4.007643222808838, + "learning_rate": 2.0746432491767287e-07, + "loss": 1.2475, + "mean_token_accuracy": 0.6362485289573669, + "num_tokens": 9541851.0, + "step": 379 + }, + { + "epoch": 0.0417307269931913, + "grad_norm": 3.7328057289123535, + "learning_rate": 2.0801317233809e-07, + "loss": 1.2443, + "mean_token_accuracy": 0.6485545635223389, + "num_tokens": 9568482.0, + "step": 380 + }, + { + "epoch": 0.04184054469580496, + "grad_norm": 4.707616329193115, + "learning_rate": 2.0856201975850713e-07, + "loss": 1.1919, + "mean_token_accuracy": 0.6666343212127686, + "num_tokens": 9600229.0, + "step": 381 + }, + { + "epoch": 0.04195036239841862, + "grad_norm": 3.8131158351898193, + "learning_rate": 2.0911086717892427e-07, + "loss": 1.1501, + "mean_token_accuracy": 0.664697527885437, + "num_tokens": 9625550.0, + "step": 382 + }, + { + "epoch": 0.04206018010103229, + "grad_norm": 4.053491115570068, + "learning_rate": 2.0965971459934136e-07, + "loss": 1.2538, + "mean_token_accuracy": 0.6479901075363159, + "num_tokens": 9653079.0, + "step": 383 + }, + { + "epoch": 0.04216999780364595, + "grad_norm": 3.8341727256774902, + "learning_rate": 2.102085620197585e-07, + "loss": 1.1568, + "mean_token_accuracy": 0.6678744554519653, + "num_tokens": 9679330.0, + "step": 384 + }, + { + "epoch": 0.04227981550625961, + "grad_norm": 3.7950501441955566, + "learning_rate": 2.1075740944017561e-07, + "loss": 1.1642, + "mean_token_accuracy": 0.6649331450462341, + "num_tokens": 9704300.0, + "step": 385 + }, + { + "epoch": 0.04238963320887327, + "grad_norm": 4.093043327331543, + "learning_rate": 2.1130625686059276e-07, + "loss": 1.2034, + "mean_token_accuracy": 0.653802752494812, + "num_tokens": 9728105.0, + "step": 386 + }, + { + "epoch": 0.04249945091148693, + "grad_norm": 3.721297025680542, + "learning_rate": 2.1185510428100987e-07, + "loss": 1.1948, + "mean_token_accuracy": 0.6599684953689575, + "num_tokens": 9755352.0, + "step": 387 + }, + { + "epoch": 0.042609268614100596, + "grad_norm": 4.409420967102051, + "learning_rate": 2.1240395170142699e-07, + "loss": 1.1674, + "mean_token_accuracy": 0.6649425625801086, + "num_tokens": 9781718.0, + "step": 388 + }, + { + "epoch": 0.042719086316714255, + "grad_norm": 3.5820086002349854, + "learning_rate": 2.129527991218441e-07, + "loss": 1.104, + "mean_token_accuracy": 0.6803410053253174, + "num_tokens": 9803899.0, + "step": 389 + }, + { + "epoch": 0.042828904019327915, + "grad_norm": 4.832542896270752, + "learning_rate": 2.1350164654226124e-07, + "loss": 1.1506, + "mean_token_accuracy": 0.6643903255462646, + "num_tokens": 9822964.0, + "step": 390 + }, + { + "epoch": 0.042938721721941575, + "grad_norm": 3.8291125297546387, + "learning_rate": 2.1405049396267836e-07, + "loss": 1.1491, + "mean_token_accuracy": 0.6686587333679199, + "num_tokens": 9850295.0, + "step": 391 + }, + { + "epoch": 0.043048539424555235, + "grad_norm": 3.650696277618408, + "learning_rate": 2.145993413830955e-07, + "loss": 1.1214, + "mean_token_accuracy": 0.673653244972229, + "num_tokens": 9875644.0, + "step": 392 + }, + { + "epoch": 0.0431583571271689, + "grad_norm": 4.382246971130371, + "learning_rate": 2.1514818880351262e-07, + "loss": 1.1112, + "mean_token_accuracy": 0.6807498335838318, + "num_tokens": 9897940.0, + "step": 393 + }, + { + "epoch": 0.04326817482978256, + "grad_norm": 3.550863027572632, + "learning_rate": 2.1569703622392973e-07, + "loss": 1.2359, + "mean_token_accuracy": 0.6461352705955505, + "num_tokens": 9922953.0, + "step": 394 + }, + { + "epoch": 0.04337799253239622, + "grad_norm": 3.6379079818725586, + "learning_rate": 2.1624588364434687e-07, + "loss": 1.1991, + "mean_token_accuracy": 0.6732434630393982, + "num_tokens": 9951190.0, + "step": 395 + }, + { + "epoch": 0.04348781023500988, + "grad_norm": 3.6109490394592285, + "learning_rate": 2.16794731064764e-07, + "loss": 1.2322, + "mean_token_accuracy": 0.6529742479324341, + "num_tokens": 9974443.0, + "step": 396 + }, + { + "epoch": 0.04359762793762355, + "grad_norm": 4.623810291290283, + "learning_rate": 2.1734357848518113e-07, + "loss": 1.1358, + "mean_token_accuracy": 0.6712257862091064, + "num_tokens": 9991978.0, + "step": 397 + }, + { + "epoch": 0.04370744564023721, + "grad_norm": 3.3473894596099854, + "learning_rate": 2.1789242590559822e-07, + "loss": 1.2431, + "mean_token_accuracy": 0.6468929648399353, + "num_tokens": 10020160.0, + "step": 398 + }, + { + "epoch": 0.04381726334285087, + "grad_norm": 3.105154037475586, + "learning_rate": 2.1844127332601536e-07, + "loss": 1.2154, + "mean_token_accuracy": 0.6524828672409058, + "num_tokens": 10052700.0, + "step": 399 + }, + { + "epoch": 0.04392708104546453, + "grad_norm": 3.3164045810699463, + "learning_rate": 2.1899012074643247e-07, + "loss": 1.1327, + "mean_token_accuracy": 0.6720834970474243, + "num_tokens": 10082086.0, + "step": 400 + }, + { + "epoch": 0.04403689874807819, + "grad_norm": 3.9238193035125732, + "learning_rate": 2.1953896816684962e-07, + "loss": 1.076, + "mean_token_accuracy": 0.6880336403846741, + "num_tokens": 10105073.0, + "step": 401 + }, + { + "epoch": 0.044146716450691854, + "grad_norm": 3.390698194503784, + "learning_rate": 2.200878155872667e-07, + "loss": 1.1983, + "mean_token_accuracy": 0.6605377197265625, + "num_tokens": 10136473.0, + "step": 402 + }, + { + "epoch": 0.044256534153305514, + "grad_norm": 3.6384503841400146, + "learning_rate": 2.2063666300768385e-07, + "loss": 1.2262, + "mean_token_accuracy": 0.6470347046852112, + "num_tokens": 10162081.0, + "step": 403 + }, + { + "epoch": 0.044366351855919174, + "grad_norm": 3.6825053691864014, + "learning_rate": 2.21185510428101e-07, + "loss": 1.2996, + "mean_token_accuracy": 0.6433191895484924, + "num_tokens": 10187144.0, + "step": 404 + }, + { + "epoch": 0.044476169558532834, + "grad_norm": 3.0361132621765137, + "learning_rate": 2.217343578485181e-07, + "loss": 1.2231, + "mean_token_accuracy": 0.6528632640838623, + "num_tokens": 10217154.0, + "step": 405 + }, + { + "epoch": 0.044585987261146494, + "grad_norm": 4.185213088989258, + "learning_rate": 2.2228320526893525e-07, + "loss": 1.2365, + "mean_token_accuracy": 0.646064281463623, + "num_tokens": 10241834.0, + "step": 406 + }, + { + "epoch": 0.04469580496376016, + "grad_norm": 3.1377105712890625, + "learning_rate": 2.2283205268935233e-07, + "loss": 1.1732, + "mean_token_accuracy": 0.671629786491394, + "num_tokens": 10269837.0, + "step": 407 + }, + { + "epoch": 0.04480562266637382, + "grad_norm": 3.3604798316955566, + "learning_rate": 2.2338090010976948e-07, + "loss": 1.0925, + "mean_token_accuracy": 0.6859331130981445, + "num_tokens": 10293749.0, + "step": 408 + }, + { + "epoch": 0.04491544036898748, + "grad_norm": 3.363783359527588, + "learning_rate": 2.239297475301866e-07, + "loss": 1.1291, + "mean_token_accuracy": 0.6714859008789062, + "num_tokens": 10318920.0, + "step": 409 + }, + { + "epoch": 0.04502525807160114, + "grad_norm": 3.477097988128662, + "learning_rate": 2.2447859495060373e-07, + "loss": 1.192, + "mean_token_accuracy": 0.6673116683959961, + "num_tokens": 10343933.0, + "step": 410 + }, + { + "epoch": 0.0451350757742148, + "grad_norm": 3.6548237800598145, + "learning_rate": 2.2502744237102085e-07, + "loss": 1.1868, + "mean_token_accuracy": 0.6663385629653931, + "num_tokens": 10372066.0, + "step": 411 + }, + { + "epoch": 0.04524489347682847, + "grad_norm": 3.4946768283843994, + "learning_rate": 2.2557628979143796e-07, + "loss": 1.1613, + "mean_token_accuracy": 0.6771502494812012, + "num_tokens": 10394367.0, + "step": 412 + }, + { + "epoch": 0.04535471117944213, + "grad_norm": 3.7518913745880127, + "learning_rate": 2.2612513721185508e-07, + "loss": 1.159, + "mean_token_accuracy": 0.6613901853561401, + "num_tokens": 10418631.0, + "step": 413 + }, + { + "epoch": 0.04546452888205579, + "grad_norm": 4.314685821533203, + "learning_rate": 2.2667398463227222e-07, + "loss": 1.1217, + "mean_token_accuracy": 0.6793466806411743, + "num_tokens": 10437430.0, + "step": 414 + }, + { + "epoch": 0.045574346584669446, + "grad_norm": 3.7095284461975098, + "learning_rate": 2.2722283205268934e-07, + "loss": 1.1885, + "mean_token_accuracy": 0.6562261581420898, + "num_tokens": 10463588.0, + "step": 415 + }, + { + "epoch": 0.04568416428728311, + "grad_norm": 3.4244861602783203, + "learning_rate": 2.2777167947310648e-07, + "loss": 1.1952, + "mean_token_accuracy": 0.6601738929748535, + "num_tokens": 10493842.0, + "step": 416 + }, + { + "epoch": 0.04579398198989677, + "grad_norm": 4.27124547958374, + "learning_rate": 2.283205268935236e-07, + "loss": 1.1743, + "mean_token_accuracy": 0.6693950891494751, + "num_tokens": 10512770.0, + "step": 417 + }, + { + "epoch": 0.04590379969251043, + "grad_norm": 3.4344122409820557, + "learning_rate": 2.288693743139407e-07, + "loss": 1.1794, + "mean_token_accuracy": 0.6695110201835632, + "num_tokens": 10543026.0, + "step": 418 + }, + { + "epoch": 0.04601361739512409, + "grad_norm": 3.537367105484009, + "learning_rate": 2.2941822173435785e-07, + "loss": 1.1571, + "mean_token_accuracy": 0.6678175926208496, + "num_tokens": 10570802.0, + "step": 419 + }, + { + "epoch": 0.04612343509773775, + "grad_norm": 3.5652360916137695, + "learning_rate": 2.2996706915477496e-07, + "loss": 1.1459, + "mean_token_accuracy": 0.6716563105583191, + "num_tokens": 10593231.0, + "step": 420 + }, + { + "epoch": 0.04623325280035142, + "grad_norm": 3.48106050491333, + "learning_rate": 2.3051591657519208e-07, + "loss": 1.1028, + "mean_token_accuracy": 0.67671138048172, + "num_tokens": 10617762.0, + "step": 421 + }, + { + "epoch": 0.04634307050296508, + "grad_norm": 3.8825907707214355, + "learning_rate": 2.310647639956092e-07, + "loss": 1.2895, + "mean_token_accuracy": 0.6335849165916443, + "num_tokens": 10641835.0, + "step": 422 + }, + { + "epoch": 0.04645288820557874, + "grad_norm": 3.540512800216675, + "learning_rate": 2.3161361141602634e-07, + "loss": 1.1943, + "mean_token_accuracy": 0.6654818058013916, + "num_tokens": 10665324.0, + "step": 423 + }, + { + "epoch": 0.0465627059081924, + "grad_norm": 3.950361967086792, + "learning_rate": 2.3216245883644345e-07, + "loss": 1.0592, + "mean_token_accuracy": 0.6951290369033813, + "num_tokens": 10683787.0, + "step": 424 + }, + { + "epoch": 0.04667252361080606, + "grad_norm": 3.6815521717071533, + "learning_rate": 2.327113062568606e-07, + "loss": 1.1888, + "mean_token_accuracy": 0.6576477289199829, + "num_tokens": 10705686.0, + "step": 425 + }, + { + "epoch": 0.046782341313419726, + "grad_norm": 3.192293405532837, + "learning_rate": 2.3326015367727768e-07, + "loss": 1.0908, + "mean_token_accuracy": 0.6860541701316833, + "num_tokens": 10730247.0, + "step": 426 + }, + { + "epoch": 0.046892159016033386, + "grad_norm": 3.4363794326782227, + "learning_rate": 2.3380900109769482e-07, + "loss": 1.2403, + "mean_token_accuracy": 0.661648154258728, + "num_tokens": 10755062.0, + "step": 427 + }, + { + "epoch": 0.047001976718647046, + "grad_norm": 3.287040948867798, + "learning_rate": 2.3435784851811197e-07, + "loss": 1.128, + "mean_token_accuracy": 0.6690882444381714, + "num_tokens": 10781687.0, + "step": 428 + }, + { + "epoch": 0.047111794421260705, + "grad_norm": 2.9648876190185547, + "learning_rate": 2.3490669593852908e-07, + "loss": 1.144, + "mean_token_accuracy": 0.6683000922203064, + "num_tokens": 10809897.0, + "step": 429 + }, + { + "epoch": 0.047221612123874365, + "grad_norm": 3.373667001724243, + "learning_rate": 2.3545554335894622e-07, + "loss": 1.2322, + "mean_token_accuracy": 0.6441100835800171, + "num_tokens": 10838673.0, + "step": 430 + }, + { + "epoch": 0.04733142982648803, + "grad_norm": 3.445230484008789, + "learning_rate": 2.360043907793633e-07, + "loss": 1.1421, + "mean_token_accuracy": 0.6742135286331177, + "num_tokens": 10862982.0, + "step": 431 + }, + { + "epoch": 0.04744124752910169, + "grad_norm": 2.983011245727539, + "learning_rate": 2.3655323819978045e-07, + "loss": 1.1977, + "mean_token_accuracy": 0.6517496705055237, + "num_tokens": 10890805.0, + "step": 432 + }, + { + "epoch": 0.04755106523171535, + "grad_norm": 3.050687789916992, + "learning_rate": 2.3710208562019757e-07, + "loss": 1.138, + "mean_token_accuracy": 0.674632728099823, + "num_tokens": 10916204.0, + "step": 433 + }, + { + "epoch": 0.04766088293432901, + "grad_norm": 2.6572771072387695, + "learning_rate": 2.376509330406147e-07, + "loss": 1.2228, + "mean_token_accuracy": 0.647578239440918, + "num_tokens": 10947517.0, + "step": 434 + }, + { + "epoch": 0.04777070063694268, + "grad_norm": 3.7540040016174316, + "learning_rate": 2.3819978046103183e-07, + "loss": 1.16, + "mean_token_accuracy": 0.6633296012878418, + "num_tokens": 10968120.0, + "step": 435 + }, + { + "epoch": 0.04788051833955634, + "grad_norm": 3.5619757175445557, + "learning_rate": 2.3874862788144894e-07, + "loss": 1.1196, + "mean_token_accuracy": 0.6780523061752319, + "num_tokens": 10988431.0, + "step": 436 + }, + { + "epoch": 0.04799033604217, + "grad_norm": 3.3967723846435547, + "learning_rate": 2.3929747530186606e-07, + "loss": 1.0935, + "mean_token_accuracy": 0.6835830807685852, + "num_tokens": 11017309.0, + "step": 437 + }, + { + "epoch": 0.04810015374478366, + "grad_norm": 4.332333564758301, + "learning_rate": 2.3984632272228317e-07, + "loss": 1.209, + "mean_token_accuracy": 0.6522250771522522, + "num_tokens": 11037739.0, + "step": 438 + }, + { + "epoch": 0.04820997144739732, + "grad_norm": 2.9753599166870117, + "learning_rate": 2.4039517014270034e-07, + "loss": 1.169, + "mean_token_accuracy": 0.6581684350967407, + "num_tokens": 11066508.0, + "step": 439 + }, + { + "epoch": 0.048319789150010985, + "grad_norm": 4.106347560882568, + "learning_rate": 2.4094401756311745e-07, + "loss": 1.2081, + "mean_token_accuracy": 0.6536073684692383, + "num_tokens": 11086018.0, + "step": 440 + }, + { + "epoch": 0.048429606852624645, + "grad_norm": 3.216214895248413, + "learning_rate": 2.4149286498353457e-07, + "loss": 1.1826, + "mean_token_accuracy": 0.6562252044677734, + "num_tokens": 11115561.0, + "step": 441 + }, + { + "epoch": 0.048539424555238304, + "grad_norm": 3.5019373893737793, + "learning_rate": 2.420417124039517e-07, + "loss": 1.1212, + "mean_token_accuracy": 0.6736428737640381, + "num_tokens": 11137344.0, + "step": 442 + }, + { + "epoch": 0.048649242257851964, + "grad_norm": 3.094752073287964, + "learning_rate": 2.425905598243688e-07, + "loss": 1.226, + "mean_token_accuracy": 0.6498067378997803, + "num_tokens": 11167951.0, + "step": 443 + }, + { + "epoch": 0.048759059960465624, + "grad_norm": 2.926076889038086, + "learning_rate": 2.431394072447859e-07, + "loss": 1.1454, + "mean_token_accuracy": 0.674740195274353, + "num_tokens": 11195777.0, + "step": 444 + }, + { + "epoch": 0.04886887766307929, + "grad_norm": 3.252802848815918, + "learning_rate": 2.436882546652031e-07, + "loss": 1.1486, + "mean_token_accuracy": 0.6732160449028015, + "num_tokens": 11219214.0, + "step": 445 + }, + { + "epoch": 0.04897869536569295, + "grad_norm": 3.2753922939300537, + "learning_rate": 2.442371020856202e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6872657537460327, + "num_tokens": 11242287.0, + "step": 446 + }, + { + "epoch": 0.04908851306830661, + "grad_norm": 3.6781997680664062, + "learning_rate": 2.447859495060373e-07, + "loss": 1.1683, + "mean_token_accuracy": 0.6609359383583069, + "num_tokens": 11262035.0, + "step": 447 + }, + { + "epoch": 0.04919833077092027, + "grad_norm": 3.1829910278320312, + "learning_rate": 2.4533479692645443e-07, + "loss": 1.1614, + "mean_token_accuracy": 0.6603661775588989, + "num_tokens": 11283381.0, + "step": 448 + }, + { + "epoch": 0.04930814847353393, + "grad_norm": 3.1999590396881104, + "learning_rate": 2.4588364434687154e-07, + "loss": 1.1446, + "mean_token_accuracy": 0.6707285642623901, + "num_tokens": 11308124.0, + "step": 449 + }, + { + "epoch": 0.0494179661761476, + "grad_norm": 3.116117477416992, + "learning_rate": 2.4643249176728866e-07, + "loss": 1.1292, + "mean_token_accuracy": 0.6790860295295715, + "num_tokens": 11334948.0, + "step": 450 + }, + { + "epoch": 0.04952778387876126, + "grad_norm": 3.5400426387786865, + "learning_rate": 2.4698133918770583e-07, + "loss": 1.1999, + "mean_token_accuracy": 0.6556198596954346, + "num_tokens": 11357390.0, + "step": 451 + }, + { + "epoch": 0.04963760158137492, + "grad_norm": 3.4173827171325684, + "learning_rate": 2.4753018660812294e-07, + "loss": 1.134, + "mean_token_accuracy": 0.6696351766586304, + "num_tokens": 11381012.0, + "step": 452 + }, + { + "epoch": 0.04974741928398858, + "grad_norm": 3.193284749984741, + "learning_rate": 2.4807903402854006e-07, + "loss": 1.1835, + "mean_token_accuracy": 0.6553693413734436, + "num_tokens": 11405970.0, + "step": 453 + }, + { + "epoch": 0.049857236986602244, + "grad_norm": 3.7147274017333984, + "learning_rate": 2.486278814489572e-07, + "loss": 1.1469, + "mean_token_accuracy": 0.6760126948356628, + "num_tokens": 11424518.0, + "step": 454 + }, + { + "epoch": 0.0499670546892159, + "grad_norm": 3.3764281272888184, + "learning_rate": 2.491767288693743e-07, + "loss": 1.1058, + "mean_token_accuracy": 0.6754143238067627, + "num_tokens": 11444503.0, + "step": 455 + }, + { + "epoch": 0.05007687239182956, + "grad_norm": 3.1833486557006836, + "learning_rate": 2.4972557628979146e-07, + "loss": 1.2508, + "mean_token_accuracy": 0.6422919631004333, + "num_tokens": 11474508.0, + "step": 456 + }, + { + "epoch": 0.05018669009444322, + "grad_norm": 2.759197950363159, + "learning_rate": 2.5027442371020857e-07, + "loss": 1.1429, + "mean_token_accuracy": 0.6870157122612, + "num_tokens": 11501796.0, + "step": 457 + }, + { + "epoch": 0.05029650779705688, + "grad_norm": 3.181910276412964, + "learning_rate": 2.508232711306257e-07, + "loss": 1.1828, + "mean_token_accuracy": 0.6643320322036743, + "num_tokens": 11525034.0, + "step": 458 + }, + { + "epoch": 0.05040632549967055, + "grad_norm": 2.8639094829559326, + "learning_rate": 2.513721185510428e-07, + "loss": 1.1626, + "mean_token_accuracy": 0.6731176972389221, + "num_tokens": 11552632.0, + "step": 459 + }, + { + "epoch": 0.05051614320228421, + "grad_norm": 2.4497454166412354, + "learning_rate": 2.519209659714599e-07, + "loss": 1.2159, + "mean_token_accuracy": 0.6503839492797852, + "num_tokens": 11587271.0, + "step": 460 + }, + { + "epoch": 0.05062596090489787, + "grad_norm": 2.666222095489502, + "learning_rate": 2.524698133918771e-07, + "loss": 1.104, + "mean_token_accuracy": 0.6771174669265747, + "num_tokens": 11616436.0, + "step": 461 + }, + { + "epoch": 0.05073577860751153, + "grad_norm": 3.520413398742676, + "learning_rate": 2.5301866081229415e-07, + "loss": 1.1726, + "mean_token_accuracy": 0.6649978160858154, + "num_tokens": 11640386.0, + "step": 462 + }, + { + "epoch": 0.05084559631012519, + "grad_norm": 2.9557013511657715, + "learning_rate": 2.5356750823271126e-07, + "loss": 1.1384, + "mean_token_accuracy": 0.6724940538406372, + "num_tokens": 11667826.0, + "step": 463 + }, + { + "epoch": 0.050955414012738856, + "grad_norm": 3.17104172706604, + "learning_rate": 2.5411635565312843e-07, + "loss": 1.1537, + "mean_token_accuracy": 0.6654286980628967, + "num_tokens": 11692407.0, + "step": 464 + }, + { + "epoch": 0.051065231715352516, + "grad_norm": 2.78537654876709, + "learning_rate": 2.5466520307354555e-07, + "loss": 1.1186, + "mean_token_accuracy": 0.6812067031860352, + "num_tokens": 11723265.0, + "step": 465 + }, + { + "epoch": 0.051175049417966176, + "grad_norm": 3.492644786834717, + "learning_rate": 2.5521405049396266e-07, + "loss": 1.1016, + "mean_token_accuracy": 0.678255558013916, + "num_tokens": 11744666.0, + "step": 466 + }, + { + "epoch": 0.051284867120579836, + "grad_norm": 3.862097978591919, + "learning_rate": 2.557628979143798e-07, + "loss": 1.1313, + "mean_token_accuracy": 0.6715521812438965, + "num_tokens": 11766376.0, + "step": 467 + }, + { + "epoch": 0.051394684823193496, + "grad_norm": 3.1644558906555176, + "learning_rate": 2.5631174533479695e-07, + "loss": 1.0266, + "mean_token_accuracy": 0.6967303156852722, + "num_tokens": 11790042.0, + "step": 468 + }, + { + "epoch": 0.05150450252580716, + "grad_norm": 3.120225429534912, + "learning_rate": 2.5686059275521406e-07, + "loss": 1.1674, + "mean_token_accuracy": 0.66709965467453, + "num_tokens": 11814692.0, + "step": 469 + }, + { + "epoch": 0.05161432022842082, + "grad_norm": 2.8900654315948486, + "learning_rate": 2.574094401756312e-07, + "loss": 1.1609, + "mean_token_accuracy": 0.6699134111404419, + "num_tokens": 11839566.0, + "step": 470 + }, + { + "epoch": 0.05172413793103448, + "grad_norm": 3.02691912651062, + "learning_rate": 2.5795828759604824e-07, + "loss": 1.3133, + "mean_token_accuracy": 0.6286238431930542, + "num_tokens": 11870048.0, + "step": 471 + }, + { + "epoch": 0.05183395563364814, + "grad_norm": 3.115713596343994, + "learning_rate": 2.585071350164654e-07, + "loss": 1.1342, + "mean_token_accuracy": 0.6664977073669434, + "num_tokens": 11891950.0, + "step": 472 + }, + { + "epoch": 0.05194377333626181, + "grad_norm": 3.0895652770996094, + "learning_rate": 2.590559824368825e-07, + "loss": 1.0854, + "mean_token_accuracy": 0.684738278388977, + "num_tokens": 11916654.0, + "step": 473 + }, + { + "epoch": 0.05205359103887547, + "grad_norm": 3.161043882369995, + "learning_rate": 2.5960482985729964e-07, + "loss": 1.1112, + "mean_token_accuracy": 0.6791590452194214, + "num_tokens": 11938774.0, + "step": 474 + }, + { + "epoch": 0.05216340874148913, + "grad_norm": 2.8730859756469727, + "learning_rate": 2.601536772777168e-07, + "loss": 1.1606, + "mean_token_accuracy": 0.6738029718399048, + "num_tokens": 11964950.0, + "step": 475 + }, + { + "epoch": 0.05227322644410279, + "grad_norm": 3.2363743782043457, + "learning_rate": 2.607025246981339e-07, + "loss": 1.1619, + "mean_token_accuracy": 0.6671016216278076, + "num_tokens": 11992016.0, + "step": 476 + }, + { + "epoch": 0.05238304414671645, + "grad_norm": 3.110344171524048, + "learning_rate": 2.6125137211855104e-07, + "loss": 1.2204, + "mean_token_accuracy": 0.660880446434021, + "num_tokens": 12017580.0, + "step": 477 + }, + { + "epoch": 0.052492861849330115, + "grad_norm": 3.0819740295410156, + "learning_rate": 2.6180021953896815e-07, + "loss": 1.1012, + "mean_token_accuracy": 0.6823756098747253, + "num_tokens": 12039157.0, + "step": 478 + }, + { + "epoch": 0.052602679551943775, + "grad_norm": 2.9258973598480225, + "learning_rate": 2.623490669593853e-07, + "loss": 1.2351, + "mean_token_accuracy": 0.6468536853790283, + "num_tokens": 12063720.0, + "step": 479 + }, + { + "epoch": 0.052712497254557435, + "grad_norm": 2.8339898586273193, + "learning_rate": 2.6289791437980244e-07, + "loss": 1.065, + "mean_token_accuracy": 0.688743531703949, + "num_tokens": 12087560.0, + "step": 480 + }, + { + "epoch": 0.052822314957171095, + "grad_norm": 2.8716561794281006, + "learning_rate": 2.634467618002195e-07, + "loss": 1.0816, + "mean_token_accuracy": 0.6896064281463623, + "num_tokens": 12112165.0, + "step": 481 + }, + { + "epoch": 0.052932132659784754, + "grad_norm": 2.790623664855957, + "learning_rate": 2.639956092206366e-07, + "loss": 1.0206, + "mean_token_accuracy": 0.6995697021484375, + "num_tokens": 12138793.0, + "step": 482 + }, + { + "epoch": 0.05304195036239842, + "grad_norm": 2.65053653717041, + "learning_rate": 2.645444566410538e-07, + "loss": 1.0876, + "mean_token_accuracy": 0.686350405216217, + "num_tokens": 12166636.0, + "step": 483 + }, + { + "epoch": 0.05315176806501208, + "grad_norm": 2.608887195587158, + "learning_rate": 2.650933040614709e-07, + "loss": 1.27, + "mean_token_accuracy": 0.6442131996154785, + "num_tokens": 12196135.0, + "step": 484 + }, + { + "epoch": 0.05326158576762574, + "grad_norm": 3.4527857303619385, + "learning_rate": 2.65642151481888e-07, + "loss": 1.1655, + "mean_token_accuracy": 0.6638915538787842, + "num_tokens": 12216394.0, + "step": 485 + }, + { + "epoch": 0.0533714034702394, + "grad_norm": 3.2262301445007324, + "learning_rate": 2.6619099890230513e-07, + "loss": 1.2333, + "mean_token_accuracy": 0.6527896523475647, + "num_tokens": 12241018.0, + "step": 486 + }, + { + "epoch": 0.05348122117285306, + "grad_norm": 3.285857915878296, + "learning_rate": 2.667398463227223e-07, + "loss": 1.073, + "mean_token_accuracy": 0.6820858716964722, + "num_tokens": 12263600.0, + "step": 487 + }, + { + "epoch": 0.05359103887546673, + "grad_norm": 2.6872103214263916, + "learning_rate": 2.672886937431394e-07, + "loss": 1.1219, + "mean_token_accuracy": 0.6784105896949768, + "num_tokens": 12292699.0, + "step": 488 + }, + { + "epoch": 0.05370085657808039, + "grad_norm": 3.071965456008911, + "learning_rate": 2.678375411635565e-07, + "loss": 1.092, + "mean_token_accuracy": 0.6770820021629333, + "num_tokens": 12316657.0, + "step": 489 + }, + { + "epoch": 0.05381067428069405, + "grad_norm": 3.0414609909057617, + "learning_rate": 2.6838638858397364e-07, + "loss": 1.0915, + "mean_token_accuracy": 0.6795448064804077, + "num_tokens": 12339400.0, + "step": 490 + }, + { + "epoch": 0.05392049198330771, + "grad_norm": 2.9798316955566406, + "learning_rate": 2.6893523600439076e-07, + "loss": 1.1457, + "mean_token_accuracy": 0.6705843210220337, + "num_tokens": 12364025.0, + "step": 491 + }, + { + "epoch": 0.054030309685921374, + "grad_norm": 3.3320274353027344, + "learning_rate": 2.6948408342480787e-07, + "loss": 1.1158, + "mean_token_accuracy": 0.6720086932182312, + "num_tokens": 12387214.0, + "step": 492 + }, + { + "epoch": 0.054140127388535034, + "grad_norm": 2.7505228519439697, + "learning_rate": 2.70032930845225e-07, + "loss": 1.084, + "mean_token_accuracy": 0.6805022954940796, + "num_tokens": 12414078.0, + "step": 493 + }, + { + "epoch": 0.054249945091148694, + "grad_norm": 3.063718318939209, + "learning_rate": 2.7058177826564215e-07, + "loss": 1.1875, + "mean_token_accuracy": 0.6640757322311401, + "num_tokens": 12441046.0, + "step": 494 + }, + { + "epoch": 0.05435976279376235, + "grad_norm": 3.2623517513275146, + "learning_rate": 2.7113062568605927e-07, + "loss": 1.1163, + "mean_token_accuracy": 0.6752200722694397, + "num_tokens": 12463893.0, + "step": 495 + }, + { + "epoch": 0.05446958049637601, + "grad_norm": 2.884359359741211, + "learning_rate": 2.716794731064764e-07, + "loss": 1.1562, + "mean_token_accuracy": 0.6683013439178467, + "num_tokens": 12491277.0, + "step": 496 + }, + { + "epoch": 0.05457939819898968, + "grad_norm": 3.2793467044830322, + "learning_rate": 2.722283205268935e-07, + "loss": 1.0733, + "mean_token_accuracy": 0.6881523728370667, + "num_tokens": 12511922.0, + "step": 497 + }, + { + "epoch": 0.05468921590160334, + "grad_norm": 2.71043062210083, + "learning_rate": 2.7277716794731067e-07, + "loss": 1.2283, + "mean_token_accuracy": 0.6487423181533813, + "num_tokens": 12540327.0, + "step": 498 + }, + { + "epoch": 0.054799033604217, + "grad_norm": 3.007193088531494, + "learning_rate": 2.733260153677278e-07, + "loss": 1.1524, + "mean_token_accuracy": 0.6689199805259705, + "num_tokens": 12566273.0, + "step": 499 + }, + { + "epoch": 0.05490885130683066, + "grad_norm": 3.0157957077026367, + "learning_rate": 2.7387486278814485e-07, + "loss": 1.0935, + "mean_token_accuracy": 0.6781295537948608, + "num_tokens": 12588318.0, + "step": 500 + }, + { + "epoch": 0.05501866900944432, + "grad_norm": 2.6994736194610596, + "learning_rate": 2.74423710208562e-07, + "loss": 1.1173, + "mean_token_accuracy": 0.6748077869415283, + "num_tokens": 12612897.0, + "step": 501 + }, + { + "epoch": 0.055128486712057986, + "grad_norm": 2.5169265270233154, + "learning_rate": 2.7497255762897913e-07, + "loss": 1.18, + "mean_token_accuracy": 0.6554571986198425, + "num_tokens": 12640911.0, + "step": 502 + }, + { + "epoch": 0.055238304414671646, + "grad_norm": 3.1675124168395996, + "learning_rate": 2.7552140504939624e-07, + "loss": 1.167, + "mean_token_accuracy": 0.6536152362823486, + "num_tokens": 12661700.0, + "step": 503 + }, + { + "epoch": 0.055348122117285306, + "grad_norm": 2.5971412658691406, + "learning_rate": 2.7607025246981336e-07, + "loss": 1.1118, + "mean_token_accuracy": 0.6761394739151001, + "num_tokens": 12689302.0, + "step": 504 + }, + { + "epoch": 0.055457939819898966, + "grad_norm": 3.5102386474609375, + "learning_rate": 2.7661909989023053e-07, + "loss": 1.0306, + "mean_token_accuracy": 0.6957042217254639, + "num_tokens": 12710512.0, + "step": 505 + }, + { + "epoch": 0.055567757522512626, + "grad_norm": 2.8974084854125977, + "learning_rate": 2.7716794731064764e-07, + "loss": 1.1835, + "mean_token_accuracy": 0.6641353368759155, + "num_tokens": 12737155.0, + "step": 506 + }, + { + "epoch": 0.05567757522512629, + "grad_norm": 3.195415735244751, + "learning_rate": 2.7771679473106476e-07, + "loss": 1.0868, + "mean_token_accuracy": 0.6837525367736816, + "num_tokens": 12759815.0, + "step": 507 + }, + { + "epoch": 0.05578739292773995, + "grad_norm": 3.1922717094421387, + "learning_rate": 2.7826564215148187e-07, + "loss": 1.1537, + "mean_token_accuracy": 0.6683025360107422, + "num_tokens": 12781197.0, + "step": 508 + }, + { + "epoch": 0.05589721063035361, + "grad_norm": 2.858783006668091, + "learning_rate": 2.78814489571899e-07, + "loss": 1.0216, + "mean_token_accuracy": 0.6911294460296631, + "num_tokens": 12802658.0, + "step": 509 + }, + { + "epoch": 0.05600702833296727, + "grad_norm": 3.0347704887390137, + "learning_rate": 2.793633369923161e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.6989728212356567, + "num_tokens": 12822221.0, + "step": 510 + }, + { + "epoch": 0.05611684603558094, + "grad_norm": 2.7335269451141357, + "learning_rate": 2.799121844127332e-07, + "loss": 1.1728, + "mean_token_accuracy": 0.6628491878509521, + "num_tokens": 12847669.0, + "step": 511 + }, + { + "epoch": 0.0562266637381946, + "grad_norm": 3.074178695678711, + "learning_rate": 2.804610318331504e-07, + "loss": 1.1265, + "mean_token_accuracy": 0.6763770580291748, + "num_tokens": 12869865.0, + "step": 512 + }, + { + "epoch": 0.05633648144080826, + "grad_norm": 3.025353193283081, + "learning_rate": 2.810098792535675e-07, + "loss": 1.0509, + "mean_token_accuracy": 0.6922295093536377, + "num_tokens": 12891072.0, + "step": 513 + }, + { + "epoch": 0.05644629914342192, + "grad_norm": 3.5103840827941895, + "learning_rate": 2.815587266739846e-07, + "loss": 1.0966, + "mean_token_accuracy": 0.6786463856697083, + "num_tokens": 12908700.0, + "step": 514 + }, + { + "epoch": 0.05655611684603558, + "grad_norm": 3.14211106300354, + "learning_rate": 2.8210757409440173e-07, + "loss": 1.0976, + "mean_token_accuracy": 0.6803863048553467, + "num_tokens": 12931023.0, + "step": 515 + }, + { + "epoch": 0.056665934548649245, + "grad_norm": 2.6387245655059814, + "learning_rate": 2.826564215148189e-07, + "loss": 1.1955, + "mean_token_accuracy": 0.6508357524871826, + "num_tokens": 12958356.0, + "step": 516 + }, + { + "epoch": 0.056775752251262905, + "grad_norm": 2.932206392288208, + "learning_rate": 2.83205268935236e-07, + "loss": 1.0753, + "mean_token_accuracy": 0.6937113404273987, + "num_tokens": 12980723.0, + "step": 517 + }, + { + "epoch": 0.056885569953876565, + "grad_norm": 2.565042495727539, + "learning_rate": 2.8375411635565313e-07, + "loss": 1.1209, + "mean_token_accuracy": 0.6758621335029602, + "num_tokens": 13012119.0, + "step": 518 + }, + { + "epoch": 0.056995387656490225, + "grad_norm": 2.834526538848877, + "learning_rate": 2.843029637760702e-07, + "loss": 1.1888, + "mean_token_accuracy": 0.6590231657028198, + "num_tokens": 13037899.0, + "step": 519 + }, + { + "epoch": 0.057105205359103885, + "grad_norm": 2.714677572250366, + "learning_rate": 2.8485181119648736e-07, + "loss": 1.0087, + "mean_token_accuracy": 0.7010228633880615, + "num_tokens": 13063443.0, + "step": 520 + }, + { + "epoch": 0.05721502306171755, + "grad_norm": 2.8339030742645264, + "learning_rate": 2.854006586169045e-07, + "loss": 1.1242, + "mean_token_accuracy": 0.6759758591651917, + "num_tokens": 13087624.0, + "step": 521 + }, + { + "epoch": 0.05732484076433121, + "grad_norm": 2.4563770294189453, + "learning_rate": 2.859495060373216e-07, + "loss": 1.1823, + "mean_token_accuracy": 0.6648232936859131, + "num_tokens": 13117947.0, + "step": 522 + }, + { + "epoch": 0.05743465846694487, + "grad_norm": 2.7016854286193848, + "learning_rate": 2.8649835345773876e-07, + "loss": 1.2058, + "mean_token_accuracy": 0.6620521545410156, + "num_tokens": 13142426.0, + "step": 523 + }, + { + "epoch": 0.05754447616955853, + "grad_norm": 2.8474929332733154, + "learning_rate": 2.870472008781559e-07, + "loss": 1.1577, + "mean_token_accuracy": 0.6612826585769653, + "num_tokens": 13164201.0, + "step": 524 + }, + { + "epoch": 0.05765429387217219, + "grad_norm": 3.0481925010681152, + "learning_rate": 2.87596048298573e-07, + "loss": 1.1779, + "mean_token_accuracy": 0.6579164266586304, + "num_tokens": 13186685.0, + "step": 525 + }, + { + "epoch": 0.05776411157478586, + "grad_norm": 3.155771017074585, + "learning_rate": 2.881448957189901e-07, + "loss": 1.0934, + "mean_token_accuracy": 0.6675970554351807, + "num_tokens": 13206879.0, + "step": 526 + }, + { + "epoch": 0.05787392927739952, + "grad_norm": 2.866953134536743, + "learning_rate": 2.886937431394073e-07, + "loss": 1.1357, + "mean_token_accuracy": 0.666412889957428, + "num_tokens": 13230075.0, + "step": 527 + }, + { + "epoch": 0.05798374698001318, + "grad_norm": 2.7802484035491943, + "learning_rate": 2.8924259055982434e-07, + "loss": 1.0942, + "mean_token_accuracy": 0.6851283311843872, + "num_tokens": 13251929.0, + "step": 528 + }, + { + "epoch": 0.05809356468262684, + "grad_norm": 3.2223222255706787, + "learning_rate": 2.8979143798024145e-07, + "loss": 1.1103, + "mean_token_accuracy": 0.677588939666748, + "num_tokens": 13270483.0, + "step": 529 + }, + { + "epoch": 0.058203382385240504, + "grad_norm": 2.7211976051330566, + "learning_rate": 2.9034028540065857e-07, + "loss": 1.1148, + "mean_token_accuracy": 0.6702684164047241, + "num_tokens": 13295014.0, + "step": 530 + }, + { + "epoch": 0.058313200087854164, + "grad_norm": 2.7614030838012695, + "learning_rate": 2.9088913282107574e-07, + "loss": 1.0862, + "mean_token_accuracy": 0.6816624402999878, + "num_tokens": 13319296.0, + "step": 531 + }, + { + "epoch": 0.058423017790467824, + "grad_norm": 2.7522804737091064, + "learning_rate": 2.9143798024149285e-07, + "loss": 1.1552, + "mean_token_accuracy": 0.6718891859054565, + "num_tokens": 13344937.0, + "step": 532 + }, + { + "epoch": 0.058532835493081484, + "grad_norm": 2.704559326171875, + "learning_rate": 2.9198682766190997e-07, + "loss": 1.0551, + "mean_token_accuracy": 0.6860990524291992, + "num_tokens": 13371089.0, + "step": 533 + }, + { + "epoch": 0.058642653195695144, + "grad_norm": 2.9575717449188232, + "learning_rate": 2.9253567508232713e-07, + "loss": 1.0037, + "mean_token_accuracy": 0.7025306224822998, + "num_tokens": 13390716.0, + "step": 534 + }, + { + "epoch": 0.05875247089830881, + "grad_norm": 3.3728537559509277, + "learning_rate": 2.9308452250274425e-07, + "loss": 1.108, + "mean_token_accuracy": 0.6794275045394897, + "num_tokens": 13408869.0, + "step": 535 + }, + { + "epoch": 0.05886228860092247, + "grad_norm": 2.8569343090057373, + "learning_rate": 2.9363336992316136e-07, + "loss": 1.0919, + "mean_token_accuracy": 0.6820759773254395, + "num_tokens": 13432069.0, + "step": 536 + }, + { + "epoch": 0.05897210630353613, + "grad_norm": 2.992858648300171, + "learning_rate": 2.941822173435785e-07, + "loss": 1.0486, + "mean_token_accuracy": 0.6903185844421387, + "num_tokens": 13452861.0, + "step": 537 + }, + { + "epoch": 0.05908192400614979, + "grad_norm": 2.4164109230041504, + "learning_rate": 2.947310647639956e-07, + "loss": 1.1083, + "mean_token_accuracy": 0.6807413697242737, + "num_tokens": 13483057.0, + "step": 538 + }, + { + "epoch": 0.05919174170876345, + "grad_norm": 2.362417221069336, + "learning_rate": 2.952799121844127e-07, + "loss": 1.1783, + "mean_token_accuracy": 0.6556383371353149, + "num_tokens": 13514057.0, + "step": 539 + }, + { + "epoch": 0.05930155941137712, + "grad_norm": 2.571406364440918, + "learning_rate": 2.958287596048298e-07, + "loss": 1.2071, + "mean_token_accuracy": 0.6529611349105835, + "num_tokens": 13543097.0, + "step": 540 + }, + { + "epoch": 0.059411377113990776, + "grad_norm": 2.924015760421753, + "learning_rate": 2.9637760702524694e-07, + "loss": 1.1852, + "mean_token_accuracy": 0.6611159443855286, + "num_tokens": 13566797.0, + "step": 541 + }, + { + "epoch": 0.059521194816604436, + "grad_norm": 2.436736822128296, + "learning_rate": 2.969264544456641e-07, + "loss": 1.0971, + "mean_token_accuracy": 0.6837626695632935, + "num_tokens": 13593919.0, + "step": 542 + }, + { + "epoch": 0.059631012519218096, + "grad_norm": 2.702781915664673, + "learning_rate": 2.974753018660812e-07, + "loss": 1.0398, + "mean_token_accuracy": 0.7032043933868408, + "num_tokens": 13617363.0, + "step": 543 + }, + { + "epoch": 0.059740830221831756, + "grad_norm": 2.5794763565063477, + "learning_rate": 2.9802414928649834e-07, + "loss": 1.1803, + "mean_token_accuracy": 0.6606963276863098, + "num_tokens": 13647092.0, + "step": 544 + }, + { + "epoch": 0.05985064792444542, + "grad_norm": 2.647167444229126, + "learning_rate": 2.9857299670691546e-07, + "loss": 1.1785, + "mean_token_accuracy": 0.660561203956604, + "num_tokens": 13673040.0, + "step": 545 + }, + { + "epoch": 0.05996046562705908, + "grad_norm": 2.6458144187927246, + "learning_rate": 2.991218441273326e-07, + "loss": 1.1092, + "mean_token_accuracy": 0.6748185157775879, + "num_tokens": 13701386.0, + "step": 546 + }, + { + "epoch": 0.06007028332967274, + "grad_norm": 2.4780240058898926, + "learning_rate": 2.996706915477497e-07, + "loss": 1.1179, + "mean_token_accuracy": 0.6739858388900757, + "num_tokens": 13730419.0, + "step": 547 + }, + { + "epoch": 0.0601801010322864, + "grad_norm": 2.223224639892578, + "learning_rate": 3.002195389681668e-07, + "loss": 1.2463, + "mean_token_accuracy": 0.639043927192688, + "num_tokens": 13767486.0, + "step": 548 + }, + { + "epoch": 0.06028991873490007, + "grad_norm": 2.6384291648864746, + "learning_rate": 3.0076838638858397e-07, + "loss": 1.0905, + "mean_token_accuracy": 0.6789321303367615, + "num_tokens": 13792205.0, + "step": 549 + }, + { + "epoch": 0.06039973643751373, + "grad_norm": 2.6970303058624268, + "learning_rate": 3.013172338090011e-07, + "loss": 1.1402, + "mean_token_accuracy": 0.6692195534706116, + "num_tokens": 13820589.0, + "step": 550 + }, + { + "epoch": 0.06050955414012739, + "grad_norm": 2.495123863220215, + "learning_rate": 3.018660812294182e-07, + "loss": 1.0785, + "mean_token_accuracy": 0.681854248046875, + "num_tokens": 13848803.0, + "step": 551 + }, + { + "epoch": 0.06061937184274105, + "grad_norm": 2.5954532623291016, + "learning_rate": 3.024149286498353e-07, + "loss": 1.1646, + "mean_token_accuracy": 0.6687313318252563, + "num_tokens": 13873437.0, + "step": 552 + }, + { + "epoch": 0.06072918954535471, + "grad_norm": 2.422405958175659, + "learning_rate": 3.029637760702525e-07, + "loss": 1.1045, + "mean_token_accuracy": 0.6816842555999756, + "num_tokens": 13904541.0, + "step": 553 + }, + { + "epoch": 0.060839007247968376, + "grad_norm": 2.375737428665161, + "learning_rate": 3.035126234906696e-07, + "loss": 1.2294, + "mean_token_accuracy": 0.65022212266922, + "num_tokens": 13937451.0, + "step": 554 + }, + { + "epoch": 0.060948824950582035, + "grad_norm": 2.712329387664795, + "learning_rate": 3.040614709110867e-07, + "loss": 1.1375, + "mean_token_accuracy": 0.6751110553741455, + "num_tokens": 13962458.0, + "step": 555 + }, + { + "epoch": 0.061058642653195695, + "grad_norm": 3.123279094696045, + "learning_rate": 3.046103183315038e-07, + "loss": 1.155, + "mean_token_accuracy": 0.6698493361473083, + "num_tokens": 13981827.0, + "step": 556 + }, + { + "epoch": 0.061168460355809355, + "grad_norm": 2.7981765270233154, + "learning_rate": 3.0515916575192094e-07, + "loss": 1.1088, + "mean_token_accuracy": 0.684118926525116, + "num_tokens": 14003338.0, + "step": 557 + }, + { + "epoch": 0.061278278058423015, + "grad_norm": 2.3893299102783203, + "learning_rate": 3.0570801317233806e-07, + "loss": 1.1267, + "mean_token_accuracy": 0.6740380525588989, + "num_tokens": 14032114.0, + "step": 558 + }, + { + "epoch": 0.06138809576103668, + "grad_norm": 2.5671164989471436, + "learning_rate": 3.062568605927552e-07, + "loss": 1.0417, + "mean_token_accuracy": 0.6936612129211426, + "num_tokens": 14059423.0, + "step": 559 + }, + { + "epoch": 0.06149791346365034, + "grad_norm": 2.917459726333618, + "learning_rate": 3.0680570801317234e-07, + "loss": 1.1289, + "mean_token_accuracy": 0.6695317029953003, + "num_tokens": 14082654.0, + "step": 560 + }, + { + "epoch": 0.061607731166264, + "grad_norm": 2.6267001628875732, + "learning_rate": 3.0735455543358946e-07, + "loss": 1.2025, + "mean_token_accuracy": 0.6656270027160645, + "num_tokens": 14107549.0, + "step": 561 + }, + { + "epoch": 0.06171754886887766, + "grad_norm": 2.1146886348724365, + "learning_rate": 3.0790340285400657e-07, + "loss": 1.097, + "mean_token_accuracy": 0.6823914051055908, + "num_tokens": 14144377.0, + "step": 562 + }, + { + "epoch": 0.06182736657149132, + "grad_norm": 2.7620744705200195, + "learning_rate": 3.084522502744237e-07, + "loss": 1.105, + "mean_token_accuracy": 0.6763653755187988, + "num_tokens": 14167018.0, + "step": 563 + }, + { + "epoch": 0.06193718427410499, + "grad_norm": 2.178720235824585, + "learning_rate": 3.0900109769484086e-07, + "loss": 1.2454, + "mean_token_accuracy": 0.6430041193962097, + "num_tokens": 14201464.0, + "step": 564 + }, + { + "epoch": 0.06204700197671865, + "grad_norm": 2.598701238632202, + "learning_rate": 3.0954994511525797e-07, + "loss": 1.0763, + "mean_token_accuracy": 0.6771224737167358, + "num_tokens": 14226878.0, + "step": 565 + }, + { + "epoch": 0.06215681967933231, + "grad_norm": 2.900466203689575, + "learning_rate": 3.1009879253567503e-07, + "loss": 1.1329, + "mean_token_accuracy": 0.6738263964653015, + "num_tokens": 14248369.0, + "step": 566 + }, + { + "epoch": 0.06226663738194597, + "grad_norm": 2.1868507862091064, + "learning_rate": 3.1064763995609215e-07, + "loss": 1.1663, + "mean_token_accuracy": 0.6683948636054993, + "num_tokens": 14283791.0, + "step": 567 + }, + { + "epoch": 0.062376455084559634, + "grad_norm": 2.2570345401763916, + "learning_rate": 3.111964873765093e-07, + "loss": 1.1553, + "mean_token_accuracy": 0.6642422676086426, + "num_tokens": 14314122.0, + "step": 568 + }, + { + "epoch": 0.062486272787173294, + "grad_norm": 2.769221305847168, + "learning_rate": 3.1174533479692643e-07, + "loss": 1.0919, + "mean_token_accuracy": 0.6844580173492432, + "num_tokens": 14337013.0, + "step": 569 + }, + { + "epoch": 0.06259609048978695, + "grad_norm": 2.205935478210449, + "learning_rate": 3.1229418221734355e-07, + "loss": 1.2209, + "mean_token_accuracy": 0.6511257886886597, + "num_tokens": 14371415.0, + "step": 570 + }, + { + "epoch": 0.06270590819240061, + "grad_norm": 3.369499444961548, + "learning_rate": 3.128430296377607e-07, + "loss": 0.9643, + "mean_token_accuracy": 0.7079017162322998, + "num_tokens": 14389294.0, + "step": 571 + }, + { + "epoch": 0.06281572589501427, + "grad_norm": 2.831822633743286, + "learning_rate": 3.1339187705817783e-07, + "loss": 1.1337, + "mean_token_accuracy": 0.6577800512313843, + "num_tokens": 14413326.0, + "step": 572 + }, + { + "epoch": 0.06292554359762793, + "grad_norm": 2.956851005554199, + "learning_rate": 3.1394072447859495e-07, + "loss": 1.0751, + "mean_token_accuracy": 0.6827040314674377, + "num_tokens": 14434554.0, + "step": 573 + }, + { + "epoch": 0.0630353613002416, + "grad_norm": 2.597419023513794, + "learning_rate": 3.1448957189901206e-07, + "loss": 1.1258, + "mean_token_accuracy": 0.6745144128799438, + "num_tokens": 14462317.0, + "step": 574 + }, + { + "epoch": 0.06314517900285525, + "grad_norm": 2.527735710144043, + "learning_rate": 3.1503841931942923e-07, + "loss": 1.1346, + "mean_token_accuracy": 0.669684886932373, + "num_tokens": 14491287.0, + "step": 575 + }, + { + "epoch": 0.06325499670546893, + "grad_norm": 2.932400941848755, + "learning_rate": 3.155872667398463e-07, + "loss": 1.0946, + "mean_token_accuracy": 0.6792353391647339, + "num_tokens": 14512509.0, + "step": 576 + }, + { + "epoch": 0.06336481440808259, + "grad_norm": 2.140928268432617, + "learning_rate": 3.161361141602634e-07, + "loss": 1.301, + "mean_token_accuracy": 0.6278937458992004, + "num_tokens": 14549026.0, + "step": 577 + }, + { + "epoch": 0.06347463211069625, + "grad_norm": 2.5364344120025635, + "learning_rate": 3.166849615806805e-07, + "loss": 1.0845, + "mean_token_accuracy": 0.6779723167419434, + "num_tokens": 14574004.0, + "step": 578 + }, + { + "epoch": 0.0635844498133099, + "grad_norm": 2.5326955318450928, + "learning_rate": 3.172338090010977e-07, + "loss": 1.1844, + "mean_token_accuracy": 0.6621726155281067, + "num_tokens": 14601577.0, + "step": 579 + }, + { + "epoch": 0.06369426751592357, + "grad_norm": 2.7136070728302, + "learning_rate": 3.177826564215148e-07, + "loss": 1.0861, + "mean_token_accuracy": 0.6878523826599121, + "num_tokens": 14623511.0, + "step": 580 + }, + { + "epoch": 0.06380408521853723, + "grad_norm": 2.9820828437805176, + "learning_rate": 3.183315038419319e-07, + "loss": 1.1058, + "mean_token_accuracy": 0.6772074699401855, + "num_tokens": 14644735.0, + "step": 581 + }, + { + "epoch": 0.06391390292115089, + "grad_norm": 2.55003023147583, + "learning_rate": 3.188803512623491e-07, + "loss": 1.2286, + "mean_token_accuracy": 0.6418269872665405, + "num_tokens": 14671382.0, + "step": 582 + }, + { + "epoch": 0.06402372062376455, + "grad_norm": 2.715261220932007, + "learning_rate": 3.194291986827662e-07, + "loss": 1.1156, + "mean_token_accuracy": 0.6751627922058105, + "num_tokens": 14694602.0, + "step": 583 + }, + { + "epoch": 0.0641335383263782, + "grad_norm": 3.0670294761657715, + "learning_rate": 3.199780461031833e-07, + "loss": 1.1682, + "mean_token_accuracy": 0.6814348697662354, + "num_tokens": 14715328.0, + "step": 584 + }, + { + "epoch": 0.06424335602899188, + "grad_norm": 2.538588047027588, + "learning_rate": 3.205268935236004e-07, + "loss": 1.215, + "mean_token_accuracy": 0.6438687443733215, + "num_tokens": 14742235.0, + "step": 585 + }, + { + "epoch": 0.06435317373160554, + "grad_norm": 3.2017407417297363, + "learning_rate": 3.2107574094401755e-07, + "loss": 1.1401, + "mean_token_accuracy": 0.6781082153320312, + "num_tokens": 14760793.0, + "step": 586 + }, + { + "epoch": 0.0644629914342192, + "grad_norm": 2.435380220413208, + "learning_rate": 3.2162458836443467e-07, + "loss": 1.0312, + "mean_token_accuracy": 0.7024374008178711, + "num_tokens": 14787649.0, + "step": 587 + }, + { + "epoch": 0.06457280913683286, + "grad_norm": 2.5156304836273193, + "learning_rate": 3.221734357848518e-07, + "loss": 1.1919, + "mean_token_accuracy": 0.664486289024353, + "num_tokens": 14815595.0, + "step": 588 + }, + { + "epoch": 0.06468262683944652, + "grad_norm": 2.6581053733825684, + "learning_rate": 3.227222832052689e-07, + "loss": 1.1281, + "mean_token_accuracy": 0.6749162077903748, + "num_tokens": 14841792.0, + "step": 589 + }, + { + "epoch": 0.06479244454206018, + "grad_norm": 2.2568202018737793, + "learning_rate": 3.2327113062568606e-07, + "loss": 1.0878, + "mean_token_accuracy": 0.6903637647628784, + "num_tokens": 14872403.0, + "step": 590 + }, + { + "epoch": 0.06490226224467384, + "grad_norm": 2.8868701457977295, + "learning_rate": 3.238199780461032e-07, + "loss": 1.0908, + "mean_token_accuracy": 0.6867451071739197, + "num_tokens": 14893233.0, + "step": 591 + }, + { + "epoch": 0.0650120799472875, + "grad_norm": 2.4932243824005127, + "learning_rate": 3.243688254665203e-07, + "loss": 1.0983, + "mean_token_accuracy": 0.6815844178199768, + "num_tokens": 14921066.0, + "step": 592 + }, + { + "epoch": 0.06512189764990116, + "grad_norm": 3.0789310932159424, + "learning_rate": 3.2491767288693746e-07, + "loss": 0.9137, + "mean_token_accuracy": 0.724494218826294, + "num_tokens": 14938996.0, + "step": 593 + }, + { + "epoch": 0.06523171535251482, + "grad_norm": 2.5066134929656982, + "learning_rate": 3.254665203073546e-07, + "loss": 0.9889, + "mean_token_accuracy": 0.7081429958343506, + "num_tokens": 14964834.0, + "step": 594 + }, + { + "epoch": 0.06534153305512849, + "grad_norm": 2.669464111328125, + "learning_rate": 3.2601536772777164e-07, + "loss": 1.0975, + "mean_token_accuracy": 0.6845757961273193, + "num_tokens": 14988623.0, + "step": 595 + }, + { + "epoch": 0.06545135075774215, + "grad_norm": 2.3219332695007324, + "learning_rate": 3.2656421514818876e-07, + "loss": 1.0524, + "mean_token_accuracy": 0.6949588060379028, + "num_tokens": 15021120.0, + "step": 596 + }, + { + "epoch": 0.06556116846035581, + "grad_norm": 2.386697769165039, + "learning_rate": 3.271130625686059e-07, + "loss": 1.1378, + "mean_token_accuracy": 0.6717516183853149, + "num_tokens": 15048275.0, + "step": 597 + }, + { + "epoch": 0.06567098616296947, + "grad_norm": 2.355808734893799, + "learning_rate": 3.2766190998902304e-07, + "loss": 1.1128, + "mean_token_accuracy": 0.6774501800537109, + "num_tokens": 15079038.0, + "step": 598 + }, + { + "epoch": 0.06578080386558313, + "grad_norm": 2.776193618774414, + "learning_rate": 3.2821075740944015e-07, + "loss": 1.1387, + "mean_token_accuracy": 0.6754657030105591, + "num_tokens": 15101182.0, + "step": 599 + }, + { + "epoch": 0.06589062156819679, + "grad_norm": 2.7841551303863525, + "learning_rate": 3.2875960482985727e-07, + "loss": 1.0969, + "mean_token_accuracy": 0.6803438067436218, + "num_tokens": 15123138.0, + "step": 600 + }, + { + "epoch": 0.06600043927081045, + "grad_norm": 2.518110513687134, + "learning_rate": 3.2930845225027444e-07, + "loss": 1.2081, + "mean_token_accuracy": 0.6573193073272705, + "num_tokens": 15151806.0, + "step": 601 + }, + { + "epoch": 0.06611025697342411, + "grad_norm": 2.5540590286254883, + "learning_rate": 3.2985729967069155e-07, + "loss": 1.0365, + "mean_token_accuracy": 0.697426438331604, + "num_tokens": 15176717.0, + "step": 602 + }, + { + "epoch": 0.06622007467603777, + "grad_norm": 2.8025527000427246, + "learning_rate": 3.3040614709110867e-07, + "loss": 1.0903, + "mean_token_accuracy": 0.6824004054069519, + "num_tokens": 15199816.0, + "step": 603 + }, + { + "epoch": 0.06632989237865144, + "grad_norm": 2.1592917442321777, + "learning_rate": 3.3095499451152573e-07, + "loss": 1.1171, + "mean_token_accuracy": 0.6749006509780884, + "num_tokens": 15232676.0, + "step": 604 + }, + { + "epoch": 0.0664397100812651, + "grad_norm": 2.843766212463379, + "learning_rate": 3.315038419319429e-07, + "loss": 1.071, + "mean_token_accuracy": 0.6817128658294678, + "num_tokens": 15251939.0, + "step": 605 + }, + { + "epoch": 0.06654952778387876, + "grad_norm": 2.554856777191162, + "learning_rate": 3.3205268935236e-07, + "loss": 1.0371, + "mean_token_accuracy": 0.7072479724884033, + "num_tokens": 15275445.0, + "step": 606 + }, + { + "epoch": 0.06665934548649242, + "grad_norm": 2.4253458976745605, + "learning_rate": 3.3260153677277713e-07, + "loss": 1.1552, + "mean_token_accuracy": 0.6618064045906067, + "num_tokens": 15304109.0, + "step": 607 + }, + { + "epoch": 0.06676916318910608, + "grad_norm": 2.737252712249756, + "learning_rate": 3.331503841931943e-07, + "loss": 1.1681, + "mean_token_accuracy": 0.6631264090538025, + "num_tokens": 15326477.0, + "step": 608 + }, + { + "epoch": 0.06687898089171974, + "grad_norm": 2.5833137035369873, + "learning_rate": 3.336992316136114e-07, + "loss": 1.061, + "mean_token_accuracy": 0.6841297149658203, + "num_tokens": 15350257.0, + "step": 609 + }, + { + "epoch": 0.0669887985943334, + "grad_norm": 2.252650499343872, + "learning_rate": 3.3424807903402853e-07, + "loss": 1.1351, + "mean_token_accuracy": 0.6647830009460449, + "num_tokens": 15381455.0, + "step": 610 + }, + { + "epoch": 0.06709861629694706, + "grad_norm": 2.532782554626465, + "learning_rate": 3.3479692645444564e-07, + "loss": 1.0628, + "mean_token_accuracy": 0.6941279172897339, + "num_tokens": 15407444.0, + "step": 611 + }, + { + "epoch": 0.06720843399956072, + "grad_norm": 2.6609697341918945, + "learning_rate": 3.353457738748628e-07, + "loss": 1.0489, + "mean_token_accuracy": 0.6901453137397766, + "num_tokens": 15430246.0, + "step": 612 + }, + { + "epoch": 0.06731825170217438, + "grad_norm": 2.5536208152770996, + "learning_rate": 3.3589462129527993e-07, + "loss": 1.1002, + "mean_token_accuracy": 0.6766078472137451, + "num_tokens": 15455119.0, + "step": 613 + }, + { + "epoch": 0.06742806940478806, + "grad_norm": 2.2744009494781494, + "learning_rate": 3.36443468715697e-07, + "loss": 1.0886, + "mean_token_accuracy": 0.6926919221878052, + "num_tokens": 15484737.0, + "step": 614 + }, + { + "epoch": 0.06753788710740172, + "grad_norm": 2.9599571228027344, + "learning_rate": 3.369923161361141e-07, + "loss": 1.1147, + "mean_token_accuracy": 0.6835256814956665, + "num_tokens": 15504860.0, + "step": 615 + }, + { + "epoch": 0.06764770481001538, + "grad_norm": 2.490388870239258, + "learning_rate": 3.3754116355653127e-07, + "loss": 1.06, + "mean_token_accuracy": 0.694509744644165, + "num_tokens": 15529071.0, + "step": 616 + }, + { + "epoch": 0.06775752251262904, + "grad_norm": 3.1287593841552734, + "learning_rate": 3.380900109769484e-07, + "loss": 1.0798, + "mean_token_accuracy": 0.696010410785675, + "num_tokens": 15547360.0, + "step": 617 + }, + { + "epoch": 0.0678673402152427, + "grad_norm": 2.6050028800964355, + "learning_rate": 3.386388583973655e-07, + "loss": 1.0898, + "mean_token_accuracy": 0.6743040084838867, + "num_tokens": 15571457.0, + "step": 618 + }, + { + "epoch": 0.06797715791785636, + "grad_norm": 2.2614266872406006, + "learning_rate": 3.3918770581778267e-07, + "loss": 1.123, + "mean_token_accuracy": 0.6660057902336121, + "num_tokens": 15604081.0, + "step": 619 + }, + { + "epoch": 0.06808697562047002, + "grad_norm": 2.612077236175537, + "learning_rate": 3.397365532381998e-07, + "loss": 1.097, + "mean_token_accuracy": 0.6800718307495117, + "num_tokens": 15630848.0, + "step": 620 + }, + { + "epoch": 0.06819679332308368, + "grad_norm": 2.4364631175994873, + "learning_rate": 3.402854006586169e-07, + "loss": 1.1809, + "mean_token_accuracy": 0.6522737741470337, + "num_tokens": 15659443.0, + "step": 621 + }, + { + "epoch": 0.06830661102569734, + "grad_norm": 2.6258833408355713, + "learning_rate": 3.40834248079034e-07, + "loss": 1.1515, + "mean_token_accuracy": 0.6640043258666992, + "num_tokens": 15683544.0, + "step": 622 + }, + { + "epoch": 0.06841642872831101, + "grad_norm": 2.7503700256347656, + "learning_rate": 3.4138309549945113e-07, + "loss": 1.0797, + "mean_token_accuracy": 0.682684600353241, + "num_tokens": 15706455.0, + "step": 623 + }, + { + "epoch": 0.06852624643092467, + "grad_norm": 2.823594570159912, + "learning_rate": 3.4193194291986825e-07, + "loss": 1.1217, + "mean_token_accuracy": 0.680978536605835, + "num_tokens": 15728407.0, + "step": 624 + }, + { + "epoch": 0.06863606413353833, + "grad_norm": 2.84145188331604, + "learning_rate": 3.4248079034028536e-07, + "loss": 0.9969, + "mean_token_accuracy": 0.6951892375946045, + "num_tokens": 15749042.0, + "step": 625 + }, + { + "epoch": 0.06874588183615199, + "grad_norm": 2.559129238128662, + "learning_rate": 3.430296377607025e-07, + "loss": 1.0994, + "mean_token_accuracy": 0.6802511215209961, + "num_tokens": 15773434.0, + "step": 626 + }, + { + "epoch": 0.06885569953876565, + "grad_norm": 2.523841142654419, + "learning_rate": 3.4357848518111965e-07, + "loss": 1.0666, + "mean_token_accuracy": 0.6882240772247314, + "num_tokens": 15798249.0, + "step": 627 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 2.919356346130371, + "learning_rate": 3.4412733260153676e-07, + "loss": 1.1205, + "mean_token_accuracy": 0.6771379709243774, + "num_tokens": 15817526.0, + "step": 628 + }, + { + "epoch": 0.06907533494399297, + "grad_norm": 2.201631546020508, + "learning_rate": 3.446761800219539e-07, + "loss": 1.1016, + "mean_token_accuracy": 0.6898823976516724, + "num_tokens": 15848166.0, + "step": 629 + }, + { + "epoch": 0.06918515264660663, + "grad_norm": 3.011117935180664, + "learning_rate": 3.4522502744237104e-07, + "loss": 1.0513, + "mean_token_accuracy": 0.695550799369812, + "num_tokens": 15867977.0, + "step": 630 + }, + { + "epoch": 0.06929497034922029, + "grad_norm": 2.135291337966919, + "learning_rate": 3.4577387486278816e-07, + "loss": 1.1188, + "mean_token_accuracy": 0.675157368183136, + "num_tokens": 15902517.0, + "step": 631 + }, + { + "epoch": 0.06940478805183395, + "grad_norm": 2.3771519660949707, + "learning_rate": 3.463227222832053e-07, + "loss": 1.0983, + "mean_token_accuracy": 0.6755086779594421, + "num_tokens": 15930594.0, + "step": 632 + }, + { + "epoch": 0.06951460575444762, + "grad_norm": 2.447812795639038, + "learning_rate": 3.4687156970362234e-07, + "loss": 1.0403, + "mean_token_accuracy": 0.6915919184684753, + "num_tokens": 15956137.0, + "step": 633 + }, + { + "epoch": 0.06962442345706128, + "grad_norm": 2.919663906097412, + "learning_rate": 3.474204171240395e-07, + "loss": 1.2446, + "mean_token_accuracy": 0.6598237752914429, + "num_tokens": 15976917.0, + "step": 634 + }, + { + "epoch": 0.06973424115967494, + "grad_norm": 2.775315999984741, + "learning_rate": 3.479692645444566e-07, + "loss": 1.0677, + "mean_token_accuracy": 0.6830012202262878, + "num_tokens": 15999875.0, + "step": 635 + }, + { + "epoch": 0.0698440588622886, + "grad_norm": 2.4943249225616455, + "learning_rate": 3.4851811196487374e-07, + "loss": 1.1859, + "mean_token_accuracy": 0.6599597930908203, + "num_tokens": 16025279.0, + "step": 636 + }, + { + "epoch": 0.06995387656490226, + "grad_norm": 2.8639883995056152, + "learning_rate": 3.4906695938529085e-07, + "loss": 1.1415, + "mean_token_accuracy": 0.6679060459136963, + "num_tokens": 16046828.0, + "step": 637 + }, + { + "epoch": 0.07006369426751592, + "grad_norm": 2.288648843765259, + "learning_rate": 3.49615806805708e-07, + "loss": 1.0602, + "mean_token_accuracy": 0.6913742423057556, + "num_tokens": 16074449.0, + "step": 638 + }, + { + "epoch": 0.07017351197012958, + "grad_norm": 3.060934543609619, + "learning_rate": 3.5016465422612513e-07, + "loss": 1.0301, + "mean_token_accuracy": 0.6977317333221436, + "num_tokens": 16093698.0, + "step": 639 + }, + { + "epoch": 0.07028332967274324, + "grad_norm": 3.141746759414673, + "learning_rate": 3.5071350164654225e-07, + "loss": 1.0044, + "mean_token_accuracy": 0.7021427750587463, + "num_tokens": 16113064.0, + "step": 640 + }, + { + "epoch": 0.0703931473753569, + "grad_norm": 2.2930939197540283, + "learning_rate": 3.512623490669594e-07, + "loss": 1.1892, + "mean_token_accuracy": 0.6548925638198853, + "num_tokens": 16141619.0, + "step": 641 + }, + { + "epoch": 0.07050296507797058, + "grad_norm": 2.7736563682556152, + "learning_rate": 3.518111964873765e-07, + "loss": 1.0443, + "mean_token_accuracy": 0.6886708736419678, + "num_tokens": 16162215.0, + "step": 642 + }, + { + "epoch": 0.07061278278058424, + "grad_norm": 3.1219687461853027, + "learning_rate": 3.523600439077936e-07, + "loss": 1.0495, + "mean_token_accuracy": 0.6877539157867432, + "num_tokens": 16180614.0, + "step": 643 + }, + { + "epoch": 0.0707226004831979, + "grad_norm": 2.500128984451294, + "learning_rate": 3.529088913282107e-07, + "loss": 1.1433, + "mean_token_accuracy": 0.6658889055252075, + "num_tokens": 16205027.0, + "step": 644 + }, + { + "epoch": 0.07083241818581155, + "grad_norm": 3.317782163619995, + "learning_rate": 3.534577387486279e-07, + "loss": 1.1211, + "mean_token_accuracy": 0.6750152111053467, + "num_tokens": 16228888.0, + "step": 645 + }, + { + "epoch": 0.07094223588842521, + "grad_norm": 2.3355777263641357, + "learning_rate": 3.54006586169045e-07, + "loss": 1.1808, + "mean_token_accuracy": 0.6562977433204651, + "num_tokens": 16258552.0, + "step": 646 + }, + { + "epoch": 0.07105205359103887, + "grad_norm": 2.423142910003662, + "learning_rate": 3.545554335894621e-07, + "loss": 1.1996, + "mean_token_accuracy": 0.6598789095878601, + "num_tokens": 16287548.0, + "step": 647 + }, + { + "epoch": 0.07116187129365253, + "grad_norm": 2.985440254211426, + "learning_rate": 3.551042810098792e-07, + "loss": 1.1267, + "mean_token_accuracy": 0.6716473698616028, + "num_tokens": 16306400.0, + "step": 648 + }, + { + "epoch": 0.0712716889962662, + "grad_norm": 2.6227874755859375, + "learning_rate": 3.556531284302964e-07, + "loss": 1.0255, + "mean_token_accuracy": 0.7012587785720825, + "num_tokens": 16328285.0, + "step": 649 + }, + { + "epoch": 0.07138150669887985, + "grad_norm": 2.4831182956695557, + "learning_rate": 3.562019758507135e-07, + "loss": 1.0764, + "mean_token_accuracy": 0.6842145323753357, + "num_tokens": 16353485.0, + "step": 650 + }, + { + "epoch": 0.07149132440149351, + "grad_norm": 2.816746711730957, + "learning_rate": 3.5675082327113057e-07, + "loss": 1.1285, + "mean_token_accuracy": 0.6763492226600647, + "num_tokens": 16373609.0, + "step": 651 + }, + { + "epoch": 0.07160114210410719, + "grad_norm": 2.397418975830078, + "learning_rate": 3.5729967069154774e-07, + "loss": 1.0537, + "mean_token_accuracy": 0.6881856918334961, + "num_tokens": 16400938.0, + "step": 652 + }, + { + "epoch": 0.07171095980672085, + "grad_norm": 2.3553593158721924, + "learning_rate": 3.5784851811196485e-07, + "loss": 1.097, + "mean_token_accuracy": 0.6788282990455627, + "num_tokens": 16429413.0, + "step": 653 + }, + { + "epoch": 0.07182077750933451, + "grad_norm": 2.6567392349243164, + "learning_rate": 3.5839736553238197e-07, + "loss": 1.1063, + "mean_token_accuracy": 0.6807007789611816, + "num_tokens": 16452770.0, + "step": 654 + }, + { + "epoch": 0.07193059521194817, + "grad_norm": 2.7698402404785156, + "learning_rate": 3.589462129527991e-07, + "loss": 1.0285, + "mean_token_accuracy": 0.6925464868545532, + "num_tokens": 16474513.0, + "step": 655 + }, + { + "epoch": 0.07204041291456183, + "grad_norm": 2.509031057357788, + "learning_rate": 3.5949506037321625e-07, + "loss": 1.0905, + "mean_token_accuracy": 0.6816123127937317, + "num_tokens": 16500202.0, + "step": 656 + }, + { + "epoch": 0.07215023061717549, + "grad_norm": 2.668841600418091, + "learning_rate": 3.6004390779363337e-07, + "loss": 1.1877, + "mean_token_accuracy": 0.6545596718788147, + "num_tokens": 16523842.0, + "step": 657 + }, + { + "epoch": 0.07226004831978915, + "grad_norm": 2.281132698059082, + "learning_rate": 3.605927552140505e-07, + "loss": 1.1701, + "mean_token_accuracy": 0.6645733714103699, + "num_tokens": 16553702.0, + "step": 658 + }, + { + "epoch": 0.0723698660224028, + "grad_norm": 2.7057266235351562, + "learning_rate": 3.611416026344676e-07, + "loss": 1.1173, + "mean_token_accuracy": 0.6724645495414734, + "num_tokens": 16576178.0, + "step": 659 + }, + { + "epoch": 0.07247968372501647, + "grad_norm": 2.4740519523620605, + "learning_rate": 3.6169045005488477e-07, + "loss": 1.1118, + "mean_token_accuracy": 0.6791301965713501, + "num_tokens": 16600755.0, + "step": 660 + }, + { + "epoch": 0.07258950142763014, + "grad_norm": 2.325007677078247, + "learning_rate": 3.6223929747530183e-07, + "loss": 1.098, + "mean_token_accuracy": 0.6857423186302185, + "num_tokens": 16630843.0, + "step": 661 + }, + { + "epoch": 0.0726993191302438, + "grad_norm": 2.4281527996063232, + "learning_rate": 3.6278814489571894e-07, + "loss": 1.12, + "mean_token_accuracy": 0.6729605197906494, + "num_tokens": 16657999.0, + "step": 662 + }, + { + "epoch": 0.07280913683285746, + "grad_norm": 2.3738715648651123, + "learning_rate": 3.6333699231613606e-07, + "loss": 1.1689, + "mean_token_accuracy": 0.6589012145996094, + "num_tokens": 16687160.0, + "step": 663 + }, + { + "epoch": 0.07291895453547112, + "grad_norm": 2.5183732509613037, + "learning_rate": 3.6388583973655323e-07, + "loss": 1.1519, + "mean_token_accuracy": 0.6636566519737244, + "num_tokens": 16713177.0, + "step": 664 + }, + { + "epoch": 0.07302877223808478, + "grad_norm": 2.4898664951324463, + "learning_rate": 3.6443468715697034e-07, + "loss": 1.0771, + "mean_token_accuracy": 0.6898940801620483, + "num_tokens": 16740122.0, + "step": 665 + }, + { + "epoch": 0.07313858994069844, + "grad_norm": 2.3184921741485596, + "learning_rate": 3.6498353457738746e-07, + "loss": 1.0855, + "mean_token_accuracy": 0.6807485222816467, + "num_tokens": 16766943.0, + "step": 666 + }, + { + "epoch": 0.0732484076433121, + "grad_norm": 2.4987833499908447, + "learning_rate": 3.655323819978046e-07, + "loss": 1.0589, + "mean_token_accuracy": 0.6871308088302612, + "num_tokens": 16791226.0, + "step": 667 + }, + { + "epoch": 0.07335822534592576, + "grad_norm": 2.661381483078003, + "learning_rate": 3.6608122941822174e-07, + "loss": 1.163, + "mean_token_accuracy": 0.6714822053909302, + "num_tokens": 16814766.0, + "step": 668 + }, + { + "epoch": 0.07346804304853942, + "grad_norm": 2.688175916671753, + "learning_rate": 3.6663007683863886e-07, + "loss": 0.9994, + "mean_token_accuracy": 0.7026537656784058, + "num_tokens": 16834745.0, + "step": 669 + }, + { + "epoch": 0.07357786075115308, + "grad_norm": 2.436001777648926, + "learning_rate": 3.671789242590559e-07, + "loss": 1.1113, + "mean_token_accuracy": 0.6780606508255005, + "num_tokens": 16860336.0, + "step": 670 + }, + { + "epoch": 0.07368767845376675, + "grad_norm": 2.246095657348633, + "learning_rate": 3.677277716794731e-07, + "loss": 1.108, + "mean_token_accuracy": 0.6820236444473267, + "num_tokens": 16888837.0, + "step": 671 + }, + { + "epoch": 0.07379749615638041, + "grad_norm": 2.5293102264404297, + "learning_rate": 3.682766190998902e-07, + "loss": 1.1321, + "mean_token_accuracy": 0.6732383966445923, + "num_tokens": 16914960.0, + "step": 672 + }, + { + "epoch": 0.07390731385899407, + "grad_norm": 2.5909759998321533, + "learning_rate": 3.688254665203073e-07, + "loss": 1.0088, + "mean_token_accuracy": 0.7030439376831055, + "num_tokens": 16938698.0, + "step": 673 + }, + { + "epoch": 0.07401713156160773, + "grad_norm": 2.352694272994995, + "learning_rate": 3.6937431394072443e-07, + "loss": 1.1539, + "mean_token_accuracy": 0.6600454449653625, + "num_tokens": 16965632.0, + "step": 674 + }, + { + "epoch": 0.07412694926422139, + "grad_norm": 2.34521222114563, + "learning_rate": 3.699231613611416e-07, + "loss": 1.1246, + "mean_token_accuracy": 0.6711335182189941, + "num_tokens": 16994819.0, + "step": 675 + }, + { + "epoch": 0.07423676696683505, + "grad_norm": 2.7320618629455566, + "learning_rate": 3.704720087815587e-07, + "loss": 1.0113, + "mean_token_accuracy": 0.7029097080230713, + "num_tokens": 17015523.0, + "step": 676 + }, + { + "epoch": 0.07434658466944871, + "grad_norm": 2.859461784362793, + "learning_rate": 3.7102085620197583e-07, + "loss": 1.1485, + "mean_token_accuracy": 0.6601946353912354, + "num_tokens": 17036843.0, + "step": 677 + }, + { + "epoch": 0.07445640237206237, + "grad_norm": 3.213627815246582, + "learning_rate": 3.71569703622393e-07, + "loss": 1.0958, + "mean_token_accuracy": 0.676249623298645, + "num_tokens": 17056997.0, + "step": 678 + }, + { + "epoch": 0.07456622007467603, + "grad_norm": 2.475321054458618, + "learning_rate": 3.721185510428101e-07, + "loss": 1.0792, + "mean_token_accuracy": 0.677234411239624, + "num_tokens": 17082197.0, + "step": 679 + }, + { + "epoch": 0.0746760377772897, + "grad_norm": 2.7407891750335693, + "learning_rate": 3.726673984632272e-07, + "loss": 1.0409, + "mean_token_accuracy": 0.7026892304420471, + "num_tokens": 17104895.0, + "step": 680 + }, + { + "epoch": 0.07478585547990337, + "grad_norm": 2.495612144470215, + "learning_rate": 3.732162458836443e-07, + "loss": 1.1147, + "mean_token_accuracy": 0.669594407081604, + "num_tokens": 17130302.0, + "step": 681 + }, + { + "epoch": 0.07489567318251703, + "grad_norm": 2.362928628921509, + "learning_rate": 3.7376509330406146e-07, + "loss": 1.1911, + "mean_token_accuracy": 0.6531205177307129, + "num_tokens": 17158044.0, + "step": 682 + }, + { + "epoch": 0.07500549088513069, + "grad_norm": 2.5915651321411133, + "learning_rate": 3.743139407244786e-07, + "loss": 1.0763, + "mean_token_accuracy": 0.6817023754119873, + "num_tokens": 17182148.0, + "step": 683 + }, + { + "epoch": 0.07511530858774434, + "grad_norm": 2.484600305557251, + "learning_rate": 3.748627881448957e-07, + "loss": 1.1118, + "mean_token_accuracy": 0.6711540222167969, + "num_tokens": 17207970.0, + "step": 684 + }, + { + "epoch": 0.075225126290358, + "grad_norm": 2.8422117233276367, + "learning_rate": 3.754116355653128e-07, + "loss": 1.0581, + "mean_token_accuracy": 0.6814311742782593, + "num_tokens": 17228647.0, + "step": 685 + }, + { + "epoch": 0.07533494399297166, + "grad_norm": 2.478968381881714, + "learning_rate": 3.7596048298573e-07, + "loss": 1.2345, + "mean_token_accuracy": 0.6462432146072388, + "num_tokens": 17255756.0, + "step": 686 + }, + { + "epoch": 0.07544476169558532, + "grad_norm": 2.8641412258148193, + "learning_rate": 3.765093304061471e-07, + "loss": 1.0905, + "mean_token_accuracy": 0.6805387735366821, + "num_tokens": 17275945.0, + "step": 687 + }, + { + "epoch": 0.07555457939819898, + "grad_norm": 2.455841541290283, + "learning_rate": 3.770581778265642e-07, + "loss": 0.9902, + "mean_token_accuracy": 0.7069699764251709, + "num_tokens": 17301094.0, + "step": 688 + }, + { + "epoch": 0.07566439710081264, + "grad_norm": 2.493415117263794, + "learning_rate": 3.7760702524698137e-07, + "loss": 1.0346, + "mean_token_accuracy": 0.692351758480072, + "num_tokens": 17325260.0, + "step": 689 + }, + { + "epoch": 0.07577421480342632, + "grad_norm": 2.231567621231079, + "learning_rate": 3.7815587266739844e-07, + "loss": 1.1047, + "mean_token_accuracy": 0.6819167137145996, + "num_tokens": 17353963.0, + "step": 690 + }, + { + "epoch": 0.07588403250603998, + "grad_norm": 2.525164842605591, + "learning_rate": 3.7870472008781555e-07, + "loss": 1.1338, + "mean_token_accuracy": 0.6708736419677734, + "num_tokens": 17377776.0, + "step": 691 + }, + { + "epoch": 0.07599385020865364, + "grad_norm": 2.5719470977783203, + "learning_rate": 3.7925356750823267e-07, + "loss": 1.0814, + "mean_token_accuracy": 0.6776222586631775, + "num_tokens": 17399819.0, + "step": 692 + }, + { + "epoch": 0.0761036679112673, + "grad_norm": 2.1097819805145264, + "learning_rate": 3.7980241492864983e-07, + "loss": 1.1979, + "mean_token_accuracy": 0.6536275744438171, + "num_tokens": 17436329.0, + "step": 693 + }, + { + "epoch": 0.07621348561388096, + "grad_norm": 2.3771214485168457, + "learning_rate": 3.8035126234906695e-07, + "loss": 1.0839, + "mean_token_accuracy": 0.6846364736557007, + "num_tokens": 17466298.0, + "step": 694 + }, + { + "epoch": 0.07632330331649462, + "grad_norm": 3.1731460094451904, + "learning_rate": 3.8090010976948406e-07, + "loss": 0.9907, + "mean_token_accuracy": 0.6956117153167725, + "num_tokens": 17483220.0, + "step": 695 + }, + { + "epoch": 0.07643312101910828, + "grad_norm": 2.4175422191619873, + "learning_rate": 3.814489571899012e-07, + "loss": 1.098, + "mean_token_accuracy": 0.6763433814048767, + "num_tokens": 17510059.0, + "step": 696 + }, + { + "epoch": 0.07654293872172194, + "grad_norm": 2.741030216217041, + "learning_rate": 3.8199780461031835e-07, + "loss": 1.1209, + "mean_token_accuracy": 0.6727316379547119, + "num_tokens": 17530401.0, + "step": 697 + }, + { + "epoch": 0.0766527564243356, + "grad_norm": 2.715291738510132, + "learning_rate": 3.8254665203073546e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.6913964152336121, + "num_tokens": 17552671.0, + "step": 698 + }, + { + "epoch": 0.07676257412694927, + "grad_norm": 2.240792989730835, + "learning_rate": 3.830954994511525e-07, + "loss": 1.1312, + "mean_token_accuracy": 0.6762468814849854, + "num_tokens": 17583278.0, + "step": 699 + }, + { + "epoch": 0.07687239182956293, + "grad_norm": 2.778073310852051, + "learning_rate": 3.836443468715697e-07, + "loss": 1.1157, + "mean_token_accuracy": 0.6752023696899414, + "num_tokens": 17604367.0, + "step": 700 + }, + { + "epoch": 0.07698220953217659, + "grad_norm": 2.9019036293029785, + "learning_rate": 3.841931942919868e-07, + "loss": 0.9833, + "mean_token_accuracy": 0.7062289714813232, + "num_tokens": 17623408.0, + "step": 701 + }, + { + "epoch": 0.07709202723479025, + "grad_norm": 3.062255859375, + "learning_rate": 3.847420417124039e-07, + "loss": 1.0044, + "mean_token_accuracy": 0.6991792917251587, + "num_tokens": 17640349.0, + "step": 702 + }, + { + "epoch": 0.07720184493740391, + "grad_norm": 2.6429600715637207, + "learning_rate": 3.8529088913282104e-07, + "loss": 0.9672, + "mean_token_accuracy": 0.7090498208999634, + "num_tokens": 17660862.0, + "step": 703 + }, + { + "epoch": 0.07731166264001757, + "grad_norm": 2.974980354309082, + "learning_rate": 3.858397365532382e-07, + "loss": 0.9556, + "mean_token_accuracy": 0.7049164772033691, + "num_tokens": 17678387.0, + "step": 704 + }, + { + "epoch": 0.07742148034263123, + "grad_norm": 2.383577823638916, + "learning_rate": 3.863885839736553e-07, + "loss": 1.1576, + "mean_token_accuracy": 0.6569070816040039, + "num_tokens": 17705777.0, + "step": 705 + }, + { + "epoch": 0.07753129804524489, + "grad_norm": 2.830853223800659, + "learning_rate": 3.8693743139407244e-07, + "loss": 1.0138, + "mean_token_accuracy": 0.704014003276825, + "num_tokens": 17725900.0, + "step": 706 + }, + { + "epoch": 0.07764111574785855, + "grad_norm": 2.3289849758148193, + "learning_rate": 3.8748627881448955e-07, + "loss": 1.1306, + "mean_token_accuracy": 0.669854998588562, + "num_tokens": 17755965.0, + "step": 707 + }, + { + "epoch": 0.07775093345047221, + "grad_norm": 2.509695529937744, + "learning_rate": 3.880351262349067e-07, + "loss": 1.0265, + "mean_token_accuracy": 0.696030855178833, + "num_tokens": 17779544.0, + "step": 708 + }, + { + "epoch": 0.07786075115308588, + "grad_norm": 2.6845343112945557, + "learning_rate": 3.885839736553238e-07, + "loss": 1.0192, + "mean_token_accuracy": 0.689469575881958, + "num_tokens": 17800817.0, + "step": 709 + }, + { + "epoch": 0.07797056885569954, + "grad_norm": 2.9255826473236084, + "learning_rate": 3.891328210757409e-07, + "loss": 1.0051, + "mean_token_accuracy": 0.6933987140655518, + "num_tokens": 17819670.0, + "step": 710 + }, + { + "epoch": 0.0780803865583132, + "grad_norm": 2.3016674518585205, + "learning_rate": 3.8968166849615807e-07, + "loss": 1.1217, + "mean_token_accuracy": 0.6755324006080627, + "num_tokens": 17848698.0, + "step": 711 + }, + { + "epoch": 0.07819020426092686, + "grad_norm": 2.1849048137664795, + "learning_rate": 3.902305159165752e-07, + "loss": 1.1366, + "mean_token_accuracy": 0.6751644015312195, + "num_tokens": 17879964.0, + "step": 712 + }, + { + "epoch": 0.07830002196354052, + "grad_norm": 2.610083818435669, + "learning_rate": 3.907793633369923e-07, + "loss": 1.0677, + "mean_token_accuracy": 0.6881343722343445, + "num_tokens": 17902410.0, + "step": 713 + }, + { + "epoch": 0.07840983966615418, + "grad_norm": 2.527125597000122, + "learning_rate": 3.913282107574094e-07, + "loss": 1.0607, + "mean_token_accuracy": 0.6832587718963623, + "num_tokens": 17923435.0, + "step": 714 + }, + { + "epoch": 0.07851965736876784, + "grad_norm": 2.491642713546753, + "learning_rate": 3.918770581778266e-07, + "loss": 1.0995, + "mean_token_accuracy": 0.6773709654808044, + "num_tokens": 17947736.0, + "step": 715 + }, + { + "epoch": 0.0786294750713815, + "grad_norm": 2.414281129837036, + "learning_rate": 3.924259055982437e-07, + "loss": 1.0436, + "mean_token_accuracy": 0.7141035795211792, + "num_tokens": 17974535.0, + "step": 716 + }, + { + "epoch": 0.07873929277399516, + "grad_norm": 2.7107396125793457, + "learning_rate": 3.929747530186608e-07, + "loss": 1.0441, + "mean_token_accuracy": 0.690897524356842, + "num_tokens": 17997553.0, + "step": 717 + }, + { + "epoch": 0.07884911047660884, + "grad_norm": 2.3281288146972656, + "learning_rate": 3.935236004390779e-07, + "loss": 1.023, + "mean_token_accuracy": 0.6967816352844238, + "num_tokens": 18026355.0, + "step": 718 + }, + { + "epoch": 0.0789589281792225, + "grad_norm": 2.2387726306915283, + "learning_rate": 3.9407244785949504e-07, + "loss": 1.1766, + "mean_token_accuracy": 0.6785626411437988, + "num_tokens": 18056745.0, + "step": 719 + }, + { + "epoch": 0.07906874588183616, + "grad_norm": 2.574486017227173, + "learning_rate": 3.9462129527991216e-07, + "loss": 1.0534, + "mean_token_accuracy": 0.6839204430580139, + "num_tokens": 18083952.0, + "step": 720 + }, + { + "epoch": 0.07917856358444982, + "grad_norm": 2.3752949237823486, + "learning_rate": 3.9517014270032927e-07, + "loss": 1.0608, + "mean_token_accuracy": 0.6853295564651489, + "num_tokens": 18111058.0, + "step": 721 + }, + { + "epoch": 0.07928838128706348, + "grad_norm": 2.696974039077759, + "learning_rate": 3.957189901207464e-07, + "loss": 1.1285, + "mean_token_accuracy": 0.6679500937461853, + "num_tokens": 18133413.0, + "step": 722 + }, + { + "epoch": 0.07939819898967714, + "grad_norm": 2.075904369354248, + "learning_rate": 3.9626783754116356e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6773852109909058, + "num_tokens": 18164401.0, + "step": 723 + }, + { + "epoch": 0.0795080166922908, + "grad_norm": 2.4150965213775635, + "learning_rate": 3.9681668496158067e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.6826106309890747, + "num_tokens": 18189525.0, + "step": 724 + }, + { + "epoch": 0.07961783439490445, + "grad_norm": 2.263758420944214, + "learning_rate": 3.973655323819978e-07, + "loss": 1.0582, + "mean_token_accuracy": 0.6991511583328247, + "num_tokens": 18218364.0, + "step": 725 + }, + { + "epoch": 0.07972765209751811, + "grad_norm": 2.400459051132202, + "learning_rate": 3.9791437980241495e-07, + "loss": 1.0016, + "mean_token_accuracy": 0.6989072561264038, + "num_tokens": 18242601.0, + "step": 726 + }, + { + "epoch": 0.07983746980013177, + "grad_norm": 2.1713480949401855, + "learning_rate": 3.9846322722283207e-07, + "loss": 1.0528, + "mean_token_accuracy": 0.6905667781829834, + "num_tokens": 18275119.0, + "step": 727 + }, + { + "epoch": 0.07994728750274545, + "grad_norm": 2.369875431060791, + "learning_rate": 3.9901207464324913e-07, + "loss": 1.0266, + "mean_token_accuracy": 0.704297661781311, + "num_tokens": 18301684.0, + "step": 728 + }, + { + "epoch": 0.08005710520535911, + "grad_norm": 2.580699920654297, + "learning_rate": 3.9956092206366625e-07, + "loss": 1.1108, + "mean_token_accuracy": 0.6739622354507446, + "num_tokens": 18326629.0, + "step": 729 + }, + { + "epoch": 0.08016692290797277, + "grad_norm": 3.1892147064208984, + "learning_rate": 4.001097694840834e-07, + "loss": 1.1518, + "mean_token_accuracy": 0.685364842414856, + "num_tokens": 18344767.0, + "step": 730 + }, + { + "epoch": 0.08027674061058643, + "grad_norm": 2.5301458835601807, + "learning_rate": 4.0065861690450053e-07, + "loss": 1.084, + "mean_token_accuracy": 0.6815541386604309, + "num_tokens": 18371431.0, + "step": 731 + }, + { + "epoch": 0.08038655831320009, + "grad_norm": 2.287410259246826, + "learning_rate": 4.0120746432491765e-07, + "loss": 1.2097, + "mean_token_accuracy": 0.6501646637916565, + "num_tokens": 18399725.0, + "step": 732 + }, + { + "epoch": 0.08049637601581375, + "grad_norm": 2.370901107788086, + "learning_rate": 4.0175631174533476e-07, + "loss": 1.057, + "mean_token_accuracy": 0.6809190511703491, + "num_tokens": 18429058.0, + "step": 733 + }, + { + "epoch": 0.08060619371842741, + "grad_norm": 2.5451548099517822, + "learning_rate": 4.0230515916575193e-07, + "loss": 1.1143, + "mean_token_accuracy": 0.6746026873588562, + "num_tokens": 18451770.0, + "step": 734 + }, + { + "epoch": 0.08071601142104107, + "grad_norm": 2.1519432067871094, + "learning_rate": 4.0285400658616904e-07, + "loss": 1.0922, + "mean_token_accuracy": 0.6911386847496033, + "num_tokens": 18484523.0, + "step": 735 + }, + { + "epoch": 0.08082582912365473, + "grad_norm": 2.7023720741271973, + "learning_rate": 4.0340285400658616e-07, + "loss": 1.0696, + "mean_token_accuracy": 0.6856139898300171, + "num_tokens": 18506423.0, + "step": 736 + }, + { + "epoch": 0.0809356468262684, + "grad_norm": 2.7516117095947266, + "learning_rate": 4.039517014270033e-07, + "loss": 0.962, + "mean_token_accuracy": 0.7140951156616211, + "num_tokens": 18526450.0, + "step": 737 + }, + { + "epoch": 0.08104546452888206, + "grad_norm": 2.7616443634033203, + "learning_rate": 4.045005488474204e-07, + "loss": 1.1229, + "mean_token_accuracy": 0.6737448573112488, + "num_tokens": 18547073.0, + "step": 738 + }, + { + "epoch": 0.08115528223149572, + "grad_norm": 2.3890461921691895, + "learning_rate": 4.050493962678375e-07, + "loss": 1.0779, + "mean_token_accuracy": 0.6781058311462402, + "num_tokens": 18572269.0, + "step": 739 + }, + { + "epoch": 0.08126509993410938, + "grad_norm": 2.3769125938415527, + "learning_rate": 4.055982436882546e-07, + "loss": 0.9711, + "mean_token_accuracy": 0.7123682498931885, + "num_tokens": 18596438.0, + "step": 740 + }, + { + "epoch": 0.08137491763672304, + "grad_norm": 2.2425200939178467, + "learning_rate": 4.061470911086718e-07, + "loss": 1.0126, + "mean_token_accuracy": 0.6981117725372314, + "num_tokens": 18623586.0, + "step": 741 + }, + { + "epoch": 0.0814847353393367, + "grad_norm": 2.442629337310791, + "learning_rate": 4.066959385290889e-07, + "loss": 1.1204, + "mean_token_accuracy": 0.6715283989906311, + "num_tokens": 18648233.0, + "step": 742 + }, + { + "epoch": 0.08159455304195036, + "grad_norm": 2.229259729385376, + "learning_rate": 4.07244785949506e-07, + "loss": 1.1248, + "mean_token_accuracy": 0.6743441224098206, + "num_tokens": 18677234.0, + "step": 743 + }, + { + "epoch": 0.08170437074456402, + "grad_norm": 2.3289132118225098, + "learning_rate": 4.0779363336992313e-07, + "loss": 1.0679, + "mean_token_accuracy": 0.6845885515213013, + "num_tokens": 18704105.0, + "step": 744 + }, + { + "epoch": 0.08181418844717768, + "grad_norm": 2.3099961280822754, + "learning_rate": 4.083424807903403e-07, + "loss": 1.1129, + "mean_token_accuracy": 0.6729022860527039, + "num_tokens": 18732630.0, + "step": 745 + }, + { + "epoch": 0.08192400614979134, + "grad_norm": 2.2922425270080566, + "learning_rate": 4.0889132821075737e-07, + "loss": 0.9864, + "mean_token_accuracy": 0.7083084583282471, + "num_tokens": 18761319.0, + "step": 746 + }, + { + "epoch": 0.08203382385240501, + "grad_norm": 2.143730640411377, + "learning_rate": 4.094401756311745e-07, + "loss": 1.0135, + "mean_token_accuracy": 0.6971228718757629, + "num_tokens": 18791728.0, + "step": 747 + }, + { + "epoch": 0.08214364155501867, + "grad_norm": 2.2639365196228027, + "learning_rate": 4.0998902305159165e-07, + "loss": 1.0765, + "mean_token_accuracy": 0.6829141974449158, + "num_tokens": 18820167.0, + "step": 748 + }, + { + "epoch": 0.08225345925763233, + "grad_norm": 2.3911831378936768, + "learning_rate": 4.1053787047200876e-07, + "loss": 1.201, + "mean_token_accuracy": 0.6537766456604004, + "num_tokens": 18847209.0, + "step": 749 + }, + { + "epoch": 0.08236327696024599, + "grad_norm": 2.6750857830047607, + "learning_rate": 4.110867178924259e-07, + "loss": 1.112, + "mean_token_accuracy": 0.6769438982009888, + "num_tokens": 18868699.0, + "step": 750 + }, + { + "epoch": 0.08247309466285965, + "grad_norm": 2.1769769191741943, + "learning_rate": 4.11635565312843e-07, + "loss": 1.0542, + "mean_token_accuracy": 0.6899917721748352, + "num_tokens": 18900050.0, + "step": 751 + }, + { + "epoch": 0.08258291236547331, + "grad_norm": 2.099997043609619, + "learning_rate": 4.1218441273326016e-07, + "loss": 1.1457, + "mean_token_accuracy": 0.6689925789833069, + "num_tokens": 18931183.0, + "step": 752 + }, + { + "epoch": 0.08269273006808697, + "grad_norm": 2.4441781044006348, + "learning_rate": 4.127332601536773e-07, + "loss": 0.9885, + "mean_token_accuracy": 0.7065198421478271, + "num_tokens": 18955539.0, + "step": 753 + }, + { + "epoch": 0.08280254777070063, + "grad_norm": 2.4608800411224365, + "learning_rate": 4.132821075740944e-07, + "loss": 1.1275, + "mean_token_accuracy": 0.6783353090286255, + "num_tokens": 18982950.0, + "step": 754 + }, + { + "epoch": 0.08291236547331429, + "grad_norm": 2.5441646575927734, + "learning_rate": 4.138309549945115e-07, + "loss": 1.0857, + "mean_token_accuracy": 0.683007001876831, + "num_tokens": 19006852.0, + "step": 755 + }, + { + "epoch": 0.08302218317592797, + "grad_norm": 2.276115655899048, + "learning_rate": 4.143798024149286e-07, + "loss": 1.1044, + "mean_token_accuracy": 0.686028003692627, + "num_tokens": 19033649.0, + "step": 756 + }, + { + "epoch": 0.08313200087854163, + "grad_norm": 2.6758370399475098, + "learning_rate": 4.1492864983534574e-07, + "loss": 1.0925, + "mean_token_accuracy": 0.677715539932251, + "num_tokens": 19054445.0, + "step": 757 + }, + { + "epoch": 0.08324181858115529, + "grad_norm": 2.6250526905059814, + "learning_rate": 4.1547749725576285e-07, + "loss": 1.029, + "mean_token_accuracy": 0.6953827738761902, + "num_tokens": 19075021.0, + "step": 758 + }, + { + "epoch": 0.08335163628376895, + "grad_norm": 2.299147605895996, + "learning_rate": 4.1602634467618e-07, + "loss": 1.0312, + "mean_token_accuracy": 0.6863264441490173, + "num_tokens": 19101290.0, + "step": 759 + }, + { + "epoch": 0.0834614539863826, + "grad_norm": 2.473689079284668, + "learning_rate": 4.1657519209659714e-07, + "loss": 0.9748, + "mean_token_accuracy": 0.7078771591186523, + "num_tokens": 19124324.0, + "step": 760 + }, + { + "epoch": 0.08357127168899627, + "grad_norm": 2.515947103500366, + "learning_rate": 4.1712403951701425e-07, + "loss": 1.0094, + "mean_token_accuracy": 0.7010884881019592, + "num_tokens": 19147942.0, + "step": 761 + }, + { + "epoch": 0.08368108939160993, + "grad_norm": 2.206089496612549, + "learning_rate": 4.1767288693743137e-07, + "loss": 1.072, + "mean_token_accuracy": 0.690072774887085, + "num_tokens": 19179176.0, + "step": 762 + }, + { + "epoch": 0.08379090709422359, + "grad_norm": 2.5874598026275635, + "learning_rate": 4.1822173435784854e-07, + "loss": 1.0922, + "mean_token_accuracy": 0.6856708526611328, + "num_tokens": 19205056.0, + "step": 763 + }, + { + "epoch": 0.08390072479683724, + "grad_norm": 2.03106689453125, + "learning_rate": 4.1877058177826565e-07, + "loss": 1.0806, + "mean_token_accuracy": 0.6774142384529114, + "num_tokens": 19236902.0, + "step": 764 + }, + { + "epoch": 0.0840105424994509, + "grad_norm": 2.080942392349243, + "learning_rate": 4.193194291986827e-07, + "loss": 1.1073, + "mean_token_accuracy": 0.6767574548721313, + "num_tokens": 19269725.0, + "step": 765 + }, + { + "epoch": 0.08412036020206458, + "grad_norm": 2.460195541381836, + "learning_rate": 4.1986827661909983e-07, + "loss": 1.0533, + "mean_token_accuracy": 0.6841942667961121, + "num_tokens": 19294880.0, + "step": 766 + }, + { + "epoch": 0.08423017790467824, + "grad_norm": 2.3112239837646484, + "learning_rate": 4.20417124039517e-07, + "loss": 1.1327, + "mean_token_accuracy": 0.6685285568237305, + "num_tokens": 19322365.0, + "step": 767 + }, + { + "epoch": 0.0843399956072919, + "grad_norm": 2.2294657230377197, + "learning_rate": 4.209659714599341e-07, + "loss": 1.13, + "mean_token_accuracy": 0.668830931186676, + "num_tokens": 19352034.0, + "step": 768 + }, + { + "epoch": 0.08444981330990556, + "grad_norm": 2.229403257369995, + "learning_rate": 4.2151481888035123e-07, + "loss": 1.0927, + "mean_token_accuracy": 0.6797540187835693, + "num_tokens": 19380369.0, + "step": 769 + }, + { + "epoch": 0.08455963101251922, + "grad_norm": 2.5087451934814453, + "learning_rate": 4.2206366630076834e-07, + "loss": 1.1991, + "mean_token_accuracy": 0.6572609543800354, + "num_tokens": 19406358.0, + "step": 770 + }, + { + "epoch": 0.08466944871513288, + "grad_norm": 2.324122190475464, + "learning_rate": 4.226125137211855e-07, + "loss": 1.1635, + "mean_token_accuracy": 0.6705927848815918, + "num_tokens": 19433645.0, + "step": 771 + }, + { + "epoch": 0.08477926641774654, + "grad_norm": 3.032573699951172, + "learning_rate": 4.231613611416026e-07, + "loss": 1.02, + "mean_token_accuracy": 0.6939630508422852, + "num_tokens": 19451852.0, + "step": 772 + }, + { + "epoch": 0.0848890841203602, + "grad_norm": 2.156722068786621, + "learning_rate": 4.2371020856201974e-07, + "loss": 1.0674, + "mean_token_accuracy": 0.6861773729324341, + "num_tokens": 19483909.0, + "step": 773 + }, + { + "epoch": 0.08499890182297386, + "grad_norm": 2.5091583728790283, + "learning_rate": 4.242590559824369e-07, + "loss": 1.1984, + "mean_token_accuracy": 0.6562774777412415, + "num_tokens": 19510720.0, + "step": 774 + }, + { + "epoch": 0.08510871952558753, + "grad_norm": 2.2473690509796143, + "learning_rate": 4.2480790340285397e-07, + "loss": 1.1912, + "mean_token_accuracy": 0.6600497364997864, + "num_tokens": 19538979.0, + "step": 775 + }, + { + "epoch": 0.08521853722820119, + "grad_norm": 2.5408637523651123, + "learning_rate": 4.253567508232711e-07, + "loss": 1.1031, + "mean_token_accuracy": 0.676423192024231, + "num_tokens": 19561899.0, + "step": 776 + }, + { + "epoch": 0.08532835493081485, + "grad_norm": 2.3086934089660645, + "learning_rate": 4.259055982436882e-07, + "loss": 1.1659, + "mean_token_accuracy": 0.6650131344795227, + "num_tokens": 19590044.0, + "step": 777 + }, + { + "epoch": 0.08543817263342851, + "grad_norm": 2.442499876022339, + "learning_rate": 4.2645444566410537e-07, + "loss": 1.1004, + "mean_token_accuracy": 0.6828691959381104, + "num_tokens": 19614815.0, + "step": 778 + }, + { + "epoch": 0.08554799033604217, + "grad_norm": 2.241804361343384, + "learning_rate": 4.270032930845225e-07, + "loss": 1.1161, + "mean_token_accuracy": 0.6752232313156128, + "num_tokens": 19643705.0, + "step": 779 + }, + { + "epoch": 0.08565780803865583, + "grad_norm": 2.38517165184021, + "learning_rate": 4.275521405049396e-07, + "loss": 1.1375, + "mean_token_accuracy": 0.6697695255279541, + "num_tokens": 19672093.0, + "step": 780 + }, + { + "epoch": 0.08576762574126949, + "grad_norm": 2.5781502723693848, + "learning_rate": 4.281009879253567e-07, + "loss": 1.0149, + "mean_token_accuracy": 0.6981644630432129, + "num_tokens": 19695832.0, + "step": 781 + }, + { + "epoch": 0.08587744344388315, + "grad_norm": 2.341256856918335, + "learning_rate": 4.286498353457739e-07, + "loss": 1.1105, + "mean_token_accuracy": 0.6732995510101318, + "num_tokens": 19721732.0, + "step": 782 + }, + { + "epoch": 0.08598726114649681, + "grad_norm": 2.3587934970855713, + "learning_rate": 4.29198682766191e-07, + "loss": 1.0631, + "mean_token_accuracy": 0.6844557523727417, + "num_tokens": 19748557.0, + "step": 783 + }, + { + "epoch": 0.08609707884911047, + "grad_norm": 2.201663017272949, + "learning_rate": 4.2974753018660806e-07, + "loss": 1.1467, + "mean_token_accuracy": 0.6744986176490784, + "num_tokens": 19776664.0, + "step": 784 + }, + { + "epoch": 0.08620689655172414, + "grad_norm": 2.9689934253692627, + "learning_rate": 4.3029637760702523e-07, + "loss": 1.1274, + "mean_token_accuracy": 0.6792852878570557, + "num_tokens": 19796402.0, + "step": 785 + }, + { + "epoch": 0.0863167142543378, + "grad_norm": 2.7617709636688232, + "learning_rate": 4.3084522502744235e-07, + "loss": 1.0398, + "mean_token_accuracy": 0.6902846097946167, + "num_tokens": 19818325.0, + "step": 786 + }, + { + "epoch": 0.08642653195695146, + "grad_norm": 2.550889015197754, + "learning_rate": 4.3139407244785946e-07, + "loss": 1.0801, + "mean_token_accuracy": 0.6930069327354431, + "num_tokens": 19841101.0, + "step": 787 + }, + { + "epoch": 0.08653634965956512, + "grad_norm": 2.4984047412872314, + "learning_rate": 4.319429198682766e-07, + "loss": 1.1493, + "mean_token_accuracy": 0.6592329740524292, + "num_tokens": 19865240.0, + "step": 788 + }, + { + "epoch": 0.08664616736217878, + "grad_norm": 2.5299501419067383, + "learning_rate": 4.3249176728869374e-07, + "loss": 1.1123, + "mean_token_accuracy": 0.6750543713569641, + "num_tokens": 19890731.0, + "step": 789 + }, + { + "epoch": 0.08675598506479244, + "grad_norm": 2.3402957916259766, + "learning_rate": 4.3304061470911086e-07, + "loss": 1.055, + "mean_token_accuracy": 0.6882466077804565, + "num_tokens": 19916785.0, + "step": 790 + }, + { + "epoch": 0.0868658027674061, + "grad_norm": 2.4673843383789062, + "learning_rate": 4.33589462129528e-07, + "loss": 1.0993, + "mean_token_accuracy": 0.6735259294509888, + "num_tokens": 19941464.0, + "step": 791 + }, + { + "epoch": 0.08697562047001976, + "grad_norm": 2.4480600357055664, + "learning_rate": 4.341383095499451e-07, + "loss": 1.0842, + "mean_token_accuracy": 0.6806681156158447, + "num_tokens": 19966542.0, + "step": 792 + }, + { + "epoch": 0.08708543817263342, + "grad_norm": 2.6560301780700684, + "learning_rate": 4.3468715697036226e-07, + "loss": 1.0404, + "mean_token_accuracy": 0.6949870586395264, + "num_tokens": 19988128.0, + "step": 793 + }, + { + "epoch": 0.0871952558752471, + "grad_norm": 2.310297727584839, + "learning_rate": 4.352360043907793e-07, + "loss": 1.026, + "mean_token_accuracy": 0.6993522047996521, + "num_tokens": 20014456.0, + "step": 794 + }, + { + "epoch": 0.08730507357786076, + "grad_norm": 2.0741732120513916, + "learning_rate": 4.3578485181119644e-07, + "loss": 1.1057, + "mean_token_accuracy": 0.6675712466239929, + "num_tokens": 20046985.0, + "step": 795 + }, + { + "epoch": 0.08741489128047442, + "grad_norm": 2.4264767169952393, + "learning_rate": 4.363336992316136e-07, + "loss": 1.0632, + "mean_token_accuracy": 0.6848944425582886, + "num_tokens": 20070178.0, + "step": 796 + }, + { + "epoch": 0.08752470898308808, + "grad_norm": 2.1327342987060547, + "learning_rate": 4.368825466520307e-07, + "loss": 1.1239, + "mean_token_accuracy": 0.6704805493354797, + "num_tokens": 20101783.0, + "step": 797 + }, + { + "epoch": 0.08763452668570174, + "grad_norm": 2.721620798110962, + "learning_rate": 4.3743139407244783e-07, + "loss": 1.0839, + "mean_token_accuracy": 0.6827415227890015, + "num_tokens": 20123870.0, + "step": 798 + }, + { + "epoch": 0.0877443443883154, + "grad_norm": 2.5003433227539062, + "learning_rate": 4.3798024149286495e-07, + "loss": 1.1428, + "mean_token_accuracy": 0.6667391657829285, + "num_tokens": 20148534.0, + "step": 799 + }, + { + "epoch": 0.08785416209092906, + "grad_norm": 2.6674084663391113, + "learning_rate": 4.385290889132821e-07, + "loss": 1.0671, + "mean_token_accuracy": 0.6855530142784119, + "num_tokens": 20170492.0, + "step": 800 + }, + { + "epoch": 0.08796397979354272, + "grad_norm": 2.1451785564422607, + "learning_rate": 4.3907793633369923e-07, + "loss": 1.0165, + "mean_token_accuracy": 0.6954480409622192, + "num_tokens": 20198901.0, + "step": 801 + }, + { + "epoch": 0.08807379749615638, + "grad_norm": 2.4731154441833496, + "learning_rate": 4.3962678375411635e-07, + "loss": 1.0168, + "mean_token_accuracy": 0.6948937177658081, + "num_tokens": 20223790.0, + "step": 802 + }, + { + "epoch": 0.08818361519877004, + "grad_norm": 2.4067907333374023, + "learning_rate": 4.401756311745334e-07, + "loss": 1.0943, + "mean_token_accuracy": 0.6696107387542725, + "num_tokens": 20248725.0, + "step": 803 + }, + { + "epoch": 0.08829343290138371, + "grad_norm": 2.556039571762085, + "learning_rate": 4.407244785949506e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.6962668895721436, + "num_tokens": 20272615.0, + "step": 804 + }, + { + "epoch": 0.08840325060399737, + "grad_norm": 2.7688100337982178, + "learning_rate": 4.412733260153677e-07, + "loss": 1.1355, + "mean_token_accuracy": 0.67090904712677, + "num_tokens": 20296537.0, + "step": 805 + }, + { + "epoch": 0.08851306830661103, + "grad_norm": 2.7379653453826904, + "learning_rate": 4.418221734357848e-07, + "loss": 1.1243, + "mean_token_accuracy": 0.667527437210083, + "num_tokens": 20318711.0, + "step": 806 + }, + { + "epoch": 0.08862288600922469, + "grad_norm": 2.456210136413574, + "learning_rate": 4.42371020856202e-07, + "loss": 1.0401, + "mean_token_accuracy": 0.680047869682312, + "num_tokens": 20343173.0, + "step": 807 + }, + { + "epoch": 0.08873270371183835, + "grad_norm": 2.321338653564453, + "learning_rate": 4.429198682766191e-07, + "loss": 1.0866, + "mean_token_accuracy": 0.6738505363464355, + "num_tokens": 20370475.0, + "step": 808 + }, + { + "epoch": 0.08884252141445201, + "grad_norm": 2.5785067081451416, + "learning_rate": 4.434687156970362e-07, + "loss": 0.9198, + "mean_token_accuracy": 0.7182015180587769, + "num_tokens": 20393441.0, + "step": 809 + }, + { + "epoch": 0.08895233911706567, + "grad_norm": 2.418480157852173, + "learning_rate": 4.440175631174533e-07, + "loss": 1.1197, + "mean_token_accuracy": 0.6744397878646851, + "num_tokens": 20419170.0, + "step": 810 + }, + { + "epoch": 0.08906215681967933, + "grad_norm": 2.4661738872528076, + "learning_rate": 4.445664105378705e-07, + "loss": 1.1172, + "mean_token_accuracy": 0.6681021451950073, + "num_tokens": 20445356.0, + "step": 811 + }, + { + "epoch": 0.08917197452229299, + "grad_norm": 2.38617205619812, + "learning_rate": 4.451152579582876e-07, + "loss": 1.0232, + "mean_token_accuracy": 0.7044554948806763, + "num_tokens": 20471978.0, + "step": 812 + }, + { + "epoch": 0.08928179222490666, + "grad_norm": 2.5185487270355225, + "learning_rate": 4.4566410537870467e-07, + "loss": 1.0126, + "mean_token_accuracy": 0.7002421617507935, + "num_tokens": 20494089.0, + "step": 813 + }, + { + "epoch": 0.08939160992752032, + "grad_norm": 2.703895092010498, + "learning_rate": 4.462129527991218e-07, + "loss": 1.111, + "mean_token_accuracy": 0.6876915693283081, + "num_tokens": 20514265.0, + "step": 814 + }, + { + "epoch": 0.08950142763013398, + "grad_norm": 2.407043695449829, + "learning_rate": 4.4676180021953895e-07, + "loss": 1.1474, + "mean_token_accuracy": 0.6639919281005859, + "num_tokens": 20540494.0, + "step": 815 + }, + { + "epoch": 0.08961124533274764, + "grad_norm": 2.6041643619537354, + "learning_rate": 4.4731064763995607e-07, + "loss": 0.958, + "mean_token_accuracy": 0.7076711654663086, + "num_tokens": 20561682.0, + "step": 816 + }, + { + "epoch": 0.0897210630353613, + "grad_norm": 2.824091672897339, + "learning_rate": 4.478594950603732e-07, + "loss": 0.9802, + "mean_token_accuracy": 0.7059307098388672, + "num_tokens": 20579255.0, + "step": 817 + }, + { + "epoch": 0.08983088073797496, + "grad_norm": 2.368406295776367, + "learning_rate": 4.4840834248079035e-07, + "loss": 1.0467, + "mean_token_accuracy": 0.6938477754592896, + "num_tokens": 20603716.0, + "step": 818 + }, + { + "epoch": 0.08994069844058862, + "grad_norm": 2.6480371952056885, + "learning_rate": 4.4895718990120747e-07, + "loss": 1.0869, + "mean_token_accuracy": 0.6816846132278442, + "num_tokens": 20625293.0, + "step": 819 + }, + { + "epoch": 0.09005051614320228, + "grad_norm": 2.1366310119628906, + "learning_rate": 4.495060373216246e-07, + "loss": 1.1096, + "mean_token_accuracy": 0.6740989685058594, + "num_tokens": 20654748.0, + "step": 820 + }, + { + "epoch": 0.09016033384581594, + "grad_norm": 2.491503953933716, + "learning_rate": 4.500548847420417e-07, + "loss": 1.164, + "mean_token_accuracy": 0.6629766821861267, + "num_tokens": 20679647.0, + "step": 821 + }, + { + "epoch": 0.0902701515484296, + "grad_norm": 2.4390156269073486, + "learning_rate": 4.5060373216245886e-07, + "loss": 1.0012, + "mean_token_accuracy": 0.7003231048583984, + "num_tokens": 20701625.0, + "step": 822 + }, + { + "epoch": 0.09037996925104327, + "grad_norm": 2.2075862884521484, + "learning_rate": 4.5115257958287593e-07, + "loss": 1.1286, + "mean_token_accuracy": 0.672073245048523, + "num_tokens": 20730689.0, + "step": 823 + }, + { + "epoch": 0.09048978695365693, + "grad_norm": 2.509636640548706, + "learning_rate": 4.5170142700329304e-07, + "loss": 1.0819, + "mean_token_accuracy": 0.6796319484710693, + "num_tokens": 20755917.0, + "step": 824 + }, + { + "epoch": 0.0905996046562706, + "grad_norm": 2.383479118347168, + "learning_rate": 4.5225027442371016e-07, + "loss": 1.165, + "mean_token_accuracy": 0.6630784273147583, + "num_tokens": 20782292.0, + "step": 825 + }, + { + "epoch": 0.09070942235888425, + "grad_norm": 2.257028579711914, + "learning_rate": 4.527991218441273e-07, + "loss": 1.0907, + "mean_token_accuracy": 0.6772596836090088, + "num_tokens": 20810465.0, + "step": 826 + }, + { + "epoch": 0.09081924006149791, + "grad_norm": 2.3962745666503906, + "learning_rate": 4.5334796926454444e-07, + "loss": 1.1291, + "mean_token_accuracy": 0.6679153442382812, + "num_tokens": 20834832.0, + "step": 827 + }, + { + "epoch": 0.09092905776411157, + "grad_norm": 2.2641537189483643, + "learning_rate": 4.5389681668496156e-07, + "loss": 1.1955, + "mean_token_accuracy": 0.6609005928039551, + "num_tokens": 20863798.0, + "step": 828 + }, + { + "epoch": 0.09103887546672523, + "grad_norm": 2.5975911617279053, + "learning_rate": 4.5444566410537867e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.6779806613922119, + "num_tokens": 20886425.0, + "step": 829 + }, + { + "epoch": 0.09114869316933889, + "grad_norm": 2.492394208908081, + "learning_rate": 4.5499451152579584e-07, + "loss": 1.0761, + "mean_token_accuracy": 0.6915173530578613, + "num_tokens": 20909468.0, + "step": 830 + }, + { + "epoch": 0.09125851087195255, + "grad_norm": 2.4197869300842285, + "learning_rate": 4.5554335894621295e-07, + "loss": 1.0886, + "mean_token_accuracy": 0.6706725358963013, + "num_tokens": 20933354.0, + "step": 831 + }, + { + "epoch": 0.09136832857456623, + "grad_norm": 2.237234354019165, + "learning_rate": 4.5609220636663e-07, + "loss": 1.1388, + "mean_token_accuracy": 0.6666435599327087, + "num_tokens": 20963749.0, + "step": 832 + }, + { + "epoch": 0.09147814627717989, + "grad_norm": 2.749314546585083, + "learning_rate": 4.566410537870472e-07, + "loss": 1.0557, + "mean_token_accuracy": 0.6842907667160034, + "num_tokens": 20984345.0, + "step": 833 + }, + { + "epoch": 0.09158796397979355, + "grad_norm": 2.4988088607788086, + "learning_rate": 4.571899012074643e-07, + "loss": 1.0182, + "mean_token_accuracy": 0.6949971914291382, + "num_tokens": 21005526.0, + "step": 834 + }, + { + "epoch": 0.0916977816824072, + "grad_norm": 2.346684217453003, + "learning_rate": 4.577387486278814e-07, + "loss": 1.0315, + "mean_token_accuracy": 0.6959075927734375, + "num_tokens": 21030931.0, + "step": 835 + }, + { + "epoch": 0.09180759938502087, + "grad_norm": 2.3003296852111816, + "learning_rate": 4.5828759604829853e-07, + "loss": 1.0123, + "mean_token_accuracy": 0.7025668621063232, + "num_tokens": 21055098.0, + "step": 836 + }, + { + "epoch": 0.09191741708763453, + "grad_norm": 2.8384270668029785, + "learning_rate": 4.588364434687157e-07, + "loss": 1.0189, + "mean_token_accuracy": 0.6969307661056519, + "num_tokens": 21073127.0, + "step": 837 + }, + { + "epoch": 0.09202723479024819, + "grad_norm": 2.6575257778167725, + "learning_rate": 4.593852908891328e-07, + "loss": 1.1266, + "mean_token_accuracy": 0.6673088073730469, + "num_tokens": 21094392.0, + "step": 838 + }, + { + "epoch": 0.09213705249286185, + "grad_norm": 2.5212509632110596, + "learning_rate": 4.5993413830954993e-07, + "loss": 1.0343, + "mean_token_accuracy": 0.6988168954849243, + "num_tokens": 21117957.0, + "step": 839 + }, + { + "epoch": 0.0922468701954755, + "grad_norm": 2.222097635269165, + "learning_rate": 4.6048298572996704e-07, + "loss": 1.1192, + "mean_token_accuracy": 0.6747076511383057, + "num_tokens": 21148410.0, + "step": 840 + }, + { + "epoch": 0.09235668789808917, + "grad_norm": 2.887636423110962, + "learning_rate": 4.6103183315038416e-07, + "loss": 0.9827, + "mean_token_accuracy": 0.712795615196228, + "num_tokens": 21167276.0, + "step": 841 + }, + { + "epoch": 0.09246650560070284, + "grad_norm": 2.339564561843872, + "learning_rate": 4.615806805708013e-07, + "loss": 1.1045, + "mean_token_accuracy": 0.6823418140411377, + "num_tokens": 21193651.0, + "step": 842 + }, + { + "epoch": 0.0925763233033165, + "grad_norm": 2.0742926597595215, + "learning_rate": 4.621295279912184e-07, + "loss": 1.1707, + "mean_token_accuracy": 0.6626230478286743, + "num_tokens": 21225554.0, + "step": 843 + }, + { + "epoch": 0.09268614100593016, + "grad_norm": 2.555065393447876, + "learning_rate": 4.6267837541163556e-07, + "loss": 1.0301, + "mean_token_accuracy": 0.6984381079673767, + "num_tokens": 21248055.0, + "step": 844 + }, + { + "epoch": 0.09279595870854382, + "grad_norm": 2.6080784797668457, + "learning_rate": 4.632272228320527e-07, + "loss": 1.0277, + "mean_token_accuracy": 0.6967787742614746, + "num_tokens": 21269811.0, + "step": 845 + }, + { + "epoch": 0.09290577641115748, + "grad_norm": 2.2110798358917236, + "learning_rate": 4.637760702524698e-07, + "loss": 1.0376, + "mean_token_accuracy": 0.6984827518463135, + "num_tokens": 21298513.0, + "step": 846 + }, + { + "epoch": 0.09301559411377114, + "grad_norm": 2.6521096229553223, + "learning_rate": 4.643249176728869e-07, + "loss": 1.0809, + "mean_token_accuracy": 0.6837522983551025, + "num_tokens": 21319605.0, + "step": 847 + }, + { + "epoch": 0.0931254118163848, + "grad_norm": 2.6216797828674316, + "learning_rate": 4.6487376509330407e-07, + "loss": 1.1374, + "mean_token_accuracy": 0.6626375317573547, + "num_tokens": 21342318.0, + "step": 848 + }, + { + "epoch": 0.09323522951899846, + "grad_norm": 2.512744426727295, + "learning_rate": 4.654226125137212e-07, + "loss": 1.0458, + "mean_token_accuracy": 0.6905454397201538, + "num_tokens": 21363819.0, + "step": 849 + }, + { + "epoch": 0.09334504722161212, + "grad_norm": 2.8271782398223877, + "learning_rate": 4.659714599341383e-07, + "loss": 1.0183, + "mean_token_accuracy": 0.6885323524475098, + "num_tokens": 21381739.0, + "step": 850 + }, + { + "epoch": 0.09345486492422579, + "grad_norm": 2.2980458736419678, + "learning_rate": 4.6652030735455537e-07, + "loss": 1.0603, + "mean_token_accuracy": 0.690467357635498, + "num_tokens": 21408894.0, + "step": 851 + }, + { + "epoch": 0.09356468262683945, + "grad_norm": 2.2302744388580322, + "learning_rate": 4.6706915477497253e-07, + "loss": 1.0213, + "mean_token_accuracy": 0.6944552659988403, + "num_tokens": 21439894.0, + "step": 852 + }, + { + "epoch": 0.09367450032945311, + "grad_norm": 2.3592684268951416, + "learning_rate": 4.6761800219538965e-07, + "loss": 1.0964, + "mean_token_accuracy": 0.6738836765289307, + "num_tokens": 21465848.0, + "step": 853 + }, + { + "epoch": 0.09378431803206677, + "grad_norm": 2.4879672527313232, + "learning_rate": 4.6816684961580676e-07, + "loss": 1.0509, + "mean_token_accuracy": 0.6946306228637695, + "num_tokens": 21487830.0, + "step": 854 + }, + { + "epoch": 0.09389413573468043, + "grad_norm": 2.481966733932495, + "learning_rate": 4.6871569703622393e-07, + "loss": 1.0167, + "mean_token_accuracy": 0.6957288384437561, + "num_tokens": 21510618.0, + "step": 855 + }, + { + "epoch": 0.09400395343729409, + "grad_norm": 2.339132308959961, + "learning_rate": 4.6926454445664105e-07, + "loss": 1.0832, + "mean_token_accuracy": 0.6790715456008911, + "num_tokens": 21535719.0, + "step": 856 + }, + { + "epoch": 0.09411377113990775, + "grad_norm": 2.3753626346588135, + "learning_rate": 4.6981339187705816e-07, + "loss": 0.9937, + "mean_token_accuracy": 0.7001358270645142, + "num_tokens": 21559100.0, + "step": 857 + }, + { + "epoch": 0.09422358884252141, + "grad_norm": 2.381005048751831, + "learning_rate": 4.703622392974753e-07, + "loss": 1.024, + "mean_token_accuracy": 0.6924189329147339, + "num_tokens": 21583811.0, + "step": 858 + }, + { + "epoch": 0.09433340654513507, + "grad_norm": 2.334106683731079, + "learning_rate": 4.7091108671789245e-07, + "loss": 1.0487, + "mean_token_accuracy": 0.6882277131080627, + "num_tokens": 21608170.0, + "step": 859 + }, + { + "epoch": 0.09444322424774873, + "grad_norm": 2.2029640674591064, + "learning_rate": 4.714599341383095e-07, + "loss": 1.101, + "mean_token_accuracy": 0.6761236786842346, + "num_tokens": 21635622.0, + "step": 860 + }, + { + "epoch": 0.0945530419503624, + "grad_norm": 2.4200046062469482, + "learning_rate": 4.720087815587266e-07, + "loss": 1.0936, + "mean_token_accuracy": 0.6736648678779602, + "num_tokens": 21658978.0, + "step": 861 + }, + { + "epoch": 0.09466285965297606, + "grad_norm": 2.294257640838623, + "learning_rate": 4.7255762897914374e-07, + "loss": 1.0604, + "mean_token_accuracy": 0.6910416483879089, + "num_tokens": 21685352.0, + "step": 862 + }, + { + "epoch": 0.09477267735558972, + "grad_norm": 2.5176122188568115, + "learning_rate": 4.731064763995609e-07, + "loss": 1.0827, + "mean_token_accuracy": 0.687953770160675, + "num_tokens": 21708627.0, + "step": 863 + }, + { + "epoch": 0.09488249505820338, + "grad_norm": 2.1999497413635254, + "learning_rate": 4.73655323819978e-07, + "loss": 1.0273, + "mean_token_accuracy": 0.6942378282546997, + "num_tokens": 21736802.0, + "step": 864 + }, + { + "epoch": 0.09499231276081704, + "grad_norm": 2.058142900466919, + "learning_rate": 4.7420417124039514e-07, + "loss": 1.1226, + "mean_token_accuracy": 0.6660559177398682, + "num_tokens": 21774006.0, + "step": 865 + }, + { + "epoch": 0.0951021304634307, + "grad_norm": 2.5618515014648438, + "learning_rate": 4.747530186608123e-07, + "loss": 0.9905, + "mean_token_accuracy": 0.7015260457992554, + "num_tokens": 21795676.0, + "step": 866 + }, + { + "epoch": 0.09521194816604436, + "grad_norm": 2.390770196914673, + "learning_rate": 4.753018660812294e-07, + "loss": 0.9876, + "mean_token_accuracy": 0.6980971693992615, + "num_tokens": 21817777.0, + "step": 867 + }, + { + "epoch": 0.09532176586865802, + "grad_norm": 2.4706363677978516, + "learning_rate": 4.7585071350164654e-07, + "loss": 0.9817, + "mean_token_accuracy": 0.701801061630249, + "num_tokens": 21842559.0, + "step": 868 + }, + { + "epoch": 0.09543158357127168, + "grad_norm": 2.125854015350342, + "learning_rate": 4.7639956092206365e-07, + "loss": 1.0185, + "mean_token_accuracy": 0.7038275599479675, + "num_tokens": 21871508.0, + "step": 869 + }, + { + "epoch": 0.09554140127388536, + "grad_norm": 2.5766663551330566, + "learning_rate": 4.769484083424808e-07, + "loss": 1.1158, + "mean_token_accuracy": 0.6739910840988159, + "num_tokens": 21894341.0, + "step": 870 + }, + { + "epoch": 0.09565121897649902, + "grad_norm": 2.858837366104126, + "learning_rate": 4.774972557628979e-07, + "loss": 1.0707, + "mean_token_accuracy": 0.6775890588760376, + "num_tokens": 21911921.0, + "step": 871 + }, + { + "epoch": 0.09576103667911268, + "grad_norm": 2.309492826461792, + "learning_rate": 4.78046103183315e-07, + "loss": 1.1536, + "mean_token_accuracy": 0.6666935682296753, + "num_tokens": 21938327.0, + "step": 872 + }, + { + "epoch": 0.09587085438172634, + "grad_norm": 2.7331314086914062, + "learning_rate": 4.785949506037321e-07, + "loss": 1.0296, + "mean_token_accuracy": 0.6945579051971436, + "num_tokens": 21958850.0, + "step": 873 + }, + { + "epoch": 0.09598067208434, + "grad_norm": 2.6386687755584717, + "learning_rate": 4.791437980241493e-07, + "loss": 1.0498, + "mean_token_accuracy": 0.6824563145637512, + "num_tokens": 21979253.0, + "step": 874 + }, + { + "epoch": 0.09609048978695366, + "grad_norm": 2.6943352222442627, + "learning_rate": 4.796926454445663e-07, + "loss": 1.0349, + "mean_token_accuracy": 0.6901556849479675, + "num_tokens": 22000168.0, + "step": 875 + }, + { + "epoch": 0.09620030748956732, + "grad_norm": 2.552124500274658, + "learning_rate": 4.802414928649835e-07, + "loss": 1.0575, + "mean_token_accuracy": 0.6905218362808228, + "num_tokens": 22022398.0, + "step": 876 + }, + { + "epoch": 0.09631012519218098, + "grad_norm": 2.2633745670318604, + "learning_rate": 4.807903402854007e-07, + "loss": 1.0082, + "mean_token_accuracy": 0.694648027420044, + "num_tokens": 22048458.0, + "step": 877 + }, + { + "epoch": 0.09641994289479464, + "grad_norm": 2.4747509956359863, + "learning_rate": 4.813391877058177e-07, + "loss": 1.013, + "mean_token_accuracy": 0.7023074626922607, + "num_tokens": 22071625.0, + "step": 878 + }, + { + "epoch": 0.0965297605974083, + "grad_norm": 2.443753480911255, + "learning_rate": 4.818880351262349e-07, + "loss": 1.056, + "mean_token_accuracy": 0.6808047294616699, + "num_tokens": 22096625.0, + "step": 879 + }, + { + "epoch": 0.09663957830002197, + "grad_norm": 2.5758862495422363, + "learning_rate": 4.82436882546652e-07, + "loss": 1.0815, + "mean_token_accuracy": 0.6848198175430298, + "num_tokens": 22118907.0, + "step": 880 + }, + { + "epoch": 0.09674939600263563, + "grad_norm": 2.6844747066497803, + "learning_rate": 4.829857299670691e-07, + "loss": 1.0677, + "mean_token_accuracy": 0.6836543083190918, + "num_tokens": 22138880.0, + "step": 881 + }, + { + "epoch": 0.09685921370524929, + "grad_norm": 2.2548155784606934, + "learning_rate": 4.835345773874863e-07, + "loss": 1.039, + "mean_token_accuracy": 0.6921529769897461, + "num_tokens": 22167318.0, + "step": 882 + }, + { + "epoch": 0.09696903140786295, + "grad_norm": 2.207911729812622, + "learning_rate": 4.840834248079034e-07, + "loss": 1.0457, + "mean_token_accuracy": 0.6902364492416382, + "num_tokens": 22196857.0, + "step": 883 + }, + { + "epoch": 0.09707884911047661, + "grad_norm": 2.6282715797424316, + "learning_rate": 4.846322722283204e-07, + "loss": 1.0007, + "mean_token_accuracy": 0.7079868316650391, + "num_tokens": 22218902.0, + "step": 884 + }, + { + "epoch": 0.09718866681309027, + "grad_norm": 2.376833915710449, + "learning_rate": 4.851811196487376e-07, + "loss": 1.0005, + "mean_token_accuracy": 0.6985609531402588, + "num_tokens": 22244850.0, + "step": 885 + }, + { + "epoch": 0.09729848451570393, + "grad_norm": 2.127777338027954, + "learning_rate": 4.857299670691548e-07, + "loss": 1.0685, + "mean_token_accuracy": 0.6881585121154785, + "num_tokens": 22274336.0, + "step": 886 + }, + { + "epoch": 0.09740830221831759, + "grad_norm": 2.959315538406372, + "learning_rate": 4.862788144895718e-07, + "loss": 1.084, + "mean_token_accuracy": 0.6837207078933716, + "num_tokens": 22294101.0, + "step": 887 + }, + { + "epoch": 0.09751811992093125, + "grad_norm": 2.4583640098571777, + "learning_rate": 4.86827661909989e-07, + "loss": 1.0251, + "mean_token_accuracy": 0.6929985284805298, + "num_tokens": 22318503.0, + "step": 888 + }, + { + "epoch": 0.09762793762354492, + "grad_norm": 2.4568991661071777, + "learning_rate": 4.873765093304062e-07, + "loss": 0.9626, + "mean_token_accuracy": 0.7161979675292969, + "num_tokens": 22344017.0, + "step": 889 + }, + { + "epoch": 0.09773775532615858, + "grad_norm": 2.5007052421569824, + "learning_rate": 4.879253567508232e-07, + "loss": 0.9776, + "mean_token_accuracy": 0.7009639739990234, + "num_tokens": 22367532.0, + "step": 890 + }, + { + "epoch": 0.09784757302877224, + "grad_norm": 2.3286452293395996, + "learning_rate": 4.884742041712404e-07, + "loss": 1.0882, + "mean_token_accuracy": 0.6890332698822021, + "num_tokens": 22393695.0, + "step": 891 + }, + { + "epoch": 0.0979573907313859, + "grad_norm": 2.2368674278259277, + "learning_rate": 4.890230515916576e-07, + "loss": 1.0934, + "mean_token_accuracy": 0.6784074306488037, + "num_tokens": 22420525.0, + "step": 892 + }, + { + "epoch": 0.09806720843399956, + "grad_norm": 2.2179012298583984, + "learning_rate": 4.895718990120746e-07, + "loss": 1.2271, + "mean_token_accuracy": 0.6468400955200195, + "num_tokens": 22451379.0, + "step": 893 + }, + { + "epoch": 0.09817702613661322, + "grad_norm": 2.3889105319976807, + "learning_rate": 4.901207464324917e-07, + "loss": 1.0233, + "mean_token_accuracy": 0.6968410015106201, + "num_tokens": 22475910.0, + "step": 894 + }, + { + "epoch": 0.09828684383922688, + "grad_norm": 2.7844302654266357, + "learning_rate": 4.906695938529089e-07, + "loss": 1.0769, + "mean_token_accuracy": 0.6810652017593384, + "num_tokens": 22496337.0, + "step": 895 + }, + { + "epoch": 0.09839666154184054, + "grad_norm": 2.4800946712493896, + "learning_rate": 4.91218441273326e-07, + "loss": 0.9272, + "mean_token_accuracy": 0.7231663465499878, + "num_tokens": 22519351.0, + "step": 896 + }, + { + "epoch": 0.0985064792444542, + "grad_norm": 2.280940532684326, + "learning_rate": 4.917672886937431e-07, + "loss": 1.0541, + "mean_token_accuracy": 0.6857730150222778, + "num_tokens": 22545695.0, + "step": 897 + }, + { + "epoch": 0.09861629694706786, + "grad_norm": 2.3716254234313965, + "learning_rate": 4.923161361141603e-07, + "loss": 1.085, + "mean_token_accuracy": 0.6748492121696472, + "num_tokens": 22570248.0, + "step": 898 + }, + { + "epoch": 0.09872611464968153, + "grad_norm": 2.468926191329956, + "learning_rate": 4.928649835345773e-07, + "loss": 1.1323, + "mean_token_accuracy": 0.6881464719772339, + "num_tokens": 22593913.0, + "step": 899 + }, + { + "epoch": 0.0988359323522952, + "grad_norm": 2.374145269393921, + "learning_rate": 4.934138309549945e-07, + "loss": 1.1354, + "mean_token_accuracy": 0.665580153465271, + "num_tokens": 22622384.0, + "step": 900 + }, + { + "epoch": 0.09894575005490885, + "grad_norm": 2.633408546447754, + "learning_rate": 4.939626783754117e-07, + "loss": 1.106, + "mean_token_accuracy": 0.67401123046875, + "num_tokens": 22644832.0, + "step": 901 + }, + { + "epoch": 0.09905556775752251, + "grad_norm": 2.104170799255371, + "learning_rate": 4.945115257958287e-07, + "loss": 1.1087, + "mean_token_accuracy": 0.6798223853111267, + "num_tokens": 22676962.0, + "step": 902 + }, + { + "epoch": 0.09916538546013617, + "grad_norm": 2.355372428894043, + "learning_rate": 4.950603732162459e-07, + "loss": 0.9513, + "mean_token_accuracy": 0.7159616947174072, + "num_tokens": 22701964.0, + "step": 903 + }, + { + "epoch": 0.09927520316274983, + "grad_norm": 2.5312843322753906, + "learning_rate": 4.95609220636663e-07, + "loss": 1.0596, + "mean_token_accuracy": 0.6920157074928284, + "num_tokens": 22725021.0, + "step": 904 + }, + { + "epoch": 0.0993850208653635, + "grad_norm": 2.2222254276275635, + "learning_rate": 4.961580680570801e-07, + "loss": 1.1191, + "mean_token_accuracy": 0.675093412399292, + "num_tokens": 22753776.0, + "step": 905 + }, + { + "epoch": 0.09949483856797715, + "grad_norm": 2.403637409210205, + "learning_rate": 4.967069154774972e-07, + "loss": 1.0908, + "mean_token_accuracy": 0.677470862865448, + "num_tokens": 22780035.0, + "step": 906 + }, + { + "epoch": 0.09960465627059081, + "grad_norm": 2.3282554149627686, + "learning_rate": 4.972557628979143e-07, + "loss": 1.107, + "mean_token_accuracy": 0.675840437412262, + "num_tokens": 22805995.0, + "step": 907 + }, + { + "epoch": 0.09971447397320449, + "grad_norm": 2.2347381114959717, + "learning_rate": 4.978046103183315e-07, + "loss": 1.1065, + "mean_token_accuracy": 0.675991415977478, + "num_tokens": 22833633.0, + "step": 908 + }, + { + "epoch": 0.09982429167581815, + "grad_norm": 2.323246955871582, + "learning_rate": 4.983534577387486e-07, + "loss": 1.0579, + "mean_token_accuracy": 0.6924059391021729, + "num_tokens": 22859647.0, + "step": 909 + }, + { + "epoch": 0.0999341093784318, + "grad_norm": 2.181335926055908, + "learning_rate": 4.989023051591657e-07, + "loss": 1.0663, + "mean_token_accuracy": 0.6869229078292847, + "num_tokens": 22886967.0, + "step": 910 + }, + { + "epoch": 0.10004392708104547, + "grad_norm": 2.463024854660034, + "learning_rate": 4.994511525795829e-07, + "loss": 1.0665, + "mean_token_accuracy": 0.6799628138542175, + "num_tokens": 22910651.0, + "step": 911 + }, + { + "epoch": 0.10015374478365913, + "grad_norm": 2.131221294403076, + "learning_rate": 5e-07, + "loss": 1.146, + "mean_token_accuracy": 0.6670068502426147, + "num_tokens": 22941736.0, + "step": 912 + }, + { + "epoch": 0.10026356248627279, + "grad_norm": 2.6411333084106445, + "learning_rate": 5.005488474204171e-07, + "loss": 1.1287, + "mean_token_accuracy": 0.6733171939849854, + "num_tokens": 22963322.0, + "step": 913 + }, + { + "epoch": 0.10037338018888645, + "grad_norm": 2.5139763355255127, + "learning_rate": 5.010976948408342e-07, + "loss": 1.0121, + "mean_token_accuracy": 0.695862889289856, + "num_tokens": 22986322.0, + "step": 914 + }, + { + "epoch": 0.1004831978915001, + "grad_norm": 2.0465457439422607, + "learning_rate": 5.016465422612514e-07, + "loss": 1.1078, + "mean_token_accuracy": 0.6763637661933899, + "num_tokens": 23018287.0, + "step": 915 + }, + { + "epoch": 0.10059301559411377, + "grad_norm": 2.5700325965881348, + "learning_rate": 5.021953896816685e-07, + "loss": 1.057, + "mean_token_accuracy": 0.6825653910636902, + "num_tokens": 23041217.0, + "step": 916 + }, + { + "epoch": 0.10070283329672743, + "grad_norm": 2.447124481201172, + "learning_rate": 5.027442371020856e-07, + "loss": 1.0861, + "mean_token_accuracy": 0.6826503276824951, + "num_tokens": 23066143.0, + "step": 917 + }, + { + "epoch": 0.1008126509993411, + "grad_norm": 2.552096366882324, + "learning_rate": 5.032930845225028e-07, + "loss": 0.9419, + "mean_token_accuracy": 0.7246436476707458, + "num_tokens": 23088341.0, + "step": 918 + }, + { + "epoch": 0.10092246870195476, + "grad_norm": 2.1542038917541504, + "learning_rate": 5.038419319429198e-07, + "loss": 1.0535, + "mean_token_accuracy": 0.6964471340179443, + "num_tokens": 23116872.0, + "step": 919 + }, + { + "epoch": 0.10103228640456842, + "grad_norm": 2.1952807903289795, + "learning_rate": 5.04390779363337e-07, + "loss": 1.0177, + "mean_token_accuracy": 0.6941949725151062, + "num_tokens": 23144659.0, + "step": 920 + }, + { + "epoch": 0.10114210410718208, + "grad_norm": 2.4453766345977783, + "learning_rate": 5.049396267837542e-07, + "loss": 0.9875, + "mean_token_accuracy": 0.703750491142273, + "num_tokens": 23169073.0, + "step": 921 + }, + { + "epoch": 0.10125192180979574, + "grad_norm": 2.3562188148498535, + "learning_rate": 5.054884742041711e-07, + "loss": 1.0769, + "mean_token_accuracy": 0.6780539751052856, + "num_tokens": 23193441.0, + "step": 922 + }, + { + "epoch": 0.1013617395124094, + "grad_norm": 2.364938735961914, + "learning_rate": 5.060373216245883e-07, + "loss": 1.055, + "mean_token_accuracy": 0.6861487627029419, + "num_tokens": 23218122.0, + "step": 923 + }, + { + "epoch": 0.10147155721502306, + "grad_norm": 2.0452780723571777, + "learning_rate": 5.065861690450055e-07, + "loss": 1.1452, + "mean_token_accuracy": 0.6639504432678223, + "num_tokens": 23249149.0, + "step": 924 + }, + { + "epoch": 0.10158137491763672, + "grad_norm": 2.3297247886657715, + "learning_rate": 5.071350164654225e-07, + "loss": 1.147, + "mean_token_accuracy": 0.662360429763794, + "num_tokens": 23277861.0, + "step": 925 + }, + { + "epoch": 0.10169119262025038, + "grad_norm": 2.9496347904205322, + "learning_rate": 5.076838638858397e-07, + "loss": 0.9989, + "mean_token_accuracy": 0.7058139443397522, + "num_tokens": 23295939.0, + "step": 926 + }, + { + "epoch": 0.10180101032286405, + "grad_norm": 2.5978994369506836, + "learning_rate": 5.082327113062569e-07, + "loss": 1.0096, + "mean_token_accuracy": 0.6955448389053345, + "num_tokens": 23316516.0, + "step": 927 + }, + { + "epoch": 0.10191082802547771, + "grad_norm": 2.220484972000122, + "learning_rate": 5.087815587266739e-07, + "loss": 1.0381, + "mean_token_accuracy": 0.6856184601783752, + "num_tokens": 23345109.0, + "step": 928 + }, + { + "epoch": 0.10202064572809137, + "grad_norm": 2.5975656509399414, + "learning_rate": 5.093304061470911e-07, + "loss": 1.0748, + "mean_token_accuracy": 0.6777538061141968, + "num_tokens": 23368747.0, + "step": 929 + }, + { + "epoch": 0.10213046343070503, + "grad_norm": 2.5538055896759033, + "learning_rate": 5.098792535675082e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.6899117231369019, + "num_tokens": 23392638.0, + "step": 930 + }, + { + "epoch": 0.10224028113331869, + "grad_norm": 2.301287889480591, + "learning_rate": 5.104281009879253e-07, + "loss": 1.1179, + "mean_token_accuracy": 0.6708561778068542, + "num_tokens": 23422524.0, + "step": 931 + }, + { + "epoch": 0.10235009883593235, + "grad_norm": 2.900459051132202, + "learning_rate": 5.109769484083425e-07, + "loss": 1.0975, + "mean_token_accuracy": 0.6800824403762817, + "num_tokens": 23442701.0, + "step": 932 + }, + { + "epoch": 0.10245991653854601, + "grad_norm": 2.4559714794158936, + "learning_rate": 5.115257958287596e-07, + "loss": 1.0478, + "mean_token_accuracy": 0.6889334917068481, + "num_tokens": 23467187.0, + "step": 933 + }, + { + "epoch": 0.10256973424115967, + "grad_norm": 2.4182069301605225, + "learning_rate": 5.120746432491767e-07, + "loss": 1.0769, + "mean_token_accuracy": 0.6871665716171265, + "num_tokens": 23491359.0, + "step": 934 + }, + { + "epoch": 0.10267955194377333, + "grad_norm": 2.408069372177124, + "learning_rate": 5.126234906695939e-07, + "loss": 1.1423, + "mean_token_accuracy": 0.6688168048858643, + "num_tokens": 23517216.0, + "step": 935 + }, + { + "epoch": 0.10278936964638699, + "grad_norm": 2.454733371734619, + "learning_rate": 5.13172338090011e-07, + "loss": 1.0832, + "mean_token_accuracy": 0.6786847114562988, + "num_tokens": 23542694.0, + "step": 936 + }, + { + "epoch": 0.10289918734900066, + "grad_norm": 2.3577113151550293, + "learning_rate": 5.137211855104281e-07, + "loss": 1.0896, + "mean_token_accuracy": 0.6778371334075928, + "num_tokens": 23569987.0, + "step": 937 + }, + { + "epoch": 0.10300900505161432, + "grad_norm": 2.5832321643829346, + "learning_rate": 5.142700329308453e-07, + "loss": 1.0164, + "mean_token_accuracy": 0.6954995393753052, + "num_tokens": 23591185.0, + "step": 938 + }, + { + "epoch": 0.10311882275422798, + "grad_norm": 2.871330976486206, + "learning_rate": 5.148188803512624e-07, + "loss": 0.9268, + "mean_token_accuracy": 0.7190417051315308, + "num_tokens": 23610054.0, + "step": 939 + }, + { + "epoch": 0.10322864045684164, + "grad_norm": 2.3961124420166016, + "learning_rate": 5.153677277716795e-07, + "loss": 1.0476, + "mean_token_accuracy": 0.6898287534713745, + "num_tokens": 23633579.0, + "step": 940 + }, + { + "epoch": 0.1033384581594553, + "grad_norm": 2.8790459632873535, + "learning_rate": 5.159165751920965e-07, + "loss": 0.982, + "mean_token_accuracy": 0.7073743343353271, + "num_tokens": 23653300.0, + "step": 941 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 2.1873581409454346, + "learning_rate": 5.164654226125136e-07, + "loss": 1.1572, + "mean_token_accuracy": 0.6578869819641113, + "num_tokens": 23684516.0, + "step": 942 + }, + { + "epoch": 0.10355809356468262, + "grad_norm": 2.249979019165039, + "learning_rate": 5.170142700329308e-07, + "loss": 1.1154, + "mean_token_accuracy": 0.6707329750061035, + "num_tokens": 23712764.0, + "step": 943 + }, + { + "epoch": 0.10366791126729628, + "grad_norm": 2.8322224617004395, + "learning_rate": 5.175631174533479e-07, + "loss": 1.0835, + "mean_token_accuracy": 0.676125168800354, + "num_tokens": 23737126.0, + "step": 944 + }, + { + "epoch": 0.10377772896990994, + "grad_norm": 2.25746750831604, + "learning_rate": 5.18111964873765e-07, + "loss": 1.052, + "mean_token_accuracy": 0.6901339292526245, + "num_tokens": 23764361.0, + "step": 945 + }, + { + "epoch": 0.10388754667252362, + "grad_norm": 2.3490850925445557, + "learning_rate": 5.186608122941822e-07, + "loss": 0.9711, + "mean_token_accuracy": 0.703517735004425, + "num_tokens": 23791402.0, + "step": 946 + }, + { + "epoch": 0.10399736437513728, + "grad_norm": 2.4732778072357178, + "learning_rate": 5.192096597145993e-07, + "loss": 1.0681, + "mean_token_accuracy": 0.6846323013305664, + "num_tokens": 23814498.0, + "step": 947 + }, + { + "epoch": 0.10410718207775094, + "grad_norm": 2.669682264328003, + "learning_rate": 5.197585071350164e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.7011044025421143, + "num_tokens": 23836034.0, + "step": 948 + }, + { + "epoch": 0.1042169997803646, + "grad_norm": 2.4967360496520996, + "learning_rate": 5.203073545554336e-07, + "loss": 1.1217, + "mean_token_accuracy": 0.6636039018630981, + "num_tokens": 23858609.0, + "step": 949 + }, + { + "epoch": 0.10432681748297826, + "grad_norm": 2.2788302898406982, + "learning_rate": 5.208562019758507e-07, + "loss": 1.0495, + "mean_token_accuracy": 0.6941999197006226, + "num_tokens": 23885111.0, + "step": 950 + }, + { + "epoch": 0.10443663518559192, + "grad_norm": 2.2731869220733643, + "learning_rate": 5.214050493962678e-07, + "loss": 1.1137, + "mean_token_accuracy": 0.6738319396972656, + "num_tokens": 23911491.0, + "step": 951 + }, + { + "epoch": 0.10454645288820558, + "grad_norm": 2.7688398361206055, + "learning_rate": 5.219538968166849e-07, + "loss": 1.0593, + "mean_token_accuracy": 0.6899579763412476, + "num_tokens": 23932074.0, + "step": 952 + }, + { + "epoch": 0.10465627059081924, + "grad_norm": 2.213602304458618, + "learning_rate": 5.225027442371021e-07, + "loss": 1.0714, + "mean_token_accuracy": 0.6929447650909424, + "num_tokens": 23958709.0, + "step": 953 + }, + { + "epoch": 0.1047660882934329, + "grad_norm": 2.4362943172454834, + "learning_rate": 5.230515916575192e-07, + "loss": 0.8932, + "mean_token_accuracy": 0.7174278497695923, + "num_tokens": 23980405.0, + "step": 954 + }, + { + "epoch": 0.10487590599604656, + "grad_norm": 2.702129364013672, + "learning_rate": 5.236004390779363e-07, + "loss": 1.0019, + "mean_token_accuracy": 0.697670042514801, + "num_tokens": 24000448.0, + "step": 955 + }, + { + "epoch": 0.10498572369866023, + "grad_norm": 2.3301408290863037, + "learning_rate": 5.241492864983535e-07, + "loss": 1.0682, + "mean_token_accuracy": 0.688544750213623, + "num_tokens": 24025541.0, + "step": 956 + }, + { + "epoch": 0.10509554140127389, + "grad_norm": 2.3735392093658447, + "learning_rate": 5.246981339187706e-07, + "loss": 1.0852, + "mean_token_accuracy": 0.67574143409729, + "num_tokens": 24052267.0, + "step": 957 + }, + { + "epoch": 0.10520535910388755, + "grad_norm": 2.178314447402954, + "learning_rate": 5.252469813391877e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.6892435550689697, + "num_tokens": 24079550.0, + "step": 958 + }, + { + "epoch": 0.10531517680650121, + "grad_norm": 2.4896328449249268, + "learning_rate": 5.257958287596049e-07, + "loss": 1.0409, + "mean_token_accuracy": 0.6920881271362305, + "num_tokens": 24102679.0, + "step": 959 + }, + { + "epoch": 0.10542499450911487, + "grad_norm": 2.2282369136810303, + "learning_rate": 5.263446761800219e-07, + "loss": 1.1271, + "mean_token_accuracy": 0.6687636375427246, + "num_tokens": 24128564.0, + "step": 960 + }, + { + "epoch": 0.10553481221172853, + "grad_norm": 2.1657094955444336, + "learning_rate": 5.26893523600439e-07, + "loss": 1.0057, + "mean_token_accuracy": 0.7004499435424805, + "num_tokens": 24157944.0, + "step": 961 + }, + { + "epoch": 0.10564462991434219, + "grad_norm": 2.3163650035858154, + "learning_rate": 5.274423710208562e-07, + "loss": 1.0108, + "mean_token_accuracy": 0.7038469910621643, + "num_tokens": 24184770.0, + "step": 962 + }, + { + "epoch": 0.10575444761695585, + "grad_norm": 2.450075626373291, + "learning_rate": 5.279912184412732e-07, + "loss": 1.0725, + "mean_token_accuracy": 0.693107545375824, + "num_tokens": 24208432.0, + "step": 963 + }, + { + "epoch": 0.10586426531956951, + "grad_norm": 2.2068915367126465, + "learning_rate": 5.285400658616904e-07, + "loss": 1.0807, + "mean_token_accuracy": 0.6793701648712158, + "num_tokens": 24238614.0, + "step": 964 + }, + { + "epoch": 0.10597408302218318, + "grad_norm": 2.4411704540252686, + "learning_rate": 5.290889132821076e-07, + "loss": 1.0698, + "mean_token_accuracy": 0.6908191442489624, + "num_tokens": 24261943.0, + "step": 965 + }, + { + "epoch": 0.10608390072479684, + "grad_norm": 2.4520931243896484, + "learning_rate": 5.296377607025246e-07, + "loss": 1.0259, + "mean_token_accuracy": 0.6988320350646973, + "num_tokens": 24283543.0, + "step": 966 + }, + { + "epoch": 0.1061937184274105, + "grad_norm": 2.060312271118164, + "learning_rate": 5.301866081229418e-07, + "loss": 1.0649, + "mean_token_accuracy": 0.6790937185287476, + "num_tokens": 24314025.0, + "step": 967 + }, + { + "epoch": 0.10630353613002416, + "grad_norm": 2.328713893890381, + "learning_rate": 5.30735455543359e-07, + "loss": 1.0412, + "mean_token_accuracy": 0.6916562914848328, + "num_tokens": 24340121.0, + "step": 968 + }, + { + "epoch": 0.10641335383263782, + "grad_norm": 2.6175925731658936, + "learning_rate": 5.31284302963776e-07, + "loss": 1.0275, + "mean_token_accuracy": 0.6947113275527954, + "num_tokens": 24361034.0, + "step": 969 + }, + { + "epoch": 0.10652317153525148, + "grad_norm": 2.400751829147339, + "learning_rate": 5.318331503841932e-07, + "loss": 0.9562, + "mean_token_accuracy": 0.7114961743354797, + "num_tokens": 24385005.0, + "step": 970 + }, + { + "epoch": 0.10663298923786514, + "grad_norm": 2.197197198867798, + "learning_rate": 5.323819978046103e-07, + "loss": 1.0346, + "mean_token_accuracy": 0.6907795667648315, + "num_tokens": 24414426.0, + "step": 971 + }, + { + "epoch": 0.1067428069404788, + "grad_norm": 2.5782508850097656, + "learning_rate": 5.329308452250274e-07, + "loss": 1.0468, + "mean_token_accuracy": 0.6944550275802612, + "num_tokens": 24435918.0, + "step": 972 + }, + { + "epoch": 0.10685262464309246, + "grad_norm": 2.1175694465637207, + "learning_rate": 5.334796926454446e-07, + "loss": 1.0114, + "mean_token_accuracy": 0.700299859046936, + "num_tokens": 24465417.0, + "step": 973 + }, + { + "epoch": 0.10696244234570612, + "grad_norm": 2.1504857540130615, + "learning_rate": 5.340285400658617e-07, + "loss": 1.1044, + "mean_token_accuracy": 0.6822065711021423, + "num_tokens": 24493625.0, + "step": 974 + }, + { + "epoch": 0.1070722600483198, + "grad_norm": 2.5601325035095215, + "learning_rate": 5.345773874862788e-07, + "loss": 0.9923, + "mean_token_accuracy": 0.6944771409034729, + "num_tokens": 24513994.0, + "step": 975 + }, + { + "epoch": 0.10718207775093345, + "grad_norm": 2.5479400157928467, + "learning_rate": 5.35126234906696e-07, + "loss": 0.9678, + "mean_token_accuracy": 0.7051277160644531, + "num_tokens": 24535593.0, + "step": 976 + }, + { + "epoch": 0.10729189545354711, + "grad_norm": 2.164855718612671, + "learning_rate": 5.35675082327113e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.679193377494812, + "num_tokens": 24564164.0, + "step": 977 + }, + { + "epoch": 0.10740171315616077, + "grad_norm": 2.4527227878570557, + "learning_rate": 5.362239297475302e-07, + "loss": 1.1045, + "mean_token_accuracy": 0.6723153591156006, + "num_tokens": 24586371.0, + "step": 978 + }, + { + "epoch": 0.10751153085877443, + "grad_norm": 2.3807687759399414, + "learning_rate": 5.367727771679473e-07, + "loss": 1.0306, + "mean_token_accuracy": 0.6965030431747437, + "num_tokens": 24612497.0, + "step": 979 + }, + { + "epoch": 0.1076213485613881, + "grad_norm": 2.403308391571045, + "learning_rate": 5.373216245883643e-07, + "loss": 0.9869, + "mean_token_accuracy": 0.7007906436920166, + "num_tokens": 24636402.0, + "step": 980 + }, + { + "epoch": 0.10773116626400175, + "grad_norm": 2.318551778793335, + "learning_rate": 5.378704720087815e-07, + "loss": 1.1408, + "mean_token_accuracy": 0.6757409572601318, + "num_tokens": 24663185.0, + "step": 981 + }, + { + "epoch": 0.10784098396661541, + "grad_norm": 2.0584311485290527, + "learning_rate": 5.384193194291986e-07, + "loss": 1.0609, + "mean_token_accuracy": 0.6939719915390015, + "num_tokens": 24693110.0, + "step": 982 + }, + { + "epoch": 0.10795080166922907, + "grad_norm": 2.229341506958008, + "learning_rate": 5.389681668496157e-07, + "loss": 1.1641, + "mean_token_accuracy": 0.6651270389556885, + "num_tokens": 24723045.0, + "step": 983 + }, + { + "epoch": 0.10806061937184275, + "grad_norm": 2.127307891845703, + "learning_rate": 5.395170142700329e-07, + "loss": 1.0699, + "mean_token_accuracy": 0.6776160597801208, + "num_tokens": 24751961.0, + "step": 984 + }, + { + "epoch": 0.10817043707445641, + "grad_norm": 2.543550729751587, + "learning_rate": 5.4006586169045e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.7006267309188843, + "num_tokens": 24775122.0, + "step": 985 + }, + { + "epoch": 0.10828025477707007, + "grad_norm": 2.499063491821289, + "learning_rate": 5.406147091108671e-07, + "loss": 1.073, + "mean_token_accuracy": 0.6807543635368347, + "num_tokens": 24797781.0, + "step": 986 + }, + { + "epoch": 0.10839007247968373, + "grad_norm": 2.565565347671509, + "learning_rate": 5.411635565312843e-07, + "loss": 1.0359, + "mean_token_accuracy": 0.6925099492073059, + "num_tokens": 24818658.0, + "step": 987 + }, + { + "epoch": 0.10849989018229739, + "grad_norm": 2.3866491317749023, + "learning_rate": 5.417124039517014e-07, + "loss": 1.1375, + "mean_token_accuracy": 0.6711760759353638, + "num_tokens": 24845133.0, + "step": 988 + }, + { + "epoch": 0.10860970788491105, + "grad_norm": 2.474252223968506, + "learning_rate": 5.422612513721185e-07, + "loss": 1.0777, + "mean_token_accuracy": 0.6859853267669678, + "num_tokens": 24869704.0, + "step": 989 + }, + { + "epoch": 0.1087195255875247, + "grad_norm": 2.315899610519409, + "learning_rate": 5.428100987925357e-07, + "loss": 1.0445, + "mean_token_accuracy": 0.6919935941696167, + "num_tokens": 24895669.0, + "step": 990 + }, + { + "epoch": 0.10882934329013837, + "grad_norm": 2.546391725540161, + "learning_rate": 5.433589462129528e-07, + "loss": 1.0636, + "mean_token_accuracy": 0.704862117767334, + "num_tokens": 24918321.0, + "step": 991 + }, + { + "epoch": 0.10893916099275203, + "grad_norm": 2.454775810241699, + "learning_rate": 5.439077936333699e-07, + "loss": 0.983, + "mean_token_accuracy": 0.7029774188995361, + "num_tokens": 24942015.0, + "step": 992 + }, + { + "epoch": 0.10904897869536569, + "grad_norm": 2.766876220703125, + "learning_rate": 5.44456641053787e-07, + "loss": 1.0105, + "mean_token_accuracy": 0.6957250237464905, + "num_tokens": 24960629.0, + "step": 993 + }, + { + "epoch": 0.10915879639797936, + "grad_norm": 2.1180524826049805, + "learning_rate": 5.450054884742042e-07, + "loss": 1.1319, + "mean_token_accuracy": 0.6610060930252075, + "num_tokens": 24992352.0, + "step": 994 + }, + { + "epoch": 0.10926861410059302, + "grad_norm": 2.5175023078918457, + "learning_rate": 5.455543358946213e-07, + "loss": 0.9993, + "mean_token_accuracy": 0.7026944160461426, + "num_tokens": 25014698.0, + "step": 995 + }, + { + "epoch": 0.10937843180320668, + "grad_norm": 2.808338165283203, + "learning_rate": 5.461031833150384e-07, + "loss": 0.9676, + "mean_token_accuracy": 0.7052673101425171, + "num_tokens": 25033271.0, + "step": 996 + }, + { + "epoch": 0.10948824950582034, + "grad_norm": 2.434610366821289, + "learning_rate": 5.466520307354556e-07, + "loss": 1.0533, + "mean_token_accuracy": 0.6873899102210999, + "num_tokens": 25057576.0, + "step": 997 + }, + { + "epoch": 0.109598067208434, + "grad_norm": 2.4688127040863037, + "learning_rate": 5.472008781558726e-07, + "loss": 1.0362, + "mean_token_accuracy": 0.6913226842880249, + "num_tokens": 25080113.0, + "step": 998 + }, + { + "epoch": 0.10970788491104766, + "grad_norm": 2.415132999420166, + "learning_rate": 5.477497255762897e-07, + "loss": 1.1728, + "mean_token_accuracy": 0.6698585152626038, + "num_tokens": 25107414.0, + "step": 999 + }, + { + "epoch": 0.10981770261366132, + "grad_norm": 2.0956485271453857, + "learning_rate": 5.482985729967069e-07, + "loss": 0.9823, + "mean_token_accuracy": 0.704224705696106, + "num_tokens": 25135071.0, + "step": 1000 + }, + { + "epoch": 0.10992752031627498, + "grad_norm": 2.380638837814331, + "learning_rate": 5.48847420417124e-07, + "loss": 1.043, + "mean_token_accuracy": 0.6852465271949768, + "num_tokens": 25161281.0, + "step": 1001 + }, + { + "epoch": 0.11003733801888864, + "grad_norm": 2.1449103355407715, + "learning_rate": 5.493962678375411e-07, + "loss": 1.158, + "mean_token_accuracy": 0.6641315817832947, + "num_tokens": 25191513.0, + "step": 1002 + }, + { + "epoch": 0.11014715572150231, + "grad_norm": 2.260031223297119, + "learning_rate": 5.499451152579583e-07, + "loss": 1.0968, + "mean_token_accuracy": 0.6806942224502563, + "num_tokens": 25218747.0, + "step": 1003 + }, + { + "epoch": 0.11025697342411597, + "grad_norm": 2.2110776901245117, + "learning_rate": 5.504939626783753e-07, + "loss": 1.0146, + "mean_token_accuracy": 0.6939783692359924, + "num_tokens": 25245775.0, + "step": 1004 + }, + { + "epoch": 0.11036679112672963, + "grad_norm": 2.3983724117279053, + "learning_rate": 5.510428100987925e-07, + "loss": 1.1569, + "mean_token_accuracy": 0.6562002897262573, + "num_tokens": 25271043.0, + "step": 1005 + }, + { + "epoch": 0.11047660882934329, + "grad_norm": 2.791862726211548, + "learning_rate": 5.515916575192097e-07, + "loss": 0.9934, + "mean_token_accuracy": 0.7007826566696167, + "num_tokens": 25289180.0, + "step": 1006 + }, + { + "epoch": 0.11058642653195695, + "grad_norm": 2.3723654747009277, + "learning_rate": 5.521405049396267e-07, + "loss": 1.0761, + "mean_token_accuracy": 0.6850502490997314, + "num_tokens": 25314560.0, + "step": 1007 + }, + { + "epoch": 0.11069624423457061, + "grad_norm": 2.3591952323913574, + "learning_rate": 5.526893523600439e-07, + "loss": 0.9666, + "mean_token_accuracy": 0.7063155174255371, + "num_tokens": 25337858.0, + "step": 1008 + }, + { + "epoch": 0.11080606193718427, + "grad_norm": 2.8171815872192383, + "learning_rate": 5.532381997804611e-07, + "loss": 0.9404, + "mean_token_accuracy": 0.7140970230102539, + "num_tokens": 25355792.0, + "step": 1009 + }, + { + "epoch": 0.11091587963979793, + "grad_norm": 2.3403210639953613, + "learning_rate": 5.537870472008781e-07, + "loss": 1.0902, + "mean_token_accuracy": 0.6771759986877441, + "num_tokens": 25380664.0, + "step": 1010 + }, + { + "epoch": 0.11102569734241159, + "grad_norm": 2.553299903869629, + "learning_rate": 5.543358946212953e-07, + "loss": 0.9881, + "mean_token_accuracy": 0.7043403387069702, + "num_tokens": 25401511.0, + "step": 1011 + }, + { + "epoch": 0.11113551504502525, + "grad_norm": 2.319180488586426, + "learning_rate": 5.548847420417125e-07, + "loss": 1.0985, + "mean_token_accuracy": 0.6715924739837646, + "num_tokens": 25428175.0, + "step": 1012 + }, + { + "epoch": 0.11124533274763893, + "grad_norm": 2.0644609928131104, + "learning_rate": 5.554335894621295e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.6932858228683472, + "num_tokens": 25460222.0, + "step": 1013 + }, + { + "epoch": 0.11135515045025259, + "grad_norm": 2.1955528259277344, + "learning_rate": 5.559824368825467e-07, + "loss": 1.1233, + "mean_token_accuracy": 0.6717449426651001, + "num_tokens": 25489889.0, + "step": 1014 + }, + { + "epoch": 0.11146496815286625, + "grad_norm": 2.254382848739624, + "learning_rate": 5.565312843029637e-07, + "loss": 1.0729, + "mean_token_accuracy": 0.689989447593689, + "num_tokens": 25516474.0, + "step": 1015 + }, + { + "epoch": 0.1115747858554799, + "grad_norm": 2.9740819931030273, + "learning_rate": 5.570801317233809e-07, + "loss": 0.9674, + "mean_token_accuracy": 0.7065136432647705, + "num_tokens": 25532872.0, + "step": 1016 + }, + { + "epoch": 0.11168460355809356, + "grad_norm": 2.2914910316467285, + "learning_rate": 5.57628979143798e-07, + "loss": 1.0604, + "mean_token_accuracy": 0.6855620741844177, + "num_tokens": 25559575.0, + "step": 1017 + }, + { + "epoch": 0.11179442126070722, + "grad_norm": 2.3864214420318604, + "learning_rate": 5.58177826564215e-07, + "loss": 0.9957, + "mean_token_accuracy": 0.6945513486862183, + "num_tokens": 25583347.0, + "step": 1018 + }, + { + "epoch": 0.11190423896332088, + "grad_norm": 2.367465019226074, + "learning_rate": 5.587266739846322e-07, + "loss": 0.9906, + "mean_token_accuracy": 0.7003063559532166, + "num_tokens": 25607639.0, + "step": 1019 + }, + { + "epoch": 0.11201405666593454, + "grad_norm": 2.2165286540985107, + "learning_rate": 5.592755214050494e-07, + "loss": 1.1285, + "mean_token_accuracy": 0.6672803163528442, + "num_tokens": 25636328.0, + "step": 1020 + }, + { + "epoch": 0.1121238743685482, + "grad_norm": 2.8034074306488037, + "learning_rate": 5.598243688254664e-07, + "loss": 1.076, + "mean_token_accuracy": 0.6887694597244263, + "num_tokens": 25663746.0, + "step": 1021 + }, + { + "epoch": 0.11223369207116188, + "grad_norm": 2.728563070297241, + "learning_rate": 5.603732162458836e-07, + "loss": 1.0422, + "mean_token_accuracy": 0.6907607316970825, + "num_tokens": 25683844.0, + "step": 1022 + }, + { + "epoch": 0.11234350977377554, + "grad_norm": 2.1737613677978516, + "learning_rate": 5.609220636663008e-07, + "loss": 0.9649, + "mean_token_accuracy": 0.7095215916633606, + "num_tokens": 25710473.0, + "step": 1023 + }, + { + "epoch": 0.1124533274763892, + "grad_norm": 2.450287103652954, + "learning_rate": 5.614709110867178e-07, + "loss": 1.0668, + "mean_token_accuracy": 0.6875818967819214, + "num_tokens": 25733937.0, + "step": 1024 + }, + { + "epoch": 0.11256314517900286, + "grad_norm": 2.4066720008850098, + "learning_rate": 5.62019758507135e-07, + "loss": 1.0183, + "mean_token_accuracy": 0.6970179677009583, + "num_tokens": 25759079.0, + "step": 1025 + }, + { + "epoch": 0.11267296288161652, + "grad_norm": 2.252352476119995, + "learning_rate": 5.625686059275521e-07, + "loss": 1.0794, + "mean_token_accuracy": 0.6797641515731812, + "num_tokens": 25786959.0, + "step": 1026 + }, + { + "epoch": 0.11278278058423018, + "grad_norm": 2.4356000423431396, + "learning_rate": 5.631174533479692e-07, + "loss": 1.0957, + "mean_token_accuracy": 0.6826174259185791, + "num_tokens": 25810820.0, + "step": 1027 + }, + { + "epoch": 0.11289259828684384, + "grad_norm": 2.4927432537078857, + "learning_rate": 5.636663007683864e-07, + "loss": 1.1231, + "mean_token_accuracy": 0.6658422946929932, + "num_tokens": 25833660.0, + "step": 1028 + }, + { + "epoch": 0.1130024159894575, + "grad_norm": 2.323660373687744, + "learning_rate": 5.642151481888035e-07, + "loss": 1.1828, + "mean_token_accuracy": 0.6559768319129944, + "num_tokens": 25861441.0, + "step": 1029 + }, + { + "epoch": 0.11311223369207116, + "grad_norm": 2.2148072719573975, + "learning_rate": 5.647639956092206e-07, + "loss": 1.0888, + "mean_token_accuracy": 0.6774550080299377, + "num_tokens": 25888733.0, + "step": 1030 + }, + { + "epoch": 0.11322205139468482, + "grad_norm": 2.5005877017974854, + "learning_rate": 5.653128430296378e-07, + "loss": 1.0333, + "mean_token_accuracy": 0.6983201503753662, + "num_tokens": 25911271.0, + "step": 1031 + }, + { + "epoch": 0.11333186909729849, + "grad_norm": 2.335399866104126, + "learning_rate": 5.658616904500549e-07, + "loss": 1.0815, + "mean_token_accuracy": 0.6778108477592468, + "num_tokens": 25938315.0, + "step": 1032 + }, + { + "epoch": 0.11344168679991215, + "grad_norm": 2.1524550914764404, + "learning_rate": 5.66410537870472e-07, + "loss": 1.1004, + "mean_token_accuracy": 0.6824955940246582, + "num_tokens": 25967791.0, + "step": 1033 + }, + { + "epoch": 0.11355150450252581, + "grad_norm": 2.147449016571045, + "learning_rate": 5.669593852908892e-07, + "loss": 1.148, + "mean_token_accuracy": 0.662798285484314, + "num_tokens": 25996685.0, + "step": 1034 + }, + { + "epoch": 0.11366132220513947, + "grad_norm": 2.416269302368164, + "learning_rate": 5.675082327113063e-07, + "loss": 1.0113, + "mean_token_accuracy": 0.695821225643158, + "num_tokens": 26020491.0, + "step": 1035 + }, + { + "epoch": 0.11377113990775313, + "grad_norm": 2.6586196422576904, + "learning_rate": 5.680570801317233e-07, + "loss": 1.0831, + "mean_token_accuracy": 0.6913597583770752, + "num_tokens": 26041142.0, + "step": 1036 + }, + { + "epoch": 0.11388095761036679, + "grad_norm": 2.3885884284973145, + "learning_rate": 5.686059275521404e-07, + "loss": 1.0335, + "mean_token_accuracy": 0.6918933987617493, + "num_tokens": 26066624.0, + "step": 1037 + }, + { + "epoch": 0.11399077531298045, + "grad_norm": 2.5344438552856445, + "learning_rate": 5.691547749725576e-07, + "loss": 1.0366, + "mean_token_accuracy": 0.6918326616287231, + "num_tokens": 26089698.0, + "step": 1038 + }, + { + "epoch": 0.11410059301559411, + "grad_norm": 2.4329583644866943, + "learning_rate": 5.697036223929747e-07, + "loss": 1.1591, + "mean_token_accuracy": 0.6638151407241821, + "num_tokens": 26115095.0, + "step": 1039 + }, + { + "epoch": 0.11421041071820777, + "grad_norm": 2.2689754962921143, + "learning_rate": 5.702524698133918e-07, + "loss": 1.0081, + "mean_token_accuracy": 0.6995898485183716, + "num_tokens": 26140022.0, + "step": 1040 + }, + { + "epoch": 0.11432022842082144, + "grad_norm": 2.1097288131713867, + "learning_rate": 5.70801317233809e-07, + "loss": 1.1674, + "mean_token_accuracy": 0.651312530040741, + "num_tokens": 26173382.0, + "step": 1041 + }, + { + "epoch": 0.1144300461234351, + "grad_norm": 2.2672247886657715, + "learning_rate": 5.713501646542261e-07, + "loss": 1.0944, + "mean_token_accuracy": 0.6711663007736206, + "num_tokens": 26201302.0, + "step": 1042 + }, + { + "epoch": 0.11453986382604876, + "grad_norm": 1.9950770139694214, + "learning_rate": 5.718990120746432e-07, + "loss": 1.0799, + "mean_token_accuracy": 0.6781672835350037, + "num_tokens": 26233158.0, + "step": 1043 + }, + { + "epoch": 0.11464968152866242, + "grad_norm": 2.6929757595062256, + "learning_rate": 5.724478594950604e-07, + "loss": 0.9458, + "mean_token_accuracy": 0.7103266716003418, + "num_tokens": 26251642.0, + "step": 1044 + }, + { + "epoch": 0.11475949923127608, + "grad_norm": 2.533076047897339, + "learning_rate": 5.729967069154775e-07, + "loss": 1.1219, + "mean_token_accuracy": 0.6685177683830261, + "num_tokens": 26275692.0, + "step": 1045 + }, + { + "epoch": 0.11486931693388974, + "grad_norm": 2.3183345794677734, + "learning_rate": 5.735455543358946e-07, + "loss": 0.9987, + "mean_token_accuracy": 0.7001446485519409, + "num_tokens": 26300391.0, + "step": 1046 + }, + { + "epoch": 0.1149791346365034, + "grad_norm": 2.1845548152923584, + "learning_rate": 5.740944017563118e-07, + "loss": 1.02, + "mean_token_accuracy": 0.6916340589523315, + "num_tokens": 26327649.0, + "step": 1047 + }, + { + "epoch": 0.11508895233911706, + "grad_norm": 2.1317365169525146, + "learning_rate": 5.746432491767288e-07, + "loss": 1.137, + "mean_token_accuracy": 0.6614333391189575, + "num_tokens": 26356657.0, + "step": 1048 + }, + { + "epoch": 0.11519877004173072, + "grad_norm": 2.311332941055298, + "learning_rate": 5.75192096597146e-07, + "loss": 1.055, + "mean_token_accuracy": 0.6854938864707947, + "num_tokens": 26382976.0, + "step": 1049 + }, + { + "epoch": 0.11530858774434438, + "grad_norm": 2.2776739597320557, + "learning_rate": 5.757409440175632e-07, + "loss": 1.0183, + "mean_token_accuracy": 0.69223552942276, + "num_tokens": 26409599.0, + "step": 1050 + }, + { + "epoch": 0.11541840544695806, + "grad_norm": 2.362339496612549, + "learning_rate": 5.762897914379802e-07, + "loss": 1.0593, + "mean_token_accuracy": 0.6946919560432434, + "num_tokens": 26435522.0, + "step": 1051 + }, + { + "epoch": 0.11552822314957172, + "grad_norm": 2.5270965099334717, + "learning_rate": 5.768386388583974e-07, + "loss": 1.0008, + "mean_token_accuracy": 0.7031856775283813, + "num_tokens": 26459098.0, + "step": 1052 + }, + { + "epoch": 0.11563804085218538, + "grad_norm": 2.4089701175689697, + "learning_rate": 5.773874862788145e-07, + "loss": 1.1712, + "mean_token_accuracy": 0.660057544708252, + "num_tokens": 26485150.0, + "step": 1053 + }, + { + "epoch": 0.11574785855479904, + "grad_norm": 2.199164628982544, + "learning_rate": 5.779363336992316e-07, + "loss": 1.1114, + "mean_token_accuracy": 0.664351761341095, + "num_tokens": 26514647.0, + "step": 1054 + }, + { + "epoch": 0.1158576762574127, + "grad_norm": 2.1941027641296387, + "learning_rate": 5.784851811196487e-07, + "loss": 1.1007, + "mean_token_accuracy": 0.670891523361206, + "num_tokens": 26544442.0, + "step": 1055 + }, + { + "epoch": 0.11596749396002635, + "grad_norm": 1.9439892768859863, + "learning_rate": 5.790340285400658e-07, + "loss": 1.0743, + "mean_token_accuracy": 0.68705153465271, + "num_tokens": 26578352.0, + "step": 1056 + }, + { + "epoch": 0.11607731166264001, + "grad_norm": 2.587045431137085, + "learning_rate": 5.795828759604829e-07, + "loss": 1.188, + "mean_token_accuracy": 0.6542661786079407, + "num_tokens": 26600064.0, + "step": 1057 + }, + { + "epoch": 0.11618712936525367, + "grad_norm": 2.3196732997894287, + "learning_rate": 5.801317233809001e-07, + "loss": 1.1039, + "mean_token_accuracy": 0.6869385838508606, + "num_tokens": 26625881.0, + "step": 1058 + }, + { + "epoch": 0.11629694706786733, + "grad_norm": 2.5721540451049805, + "learning_rate": 5.806805708013171e-07, + "loss": 1.0437, + "mean_token_accuracy": 0.6927542686462402, + "num_tokens": 26649240.0, + "step": 1059 + }, + { + "epoch": 0.11640676477048101, + "grad_norm": 2.605855703353882, + "learning_rate": 5.812294182217343e-07, + "loss": 1.0076, + "mean_token_accuracy": 0.6978256702423096, + "num_tokens": 26671781.0, + "step": 1060 + }, + { + "epoch": 0.11651658247309467, + "grad_norm": 2.334444284439087, + "learning_rate": 5.817782656421515e-07, + "loss": 1.0632, + "mean_token_accuracy": 0.6875900626182556, + "num_tokens": 26696810.0, + "step": 1061 + }, + { + "epoch": 0.11662640017570833, + "grad_norm": 3.107168436050415, + "learning_rate": 5.823271130625685e-07, + "loss": 1.0422, + "mean_token_accuracy": 0.6942867636680603, + "num_tokens": 26717913.0, + "step": 1062 + }, + { + "epoch": 0.11673621787832199, + "grad_norm": 2.402308940887451, + "learning_rate": 5.828759604829857e-07, + "loss": 1.0316, + "mean_token_accuracy": 0.6923893690109253, + "num_tokens": 26743099.0, + "step": 1063 + }, + { + "epoch": 0.11684603558093565, + "grad_norm": 2.4021151065826416, + "learning_rate": 5.834248079034029e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.6858847737312317, + "num_tokens": 26767735.0, + "step": 1064 + }, + { + "epoch": 0.11695585328354931, + "grad_norm": 2.1635444164276123, + "learning_rate": 5.839736553238199e-07, + "loss": 1.1522, + "mean_token_accuracy": 0.6763821244239807, + "num_tokens": 26798675.0, + "step": 1065 + }, + { + "epoch": 0.11706567098616297, + "grad_norm": 2.1216366291046143, + "learning_rate": 5.845225027442371e-07, + "loss": 1.1537, + "mean_token_accuracy": 0.6593403816223145, + "num_tokens": 26828780.0, + "step": 1066 + }, + { + "epoch": 0.11717548868877663, + "grad_norm": 2.591707944869995, + "learning_rate": 5.850713501646543e-07, + "loss": 1.008, + "mean_token_accuracy": 0.6959253549575806, + "num_tokens": 26851147.0, + "step": 1067 + }, + { + "epoch": 0.11728530639139029, + "grad_norm": 2.179983615875244, + "learning_rate": 5.856201975850713e-07, + "loss": 1.1193, + "mean_token_accuracy": 0.6754016280174255, + "num_tokens": 26879361.0, + "step": 1068 + }, + { + "epoch": 0.11739512409400395, + "grad_norm": 2.064356803894043, + "learning_rate": 5.861690450054885e-07, + "loss": 1.1604, + "mean_token_accuracy": 0.6575138568878174, + "num_tokens": 26911708.0, + "step": 1069 + }, + { + "epoch": 0.11750494179661762, + "grad_norm": 2.237522840499878, + "learning_rate": 5.867178924259056e-07, + "loss": 1.0404, + "mean_token_accuracy": 0.6942808628082275, + "num_tokens": 26937907.0, + "step": 1070 + }, + { + "epoch": 0.11761475949923128, + "grad_norm": 2.538572311401367, + "learning_rate": 5.872667398463227e-07, + "loss": 1.0025, + "mean_token_accuracy": 0.6966465711593628, + "num_tokens": 26958641.0, + "step": 1071 + }, + { + "epoch": 0.11772457720184494, + "grad_norm": 2.169652223587036, + "learning_rate": 5.878155872667399e-07, + "loss": 1.0878, + "mean_token_accuracy": 0.6786889433860779, + "num_tokens": 26987408.0, + "step": 1072 + }, + { + "epoch": 0.1178343949044586, + "grad_norm": 2.6633853912353516, + "learning_rate": 5.88364434687157e-07, + "loss": 1.0379, + "mean_token_accuracy": 0.6929398775100708, + "num_tokens": 27007359.0, + "step": 1073 + }, + { + "epoch": 0.11794421260707226, + "grad_norm": 2.6179287433624268, + "learning_rate": 5.88913282107574e-07, + "loss": 1.1477, + "mean_token_accuracy": 0.678154706954956, + "num_tokens": 27029997.0, + "step": 1074 + }, + { + "epoch": 0.11805403030968592, + "grad_norm": 2.2314538955688477, + "learning_rate": 5.894621295279912e-07, + "loss": 1.0215, + "mean_token_accuracy": 0.6939265131950378, + "num_tokens": 27056678.0, + "step": 1075 + }, + { + "epoch": 0.11816384801229958, + "grad_norm": 2.2718842029571533, + "learning_rate": 5.900109769484083e-07, + "loss": 1.0528, + "mean_token_accuracy": 0.6817851662635803, + "num_tokens": 27082712.0, + "step": 1076 + }, + { + "epoch": 0.11827366571491324, + "grad_norm": 2.3622565269470215, + "learning_rate": 5.905598243688254e-07, + "loss": 1.0682, + "mean_token_accuracy": 0.6782752275466919, + "num_tokens": 27108630.0, + "step": 1077 + }, + { + "epoch": 0.1183834834175269, + "grad_norm": 2.6052565574645996, + "learning_rate": 5.911086717892426e-07, + "loss": 1.1548, + "mean_token_accuracy": 0.667505145072937, + "num_tokens": 27132044.0, + "step": 1078 + }, + { + "epoch": 0.11849330112014057, + "grad_norm": 2.4168858528137207, + "learning_rate": 5.916575192096597e-07, + "loss": 1.1506, + "mean_token_accuracy": 0.6602126359939575, + "num_tokens": 27156917.0, + "step": 1079 + }, + { + "epoch": 0.11860311882275423, + "grad_norm": 2.045844316482544, + "learning_rate": 5.922063666300768e-07, + "loss": 1.0833, + "mean_token_accuracy": 0.6760967373847961, + "num_tokens": 27189397.0, + "step": 1080 + }, + { + "epoch": 0.1187129365253679, + "grad_norm": 2.3183932304382324, + "learning_rate": 5.927552140504939e-07, + "loss": 0.9981, + "mean_token_accuracy": 0.7027731537818909, + "num_tokens": 27213462.0, + "step": 1081 + }, + { + "epoch": 0.11882275422798155, + "grad_norm": 2.572843074798584, + "learning_rate": 5.93304061470911e-07, + "loss": 1.0681, + "mean_token_accuracy": 0.6819947957992554, + "num_tokens": 27236300.0, + "step": 1082 + }, + { + "epoch": 0.11893257193059521, + "grad_norm": 2.474104404449463, + "learning_rate": 5.938529088913282e-07, + "loss": 1.0813, + "mean_token_accuracy": 0.6837098598480225, + "num_tokens": 27260011.0, + "step": 1083 + }, + { + "epoch": 0.11904238963320887, + "grad_norm": 2.257878065109253, + "learning_rate": 5.944017563117453e-07, + "loss": 1.0969, + "mean_token_accuracy": 0.6778831481933594, + "num_tokens": 27287338.0, + "step": 1084 + }, + { + "epoch": 0.11915220733582253, + "grad_norm": 2.5371627807617188, + "learning_rate": 5.949506037321624e-07, + "loss": 1.1241, + "mean_token_accuracy": 0.6780568361282349, + "num_tokens": 27311344.0, + "step": 1085 + }, + { + "epoch": 0.11926202503843619, + "grad_norm": 2.4268429279327393, + "learning_rate": 5.954994511525796e-07, + "loss": 1.065, + "mean_token_accuracy": 0.693743109703064, + "num_tokens": 27335623.0, + "step": 1086 + }, + { + "epoch": 0.11937184274104985, + "grad_norm": 2.341245412826538, + "learning_rate": 5.960482985729967e-07, + "loss": 1.0573, + "mean_token_accuracy": 0.697058916091919, + "num_tokens": 27359822.0, + "step": 1087 + }, + { + "epoch": 0.11948166044366351, + "grad_norm": 2.3709349632263184, + "learning_rate": 5.965971459934138e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6730930209159851, + "num_tokens": 27386380.0, + "step": 1088 + }, + { + "epoch": 0.11959147814627719, + "grad_norm": 2.7429826259613037, + "learning_rate": 5.971459934138309e-07, + "loss": 0.9804, + "mean_token_accuracy": 0.7058590054512024, + "num_tokens": 27405863.0, + "step": 1089 + }, + { + "epoch": 0.11970129584889085, + "grad_norm": 2.2671823501586914, + "learning_rate": 5.976948408342481e-07, + "loss": 1.0527, + "mean_token_accuracy": 0.6887243390083313, + "num_tokens": 27433458.0, + "step": 1090 + }, + { + "epoch": 0.1198111135515045, + "grad_norm": 2.59247088432312, + "learning_rate": 5.982436882546652e-07, + "loss": 0.993, + "mean_token_accuracy": 0.7000553607940674, + "num_tokens": 27456333.0, + "step": 1091 + }, + { + "epoch": 0.11992093125411817, + "grad_norm": 2.3383803367614746, + "learning_rate": 5.987925356750822e-07, + "loss": 1.1272, + "mean_token_accuracy": 0.6680220365524292, + "num_tokens": 27485493.0, + "step": 1092 + }, + { + "epoch": 0.12003074895673183, + "grad_norm": 2.362011432647705, + "learning_rate": 5.993413830954994e-07, + "loss": 1.0239, + "mean_token_accuracy": 0.6973022222518921, + "num_tokens": 27511856.0, + "step": 1093 + }, + { + "epoch": 0.12014056665934549, + "grad_norm": 2.3198330402374268, + "learning_rate": 5.998902305159165e-07, + "loss": 1.0668, + "mean_token_accuracy": 0.6934029459953308, + "num_tokens": 27535199.0, + "step": 1094 + }, + { + "epoch": 0.12025038436195915, + "grad_norm": 2.192518949508667, + "learning_rate": 6.004390779363336e-07, + "loss": 1.082, + "mean_token_accuracy": 0.681018054485321, + "num_tokens": 27565172.0, + "step": 1095 + }, + { + "epoch": 0.1203602020645728, + "grad_norm": 2.508449077606201, + "learning_rate": 6.009879253567508e-07, + "loss": 1.0334, + "mean_token_accuracy": 0.6982046365737915, + "num_tokens": 27589455.0, + "step": 1096 + }, + { + "epoch": 0.12047001976718646, + "grad_norm": 2.4107673168182373, + "learning_rate": 6.015367727771679e-07, + "loss": 1.0717, + "mean_token_accuracy": 0.6936025619506836, + "num_tokens": 27614325.0, + "step": 1097 + }, + { + "epoch": 0.12057983746980014, + "grad_norm": 2.5362977981567383, + "learning_rate": 6.02085620197585e-07, + "loss": 0.9632, + "mean_token_accuracy": 0.7187292575836182, + "num_tokens": 27637164.0, + "step": 1098 + }, + { + "epoch": 0.1206896551724138, + "grad_norm": 2.3942604064941406, + "learning_rate": 6.026344676180022e-07, + "loss": 1.0878, + "mean_token_accuracy": 0.6860554814338684, + "num_tokens": 27662548.0, + "step": 1099 + }, + { + "epoch": 0.12079947287502746, + "grad_norm": 2.318336248397827, + "learning_rate": 6.031833150384192e-07, + "loss": 1.0675, + "mean_token_accuracy": 0.6890044212341309, + "num_tokens": 27688784.0, + "step": 1100 + }, + { + "epoch": 0.12090929057764112, + "grad_norm": 2.6793038845062256, + "learning_rate": 6.037321624588364e-07, + "loss": 1.0763, + "mean_token_accuracy": 0.6830413341522217, + "num_tokens": 27710431.0, + "step": 1101 + }, + { + "epoch": 0.12101910828025478, + "grad_norm": 2.3081295490264893, + "learning_rate": 6.042810098792536e-07, + "loss": 1.0792, + "mean_token_accuracy": 0.6828701496124268, + "num_tokens": 27737069.0, + "step": 1102 + }, + { + "epoch": 0.12112892598286844, + "grad_norm": 2.696058988571167, + "learning_rate": 6.048298572996706e-07, + "loss": 0.9991, + "mean_token_accuracy": 0.7005802392959595, + "num_tokens": 27757144.0, + "step": 1103 + }, + { + "epoch": 0.1212387436854821, + "grad_norm": 2.490670919418335, + "learning_rate": 6.053787047200878e-07, + "loss": 1.029, + "mean_token_accuracy": 0.6947356462478638, + "num_tokens": 27779183.0, + "step": 1104 + }, + { + "epoch": 0.12134856138809576, + "grad_norm": 2.672840118408203, + "learning_rate": 6.05927552140505e-07, + "loss": 0.9537, + "mean_token_accuracy": 0.7201066017150879, + "num_tokens": 27798358.0, + "step": 1105 + }, + { + "epoch": 0.12145837909070942, + "grad_norm": 2.380380868911743, + "learning_rate": 6.06476399560922e-07, + "loss": 1.0477, + "mean_token_accuracy": 0.6874430775642395, + "num_tokens": 27823073.0, + "step": 1106 + }, + { + "epoch": 0.12156819679332308, + "grad_norm": 2.5757598876953125, + "learning_rate": 6.070252469813392e-07, + "loss": 1.0549, + "mean_token_accuracy": 0.6801700592041016, + "num_tokens": 27845900.0, + "step": 1107 + }, + { + "epoch": 0.12167801449593675, + "grad_norm": 2.514864444732666, + "learning_rate": 6.075740944017564e-07, + "loss": 1.0957, + "mean_token_accuracy": 0.6834297180175781, + "num_tokens": 27871486.0, + "step": 1108 + }, + { + "epoch": 0.12178783219855041, + "grad_norm": 2.339977741241455, + "learning_rate": 6.081229418221734e-07, + "loss": 1.053, + "mean_token_accuracy": 0.68446946144104, + "num_tokens": 27896962.0, + "step": 1109 + }, + { + "epoch": 0.12189764990116407, + "grad_norm": 2.4726905822753906, + "learning_rate": 6.086717892425906e-07, + "loss": 1.0736, + "mean_token_accuracy": 0.684185266494751, + "num_tokens": 27923317.0, + "step": 1110 + }, + { + "epoch": 0.12200746760377773, + "grad_norm": 2.4602158069610596, + "learning_rate": 6.092206366630076e-07, + "loss": 0.9944, + "mean_token_accuracy": 0.7159419059753418, + "num_tokens": 27944855.0, + "step": 1111 + }, + { + "epoch": 0.12211728530639139, + "grad_norm": 2.439174175262451, + "learning_rate": 6.097694840834247e-07, + "loss": 0.9926, + "mean_token_accuracy": 0.699160635471344, + "num_tokens": 27967625.0, + "step": 1112 + }, + { + "epoch": 0.12222710300900505, + "grad_norm": 2.2439446449279785, + "learning_rate": 6.103183315038419e-07, + "loss": 0.9713, + "mean_token_accuracy": 0.7132754325866699, + "num_tokens": 27992727.0, + "step": 1113 + }, + { + "epoch": 0.12233692071161871, + "grad_norm": 2.363856315612793, + "learning_rate": 6.10867178924259e-07, + "loss": 1.1015, + "mean_token_accuracy": 0.6760891675949097, + "num_tokens": 28018955.0, + "step": 1114 + }, + { + "epoch": 0.12244673841423237, + "grad_norm": 2.5793416500091553, + "learning_rate": 6.114160263446761e-07, + "loss": 0.9729, + "mean_token_accuracy": 0.7087422013282776, + "num_tokens": 28040274.0, + "step": 1115 + }, + { + "epoch": 0.12255655611684603, + "grad_norm": 2.2644405364990234, + "learning_rate": 6.119648737650933e-07, + "loss": 1.0264, + "mean_token_accuracy": 0.6978238821029663, + "num_tokens": 28066681.0, + "step": 1116 + }, + { + "epoch": 0.1226663738194597, + "grad_norm": 2.416656732559204, + "learning_rate": 6.125137211855103e-07, + "loss": 1.004, + "mean_token_accuracy": 0.6931289434432983, + "num_tokens": 28089837.0, + "step": 1117 + }, + { + "epoch": 0.12277619152207336, + "grad_norm": 2.7097926139831543, + "learning_rate": 6.130625686059275e-07, + "loss": 1.1214, + "mean_token_accuracy": 0.6685055494308472, + "num_tokens": 28111370.0, + "step": 1118 + }, + { + "epoch": 0.12288600922468702, + "grad_norm": 2.289337158203125, + "learning_rate": 6.136114160263447e-07, + "loss": 0.9705, + "mean_token_accuracy": 0.706321656703949, + "num_tokens": 28136081.0, + "step": 1119 + }, + { + "epoch": 0.12299582692730068, + "grad_norm": 2.2373046875, + "learning_rate": 6.141602634467617e-07, + "loss": 1.03, + "mean_token_accuracy": 0.6935609579086304, + "num_tokens": 28162497.0, + "step": 1120 + }, + { + "epoch": 0.12310564462991434, + "grad_norm": 2.150697946548462, + "learning_rate": 6.147091108671789e-07, + "loss": 1.0876, + "mean_token_accuracy": 0.6779536604881287, + "num_tokens": 28190320.0, + "step": 1121 + }, + { + "epoch": 0.123215462332528, + "grad_norm": 2.621511697769165, + "learning_rate": 6.15257958287596e-07, + "loss": 1.0325, + "mean_token_accuracy": 0.6915658712387085, + "num_tokens": 28212428.0, + "step": 1122 + }, + { + "epoch": 0.12332528003514166, + "grad_norm": 2.544776678085327, + "learning_rate": 6.158068057080131e-07, + "loss": 1.0459, + "mean_token_accuracy": 0.6811121106147766, + "num_tokens": 28235717.0, + "step": 1123 + }, + { + "epoch": 0.12343509773775532, + "grad_norm": 2.3819949626922607, + "learning_rate": 6.163556531284303e-07, + "loss": 1.0376, + "mean_token_accuracy": 0.6851065158843994, + "num_tokens": 28260455.0, + "step": 1124 + }, + { + "epoch": 0.12354491544036898, + "grad_norm": 2.2760794162750244, + "learning_rate": 6.169045005488474e-07, + "loss": 1.0504, + "mean_token_accuracy": 0.6866464614868164, + "num_tokens": 28287746.0, + "step": 1125 + }, + { + "epoch": 0.12365473314298264, + "grad_norm": 2.491302251815796, + "learning_rate": 6.174533479692645e-07, + "loss": 0.996, + "mean_token_accuracy": 0.7068252563476562, + "num_tokens": 28309026.0, + "step": 1126 + }, + { + "epoch": 0.12376455084559632, + "grad_norm": 2.254221200942993, + "learning_rate": 6.180021953896817e-07, + "loss": 1.0509, + "mean_token_accuracy": 0.687201976776123, + "num_tokens": 28335603.0, + "step": 1127 + }, + { + "epoch": 0.12387436854820998, + "grad_norm": 2.1324832439422607, + "learning_rate": 6.185510428100988e-07, + "loss": 1.1371, + "mean_token_accuracy": 0.6598135232925415, + "num_tokens": 28365440.0, + "step": 1128 + }, + { + "epoch": 0.12398418625082364, + "grad_norm": 2.1745705604553223, + "learning_rate": 6.190998902305159e-07, + "loss": 1.047, + "mean_token_accuracy": 0.6917715668678284, + "num_tokens": 28393021.0, + "step": 1129 + }, + { + "epoch": 0.1240940039534373, + "grad_norm": 2.335282325744629, + "learning_rate": 6.196487376509331e-07, + "loss": 1.0223, + "mean_token_accuracy": 0.6939451098442078, + "num_tokens": 28416034.0, + "step": 1130 + }, + { + "epoch": 0.12420382165605096, + "grad_norm": 2.360264778137207, + "learning_rate": 6.201975850713501e-07, + "loss": 1.1132, + "mean_token_accuracy": 0.6680254936218262, + "num_tokens": 28440406.0, + "step": 1131 + }, + { + "epoch": 0.12431363935866462, + "grad_norm": 2.537277936935425, + "learning_rate": 6.207464324917672e-07, + "loss": 1.0881, + "mean_token_accuracy": 0.6842485666275024, + "num_tokens": 28465690.0, + "step": 1132 + }, + { + "epoch": 0.12442345706127828, + "grad_norm": 2.5100979804992676, + "learning_rate": 6.212952799121843e-07, + "loss": 1.0528, + "mean_token_accuracy": 0.6917760372161865, + "num_tokens": 28487416.0, + "step": 1133 + }, + { + "epoch": 0.12453327476389194, + "grad_norm": 2.4283926486968994, + "learning_rate": 6.218441273326015e-07, + "loss": 1.0653, + "mean_token_accuracy": 0.6881484985351562, + "num_tokens": 28511773.0, + "step": 1134 + }, + { + "epoch": 0.1246430924665056, + "grad_norm": 2.372854232788086, + "learning_rate": 6.223929747530186e-07, + "loss": 1.0726, + "mean_token_accuracy": 0.6798744797706604, + "num_tokens": 28537228.0, + "step": 1135 + }, + { + "epoch": 0.12475291016911927, + "grad_norm": 2.556612730026245, + "learning_rate": 6.229418221734357e-07, + "loss": 0.9631, + "mean_token_accuracy": 0.7103633284568787, + "num_tokens": 28557397.0, + "step": 1136 + }, + { + "epoch": 0.12486272787173293, + "grad_norm": 2.4432942867279053, + "learning_rate": 6.234906695938529e-07, + "loss": 1.114, + "mean_token_accuracy": 0.6713842153549194, + "num_tokens": 28581757.0, + "step": 1137 + }, + { + "epoch": 0.12497254557434659, + "grad_norm": 2.56561541557312, + "learning_rate": 6.2403951701427e-07, + "loss": 1.0401, + "mean_token_accuracy": 0.7014361619949341, + "num_tokens": 28603857.0, + "step": 1138 + }, + { + "epoch": 0.12508236327696023, + "grad_norm": 2.3308932781219482, + "learning_rate": 6.245883644346871e-07, + "loss": 1.0038, + "mean_token_accuracy": 0.6983658075332642, + "num_tokens": 28629962.0, + "step": 1139 + }, + { + "epoch": 0.1251921809795739, + "grad_norm": 2.187873601913452, + "learning_rate": 6.251372118551043e-07, + "loss": 1.0308, + "mean_token_accuracy": 0.6939769983291626, + "num_tokens": 28656736.0, + "step": 1140 + }, + { + "epoch": 0.12530199868218758, + "grad_norm": 2.372724771499634, + "learning_rate": 6.256860592755214e-07, + "loss": 1.1232, + "mean_token_accuracy": 0.6723412871360779, + "num_tokens": 28680838.0, + "step": 1141 + }, + { + "epoch": 0.12541181638480123, + "grad_norm": 2.635265827178955, + "learning_rate": 6.262349066959385e-07, + "loss": 1.1089, + "mean_token_accuracy": 0.6793608069419861, + "num_tokens": 28703729.0, + "step": 1142 + }, + { + "epoch": 0.1255216340874149, + "grad_norm": 2.339897871017456, + "learning_rate": 6.267837541163557e-07, + "loss": 1.0359, + "mean_token_accuracy": 0.6819376945495605, + "num_tokens": 28729050.0, + "step": 1143 + }, + { + "epoch": 0.12563145179002855, + "grad_norm": 2.5602500438690186, + "learning_rate": 6.273326015367727e-07, + "loss": 1.0176, + "mean_token_accuracy": 0.6951220035552979, + "num_tokens": 28750513.0, + "step": 1144 + }, + { + "epoch": 0.12574126949264222, + "grad_norm": 2.5799965858459473, + "learning_rate": 6.278814489571899e-07, + "loss": 1.0607, + "mean_token_accuracy": 0.6897681951522827, + "num_tokens": 28771752.0, + "step": 1145 + }, + { + "epoch": 0.12585108719525587, + "grad_norm": 2.527050495147705, + "learning_rate": 6.284302963776071e-07, + "loss": 1.0374, + "mean_token_accuracy": 0.703777551651001, + "num_tokens": 28793328.0, + "step": 1146 + }, + { + "epoch": 0.12596090489786954, + "grad_norm": 2.457806348800659, + "learning_rate": 6.289791437980241e-07, + "loss": 0.885, + "mean_token_accuracy": 0.7308540940284729, + "num_tokens": 28817275.0, + "step": 1147 + }, + { + "epoch": 0.1260707226004832, + "grad_norm": 2.4204812049865723, + "learning_rate": 6.295279912184413e-07, + "loss": 1.0923, + "mean_token_accuracy": 0.6740754246711731, + "num_tokens": 28840296.0, + "step": 1148 + }, + { + "epoch": 0.12618054030309686, + "grad_norm": 2.7779624462127686, + "learning_rate": 6.300768386388585e-07, + "loss": 1.0492, + "mean_token_accuracy": 0.6904221773147583, + "num_tokens": 28859755.0, + "step": 1149 + }, + { + "epoch": 0.1262903580057105, + "grad_norm": 1.8883113861083984, + "learning_rate": 6.306256860592754e-07, + "loss": 1.0045, + "mean_token_accuracy": 0.6979160308837891, + "num_tokens": 28895415.0, + "step": 1150 + }, + { + "epoch": 0.12640017570832418, + "grad_norm": 2.1908349990844727, + "learning_rate": 6.311745334796926e-07, + "loss": 1.1163, + "mean_token_accuracy": 0.671488881111145, + "num_tokens": 28925097.0, + "step": 1151 + }, + { + "epoch": 0.12650999341093785, + "grad_norm": 2.4313080310821533, + "learning_rate": 6.317233809001098e-07, + "loss": 1.0438, + "mean_token_accuracy": 0.699648380279541, + "num_tokens": 28949781.0, + "step": 1152 + }, + { + "epoch": 0.1266198111135515, + "grad_norm": 2.3794965744018555, + "learning_rate": 6.322722283205268e-07, + "loss": 1.0886, + "mean_token_accuracy": 0.6860392093658447, + "num_tokens": 28974551.0, + "step": 1153 + }, + { + "epoch": 0.12672962881616517, + "grad_norm": 2.9758431911468506, + "learning_rate": 6.32821075740944e-07, + "loss": 0.9689, + "mean_token_accuracy": 0.7043944001197815, + "num_tokens": 28992961.0, + "step": 1154 + }, + { + "epoch": 0.12683944651877882, + "grad_norm": 2.119018793106079, + "learning_rate": 6.33369923161361e-07, + "loss": 1.0362, + "mean_token_accuracy": 0.6882965564727783, + "num_tokens": 29021775.0, + "step": 1155 + }, + { + "epoch": 0.1269492642213925, + "grad_norm": 2.1051695346832275, + "learning_rate": 6.339187705817782e-07, + "loss": 1.0495, + "mean_token_accuracy": 0.691261351108551, + "num_tokens": 29050551.0, + "step": 1156 + }, + { + "epoch": 0.12705908192400614, + "grad_norm": 2.3860061168670654, + "learning_rate": 6.344676180021954e-07, + "loss": 0.9319, + "mean_token_accuracy": 0.7182657122612, + "num_tokens": 29072933.0, + "step": 1157 + }, + { + "epoch": 0.1271688996266198, + "grad_norm": 2.417447566986084, + "learning_rate": 6.350164654226124e-07, + "loss": 1.1049, + "mean_token_accuracy": 0.6734758615493774, + "num_tokens": 29097267.0, + "step": 1158 + }, + { + "epoch": 0.12727871732923346, + "grad_norm": 2.3775475025177, + "learning_rate": 6.355653128430296e-07, + "loss": 1.0345, + "mean_token_accuracy": 0.6904564499855042, + "num_tokens": 29121137.0, + "step": 1159 + }, + { + "epoch": 0.12738853503184713, + "grad_norm": 2.27984881401062, + "learning_rate": 6.361141602634468e-07, + "loss": 1.0265, + "mean_token_accuracy": 0.691722571849823, + "num_tokens": 29147199.0, + "step": 1160 + }, + { + "epoch": 0.1274983527344608, + "grad_norm": 2.1773018836975098, + "learning_rate": 6.366630076838638e-07, + "loss": 0.9859, + "mean_token_accuracy": 0.701601505279541, + "num_tokens": 29174280.0, + "step": 1161 + }, + { + "epoch": 0.12760817043707445, + "grad_norm": 2.2490482330322266, + "learning_rate": 6.37211855104281e-07, + "loss": 1.0971, + "mean_token_accuracy": 0.6687562465667725, + "num_tokens": 29202134.0, + "step": 1162 + }, + { + "epoch": 0.12771798813968813, + "grad_norm": 2.2807140350341797, + "learning_rate": 6.377607025246982e-07, + "loss": 1.0837, + "mean_token_accuracy": 0.6759827136993408, + "num_tokens": 29228180.0, + "step": 1163 + }, + { + "epoch": 0.12782780584230177, + "grad_norm": 2.226738691329956, + "learning_rate": 6.383095499451152e-07, + "loss": 1.0907, + "mean_token_accuracy": 0.6864488124847412, + "num_tokens": 29257159.0, + "step": 1164 + }, + { + "epoch": 0.12793762354491545, + "grad_norm": 2.1535227298736572, + "learning_rate": 6.388583973655324e-07, + "loss": 1.0581, + "mean_token_accuracy": 0.6871017217636108, + "num_tokens": 29286584.0, + "step": 1165 + }, + { + "epoch": 0.1280474412475291, + "grad_norm": 2.3927195072174072, + "learning_rate": 6.394072447859495e-07, + "loss": 1.024, + "mean_token_accuracy": 0.6904647350311279, + "num_tokens": 29309511.0, + "step": 1166 + }, + { + "epoch": 0.12815725895014277, + "grad_norm": 2.2153048515319824, + "learning_rate": 6.399560922063666e-07, + "loss": 0.9435, + "mean_token_accuracy": 0.7183014154434204, + "num_tokens": 29336177.0, + "step": 1167 + }, + { + "epoch": 0.1282670766527564, + "grad_norm": 2.3506007194519043, + "learning_rate": 6.405049396267838e-07, + "loss": 1.0056, + "mean_token_accuracy": 0.7021580934524536, + "num_tokens": 29362265.0, + "step": 1168 + }, + { + "epoch": 0.12837689435537009, + "grad_norm": 2.436110019683838, + "learning_rate": 6.410537870472008e-07, + "loss": 1.0333, + "mean_token_accuracy": 0.6912612915039062, + "num_tokens": 29385615.0, + "step": 1169 + }, + { + "epoch": 0.12848671205798376, + "grad_norm": 2.7244224548339844, + "learning_rate": 6.416026344676179e-07, + "loss": 1.0555, + "mean_token_accuracy": 0.6830066442489624, + "num_tokens": 29405848.0, + "step": 1170 + }, + { + "epoch": 0.1285965297605974, + "grad_norm": 2.1406185626983643, + "learning_rate": 6.421514818880351e-07, + "loss": 1.005, + "mean_token_accuracy": 0.7032278776168823, + "num_tokens": 29434314.0, + "step": 1171 + }, + { + "epoch": 0.12870634746321108, + "grad_norm": 2.4847347736358643, + "learning_rate": 6.427003293084522e-07, + "loss": 1.0048, + "mean_token_accuracy": 0.6917538642883301, + "num_tokens": 29456212.0, + "step": 1172 + }, + { + "epoch": 0.12881616516582473, + "grad_norm": 2.19641375541687, + "learning_rate": 6.432491767288693e-07, + "loss": 1.0562, + "mean_token_accuracy": 0.6863774657249451, + "num_tokens": 29482804.0, + "step": 1173 + }, + { + "epoch": 0.1289259828684384, + "grad_norm": 2.156919240951538, + "learning_rate": 6.437980241492865e-07, + "loss": 1.0576, + "mean_token_accuracy": 0.6978520154953003, + "num_tokens": 29511651.0, + "step": 1174 + }, + { + "epoch": 0.12903580057105205, + "grad_norm": 2.459094285964966, + "learning_rate": 6.443468715697036e-07, + "loss": 1.0221, + "mean_token_accuracy": 0.7082746028900146, + "num_tokens": 29535567.0, + "step": 1175 + }, + { + "epoch": 0.12914561827366572, + "grad_norm": 2.426699638366699, + "learning_rate": 6.448957189901207e-07, + "loss": 1.0219, + "mean_token_accuracy": 0.7195491790771484, + "num_tokens": 29558393.0, + "step": 1176 + }, + { + "epoch": 0.12925543597627936, + "grad_norm": 2.5086138248443604, + "learning_rate": 6.454445664105378e-07, + "loss": 1.078, + "mean_token_accuracy": 0.6878311634063721, + "num_tokens": 29583065.0, + "step": 1177 + }, + { + "epoch": 0.12936525367889304, + "grad_norm": 2.481623649597168, + "learning_rate": 6.45993413830955e-07, + "loss": 1.0317, + "mean_token_accuracy": 0.7009872794151306, + "num_tokens": 29605291.0, + "step": 1178 + }, + { + "epoch": 0.1294750713815067, + "grad_norm": 2.666429281234741, + "learning_rate": 6.465422612513721e-07, + "loss": 1.0401, + "mean_token_accuracy": 0.6872227191925049, + "num_tokens": 29625918.0, + "step": 1179 + }, + { + "epoch": 0.12958488908412036, + "grad_norm": 2.528545379638672, + "learning_rate": 6.470911086717892e-07, + "loss": 1.0808, + "mean_token_accuracy": 0.6817375421524048, + "num_tokens": 29648784.0, + "step": 1180 + }, + { + "epoch": 0.12969470678673403, + "grad_norm": 2.1149871349334717, + "learning_rate": 6.476399560922064e-07, + "loss": 1.1441, + "mean_token_accuracy": 0.6660103797912598, + "num_tokens": 29679984.0, + "step": 1181 + }, + { + "epoch": 0.12980452448934768, + "grad_norm": 2.725560426712036, + "learning_rate": 6.481888035126235e-07, + "loss": 1.0135, + "mean_token_accuracy": 0.7038834095001221, + "num_tokens": 29700192.0, + "step": 1182 + }, + { + "epoch": 0.12991434219196135, + "grad_norm": 2.543581485748291, + "learning_rate": 6.487376509330406e-07, + "loss": 0.9922, + "mean_token_accuracy": 0.7065889835357666, + "num_tokens": 29723121.0, + "step": 1183 + }, + { + "epoch": 0.130024159894575, + "grad_norm": 2.341614246368408, + "learning_rate": 6.492864983534578e-07, + "loss": 1.1731, + "mean_token_accuracy": 0.6691802740097046, + "num_tokens": 29750261.0, + "step": 1184 + }, + { + "epoch": 0.13013397759718867, + "grad_norm": 2.3689873218536377, + "learning_rate": 6.498353457738749e-07, + "loss": 0.9948, + "mean_token_accuracy": 0.7030237913131714, + "num_tokens": 29772890.0, + "step": 1185 + }, + { + "epoch": 0.13024379529980232, + "grad_norm": 2.1390316486358643, + "learning_rate": 6.50384193194292e-07, + "loss": 1.0463, + "mean_token_accuracy": 0.6958863735198975, + "num_tokens": 29801528.0, + "step": 1186 + }, + { + "epoch": 0.130353613002416, + "grad_norm": 2.4972286224365234, + "learning_rate": 6.509330406147092e-07, + "loss": 1.0492, + "mean_token_accuracy": 0.6835654973983765, + "num_tokens": 29825654.0, + "step": 1187 + }, + { + "epoch": 0.13046343070502964, + "grad_norm": 3.223085403442383, + "learning_rate": 6.514818880351261e-07, + "loss": 0.9354, + "mean_token_accuracy": 0.7174443006515503, + "num_tokens": 29841189.0, + "step": 1188 + }, + { + "epoch": 0.1305732484076433, + "grad_norm": 2.6600210666656494, + "learning_rate": 6.520307354555433e-07, + "loss": 1.1011, + "mean_token_accuracy": 0.688363254070282, + "num_tokens": 29862197.0, + "step": 1189 + }, + { + "epoch": 0.13068306611025698, + "grad_norm": 2.5370779037475586, + "learning_rate": 6.525795828759604e-07, + "loss": 0.9992, + "mean_token_accuracy": 0.6975914239883423, + "num_tokens": 29883584.0, + "step": 1190 + }, + { + "epoch": 0.13079288381287063, + "grad_norm": 2.4974660873413086, + "learning_rate": 6.531284302963775e-07, + "loss": 1.0617, + "mean_token_accuracy": 0.6935686469078064, + "num_tokens": 29905968.0, + "step": 1191 + }, + { + "epoch": 0.1309027015154843, + "grad_norm": 2.5117053985595703, + "learning_rate": 6.536772777167947e-07, + "loss": 0.9909, + "mean_token_accuracy": 0.703594446182251, + "num_tokens": 29928785.0, + "step": 1192 + }, + { + "epoch": 0.13101251921809795, + "grad_norm": 2.377807140350342, + "learning_rate": 6.542261251372118e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.6928077936172485, + "num_tokens": 29953737.0, + "step": 1193 + }, + { + "epoch": 0.13112233692071162, + "grad_norm": 2.4185612201690674, + "learning_rate": 6.547749725576289e-07, + "loss": 1.0446, + "mean_token_accuracy": 0.6983730792999268, + "num_tokens": 29977094.0, + "step": 1194 + }, + { + "epoch": 0.13123215462332527, + "grad_norm": 2.4840075969696045, + "learning_rate": 6.553238199780461e-07, + "loss": 1.1129, + "mean_token_accuracy": 0.6781190633773804, + "num_tokens": 30004347.0, + "step": 1195 + }, + { + "epoch": 0.13134197232593894, + "grad_norm": 2.5198471546173096, + "learning_rate": 6.558726673984632e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6933634877204895, + "num_tokens": 30027614.0, + "step": 1196 + }, + { + "epoch": 0.1314517900285526, + "grad_norm": 2.5193593502044678, + "learning_rate": 6.564215148188803e-07, + "loss": 1.0201, + "mean_token_accuracy": 0.6966559886932373, + "num_tokens": 30049059.0, + "step": 1197 + }, + { + "epoch": 0.13156160773116626, + "grad_norm": 2.464150905609131, + "learning_rate": 6.569703622392975e-07, + "loss": 1.0162, + "mean_token_accuracy": 0.694063663482666, + "num_tokens": 30071449.0, + "step": 1198 + }, + { + "epoch": 0.13167142543377994, + "grad_norm": 2.313471555709839, + "learning_rate": 6.575192096597145e-07, + "loss": 0.9641, + "mean_token_accuracy": 0.7105789184570312, + "num_tokens": 30094955.0, + "step": 1199 + }, + { + "epoch": 0.13178124313639358, + "grad_norm": 2.4281506538391113, + "learning_rate": 6.580680570801317e-07, + "loss": 1.0086, + "mean_token_accuracy": 0.6944973468780518, + "num_tokens": 30118008.0, + "step": 1200 + }, + { + "epoch": 0.13189106083900726, + "grad_norm": 2.1015493869781494, + "learning_rate": 6.586169045005489e-07, + "loss": 0.9828, + "mean_token_accuracy": 0.7025058269500732, + "num_tokens": 30144936.0, + "step": 1201 + }, + { + "epoch": 0.1320008785416209, + "grad_norm": 2.3186707496643066, + "learning_rate": 6.591657519209659e-07, + "loss": 0.9994, + "mean_token_accuracy": 0.7075050473213196, + "num_tokens": 30169026.0, + "step": 1202 + }, + { + "epoch": 0.13211069624423458, + "grad_norm": 2.0986249446868896, + "learning_rate": 6.597145993413831e-07, + "loss": 1.1249, + "mean_token_accuracy": 0.6654796600341797, + "num_tokens": 30199217.0, + "step": 1203 + }, + { + "epoch": 0.13222051394684822, + "grad_norm": 2.3669776916503906, + "learning_rate": 6.602634467618003e-07, + "loss": 1.0271, + "mean_token_accuracy": 0.6955165863037109, + "num_tokens": 30224916.0, + "step": 1204 + }, + { + "epoch": 0.1323303316494619, + "grad_norm": 1.954027533531189, + "learning_rate": 6.608122941822173e-07, + "loss": 1.0588, + "mean_token_accuracy": 0.6814873218536377, + "num_tokens": 30261606.0, + "step": 1205 + }, + { + "epoch": 0.13244014935207554, + "grad_norm": 2.345764636993408, + "learning_rate": 6.613611416026345e-07, + "loss": 1.0277, + "mean_token_accuracy": 0.6852288842201233, + "num_tokens": 30286745.0, + "step": 1206 + }, + { + "epoch": 0.13254996705468922, + "grad_norm": 2.31714129447937, + "learning_rate": 6.619099890230515e-07, + "loss": 1.0325, + "mean_token_accuracy": 0.6889185905456543, + "num_tokens": 30313693.0, + "step": 1207 + }, + { + "epoch": 0.1326597847573029, + "grad_norm": 2.256124258041382, + "learning_rate": 6.624588364434686e-07, + "loss": 1.0905, + "mean_token_accuracy": 0.692707896232605, + "num_tokens": 30340393.0, + "step": 1208 + }, + { + "epoch": 0.13276960245991654, + "grad_norm": 2.4972405433654785, + "learning_rate": 6.630076838638858e-07, + "loss": 1.0568, + "mean_token_accuracy": 0.6866587400436401, + "num_tokens": 30363522.0, + "step": 1209 + }, + { + "epoch": 0.1328794201625302, + "grad_norm": 2.1433587074279785, + "learning_rate": 6.635565312843029e-07, + "loss": 1.0531, + "mean_token_accuracy": 0.6980574131011963, + "num_tokens": 30391509.0, + "step": 1210 + }, + { + "epoch": 0.13298923786514386, + "grad_norm": 2.083603620529175, + "learning_rate": 6.6410537870472e-07, + "loss": 1.0836, + "mean_token_accuracy": 0.6829323172569275, + "num_tokens": 30423910.0, + "step": 1211 + }, + { + "epoch": 0.13309905556775753, + "grad_norm": 2.2505288124084473, + "learning_rate": 6.646542261251372e-07, + "loss": 1.0617, + "mean_token_accuracy": 0.6840540170669556, + "num_tokens": 30452547.0, + "step": 1212 + }, + { + "epoch": 0.13320887327037118, + "grad_norm": 2.314493417739868, + "learning_rate": 6.652030735455543e-07, + "loss": 1.0081, + "mean_token_accuracy": 0.696753978729248, + "num_tokens": 30478299.0, + "step": 1213 + }, + { + "epoch": 0.13331869097298485, + "grad_norm": 2.410780191421509, + "learning_rate": 6.657519209659714e-07, + "loss": 1.0419, + "mean_token_accuracy": 0.6988143920898438, + "num_tokens": 30502184.0, + "step": 1214 + }, + { + "epoch": 0.1334285086755985, + "grad_norm": 2.5865304470062256, + "learning_rate": 6.663007683863886e-07, + "loss": 1.032, + "mean_token_accuracy": 0.697053849697113, + "num_tokens": 30523880.0, + "step": 1215 + }, + { + "epoch": 0.13353832637821217, + "grad_norm": 2.533975601196289, + "learning_rate": 6.668496158068057e-07, + "loss": 1.0714, + "mean_token_accuracy": 0.6821579933166504, + "num_tokens": 30547013.0, + "step": 1216 + }, + { + "epoch": 0.13364814408082584, + "grad_norm": 2.3272945880889893, + "learning_rate": 6.673984632272228e-07, + "loss": 0.9763, + "mean_token_accuracy": 0.6994010806083679, + "num_tokens": 30570887.0, + "step": 1217 + }, + { + "epoch": 0.1337579617834395, + "grad_norm": 2.4282870292663574, + "learning_rate": 6.679473106476399e-07, + "loss": 0.9996, + "mean_token_accuracy": 0.700444757938385, + "num_tokens": 30593935.0, + "step": 1218 + }, + { + "epoch": 0.13386777948605316, + "grad_norm": 2.2231743335723877, + "learning_rate": 6.684961580680571e-07, + "loss": 1.073, + "mean_token_accuracy": 0.6756887435913086, + "num_tokens": 30621360.0, + "step": 1219 + }, + { + "epoch": 0.1339775971886668, + "grad_norm": 2.1960690021514893, + "learning_rate": 6.690450054884742e-07, + "loss": 0.9642, + "mean_token_accuracy": 0.711060643196106, + "num_tokens": 30649663.0, + "step": 1220 + }, + { + "epoch": 0.13408741489128048, + "grad_norm": 2.3789565563201904, + "learning_rate": 6.695938529088913e-07, + "loss": 1.0142, + "mean_token_accuracy": 0.6944167613983154, + "num_tokens": 30674394.0, + "step": 1221 + }, + { + "epoch": 0.13419723259389413, + "grad_norm": 2.4707956314086914, + "learning_rate": 6.701427003293085e-07, + "loss": 1.037, + "mean_token_accuracy": 0.6938780546188354, + "num_tokens": 30698137.0, + "step": 1222 + }, + { + "epoch": 0.1343070502965078, + "grad_norm": 2.2993340492248535, + "learning_rate": 6.706915477497256e-07, + "loss": 1.0287, + "mean_token_accuracy": 0.6923698782920837, + "num_tokens": 30724845.0, + "step": 1223 + }, + { + "epoch": 0.13441686799912145, + "grad_norm": 2.4919419288635254, + "learning_rate": 6.712403951701427e-07, + "loss": 1.1381, + "mean_token_accuracy": 0.6657485365867615, + "num_tokens": 30748802.0, + "step": 1224 + }, + { + "epoch": 0.13452668570173512, + "grad_norm": 2.198054075241089, + "learning_rate": 6.717892425905599e-07, + "loss": 1.1373, + "mean_token_accuracy": 0.6614638566970825, + "num_tokens": 30777944.0, + "step": 1225 + }, + { + "epoch": 0.13463650340434877, + "grad_norm": 2.7070164680480957, + "learning_rate": 6.723380900109769e-07, + "loss": 0.9144, + "mean_token_accuracy": 0.7303860783576965, + "num_tokens": 30796290.0, + "step": 1226 + }, + { + "epoch": 0.13474632110696244, + "grad_norm": 2.7234930992126465, + "learning_rate": 6.72886937431394e-07, + "loss": 1.0532, + "mean_token_accuracy": 0.6911415457725525, + "num_tokens": 30819078.0, + "step": 1227 + }, + { + "epoch": 0.13485613880957611, + "grad_norm": 2.0709588527679443, + "learning_rate": 6.734357848518111e-07, + "loss": 1.055, + "mean_token_accuracy": 0.6883975267410278, + "num_tokens": 30849079.0, + "step": 1228 + }, + { + "epoch": 0.13496595651218976, + "grad_norm": 2.210167646408081, + "learning_rate": 6.739846322722282e-07, + "loss": 1.0375, + "mean_token_accuracy": 0.6844400763511658, + "num_tokens": 30878929.0, + "step": 1229 + }, + { + "epoch": 0.13507577421480343, + "grad_norm": 2.3037898540496826, + "learning_rate": 6.745334796926454e-07, + "loss": 1.0687, + "mean_token_accuracy": 0.683539867401123, + "num_tokens": 30904849.0, + "step": 1230 + }, + { + "epoch": 0.13518559191741708, + "grad_norm": 2.0463662147521973, + "learning_rate": 6.750823271130625e-07, + "loss": 1.1206, + "mean_token_accuracy": 0.6661313772201538, + "num_tokens": 30936663.0, + "step": 1231 + }, + { + "epoch": 0.13529540962003075, + "grad_norm": 2.0296363830566406, + "learning_rate": 6.756311745334796e-07, + "loss": 1.0457, + "mean_token_accuracy": 0.6876791715621948, + "num_tokens": 30967497.0, + "step": 1232 + }, + { + "epoch": 0.1354052273226444, + "grad_norm": 2.487713575363159, + "learning_rate": 6.761800219538968e-07, + "loss": 1.1154, + "mean_token_accuracy": 0.6789094805717468, + "num_tokens": 30991774.0, + "step": 1233 + }, + { + "epoch": 0.13551504502525807, + "grad_norm": 2.2005789279937744, + "learning_rate": 6.767288693743139e-07, + "loss": 1.0728, + "mean_token_accuracy": 0.6880407333374023, + "num_tokens": 31019375.0, + "step": 1234 + }, + { + "epoch": 0.13562486272787172, + "grad_norm": 2.3782083988189697, + "learning_rate": 6.77277716794731e-07, + "loss": 0.9722, + "mean_token_accuracy": 0.7111836671829224, + "num_tokens": 31043590.0, + "step": 1235 + }, + { + "epoch": 0.1357346804304854, + "grad_norm": 2.353605031967163, + "learning_rate": 6.778265642151482e-07, + "loss": 1.067, + "mean_token_accuracy": 0.6888073086738586, + "num_tokens": 31068419.0, + "step": 1236 + }, + { + "epoch": 0.13584449813309907, + "grad_norm": 2.298191547393799, + "learning_rate": 6.783754116355653e-07, + "loss": 1.061, + "mean_token_accuracy": 0.6897018551826477, + "num_tokens": 31094684.0, + "step": 1237 + }, + { + "epoch": 0.1359543158357127, + "grad_norm": 2.3368492126464844, + "learning_rate": 6.789242590559824e-07, + "loss": 1.1265, + "mean_token_accuracy": 0.6705570220947266, + "num_tokens": 31121273.0, + "step": 1238 + }, + { + "epoch": 0.1360641335383264, + "grad_norm": 2.4595119953155518, + "learning_rate": 6.794731064763996e-07, + "loss": 1.0335, + "mean_token_accuracy": 0.6885470151901245, + "num_tokens": 31144843.0, + "step": 1239 + }, + { + "epoch": 0.13617395124094003, + "grad_norm": 2.4682188034057617, + "learning_rate": 6.800219538968166e-07, + "loss": 0.9922, + "mean_token_accuracy": 0.6973428726196289, + "num_tokens": 31167105.0, + "step": 1240 + }, + { + "epoch": 0.1362837689435537, + "grad_norm": 2.1941332817077637, + "learning_rate": 6.805708013172338e-07, + "loss": 0.9799, + "mean_token_accuracy": 0.7011840343475342, + "num_tokens": 31194705.0, + "step": 1241 + }, + { + "epoch": 0.13639358664616735, + "grad_norm": 2.797415256500244, + "learning_rate": 6.81119648737651e-07, + "loss": 1.003, + "mean_token_accuracy": 0.6969583630561829, + "num_tokens": 31213163.0, + "step": 1242 + }, + { + "epoch": 0.13650340434878103, + "grad_norm": 2.3194210529327393, + "learning_rate": 6.81668496158068e-07, + "loss": 1.0439, + "mean_token_accuracy": 0.687375545501709, + "num_tokens": 31240214.0, + "step": 1243 + }, + { + "epoch": 0.13661322205139467, + "grad_norm": 2.2632675170898438, + "learning_rate": 6.822173435784852e-07, + "loss": 1.0399, + "mean_token_accuracy": 0.6897954940795898, + "num_tokens": 31266743.0, + "step": 1244 + }, + { + "epoch": 0.13672303975400835, + "grad_norm": 2.4544289112091064, + "learning_rate": 6.827661909989023e-07, + "loss": 1.023, + "mean_token_accuracy": 0.6952857971191406, + "num_tokens": 31289398.0, + "step": 1245 + }, + { + "epoch": 0.13683285745662202, + "grad_norm": 2.36418080329895, + "learning_rate": 6.833150384193193e-07, + "loss": 1.1305, + "mean_token_accuracy": 0.6789097189903259, + "num_tokens": 31318508.0, + "step": 1246 + }, + { + "epoch": 0.13694267515923567, + "grad_norm": 2.343902826309204, + "learning_rate": 6.838638858397365e-07, + "loss": 1.1115, + "mean_token_accuracy": 0.6729546189308167, + "num_tokens": 31346602.0, + "step": 1247 + }, + { + "epoch": 0.13705249286184934, + "grad_norm": 2.83482027053833, + "learning_rate": 6.844127332601537e-07, + "loss": 0.9499, + "mean_token_accuracy": 0.7106227278709412, + "num_tokens": 31365432.0, + "step": 1248 + }, + { + "epoch": 0.13716231056446299, + "grad_norm": 2.1249876022338867, + "learning_rate": 6.849615806805707e-07, + "loss": 1.1475, + "mean_token_accuracy": 0.6705546379089355, + "num_tokens": 31397921.0, + "step": 1249 + }, + { + "epoch": 0.13727212826707666, + "grad_norm": 2.52762508392334, + "learning_rate": 6.855104281009879e-07, + "loss": 1.0292, + "mean_token_accuracy": 0.6948179006576538, + "num_tokens": 31421624.0, + "step": 1250 + }, + { + "epoch": 0.1373819459696903, + "grad_norm": 2.5443663597106934, + "learning_rate": 6.86059275521405e-07, + "loss": 1.0768, + "mean_token_accuracy": 0.6735204458236694, + "num_tokens": 31444729.0, + "step": 1251 + }, + { + "epoch": 0.13749176367230398, + "grad_norm": 2.409569025039673, + "learning_rate": 6.866081229418221e-07, + "loss": 1.0743, + "mean_token_accuracy": 0.6820551156997681, + "num_tokens": 31469267.0, + "step": 1252 + }, + { + "epoch": 0.13760158137491763, + "grad_norm": 2.335305690765381, + "learning_rate": 6.871569703622393e-07, + "loss": 0.9431, + "mean_token_accuracy": 0.7111176252365112, + "num_tokens": 31492865.0, + "step": 1253 + }, + { + "epoch": 0.1377113990775313, + "grad_norm": 2.0791549682617188, + "learning_rate": 6.877058177826564e-07, + "loss": 1.089, + "mean_token_accuracy": 0.6832231879234314, + "num_tokens": 31524429.0, + "step": 1254 + }, + { + "epoch": 0.13782121678014497, + "grad_norm": 2.300354480743408, + "learning_rate": 6.882546652030735e-07, + "loss": 0.9477, + "mean_token_accuracy": 0.7170947790145874, + "num_tokens": 31550599.0, + "step": 1255 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 2.2052218914031982, + "learning_rate": 6.888035126234907e-07, + "loss": 1.0909, + "mean_token_accuracy": 0.685070276260376, + "num_tokens": 31576901.0, + "step": 1256 + }, + { + "epoch": 0.1380408521853723, + "grad_norm": 1.9150100946426392, + "learning_rate": 6.893523600439078e-07, + "loss": 1.048, + "mean_token_accuracy": 0.6883069276809692, + "num_tokens": 31609437.0, + "step": 1257 + }, + { + "epoch": 0.13815066988798594, + "grad_norm": 2.423095941543579, + "learning_rate": 6.899012074643249e-07, + "loss": 1.0667, + "mean_token_accuracy": 0.6831873059272766, + "num_tokens": 31636367.0, + "step": 1258 + }, + { + "epoch": 0.1382604875905996, + "grad_norm": 2.665032386779785, + "learning_rate": 6.904500548847421e-07, + "loss": 1.0469, + "mean_token_accuracy": 0.6914893984794617, + "num_tokens": 31657221.0, + "step": 1259 + }, + { + "epoch": 0.13837030529321326, + "grad_norm": 2.4249162673950195, + "learning_rate": 6.909989023051592e-07, + "loss": 1.1164, + "mean_token_accuracy": 0.6656757593154907, + "num_tokens": 31681999.0, + "step": 1260 + }, + { + "epoch": 0.13848012299582693, + "grad_norm": 2.56069278717041, + "learning_rate": 6.915477497255763e-07, + "loss": 1.069, + "mean_token_accuracy": 0.6786240339279175, + "num_tokens": 31704530.0, + "step": 1261 + }, + { + "epoch": 0.13858994069844058, + "grad_norm": 2.7906334400177, + "learning_rate": 6.920965971459934e-07, + "loss": 1.0728, + "mean_token_accuracy": 0.6810013651847839, + "num_tokens": 31724424.0, + "step": 1262 + }, + { + "epoch": 0.13869975840105425, + "grad_norm": 2.308953285217285, + "learning_rate": 6.926454445664105e-07, + "loss": 1.0134, + "mean_token_accuracy": 0.69651859998703, + "num_tokens": 31750349.0, + "step": 1263 + }, + { + "epoch": 0.1388095761036679, + "grad_norm": 2.3934431076049805, + "learning_rate": 6.931942919868276e-07, + "loss": 0.989, + "mean_token_accuracy": 0.6981557607650757, + "num_tokens": 31773727.0, + "step": 1264 + }, + { + "epoch": 0.13891939380628157, + "grad_norm": 2.2753732204437256, + "learning_rate": 6.937431394072447e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.68993079662323, + "num_tokens": 31801540.0, + "step": 1265 + }, + { + "epoch": 0.13902921150889525, + "grad_norm": 2.39715576171875, + "learning_rate": 6.942919868276618e-07, + "loss": 0.9855, + "mean_token_accuracy": 0.7080968022346497, + "num_tokens": 31826004.0, + "step": 1266 + }, + { + "epoch": 0.1391390292115089, + "grad_norm": 2.5159385204315186, + "learning_rate": 6.94840834248079e-07, + "loss": 1.0787, + "mean_token_accuracy": 0.6802542209625244, + "num_tokens": 31849140.0, + "step": 1267 + }, + { + "epoch": 0.13924884691412256, + "grad_norm": 2.124213457107544, + "learning_rate": 6.953896816684961e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.6868577599525452, + "num_tokens": 31879467.0, + "step": 1268 + }, + { + "epoch": 0.1393586646167362, + "grad_norm": 2.361788749694824, + "learning_rate": 6.959385290889132e-07, + "loss": 0.9182, + "mean_token_accuracy": 0.7214388847351074, + "num_tokens": 31903887.0, + "step": 1269 + }, + { + "epoch": 0.13946848231934988, + "grad_norm": 2.241323947906494, + "learning_rate": 6.964873765093304e-07, + "loss": 1.1021, + "mean_token_accuracy": 0.6745392084121704, + "num_tokens": 31933226.0, + "step": 1270 + }, + { + "epoch": 0.13957830002196353, + "grad_norm": 2.4074628353118896, + "learning_rate": 6.970362239297475e-07, + "loss": 0.902, + "mean_token_accuracy": 0.7285845279693604, + "num_tokens": 31954480.0, + "step": 1271 + }, + { + "epoch": 0.1396881177245772, + "grad_norm": 2.320513963699341, + "learning_rate": 6.975850713501646e-07, + "loss": 1.0942, + "mean_token_accuracy": 0.6819272637367249, + "num_tokens": 31981936.0, + "step": 1272 + }, + { + "epoch": 0.13979793542719085, + "grad_norm": 2.1797406673431396, + "learning_rate": 6.981339187705817e-07, + "loss": 1.0432, + "mean_token_accuracy": 0.6960906982421875, + "num_tokens": 32009546.0, + "step": 1273 + }, + { + "epoch": 0.13990775312980452, + "grad_norm": 2.0258607864379883, + "learning_rate": 6.986827661909989e-07, + "loss": 1.1167, + "mean_token_accuracy": 0.6640499830245972, + "num_tokens": 32043007.0, + "step": 1274 + }, + { + "epoch": 0.1400175708324182, + "grad_norm": 2.5608606338500977, + "learning_rate": 6.99231613611416e-07, + "loss": 1.0116, + "mean_token_accuracy": 0.7013552188873291, + "num_tokens": 32064815.0, + "step": 1275 + }, + { + "epoch": 0.14012738853503184, + "grad_norm": 2.4699127674102783, + "learning_rate": 6.997804610318331e-07, + "loss": 1.0611, + "mean_token_accuracy": 0.6979429721832275, + "num_tokens": 32087506.0, + "step": 1276 + }, + { + "epoch": 0.14023720623764552, + "grad_norm": 2.2589547634124756, + "learning_rate": 7.003293084522503e-07, + "loss": 1.1395, + "mean_token_accuracy": 0.6660817265510559, + "num_tokens": 32115508.0, + "step": 1277 + }, + { + "epoch": 0.14034702394025916, + "grad_norm": 2.6866507530212402, + "learning_rate": 7.008781558726674e-07, + "loss": 1.0521, + "mean_token_accuracy": 0.682885468006134, + "num_tokens": 32134929.0, + "step": 1278 + }, + { + "epoch": 0.14045684164287284, + "grad_norm": 2.3245913982391357, + "learning_rate": 7.014270032930845e-07, + "loss": 1.0556, + "mean_token_accuracy": 0.6893695592880249, + "num_tokens": 32160743.0, + "step": 1279 + }, + { + "epoch": 0.14056665934548648, + "grad_norm": 2.557309627532959, + "learning_rate": 7.019758507135017e-07, + "loss": 1.0099, + "mean_token_accuracy": 0.6932941675186157, + "num_tokens": 32181197.0, + "step": 1280 + }, + { + "epoch": 0.14067647704810016, + "grad_norm": 2.4484710693359375, + "learning_rate": 7.025246981339188e-07, + "loss": 0.9225, + "mean_token_accuracy": 0.726508617401123, + "num_tokens": 32203614.0, + "step": 1281 + }, + { + "epoch": 0.1407862947507138, + "grad_norm": 2.307565689086914, + "learning_rate": 7.030735455543358e-07, + "loss": 1.0061, + "mean_token_accuracy": 0.7018078565597534, + "num_tokens": 32227301.0, + "step": 1282 + }, + { + "epoch": 0.14089611245332748, + "grad_norm": 2.383793354034424, + "learning_rate": 7.03622392974753e-07, + "loss": 0.9841, + "mean_token_accuracy": 0.7123017311096191, + "num_tokens": 32251293.0, + "step": 1283 + }, + { + "epoch": 0.14100593015594115, + "grad_norm": 2.406294584274292, + "learning_rate": 7.0417124039517e-07, + "loss": 1.0193, + "mean_token_accuracy": 0.6993950605392456, + "num_tokens": 32274619.0, + "step": 1284 + }, + { + "epoch": 0.1411157478585548, + "grad_norm": 2.5482006072998047, + "learning_rate": 7.047200878155872e-07, + "loss": 1.0952, + "mean_token_accuracy": 0.6860535144805908, + "num_tokens": 32295930.0, + "step": 1285 + }, + { + "epoch": 0.14122556556116847, + "grad_norm": 2.5412960052490234, + "learning_rate": 7.052689352360044e-07, + "loss": 0.979, + "mean_token_accuracy": 0.7035924196243286, + "num_tokens": 32317952.0, + "step": 1286 + }, + { + "epoch": 0.14133538326378212, + "grad_norm": 2.307206869125366, + "learning_rate": 7.058177826564214e-07, + "loss": 1.08, + "mean_token_accuracy": 0.6906325817108154, + "num_tokens": 32343859.0, + "step": 1287 + }, + { + "epoch": 0.1414452009663958, + "grad_norm": 2.4982028007507324, + "learning_rate": 7.063666300768386e-07, + "loss": 1.1473, + "mean_token_accuracy": 0.6605150699615479, + "num_tokens": 32371967.0, + "step": 1288 + }, + { + "epoch": 0.14155501866900944, + "grad_norm": 2.148237943649292, + "learning_rate": 7.069154774972558e-07, + "loss": 1.0901, + "mean_token_accuracy": 0.6776302456855774, + "num_tokens": 32401990.0, + "step": 1289 + }, + { + "epoch": 0.1416648363716231, + "grad_norm": 2.3146402835845947, + "learning_rate": 7.074643249176728e-07, + "loss": 1.0037, + "mean_token_accuracy": 0.6971079111099243, + "num_tokens": 32426402.0, + "step": 1290 + }, + { + "epoch": 0.14177465407423676, + "grad_norm": 2.4205663204193115, + "learning_rate": 7.0801317233809e-07, + "loss": 0.9381, + "mean_token_accuracy": 0.7235709428787231, + "num_tokens": 32447472.0, + "step": 1291 + }, + { + "epoch": 0.14188447177685043, + "grad_norm": 2.5051326751708984, + "learning_rate": 7.085620197585072e-07, + "loss": 1.1043, + "mean_token_accuracy": 0.6763509511947632, + "num_tokens": 32472627.0, + "step": 1292 + }, + { + "epoch": 0.1419942894794641, + "grad_norm": 1.941112756729126, + "learning_rate": 7.091108671789242e-07, + "loss": 1.1064, + "mean_token_accuracy": 0.6738348007202148, + "num_tokens": 32506752.0, + "step": 1293 + }, + { + "epoch": 0.14210410718207775, + "grad_norm": 2.2701141834259033, + "learning_rate": 7.096597145993414e-07, + "loss": 1.0143, + "mean_token_accuracy": 0.6927616596221924, + "num_tokens": 32532980.0, + "step": 1294 + }, + { + "epoch": 0.14221392488469142, + "grad_norm": 2.5060527324676514, + "learning_rate": 7.102085620197584e-07, + "loss": 1.0658, + "mean_token_accuracy": 0.6729046106338501, + "num_tokens": 32555421.0, + "step": 1295 + }, + { + "epoch": 0.14232374258730507, + "grad_norm": 2.1771278381347656, + "learning_rate": 7.107574094401756e-07, + "loss": 1.0079, + "mean_token_accuracy": 0.7060142755508423, + "num_tokens": 32580766.0, + "step": 1296 + }, + { + "epoch": 0.14243356028991874, + "grad_norm": 2.537412405014038, + "learning_rate": 7.113062568605928e-07, + "loss": 1.0294, + "mean_token_accuracy": 0.6878150105476379, + "num_tokens": 32602441.0, + "step": 1297 + }, + { + "epoch": 0.1425433779925324, + "grad_norm": 2.1446189880371094, + "learning_rate": 7.118551042810098e-07, + "loss": 1.0573, + "mean_token_accuracy": 0.6847150325775146, + "num_tokens": 32629284.0, + "step": 1298 + }, + { + "epoch": 0.14265319569514606, + "grad_norm": 2.407275915145874, + "learning_rate": 7.12403951701427e-07, + "loss": 1.1439, + "mean_token_accuracy": 0.6605674028396606, + "num_tokens": 32654567.0, + "step": 1299 + }, + { + "epoch": 0.1427630133977597, + "grad_norm": 2.229330539703369, + "learning_rate": 7.129527991218442e-07, + "loss": 1.0434, + "mean_token_accuracy": 0.68440842628479, + "num_tokens": 32679956.0, + "step": 1300 + }, + { + "epoch": 0.14287283110037338, + "grad_norm": 2.5421805381774902, + "learning_rate": 7.135016465422611e-07, + "loss": 0.9789, + "mean_token_accuracy": 0.7094631195068359, + "num_tokens": 32701126.0, + "step": 1301 + }, + { + "epoch": 0.14298264880298703, + "grad_norm": 2.32916522026062, + "learning_rate": 7.140504939626783e-07, + "loss": 0.9109, + "mean_token_accuracy": 0.723305344581604, + "num_tokens": 32724863.0, + "step": 1302 + }, + { + "epoch": 0.1430924665056007, + "grad_norm": 2.260162830352783, + "learning_rate": 7.145993413830955e-07, + "loss": 1.0074, + "mean_token_accuracy": 0.7057392597198486, + "num_tokens": 32751158.0, + "step": 1303 + }, + { + "epoch": 0.14320228420821438, + "grad_norm": 2.1237077713012695, + "learning_rate": 7.151481888035125e-07, + "loss": 0.9982, + "mean_token_accuracy": 0.7017419934272766, + "num_tokens": 32779752.0, + "step": 1304 + }, + { + "epoch": 0.14331210191082802, + "grad_norm": 2.278259515762329, + "learning_rate": 7.156970362239297e-07, + "loss": 0.9866, + "mean_token_accuracy": 0.705629825592041, + "num_tokens": 32804354.0, + "step": 1305 + }, + { + "epoch": 0.1434219196134417, + "grad_norm": 1.9974240064620972, + "learning_rate": 7.162458836443468e-07, + "loss": 1.0778, + "mean_token_accuracy": 0.6843931078910828, + "num_tokens": 32837832.0, + "step": 1306 + }, + { + "epoch": 0.14353173731605534, + "grad_norm": 2.3079285621643066, + "learning_rate": 7.167947310647639e-07, + "loss": 0.9753, + "mean_token_accuracy": 0.7283300161361694, + "num_tokens": 32861801.0, + "step": 1307 + }, + { + "epoch": 0.14364155501866901, + "grad_norm": 2.031486749649048, + "learning_rate": 7.173435784851811e-07, + "loss": 1.0975, + "mean_token_accuracy": 0.6797066926956177, + "num_tokens": 32893035.0, + "step": 1308 + }, + { + "epoch": 0.14375137272128266, + "grad_norm": 2.408975839614868, + "learning_rate": 7.178924259055982e-07, + "loss": 1.0547, + "mean_token_accuracy": 0.6845706105232239, + "num_tokens": 32916332.0, + "step": 1309 + }, + { + "epoch": 0.14386119042389633, + "grad_norm": 2.256378173828125, + "learning_rate": 7.184412733260153e-07, + "loss": 0.9219, + "mean_token_accuracy": 0.7203900814056396, + "num_tokens": 32944196.0, + "step": 1310 + }, + { + "epoch": 0.14397100812650998, + "grad_norm": 2.394702196121216, + "learning_rate": 7.189901207464325e-07, + "loss": 1.0475, + "mean_token_accuracy": 0.6850335597991943, + "num_tokens": 32968976.0, + "step": 1311 + }, + { + "epoch": 0.14408082582912365, + "grad_norm": 2.307651996612549, + "learning_rate": 7.195389681668496e-07, + "loss": 1.0711, + "mean_token_accuracy": 0.6799653768539429, + "num_tokens": 32993413.0, + "step": 1312 + }, + { + "epoch": 0.14419064353173733, + "grad_norm": 2.543945550918579, + "learning_rate": 7.200878155872667e-07, + "loss": 0.885, + "mean_token_accuracy": 0.730444073677063, + "num_tokens": 33012633.0, + "step": 1313 + }, + { + "epoch": 0.14430046123435097, + "grad_norm": 2.423027276992798, + "learning_rate": 7.206366630076838e-07, + "loss": 1.0542, + "mean_token_accuracy": 0.6840363144874573, + "num_tokens": 33036237.0, + "step": 1314 + }, + { + "epoch": 0.14441027893696465, + "grad_norm": 2.4701976776123047, + "learning_rate": 7.21185510428101e-07, + "loss": 1.0163, + "mean_token_accuracy": 0.7020743489265442, + "num_tokens": 33061003.0, + "step": 1315 + }, + { + "epoch": 0.1445200966395783, + "grad_norm": 2.0371501445770264, + "learning_rate": 7.217343578485181e-07, + "loss": 0.9941, + "mean_token_accuracy": 0.7031715512275696, + "num_tokens": 33091773.0, + "step": 1316 + }, + { + "epoch": 0.14462991434219197, + "grad_norm": 2.26076340675354, + "learning_rate": 7.222832052689352e-07, + "loss": 0.9346, + "mean_token_accuracy": 0.716229259967804, + "num_tokens": 33115623.0, + "step": 1317 + }, + { + "epoch": 0.1447397320448056, + "grad_norm": 2.431871175765991, + "learning_rate": 7.228320526893524e-07, + "loss": 1.0504, + "mean_token_accuracy": 0.6956100463867188, + "num_tokens": 33140592.0, + "step": 1318 + }, + { + "epoch": 0.1448495497474193, + "grad_norm": 2.11822509765625, + "learning_rate": 7.233809001097695e-07, + "loss": 1.0732, + "mean_token_accuracy": 0.6803802847862244, + "num_tokens": 33170519.0, + "step": 1319 + }, + { + "epoch": 0.14495936745003293, + "grad_norm": 2.4155235290527344, + "learning_rate": 7.239297475301865e-07, + "loss": 1.1328, + "mean_token_accuracy": 0.6660611033439636, + "num_tokens": 33194682.0, + "step": 1320 + }, + { + "epoch": 0.1450691851526466, + "grad_norm": 1.9430997371673584, + "learning_rate": 7.244785949506037e-07, + "loss": 1.1301, + "mean_token_accuracy": 0.6678909063339233, + "num_tokens": 33229825.0, + "step": 1321 + }, + { + "epoch": 0.14517900285526028, + "grad_norm": 2.5022284984588623, + "learning_rate": 7.250274423710208e-07, + "loss": 1.1087, + "mean_token_accuracy": 0.6764153242111206, + "num_tokens": 33253063.0, + "step": 1322 + }, + { + "epoch": 0.14528882055787393, + "grad_norm": 2.33554744720459, + "learning_rate": 7.255762897914379e-07, + "loss": 1.062, + "mean_token_accuracy": 0.6865671873092651, + "num_tokens": 33279528.0, + "step": 1323 + }, + { + "epoch": 0.1453986382604876, + "grad_norm": 1.9982432126998901, + "learning_rate": 7.261251372118551e-07, + "loss": 1.1176, + "mean_token_accuracy": 0.6796197295188904, + "num_tokens": 33313205.0, + "step": 1324 + }, + { + "epoch": 0.14550845596310125, + "grad_norm": 2.2401278018951416, + "learning_rate": 7.266739846322721e-07, + "loss": 1.1101, + "mean_token_accuracy": 0.6665233373641968, + "num_tokens": 33340947.0, + "step": 1325 + }, + { + "epoch": 0.14561827366571492, + "grad_norm": 2.3364417552948, + "learning_rate": 7.272228320526893e-07, + "loss": 0.8979, + "mean_token_accuracy": 0.7263665795326233, + "num_tokens": 33364575.0, + "step": 1326 + }, + { + "epoch": 0.14572809136832857, + "grad_norm": 2.518137216567993, + "learning_rate": 7.277716794731065e-07, + "loss": 0.9691, + "mean_token_accuracy": 0.7071563005447388, + "num_tokens": 33386230.0, + "step": 1327 + }, + { + "epoch": 0.14583790907094224, + "grad_norm": 2.186096429824829, + "learning_rate": 7.283205268935235e-07, + "loss": 1.0098, + "mean_token_accuracy": 0.6970985531806946, + "num_tokens": 33413199.0, + "step": 1328 + }, + { + "epoch": 0.14594772677355589, + "grad_norm": 2.5100958347320557, + "learning_rate": 7.288693743139407e-07, + "loss": 0.9852, + "mean_token_accuracy": 0.6980108618736267, + "num_tokens": 33434945.0, + "step": 1329 + }, + { + "epoch": 0.14605754447616956, + "grad_norm": 2.36519193649292, + "learning_rate": 7.294182217343579e-07, + "loss": 0.9926, + "mean_token_accuracy": 0.6978111267089844, + "num_tokens": 33458754.0, + "step": 1330 + }, + { + "epoch": 0.14616736217878323, + "grad_norm": 2.3045248985290527, + "learning_rate": 7.299670691547749e-07, + "loss": 1.0753, + "mean_token_accuracy": 0.688157320022583, + "num_tokens": 33483356.0, + "step": 1331 + }, + { + "epoch": 0.14627717988139688, + "grad_norm": 2.5368828773498535, + "learning_rate": 7.305159165751921e-07, + "loss": 1.0173, + "mean_token_accuracy": 0.6958284378051758, + "num_tokens": 33506036.0, + "step": 1332 + }, + { + "epoch": 0.14638699758401055, + "grad_norm": 2.4809086322784424, + "learning_rate": 7.310647639956093e-07, + "loss": 1.0939, + "mean_token_accuracy": 0.6814085245132446, + "num_tokens": 33530813.0, + "step": 1333 + }, + { + "epoch": 0.1464968152866242, + "grad_norm": 2.4640984535217285, + "learning_rate": 7.316136114160263e-07, + "loss": 1.0629, + "mean_token_accuracy": 0.6848637461662292, + "num_tokens": 33555344.0, + "step": 1334 + }, + { + "epoch": 0.14660663298923787, + "grad_norm": 2.2157795429229736, + "learning_rate": 7.321624588364435e-07, + "loss": 0.9903, + "mean_token_accuracy": 0.7036306858062744, + "num_tokens": 33582352.0, + "step": 1335 + }, + { + "epoch": 0.14671645069185152, + "grad_norm": 2.2308130264282227, + "learning_rate": 7.327113062568605e-07, + "loss": 1.1352, + "mean_token_accuracy": 0.6671068072319031, + "num_tokens": 33610665.0, + "step": 1336 + }, + { + "epoch": 0.1468262683944652, + "grad_norm": 2.617030382156372, + "learning_rate": 7.332601536772777e-07, + "loss": 1.1036, + "mean_token_accuracy": 0.6800092458724976, + "num_tokens": 33633038.0, + "step": 1337 + }, + { + "epoch": 0.14693608609707884, + "grad_norm": 2.2984378337860107, + "learning_rate": 7.338090010976949e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6915469765663147, + "num_tokens": 33659047.0, + "step": 1338 + }, + { + "epoch": 0.1470459037996925, + "grad_norm": 2.6109395027160645, + "learning_rate": 7.343578485181118e-07, + "loss": 1.0889, + "mean_token_accuracy": 0.6775262355804443, + "num_tokens": 33683536.0, + "step": 1339 + }, + { + "epoch": 0.14715572150230616, + "grad_norm": 2.5613131523132324, + "learning_rate": 7.34906695938529e-07, + "loss": 1.062, + "mean_token_accuracy": 0.6809219121932983, + "num_tokens": 33703019.0, + "step": 1340 + }, + { + "epoch": 0.14726553920491983, + "grad_norm": 2.3089427947998047, + "learning_rate": 7.354555433589462e-07, + "loss": 0.9308, + "mean_token_accuracy": 0.7147542238235474, + "num_tokens": 33724953.0, + "step": 1341 + }, + { + "epoch": 0.1473753569075335, + "grad_norm": 2.261340856552124, + "learning_rate": 7.360043907793632e-07, + "loss": 1.0526, + "mean_token_accuracy": 0.6840497255325317, + "num_tokens": 33750967.0, + "step": 1342 + }, + { + "epoch": 0.14748517461014715, + "grad_norm": 2.311962366104126, + "learning_rate": 7.365532381997804e-07, + "loss": 1.0847, + "mean_token_accuracy": 0.6742965579032898, + "num_tokens": 33777383.0, + "step": 1343 + }, + { + "epoch": 0.14759499231276083, + "grad_norm": 2.433703899383545, + "learning_rate": 7.371020856201976e-07, + "loss": 0.9307, + "mean_token_accuracy": 0.7108976244926453, + "num_tokens": 33798973.0, + "step": 1344 + }, + { + "epoch": 0.14770481001537447, + "grad_norm": 2.391653537750244, + "learning_rate": 7.376509330406146e-07, + "loss": 0.9949, + "mean_token_accuracy": 0.706216037273407, + "num_tokens": 33822611.0, + "step": 1345 + }, + { + "epoch": 0.14781462771798815, + "grad_norm": 2.4508068561553955, + "learning_rate": 7.381997804610318e-07, + "loss": 1.069, + "mean_token_accuracy": 0.6913122534751892, + "num_tokens": 33846081.0, + "step": 1346 + }, + { + "epoch": 0.1479244454206018, + "grad_norm": 2.3932411670684814, + "learning_rate": 7.387486278814489e-07, + "loss": 1.0314, + "mean_token_accuracy": 0.694403886795044, + "num_tokens": 33870210.0, + "step": 1347 + }, + { + "epoch": 0.14803426312321546, + "grad_norm": 2.456691265106201, + "learning_rate": 7.39297475301866e-07, + "loss": 1.1072, + "mean_token_accuracy": 0.671416699886322, + "num_tokens": 33894404.0, + "step": 1348 + }, + { + "epoch": 0.1481440808258291, + "grad_norm": 2.488664388656616, + "learning_rate": 7.398463227222832e-07, + "loss": 0.9634, + "mean_token_accuracy": 0.706531822681427, + "num_tokens": 33917454.0, + "step": 1349 + }, + { + "epoch": 0.14825389852844278, + "grad_norm": 2.5187950134277344, + "learning_rate": 7.403951701427003e-07, + "loss": 1.0157, + "mean_token_accuracy": 0.6925587058067322, + "num_tokens": 33939667.0, + "step": 1350 + }, + { + "epoch": 0.14836371623105646, + "grad_norm": 2.2610695362091064, + "learning_rate": 7.409440175631174e-07, + "loss": 1.116, + "mean_token_accuracy": 0.6736181378364563, + "num_tokens": 33966707.0, + "step": 1351 + }, + { + "epoch": 0.1484735339336701, + "grad_norm": 2.56129789352417, + "learning_rate": 7.414928649835346e-07, + "loss": 0.9746, + "mean_token_accuracy": 0.7037318348884583, + "num_tokens": 33987279.0, + "step": 1352 + }, + { + "epoch": 0.14858335163628378, + "grad_norm": 2.358837366104126, + "learning_rate": 7.420417124039517e-07, + "loss": 0.9827, + "mean_token_accuracy": 0.6984211206436157, + "num_tokens": 34011643.0, + "step": 1353 + }, + { + "epoch": 0.14869316933889742, + "grad_norm": 2.4467568397521973, + "learning_rate": 7.425905598243688e-07, + "loss": 0.9437, + "mean_token_accuracy": 0.7102735042572021, + "num_tokens": 34033842.0, + "step": 1354 + }, + { + "epoch": 0.1488029870415111, + "grad_norm": 2.4432787895202637, + "learning_rate": 7.43139407244786e-07, + "loss": 1.067, + "mean_token_accuracy": 0.691628098487854, + "num_tokens": 34057375.0, + "step": 1355 + }, + { + "epoch": 0.14891280474412474, + "grad_norm": 2.3409910202026367, + "learning_rate": 7.436882546652031e-07, + "loss": 0.9872, + "mean_token_accuracy": 0.7019619941711426, + "num_tokens": 34082347.0, + "step": 1356 + }, + { + "epoch": 0.14902262244673842, + "grad_norm": 2.1755728721618652, + "learning_rate": 7.442371020856202e-07, + "loss": 1.0319, + "mean_token_accuracy": 0.6908451914787292, + "num_tokens": 34111158.0, + "step": 1357 + }, + { + "epoch": 0.14913244014935206, + "grad_norm": 2.5606017112731934, + "learning_rate": 7.447859495060372e-07, + "loss": 1.0437, + "mean_token_accuracy": 0.6894384026527405, + "num_tokens": 34132435.0, + "step": 1358 + }, + { + "epoch": 0.14924225785196574, + "grad_norm": 2.1485934257507324, + "learning_rate": 7.453347969264544e-07, + "loss": 0.9971, + "mean_token_accuracy": 0.6946344375610352, + "num_tokens": 34158968.0, + "step": 1359 + }, + { + "epoch": 0.1493520755545794, + "grad_norm": 2.4211409091949463, + "learning_rate": 7.458836443468715e-07, + "loss": 0.9986, + "mean_token_accuracy": 0.6975734233856201, + "num_tokens": 34181929.0, + "step": 1360 + }, + { + "epoch": 0.14946189325719306, + "grad_norm": 2.389845132827759, + "learning_rate": 7.464324917672886e-07, + "loss": 1.0661, + "mean_token_accuracy": 0.6808556318283081, + "num_tokens": 34207721.0, + "step": 1361 + }, + { + "epoch": 0.14957171095980673, + "grad_norm": 2.360943555831909, + "learning_rate": 7.469813391877058e-07, + "loss": 1.0723, + "mean_token_accuracy": 0.6789240837097168, + "num_tokens": 34233874.0, + "step": 1362 + }, + { + "epoch": 0.14968152866242038, + "grad_norm": 2.1022627353668213, + "learning_rate": 7.475301866081229e-07, + "loss": 1.0933, + "mean_token_accuracy": 0.6755452156066895, + "num_tokens": 34263648.0, + "step": 1363 + }, + { + "epoch": 0.14979134636503405, + "grad_norm": 2.149785041809082, + "learning_rate": 7.4807903402854e-07, + "loss": 0.9823, + "mean_token_accuracy": 0.7044111490249634, + "num_tokens": 34292844.0, + "step": 1364 + }, + { + "epoch": 0.1499011640676477, + "grad_norm": 2.1186697483062744, + "learning_rate": 7.486278814489572e-07, + "loss": 1.0521, + "mean_token_accuracy": 0.6838041543960571, + "num_tokens": 34323383.0, + "step": 1365 + }, + { + "epoch": 0.15001098177026137, + "grad_norm": 2.221229076385498, + "learning_rate": 7.491767288693743e-07, + "loss": 1.1197, + "mean_token_accuracy": 0.6783987879753113, + "num_tokens": 34352535.0, + "step": 1366 + }, + { + "epoch": 0.15012079947287502, + "grad_norm": 2.446065664291382, + "learning_rate": 7.497255762897914e-07, + "loss": 0.9769, + "mean_token_accuracy": 0.7047669887542725, + "num_tokens": 34375559.0, + "step": 1367 + }, + { + "epoch": 0.1502306171754887, + "grad_norm": 2.227482795715332, + "learning_rate": 7.502744237102086e-07, + "loss": 1.02, + "mean_token_accuracy": 0.6996102333068848, + "num_tokens": 34403364.0, + "step": 1368 + }, + { + "epoch": 0.15034043487810236, + "grad_norm": 2.4147255420684814, + "learning_rate": 7.508232711306256e-07, + "loss": 1.0515, + "mean_token_accuracy": 0.6840662956237793, + "num_tokens": 34426780.0, + "step": 1369 + }, + { + "epoch": 0.150450252580716, + "grad_norm": 2.3622119426727295, + "learning_rate": 7.513721185510428e-07, + "loss": 1.0727, + "mean_token_accuracy": 0.6796591281890869, + "num_tokens": 34454277.0, + "step": 1370 + }, + { + "epoch": 0.15056007028332968, + "grad_norm": 2.37243914604187, + "learning_rate": 7.5192096597146e-07, + "loss": 1.0076, + "mean_token_accuracy": 0.7033490538597107, + "num_tokens": 34478894.0, + "step": 1371 + }, + { + "epoch": 0.15066988798594333, + "grad_norm": 2.596853256225586, + "learning_rate": 7.52469813391877e-07, + "loss": 0.9669, + "mean_token_accuracy": 0.7160942554473877, + "num_tokens": 34499183.0, + "step": 1372 + }, + { + "epoch": 0.150779705688557, + "grad_norm": 2.367884635925293, + "learning_rate": 7.530186608122942e-07, + "loss": 1.0927, + "mean_token_accuracy": 0.6823912858963013, + "num_tokens": 34525095.0, + "step": 1373 + }, + { + "epoch": 0.15088952339117065, + "grad_norm": 2.7288100719451904, + "learning_rate": 7.535675082327113e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.6891135573387146, + "num_tokens": 34546205.0, + "step": 1374 + }, + { + "epoch": 0.15099934109378432, + "grad_norm": 2.4739990234375, + "learning_rate": 7.541163556531284e-07, + "loss": 1.0005, + "mean_token_accuracy": 0.7015960812568665, + "num_tokens": 34568936.0, + "step": 1375 + }, + { + "epoch": 0.15110915879639797, + "grad_norm": 2.4376540184020996, + "learning_rate": 7.546652030735456e-07, + "loss": 1.1126, + "mean_token_accuracy": 0.6866626739501953, + "num_tokens": 34592952.0, + "step": 1376 + }, + { + "epoch": 0.15121897649901164, + "grad_norm": 2.354947328567505, + "learning_rate": 7.552140504939627e-07, + "loss": 0.9399, + "mean_token_accuracy": 0.7095980048179626, + "num_tokens": 34615173.0, + "step": 1377 + }, + { + "epoch": 0.1513287942016253, + "grad_norm": 2.23492169380188, + "learning_rate": 7.557628979143797e-07, + "loss": 1.0822, + "mean_token_accuracy": 0.6830579042434692, + "num_tokens": 34641443.0, + "step": 1378 + }, + { + "epoch": 0.15143861190423896, + "grad_norm": 2.4257404804229736, + "learning_rate": 7.563117453347969e-07, + "loss": 1.0727, + "mean_token_accuracy": 0.6832407712936401, + "num_tokens": 34664731.0, + "step": 1379 + }, + { + "epoch": 0.15154842960685264, + "grad_norm": 2.125270366668701, + "learning_rate": 7.568605927552139e-07, + "loss": 1.0534, + "mean_token_accuracy": 0.6852811574935913, + "num_tokens": 34692355.0, + "step": 1380 + }, + { + "epoch": 0.15165824730946628, + "grad_norm": 2.4305357933044434, + "learning_rate": 7.574094401756311e-07, + "loss": 0.9726, + "mean_token_accuracy": 0.7049511671066284, + "num_tokens": 34716222.0, + "step": 1381 + }, + { + "epoch": 0.15176806501207996, + "grad_norm": 2.384218215942383, + "learning_rate": 7.579582875960483e-07, + "loss": 1.1328, + "mean_token_accuracy": 0.6607423424720764, + "num_tokens": 34739641.0, + "step": 1382 + }, + { + "epoch": 0.1518778827146936, + "grad_norm": 2.3021738529205322, + "learning_rate": 7.585071350164653e-07, + "loss": 1.081, + "mean_token_accuracy": 0.6772789359092712, + "num_tokens": 34767433.0, + "step": 1383 + }, + { + "epoch": 0.15198770041730728, + "grad_norm": 2.2204554080963135, + "learning_rate": 7.590559824368825e-07, + "loss": 0.9675, + "mean_token_accuracy": 0.7099024653434753, + "num_tokens": 34791963.0, + "step": 1384 + }, + { + "epoch": 0.15209751811992092, + "grad_norm": 2.137282371520996, + "learning_rate": 7.596048298572997e-07, + "loss": 1.1162, + "mean_token_accuracy": 0.6683439016342163, + "num_tokens": 34821570.0, + "step": 1385 + }, + { + "epoch": 0.1522073358225346, + "grad_norm": 2.467531204223633, + "learning_rate": 7.601536772777167e-07, + "loss": 0.9931, + "mean_token_accuracy": 0.7006394863128662, + "num_tokens": 34846249.0, + "step": 1386 + }, + { + "epoch": 0.15231715352514824, + "grad_norm": 2.188359022140503, + "learning_rate": 7.607025246981339e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6778285503387451, + "num_tokens": 34874058.0, + "step": 1387 + }, + { + "epoch": 0.15242697122776191, + "grad_norm": 2.4405858516693115, + "learning_rate": 7.612513721185511e-07, + "loss": 1.0118, + "mean_token_accuracy": 0.7091536521911621, + "num_tokens": 34896273.0, + "step": 1388 + }, + { + "epoch": 0.1525367889303756, + "grad_norm": 2.483816385269165, + "learning_rate": 7.618002195389681e-07, + "loss": 1.0633, + "mean_token_accuracy": 0.6884913444519043, + "num_tokens": 34920322.0, + "step": 1389 + }, + { + "epoch": 0.15264660663298923, + "grad_norm": 2.2812490463256836, + "learning_rate": 7.623490669593853e-07, + "loss": 1.0557, + "mean_token_accuracy": 0.6862673759460449, + "num_tokens": 34948170.0, + "step": 1390 + }, + { + "epoch": 0.1527564243356029, + "grad_norm": 2.427373170852661, + "learning_rate": 7.628979143798024e-07, + "loss": 1.0217, + "mean_token_accuracy": 0.6915424466133118, + "num_tokens": 34971679.0, + "step": 1391 + }, + { + "epoch": 0.15286624203821655, + "grad_norm": 2.464067220687866, + "learning_rate": 7.634467618002195e-07, + "loss": 1.0548, + "mean_token_accuracy": 0.68646240234375, + "num_tokens": 34997431.0, + "step": 1392 + }, + { + "epoch": 0.15297605974083023, + "grad_norm": 2.437122344970703, + "learning_rate": 7.639956092206367e-07, + "loss": 1.0592, + "mean_token_accuracy": 0.6845401525497437, + "num_tokens": 35021289.0, + "step": 1393 + }, + { + "epoch": 0.15308587744344387, + "grad_norm": 2.3534271717071533, + "learning_rate": 7.645444566410538e-07, + "loss": 1.0911, + "mean_token_accuracy": 0.6740949153900146, + "num_tokens": 35046869.0, + "step": 1394 + }, + { + "epoch": 0.15319569514605755, + "grad_norm": 2.2532060146331787, + "learning_rate": 7.650933040614709e-07, + "loss": 0.9518, + "mean_token_accuracy": 0.7240962386131287, + "num_tokens": 35071764.0, + "step": 1395 + }, + { + "epoch": 0.1533055128486712, + "grad_norm": 2.222245931625366, + "learning_rate": 7.656421514818881e-07, + "loss": 1.048, + "mean_token_accuracy": 0.6884398460388184, + "num_tokens": 35099149.0, + "step": 1396 + }, + { + "epoch": 0.15341533055128487, + "grad_norm": 2.3091394901275635, + "learning_rate": 7.66190998902305e-07, + "loss": 1.0144, + "mean_token_accuracy": 0.7005404829978943, + "num_tokens": 35125718.0, + "step": 1397 + }, + { + "epoch": 0.15352514825389854, + "grad_norm": 2.044922113418579, + "learning_rate": 7.667398463227222e-07, + "loss": 1.035, + "mean_token_accuracy": 0.6882592439651489, + "num_tokens": 35155960.0, + "step": 1398 + }, + { + "epoch": 0.1536349659565122, + "grad_norm": 2.2034683227539062, + "learning_rate": 7.672886937431394e-07, + "loss": 1.1305, + "mean_token_accuracy": 0.6636358499526978, + "num_tokens": 35184543.0, + "step": 1399 + }, + { + "epoch": 0.15374478365912586, + "grad_norm": 2.5525894165039062, + "learning_rate": 7.678375411635564e-07, + "loss": 1.0441, + "mean_token_accuracy": 0.6898270845413208, + "num_tokens": 35206573.0, + "step": 1400 + }, + { + "epoch": 0.1538546013617395, + "grad_norm": 2.4209859371185303, + "learning_rate": 7.683863885839736e-07, + "loss": 0.997, + "mean_token_accuracy": 0.7014549970626831, + "num_tokens": 35229089.0, + "step": 1401 + }, + { + "epoch": 0.15396441906435318, + "grad_norm": 2.0957016944885254, + "learning_rate": 7.689352360043907e-07, + "loss": 1.0466, + "mean_token_accuracy": 0.6844193935394287, + "num_tokens": 35257930.0, + "step": 1402 + }, + { + "epoch": 0.15407423676696683, + "grad_norm": 2.384202480316162, + "learning_rate": 7.694840834248078e-07, + "loss": 1.052, + "mean_token_accuracy": 0.6906245350837708, + "num_tokens": 35280602.0, + "step": 1403 + }, + { + "epoch": 0.1541840544695805, + "grad_norm": 2.5037338733673096, + "learning_rate": 7.70032930845225e-07, + "loss": 1.118, + "mean_token_accuracy": 0.6679539680480957, + "num_tokens": 35302630.0, + "step": 1404 + }, + { + "epoch": 0.15429387217219415, + "grad_norm": 2.1926400661468506, + "learning_rate": 7.705817782656421e-07, + "loss": 0.9673, + "mean_token_accuracy": 0.7076467275619507, + "num_tokens": 35328760.0, + "step": 1405 + }, + { + "epoch": 0.15440368987480782, + "grad_norm": 2.505310535430908, + "learning_rate": 7.711306256860592e-07, + "loss": 1.0333, + "mean_token_accuracy": 0.6907936334609985, + "num_tokens": 35350856.0, + "step": 1406 + }, + { + "epoch": 0.1545135075774215, + "grad_norm": 2.7119786739349365, + "learning_rate": 7.716794731064764e-07, + "loss": 1.0372, + "mean_token_accuracy": 0.6881084442138672, + "num_tokens": 35372189.0, + "step": 1407 + }, + { + "epoch": 0.15462332528003514, + "grad_norm": 2.588273525238037, + "learning_rate": 7.722283205268935e-07, + "loss": 1.007, + "mean_token_accuracy": 0.7004454731941223, + "num_tokens": 35394128.0, + "step": 1408 + }, + { + "epoch": 0.1547331429826488, + "grad_norm": 2.4407405853271484, + "learning_rate": 7.727771679473106e-07, + "loss": 1.0399, + "mean_token_accuracy": 0.6878803968429565, + "num_tokens": 35418448.0, + "step": 1409 + }, + { + "epoch": 0.15484296068526246, + "grad_norm": 2.3043711185455322, + "learning_rate": 7.733260153677278e-07, + "loss": 1.1229, + "mean_token_accuracy": 0.6770443916320801, + "num_tokens": 35444830.0, + "step": 1410 + }, + { + "epoch": 0.15495277838787613, + "grad_norm": 2.496854782104492, + "learning_rate": 7.738748627881449e-07, + "loss": 1.0754, + "mean_token_accuracy": 0.692380428314209, + "num_tokens": 35467680.0, + "step": 1411 + }, + { + "epoch": 0.15506259609048978, + "grad_norm": 2.2586328983306885, + "learning_rate": 7.74423710208562e-07, + "loss": 1.0261, + "mean_token_accuracy": 0.7041636109352112, + "num_tokens": 35493769.0, + "step": 1412 + }, + { + "epoch": 0.15517241379310345, + "grad_norm": 2.1798765659332275, + "learning_rate": 7.749725576289791e-07, + "loss": 1.0629, + "mean_token_accuracy": 0.6789826154708862, + "num_tokens": 35520089.0, + "step": 1413 + }, + { + "epoch": 0.1552822314957171, + "grad_norm": 2.423844337463379, + "learning_rate": 7.755214050493963e-07, + "loss": 1.0487, + "mean_token_accuracy": 0.6873781681060791, + "num_tokens": 35541958.0, + "step": 1414 + }, + { + "epoch": 0.15539204919833077, + "grad_norm": 2.176468849182129, + "learning_rate": 7.760702524698134e-07, + "loss": 0.9411, + "mean_token_accuracy": 0.7126966714859009, + "num_tokens": 35566944.0, + "step": 1415 + }, + { + "epoch": 0.15550186690094442, + "grad_norm": 2.3042197227478027, + "learning_rate": 7.766190998902304e-07, + "loss": 1.0047, + "mean_token_accuracy": 0.7051127552986145, + "num_tokens": 35592239.0, + "step": 1416 + }, + { + "epoch": 0.1556116846035581, + "grad_norm": 2.273038625717163, + "learning_rate": 7.771679473106476e-07, + "loss": 0.9757, + "mean_token_accuracy": 0.7090061902999878, + "num_tokens": 35615909.0, + "step": 1417 + }, + { + "epoch": 0.15572150230617177, + "grad_norm": 2.3548507690429688, + "learning_rate": 7.777167947310647e-07, + "loss": 1.0355, + "mean_token_accuracy": 0.697195291519165, + "num_tokens": 35640403.0, + "step": 1418 + }, + { + "epoch": 0.1558313200087854, + "grad_norm": 2.35784912109375, + "learning_rate": 7.782656421514818e-07, + "loss": 0.9973, + "mean_token_accuracy": 0.7071335315704346, + "num_tokens": 35662902.0, + "step": 1419 + }, + { + "epoch": 0.15594113771139909, + "grad_norm": 2.0391035079956055, + "learning_rate": 7.78814489571899e-07, + "loss": 1.1109, + "mean_token_accuracy": 0.6730285882949829, + "num_tokens": 35697609.0, + "step": 1420 + }, + { + "epoch": 0.15605095541401273, + "grad_norm": 2.1730892658233643, + "learning_rate": 7.793633369923161e-07, + "loss": 1.0761, + "mean_token_accuracy": 0.6796181201934814, + "num_tokens": 35727112.0, + "step": 1421 + }, + { + "epoch": 0.1561607731166264, + "grad_norm": 2.546123743057251, + "learning_rate": 7.799121844127332e-07, + "loss": 1.1326, + "mean_token_accuracy": 0.6650257110595703, + "num_tokens": 35750133.0, + "step": 1422 + }, + { + "epoch": 0.15627059081924005, + "grad_norm": 2.185746908187866, + "learning_rate": 7.804610318331504e-07, + "loss": 1.1377, + "mean_token_accuracy": 0.6618881225585938, + "num_tokens": 35780030.0, + "step": 1423 + }, + { + "epoch": 0.15638040852185373, + "grad_norm": 2.446215867996216, + "learning_rate": 7.810098792535674e-07, + "loss": 1.0111, + "mean_token_accuracy": 0.6961355209350586, + "num_tokens": 35803663.0, + "step": 1424 + }, + { + "epoch": 0.15649022622446737, + "grad_norm": 2.4050965309143066, + "learning_rate": 7.815587266739846e-07, + "loss": 1.0355, + "mean_token_accuracy": 0.6971710324287415, + "num_tokens": 35827781.0, + "step": 1425 + }, + { + "epoch": 0.15660004392708105, + "grad_norm": 2.3654873371124268, + "learning_rate": 7.821075740944018e-07, + "loss": 1.0306, + "mean_token_accuracy": 0.6963926553726196, + "num_tokens": 35852347.0, + "step": 1426 + }, + { + "epoch": 0.15670986162969472, + "grad_norm": 2.0755627155303955, + "learning_rate": 7.826564215148188e-07, + "loss": 1.0913, + "mean_token_accuracy": 0.6808547377586365, + "num_tokens": 35884406.0, + "step": 1427 + }, + { + "epoch": 0.15681967933230836, + "grad_norm": 2.4201314449310303, + "learning_rate": 7.83205268935236e-07, + "loss": 1.0819, + "mean_token_accuracy": 0.6835185289382935, + "num_tokens": 35908441.0, + "step": 1428 + }, + { + "epoch": 0.15692949703492204, + "grad_norm": 2.186647891998291, + "learning_rate": 7.837541163556532e-07, + "loss": 1.0073, + "mean_token_accuracy": 0.698293149471283, + "num_tokens": 35935928.0, + "step": 1429 + }, + { + "epoch": 0.15703931473753568, + "grad_norm": 2.184596061706543, + "learning_rate": 7.843029637760702e-07, + "loss": 0.9744, + "mean_token_accuracy": 0.7184908390045166, + "num_tokens": 35962201.0, + "step": 1430 + }, + { + "epoch": 0.15714913244014936, + "grad_norm": 2.467151641845703, + "learning_rate": 7.848518111964874e-07, + "loss": 0.9029, + "mean_token_accuracy": 0.7228104472160339, + "num_tokens": 35982234.0, + "step": 1431 + }, + { + "epoch": 0.157258950142763, + "grad_norm": 2.553513288497925, + "learning_rate": 7.854006586169045e-07, + "loss": 1.0294, + "mean_token_accuracy": 0.6892896890640259, + "num_tokens": 36003530.0, + "step": 1432 + }, + { + "epoch": 0.15736876784537668, + "grad_norm": 2.572268009185791, + "learning_rate": 7.859495060373216e-07, + "loss": 1.111, + "mean_token_accuracy": 0.6747630834579468, + "num_tokens": 36027317.0, + "step": 1433 + }, + { + "epoch": 0.15747858554799032, + "grad_norm": 2.274662733078003, + "learning_rate": 7.864983534577388e-07, + "loss": 0.9659, + "mean_token_accuracy": 0.709114670753479, + "num_tokens": 36052762.0, + "step": 1434 + }, + { + "epoch": 0.157588403250604, + "grad_norm": 2.5270211696624756, + "learning_rate": 7.870472008781557e-07, + "loss": 1.0332, + "mean_token_accuracy": 0.6966100931167603, + "num_tokens": 36076813.0, + "step": 1435 + }, + { + "epoch": 0.15769822095321767, + "grad_norm": 2.4193320274353027, + "learning_rate": 7.875960482985729e-07, + "loss": 0.943, + "mean_token_accuracy": 0.7200340628623962, + "num_tokens": 36100537.0, + "step": 1436 + }, + { + "epoch": 0.15780803865583132, + "grad_norm": 2.2673754692077637, + "learning_rate": 7.881448957189901e-07, + "loss": 1.0952, + "mean_token_accuracy": 0.6712539196014404, + "num_tokens": 36128957.0, + "step": 1437 + }, + { + "epoch": 0.157917856358445, + "grad_norm": 2.171753168106079, + "learning_rate": 7.886937431394071e-07, + "loss": 1.083, + "mean_token_accuracy": 0.6741511225700378, + "num_tokens": 36157890.0, + "step": 1438 + }, + { + "epoch": 0.15802767406105864, + "grad_norm": 2.434563636779785, + "learning_rate": 7.892425905598243e-07, + "loss": 0.9915, + "mean_token_accuracy": 0.6975425481796265, + "num_tokens": 36181636.0, + "step": 1439 + }, + { + "epoch": 0.1581374917636723, + "grad_norm": 2.472973346710205, + "learning_rate": 7.897914379802415e-07, + "loss": 0.9359, + "mean_token_accuracy": 0.7159822583198547, + "num_tokens": 36204913.0, + "step": 1440 + }, + { + "epoch": 0.15824730946628596, + "grad_norm": 2.3523123264312744, + "learning_rate": 7.903402854006585e-07, + "loss": 0.9751, + "mean_token_accuracy": 0.7070481777191162, + "num_tokens": 36229503.0, + "step": 1441 + }, + { + "epoch": 0.15835712716889963, + "grad_norm": 2.527815341949463, + "learning_rate": 7.908891328210757e-07, + "loss": 1.0615, + "mean_token_accuracy": 0.6881087422370911, + "num_tokens": 36253279.0, + "step": 1442 + }, + { + "epoch": 0.15846694487151328, + "grad_norm": 2.5256996154785156, + "learning_rate": 7.914379802414928e-07, + "loss": 0.9194, + "mean_token_accuracy": 0.7186449766159058, + "num_tokens": 36274981.0, + "step": 1443 + }, + { + "epoch": 0.15857676257412695, + "grad_norm": 2.179611921310425, + "learning_rate": 7.919868276619099e-07, + "loss": 0.9444, + "mean_token_accuracy": 0.7103243470191956, + "num_tokens": 36304133.0, + "step": 1444 + }, + { + "epoch": 0.15868658027674062, + "grad_norm": 2.3028581142425537, + "learning_rate": 7.925356750823271e-07, + "loss": 1.0413, + "mean_token_accuracy": 0.6957898736000061, + "num_tokens": 36328496.0, + "step": 1445 + }, + { + "epoch": 0.15879639797935427, + "grad_norm": 1.927564263343811, + "learning_rate": 7.930845225027442e-07, + "loss": 0.9824, + "mean_token_accuracy": 0.7000219225883484, + "num_tokens": 36359548.0, + "step": 1446 + }, + { + "epoch": 0.15890621568196794, + "grad_norm": 2.227750062942505, + "learning_rate": 7.936333699231613e-07, + "loss": 1.0914, + "mean_token_accuracy": 0.6756489276885986, + "num_tokens": 36387398.0, + "step": 1447 + }, + { + "epoch": 0.1590160333845816, + "grad_norm": 2.5606017112731934, + "learning_rate": 7.941822173435785e-07, + "loss": 0.9985, + "mean_token_accuracy": 0.7013879418373108, + "num_tokens": 36409357.0, + "step": 1448 + }, + { + "epoch": 0.15912585108719526, + "grad_norm": 2.3935539722442627, + "learning_rate": 7.947310647639956e-07, + "loss": 1.0459, + "mean_token_accuracy": 0.6931943893432617, + "num_tokens": 36432340.0, + "step": 1449 + }, + { + "epoch": 0.1592356687898089, + "grad_norm": 2.2555043697357178, + "learning_rate": 7.952799121844127e-07, + "loss": 0.9669, + "mean_token_accuracy": 0.7083638906478882, + "num_tokens": 36456921.0, + "step": 1450 + }, + { + "epoch": 0.15934548649242258, + "grad_norm": 2.4643945693969727, + "learning_rate": 7.958287596048299e-07, + "loss": 1.0196, + "mean_token_accuracy": 0.6894985437393188, + "num_tokens": 36479664.0, + "step": 1451 + }, + { + "epoch": 0.15945530419503623, + "grad_norm": 2.091534376144409, + "learning_rate": 7.96377607025247e-07, + "loss": 0.9656, + "mean_token_accuracy": 0.7098536491394043, + "num_tokens": 36507903.0, + "step": 1452 + }, + { + "epoch": 0.1595651218976499, + "grad_norm": 2.3005433082580566, + "learning_rate": 7.969264544456641e-07, + "loss": 1.0576, + "mean_token_accuracy": 0.6894838809967041, + "num_tokens": 36531454.0, + "step": 1453 + }, + { + "epoch": 0.15967493960026355, + "grad_norm": 2.6075642108917236, + "learning_rate": 7.974753018660811e-07, + "loss": 1.1199, + "mean_token_accuracy": 0.6684011220932007, + "num_tokens": 36554438.0, + "step": 1454 + }, + { + "epoch": 0.15978475730287722, + "grad_norm": 2.0977859497070312, + "learning_rate": 7.980241492864983e-07, + "loss": 1.051, + "mean_token_accuracy": 0.6937994360923767, + "num_tokens": 36582499.0, + "step": 1455 + }, + { + "epoch": 0.1598945750054909, + "grad_norm": 2.2791993618011475, + "learning_rate": 7.985729967069154e-07, + "loss": 1.0324, + "mean_token_accuracy": 0.6870162487030029, + "num_tokens": 36610425.0, + "step": 1456 + }, + { + "epoch": 0.16000439270810454, + "grad_norm": 2.4897663593292236, + "learning_rate": 7.991218441273325e-07, + "loss": 1.0032, + "mean_token_accuracy": 0.6930490732192993, + "num_tokens": 36633282.0, + "step": 1457 + }, + { + "epoch": 0.16011421041071822, + "grad_norm": 2.5141868591308594, + "learning_rate": 7.996706915477497e-07, + "loss": 1.0066, + "mean_token_accuracy": 0.7051894068717957, + "num_tokens": 36654778.0, + "step": 1458 + }, + { + "epoch": 0.16022402811333186, + "grad_norm": 2.159623146057129, + "learning_rate": 8.002195389681668e-07, + "loss": 1.0084, + "mean_token_accuracy": 0.6984792947769165, + "num_tokens": 36682913.0, + "step": 1459 + }, + { + "epoch": 0.16033384581594554, + "grad_norm": 2.3418264389038086, + "learning_rate": 8.007683863885839e-07, + "loss": 1.0709, + "mean_token_accuracy": 0.6885960102081299, + "num_tokens": 36709006.0, + "step": 1460 + }, + { + "epoch": 0.16044366351855918, + "grad_norm": 2.152913808822632, + "learning_rate": 8.013172338090011e-07, + "loss": 0.969, + "mean_token_accuracy": 0.7139444351196289, + "num_tokens": 36736093.0, + "step": 1461 + }, + { + "epoch": 0.16055348122117286, + "grad_norm": 2.6763551235198975, + "learning_rate": 8.018660812294182e-07, + "loss": 1.0851, + "mean_token_accuracy": 0.6880163550376892, + "num_tokens": 36756205.0, + "step": 1462 + }, + { + "epoch": 0.1606632989237865, + "grad_norm": 2.20117449760437, + "learning_rate": 8.024149286498353e-07, + "loss": 0.9913, + "mean_token_accuracy": 0.6987872123718262, + "num_tokens": 36784892.0, + "step": 1463 + }, + { + "epoch": 0.16077311662640018, + "grad_norm": 2.2680141925811768, + "learning_rate": 8.029637760702525e-07, + "loss": 1.0276, + "mean_token_accuracy": 0.6972304582595825, + "num_tokens": 36812059.0, + "step": 1464 + }, + { + "epoch": 0.16088293432901385, + "grad_norm": 2.2305428981781006, + "learning_rate": 8.035126234906695e-07, + "loss": 1.0353, + "mean_token_accuracy": 0.6867828965187073, + "num_tokens": 36839978.0, + "step": 1465 + }, + { + "epoch": 0.1609927520316275, + "grad_norm": 2.195760488510132, + "learning_rate": 8.040614709110867e-07, + "loss": 0.9947, + "mean_token_accuracy": 0.7079178094863892, + "num_tokens": 36869033.0, + "step": 1466 + }, + { + "epoch": 0.16110256973424117, + "grad_norm": 2.2870571613311768, + "learning_rate": 8.046103183315039e-07, + "loss": 0.9674, + "mean_token_accuracy": 0.7064690589904785, + "num_tokens": 36894935.0, + "step": 1467 + }, + { + "epoch": 0.16121238743685481, + "grad_norm": 2.359522819519043, + "learning_rate": 8.051591657519209e-07, + "loss": 0.9932, + "mean_token_accuracy": 0.7017046213150024, + "num_tokens": 36919223.0, + "step": 1468 + }, + { + "epoch": 0.1613222051394685, + "grad_norm": 2.2246084213256836, + "learning_rate": 8.057080131723381e-07, + "loss": 1.1449, + "mean_token_accuracy": 0.6590299606323242, + "num_tokens": 36948685.0, + "step": 1469 + }, + { + "epoch": 0.16143202284208213, + "grad_norm": 2.438646078109741, + "learning_rate": 8.062568605927553e-07, + "loss": 0.9949, + "mean_token_accuracy": 0.7021481990814209, + "num_tokens": 36971919.0, + "step": 1470 + }, + { + "epoch": 0.1615418405446958, + "grad_norm": 2.4661824703216553, + "learning_rate": 8.068057080131723e-07, + "loss": 0.9827, + "mean_token_accuracy": 0.7029803395271301, + "num_tokens": 36992833.0, + "step": 1471 + }, + { + "epoch": 0.16165165824730945, + "grad_norm": 2.5369837284088135, + "learning_rate": 8.073545554335894e-07, + "loss": 0.9856, + "mean_token_accuracy": 0.7057191729545593, + "num_tokens": 37013769.0, + "step": 1472 + }, + { + "epoch": 0.16176147594992313, + "grad_norm": 2.187673330307007, + "learning_rate": 8.079034028540066e-07, + "loss": 1.123, + "mean_token_accuracy": 0.6681352853775024, + "num_tokens": 37045007.0, + "step": 1473 + }, + { + "epoch": 0.1618712936525368, + "grad_norm": 2.373729944229126, + "learning_rate": 8.084522502744236e-07, + "loss": 0.9965, + "mean_token_accuracy": 0.7033218145370483, + "num_tokens": 37070246.0, + "step": 1474 + }, + { + "epoch": 0.16198111135515045, + "grad_norm": 2.083935499191284, + "learning_rate": 8.090010976948408e-07, + "loss": 1.1195, + "mean_token_accuracy": 0.6792782545089722, + "num_tokens": 37101592.0, + "step": 1475 + }, + { + "epoch": 0.16209092905776412, + "grad_norm": 2.395087718963623, + "learning_rate": 8.095499451152578e-07, + "loss": 1.1099, + "mean_token_accuracy": 0.6747811436653137, + "num_tokens": 37125624.0, + "step": 1476 + }, + { + "epoch": 0.16220074676037777, + "grad_norm": 2.655198812484741, + "learning_rate": 8.10098792535675e-07, + "loss": 0.9376, + "mean_token_accuracy": 0.7172510623931885, + "num_tokens": 37145814.0, + "step": 1477 + }, + { + "epoch": 0.16231056446299144, + "grad_norm": 2.6695785522460938, + "learning_rate": 8.106476399560922e-07, + "loss": 1.0221, + "mean_token_accuracy": 0.6956014633178711, + "num_tokens": 37166445.0, + "step": 1478 + }, + { + "epoch": 0.1624203821656051, + "grad_norm": 2.2587270736694336, + "learning_rate": 8.111964873765092e-07, + "loss": 1.0198, + "mean_token_accuracy": 0.694007158279419, + "num_tokens": 37191362.0, + "step": 1479 + }, + { + "epoch": 0.16253019986821876, + "grad_norm": 2.7259762287139893, + "learning_rate": 8.117453347969264e-07, + "loss": 0.9515, + "mean_token_accuracy": 0.7101019024848938, + "num_tokens": 37211204.0, + "step": 1480 + }, + { + "epoch": 0.1626400175708324, + "grad_norm": 2.343792200088501, + "learning_rate": 8.122941822173436e-07, + "loss": 0.9962, + "mean_token_accuracy": 0.6977241039276123, + "num_tokens": 37235772.0, + "step": 1481 + }, + { + "epoch": 0.16274983527344608, + "grad_norm": 2.076993703842163, + "learning_rate": 8.128430296377606e-07, + "loss": 1.1112, + "mean_token_accuracy": 0.6675161123275757, + "num_tokens": 37268119.0, + "step": 1482 + }, + { + "epoch": 0.16285965297605975, + "grad_norm": 2.2484405040740967, + "learning_rate": 8.133918770581778e-07, + "loss": 0.962, + "mean_token_accuracy": 0.7090801000595093, + "num_tokens": 37294460.0, + "step": 1483 + }, + { + "epoch": 0.1629694706786734, + "grad_norm": 2.3163836002349854, + "learning_rate": 8.13940724478595e-07, + "loss": 0.9872, + "mean_token_accuracy": 0.7005183696746826, + "num_tokens": 37319397.0, + "step": 1484 + }, + { + "epoch": 0.16307928838128707, + "grad_norm": 2.451967716217041, + "learning_rate": 8.14489571899012e-07, + "loss": 1.047, + "mean_token_accuracy": 0.6913277506828308, + "num_tokens": 37342850.0, + "step": 1485 + }, + { + "epoch": 0.16318910608390072, + "grad_norm": 2.4477791786193848, + "learning_rate": 8.150384193194292e-07, + "loss": 1.0953, + "mean_token_accuracy": 0.6829548478126526, + "num_tokens": 37367420.0, + "step": 1486 + }, + { + "epoch": 0.1632989237865144, + "grad_norm": 2.264281988143921, + "learning_rate": 8.155872667398463e-07, + "loss": 1.1278, + "mean_token_accuracy": 0.6848916411399841, + "num_tokens": 37394024.0, + "step": 1487 + }, + { + "epoch": 0.16340874148912804, + "grad_norm": 2.9826784133911133, + "learning_rate": 8.161361141602634e-07, + "loss": 0.8683, + "mean_token_accuracy": 0.7326562404632568, + "num_tokens": 37411035.0, + "step": 1488 + }, + { + "epoch": 0.1635185591917417, + "grad_norm": 2.4763424396514893, + "learning_rate": 8.166849615806806e-07, + "loss": 1.069, + "mean_token_accuracy": 0.6813057661056519, + "num_tokens": 37435801.0, + "step": 1489 + }, + { + "epoch": 0.16362837689435536, + "grad_norm": 2.252890110015869, + "learning_rate": 8.172338090010977e-07, + "loss": 1.0975, + "mean_token_accuracy": 0.6781739592552185, + "num_tokens": 37466733.0, + "step": 1490 + }, + { + "epoch": 0.16373819459696903, + "grad_norm": 2.4702558517456055, + "learning_rate": 8.177826564215147e-07, + "loss": 1.0813, + "mean_token_accuracy": 0.6913646459579468, + "num_tokens": 37489178.0, + "step": 1491 + }, + { + "epoch": 0.16384801229958268, + "grad_norm": 2.245356321334839, + "learning_rate": 8.183315038419319e-07, + "loss": 1.0116, + "mean_token_accuracy": 0.7003697156906128, + "num_tokens": 37515494.0, + "step": 1492 + }, + { + "epoch": 0.16395783000219635, + "grad_norm": 2.552849531173706, + "learning_rate": 8.18880351262349e-07, + "loss": 0.9015, + "mean_token_accuracy": 0.7234364748001099, + "num_tokens": 37535503.0, + "step": 1493 + }, + { + "epoch": 0.16406764770481003, + "grad_norm": 2.4821979999542236, + "learning_rate": 8.194291986827661e-07, + "loss": 1.0532, + "mean_token_accuracy": 0.6908234357833862, + "num_tokens": 37560172.0, + "step": 1494 + }, + { + "epoch": 0.16417746540742367, + "grad_norm": 2.653841018676758, + "learning_rate": 8.199780461031833e-07, + "loss": 0.9537, + "mean_token_accuracy": 0.710858941078186, + "num_tokens": 37582483.0, + "step": 1495 + }, + { + "epoch": 0.16428728311003735, + "grad_norm": 2.1300718784332275, + "learning_rate": 8.205268935236004e-07, + "loss": 1.0059, + "mean_token_accuracy": 0.6962745189666748, + "num_tokens": 37610662.0, + "step": 1496 + }, + { + "epoch": 0.164397100812651, + "grad_norm": 2.723930835723877, + "learning_rate": 8.210757409440175e-07, + "loss": 1.0837, + "mean_token_accuracy": 0.6778481006622314, + "num_tokens": 37632017.0, + "step": 1497 + }, + { + "epoch": 0.16450691851526467, + "grad_norm": 2.4895870685577393, + "learning_rate": 8.216245883644346e-07, + "loss": 1.0565, + "mean_token_accuracy": 0.6892681121826172, + "num_tokens": 37654055.0, + "step": 1498 + }, + { + "epoch": 0.1646167362178783, + "grad_norm": 2.411417245864868, + "learning_rate": 8.221734357848518e-07, + "loss": 1.0788, + "mean_token_accuracy": 0.6865249872207642, + "num_tokens": 37678977.0, + "step": 1499 + }, + { + "epoch": 0.16472655392049199, + "grad_norm": 2.3076436519622803, + "learning_rate": 8.227222832052689e-07, + "loss": 0.9592, + "mean_token_accuracy": 0.7174220681190491, + "num_tokens": 37702127.0, + "step": 1500 + }, + { + "epoch": 0.16483637162310563, + "grad_norm": 2.237382650375366, + "learning_rate": 8.23271130625686e-07, + "loss": 1.024, + "mean_token_accuracy": 0.6973133087158203, + "num_tokens": 37728185.0, + "step": 1501 + }, + { + "epoch": 0.1649461893257193, + "grad_norm": 2.4270365238189697, + "learning_rate": 8.238199780461032e-07, + "loss": 0.9511, + "mean_token_accuracy": 0.7169910669326782, + "num_tokens": 37750812.0, + "step": 1502 + }, + { + "epoch": 0.16505600702833298, + "grad_norm": 2.374197483062744, + "learning_rate": 8.243688254665203e-07, + "loss": 0.8405, + "mean_token_accuracy": 0.7355517148971558, + "num_tokens": 37771178.0, + "step": 1503 + }, + { + "epoch": 0.16516582473094663, + "grad_norm": 2.4464876651763916, + "learning_rate": 8.249176728869374e-07, + "loss": 0.9646, + "mean_token_accuracy": 0.7192834615707397, + "num_tokens": 37792790.0, + "step": 1504 + }, + { + "epoch": 0.1652756424335603, + "grad_norm": 2.4594950675964355, + "learning_rate": 8.254665203073546e-07, + "loss": 0.9396, + "mean_token_accuracy": 0.7163498401641846, + "num_tokens": 37815124.0, + "step": 1505 + }, + { + "epoch": 0.16538546013617395, + "grad_norm": 2.4215686321258545, + "learning_rate": 8.260153677277717e-07, + "loss": 0.9631, + "mean_token_accuracy": 0.7084594964981079, + "num_tokens": 37836819.0, + "step": 1506 + }, + { + "epoch": 0.16549527783878762, + "grad_norm": 2.5831611156463623, + "learning_rate": 8.265642151481888e-07, + "loss": 0.998, + "mean_token_accuracy": 0.6972143054008484, + "num_tokens": 37857753.0, + "step": 1507 + }, + { + "epoch": 0.16560509554140126, + "grad_norm": 2.424161195755005, + "learning_rate": 8.27113062568606e-07, + "loss": 0.9529, + "mean_token_accuracy": 0.7112107276916504, + "num_tokens": 37880239.0, + "step": 1508 + }, + { + "epoch": 0.16571491324401494, + "grad_norm": 2.5259854793548584, + "learning_rate": 8.27661909989023e-07, + "loss": 1.0437, + "mean_token_accuracy": 0.6992644667625427, + "num_tokens": 37902611.0, + "step": 1509 + }, + { + "epoch": 0.16582473094662858, + "grad_norm": 2.426912546157837, + "learning_rate": 8.282107574094401e-07, + "loss": 1.0326, + "mean_token_accuracy": 0.6870989203453064, + "num_tokens": 37925571.0, + "step": 1510 + }, + { + "epoch": 0.16593454864924226, + "grad_norm": 2.4511501789093018, + "learning_rate": 8.287596048298572e-07, + "loss": 1.005, + "mean_token_accuracy": 0.695046067237854, + "num_tokens": 37949305.0, + "step": 1511 + }, + { + "epoch": 0.16604436635185593, + "grad_norm": 2.250136137008667, + "learning_rate": 8.293084522502743e-07, + "loss": 1.0488, + "mean_token_accuracy": 0.6897773146629333, + "num_tokens": 37973599.0, + "step": 1512 + }, + { + "epoch": 0.16615418405446958, + "grad_norm": 2.3653528690338135, + "learning_rate": 8.298572996706915e-07, + "loss": 1.0115, + "mean_token_accuracy": 0.7071911096572876, + "num_tokens": 37997836.0, + "step": 1513 + }, + { + "epoch": 0.16626400175708325, + "grad_norm": 2.2556588649749756, + "learning_rate": 8.304061470911086e-07, + "loss": 1.101, + "mean_token_accuracy": 0.6716217994689941, + "num_tokens": 38024163.0, + "step": 1514 + }, + { + "epoch": 0.1663738194596969, + "grad_norm": 2.667789936065674, + "learning_rate": 8.309549945115257e-07, + "loss": 0.9674, + "mean_token_accuracy": 0.7166512608528137, + "num_tokens": 38043898.0, + "step": 1515 + }, + { + "epoch": 0.16648363716231057, + "grad_norm": 2.2065653800964355, + "learning_rate": 8.315038419319429e-07, + "loss": 0.8996, + "mean_token_accuracy": 0.7329157590866089, + "num_tokens": 38068770.0, + "step": 1516 + }, + { + "epoch": 0.16659345486492422, + "grad_norm": 2.075514554977417, + "learning_rate": 8.3205268935236e-07, + "loss": 1.0162, + "mean_token_accuracy": 0.7016498446464539, + "num_tokens": 38098192.0, + "step": 1517 + }, + { + "epoch": 0.1667032725675379, + "grad_norm": 2.7059082984924316, + "learning_rate": 8.326015367727771e-07, + "loss": 1.0985, + "mean_token_accuracy": 0.675512433052063, + "num_tokens": 38118224.0, + "step": 1518 + }, + { + "epoch": 0.16681309027015154, + "grad_norm": 2.3970491886138916, + "learning_rate": 8.331503841931943e-07, + "loss": 1.089, + "mean_token_accuracy": 0.6795108914375305, + "num_tokens": 38142152.0, + "step": 1519 + }, + { + "epoch": 0.1669229079727652, + "grad_norm": 2.2711329460144043, + "learning_rate": 8.336992316136113e-07, + "loss": 1.0201, + "mean_token_accuracy": 0.6960911154747009, + "num_tokens": 38167803.0, + "step": 1520 + }, + { + "epoch": 0.16703272567537888, + "grad_norm": 2.716015577316284, + "learning_rate": 8.342480790340285e-07, + "loss": 0.9654, + "mean_token_accuracy": 0.7121863961219788, + "num_tokens": 38186777.0, + "step": 1521 + }, + { + "epoch": 0.16714254337799253, + "grad_norm": 2.253830671310425, + "learning_rate": 8.347969264544457e-07, + "loss": 1.0534, + "mean_token_accuracy": 0.6855176687240601, + "num_tokens": 38213394.0, + "step": 1522 + }, + { + "epoch": 0.1672523610806062, + "grad_norm": 2.291825532913208, + "learning_rate": 8.353457738748627e-07, + "loss": 1.0202, + "mean_token_accuracy": 0.6988116502761841, + "num_tokens": 38237781.0, + "step": 1523 + }, + { + "epoch": 0.16736217878321985, + "grad_norm": 2.2136008739471436, + "learning_rate": 8.358946212952799e-07, + "loss": 0.9975, + "mean_token_accuracy": 0.7098481059074402, + "num_tokens": 38264251.0, + "step": 1524 + }, + { + "epoch": 0.16747199648583352, + "grad_norm": 2.0565218925476074, + "learning_rate": 8.364434687156971e-07, + "loss": 1.0531, + "mean_token_accuracy": 0.6846892833709717, + "num_tokens": 38295736.0, + "step": 1525 + }, + { + "epoch": 0.16758181418844717, + "grad_norm": 2.3099026679992676, + "learning_rate": 8.369923161361141e-07, + "loss": 1.0152, + "mean_token_accuracy": 0.6984436511993408, + "num_tokens": 38319627.0, + "step": 1526 + }, + { + "epoch": 0.16769163189106084, + "grad_norm": 2.2496092319488525, + "learning_rate": 8.375411635565313e-07, + "loss": 1.015, + "mean_token_accuracy": 0.6956290602684021, + "num_tokens": 38345949.0, + "step": 1527 + }, + { + "epoch": 0.1678014495936745, + "grad_norm": 2.3123838901519775, + "learning_rate": 8.380900109769485e-07, + "loss": 0.9867, + "mean_token_accuracy": 0.7068179845809937, + "num_tokens": 38370799.0, + "step": 1528 + }, + { + "epoch": 0.16791126729628816, + "grad_norm": 2.30070161819458, + "learning_rate": 8.386388583973654e-07, + "loss": 1.0511, + "mean_token_accuracy": 0.698466956615448, + "num_tokens": 38396377.0, + "step": 1529 + }, + { + "epoch": 0.1680210849989018, + "grad_norm": 2.269742965698242, + "learning_rate": 8.391877058177826e-07, + "loss": 1.1043, + "mean_token_accuracy": 0.6741380095481873, + "num_tokens": 38423969.0, + "step": 1530 + }, + { + "epoch": 0.16813090270151548, + "grad_norm": 2.093283176422119, + "learning_rate": 8.397365532381997e-07, + "loss": 0.9174, + "mean_token_accuracy": 0.7294264435768127, + "num_tokens": 38450584.0, + "step": 1531 + }, + { + "epoch": 0.16824072040412916, + "grad_norm": 2.6344962120056152, + "learning_rate": 8.402854006586168e-07, + "loss": 1.0181, + "mean_token_accuracy": 0.6912378072738647, + "num_tokens": 38471028.0, + "step": 1532 + }, + { + "epoch": 0.1683505381067428, + "grad_norm": 2.148557424545288, + "learning_rate": 8.40834248079034e-07, + "loss": 1.0751, + "mean_token_accuracy": 0.6793046593666077, + "num_tokens": 38500783.0, + "step": 1533 + }, + { + "epoch": 0.16846035580935648, + "grad_norm": 2.494333505630493, + "learning_rate": 8.413830954994511e-07, + "loss": 1.0702, + "mean_token_accuracy": 0.6866065859794617, + "num_tokens": 38524908.0, + "step": 1534 + }, + { + "epoch": 0.16857017351197012, + "grad_norm": 2.588632822036743, + "learning_rate": 8.419319429198682e-07, + "loss": 1.0451, + "mean_token_accuracy": 0.6867592334747314, + "num_tokens": 38547259.0, + "step": 1535 + }, + { + "epoch": 0.1686799912145838, + "grad_norm": 2.388157844543457, + "learning_rate": 8.424807903402854e-07, + "loss": 0.9833, + "mean_token_accuracy": 0.7073384523391724, + "num_tokens": 38570318.0, + "step": 1536 + }, + { + "epoch": 0.16878980891719744, + "grad_norm": 2.3723745346069336, + "learning_rate": 8.430296377607025e-07, + "loss": 1.0239, + "mean_token_accuracy": 0.6922621726989746, + "num_tokens": 38595879.0, + "step": 1537 + }, + { + "epoch": 0.16889962661981112, + "grad_norm": 2.5938704013824463, + "learning_rate": 8.435784851811196e-07, + "loss": 1.0593, + "mean_token_accuracy": 0.6806243658065796, + "num_tokens": 38618389.0, + "step": 1538 + }, + { + "epoch": 0.16900944432242476, + "grad_norm": 2.401284694671631, + "learning_rate": 8.441273326015367e-07, + "loss": 1.0553, + "mean_token_accuracy": 0.6947208642959595, + "num_tokens": 38643327.0, + "step": 1539 + }, + { + "epoch": 0.16911926202503844, + "grad_norm": 2.107990026473999, + "learning_rate": 8.446761800219539e-07, + "loss": 1.1172, + "mean_token_accuracy": 0.6637616157531738, + "num_tokens": 38674311.0, + "step": 1540 + }, + { + "epoch": 0.1692290797276521, + "grad_norm": 2.244741678237915, + "learning_rate": 8.45225027442371e-07, + "loss": 1.0932, + "mean_token_accuracy": 0.6761878132820129, + "num_tokens": 38700082.0, + "step": 1541 + }, + { + "epoch": 0.16933889743026576, + "grad_norm": 2.3174965381622314, + "learning_rate": 8.457738748627881e-07, + "loss": 1.0379, + "mean_token_accuracy": 0.6916471123695374, + "num_tokens": 38723921.0, + "step": 1542 + }, + { + "epoch": 0.16944871513287943, + "grad_norm": 2.52295184135437, + "learning_rate": 8.463227222832053e-07, + "loss": 0.9817, + "mean_token_accuracy": 0.702964723110199, + "num_tokens": 38745410.0, + "step": 1543 + }, + { + "epoch": 0.16955853283549308, + "grad_norm": 2.559875011444092, + "learning_rate": 8.468715697036224e-07, + "loss": 0.9925, + "mean_token_accuracy": 0.7077068090438843, + "num_tokens": 38765005.0, + "step": 1544 + }, + { + "epoch": 0.16966835053810675, + "grad_norm": 2.392996072769165, + "learning_rate": 8.474204171240395e-07, + "loss": 1.0206, + "mean_token_accuracy": 0.6970354318618774, + "num_tokens": 38788003.0, + "step": 1545 + }, + { + "epoch": 0.1697781682407204, + "grad_norm": 2.5525267124176025, + "learning_rate": 8.479692645444567e-07, + "loss": 0.9454, + "mean_token_accuracy": 0.7085498571395874, + "num_tokens": 38807065.0, + "step": 1546 + }, + { + "epoch": 0.16988798594333407, + "grad_norm": 2.6110029220581055, + "learning_rate": 8.485181119648738e-07, + "loss": 1.068, + "mean_token_accuracy": 0.6866448521614075, + "num_tokens": 38829406.0, + "step": 1547 + }, + { + "epoch": 0.16999780364594771, + "grad_norm": 2.2229931354522705, + "learning_rate": 8.490669593852908e-07, + "loss": 0.9942, + "mean_token_accuracy": 0.7015323042869568, + "num_tokens": 38855737.0, + "step": 1548 + }, + { + "epoch": 0.1701076213485614, + "grad_norm": 2.2556583881378174, + "learning_rate": 8.496158068057079e-07, + "loss": 1.0199, + "mean_token_accuracy": 0.7009989619255066, + "num_tokens": 38882009.0, + "step": 1549 + }, + { + "epoch": 0.17021743905117506, + "grad_norm": 2.351280689239502, + "learning_rate": 8.50164654226125e-07, + "loss": 0.9853, + "mean_token_accuracy": 0.7015206813812256, + "num_tokens": 38906692.0, + "step": 1550 + }, + { + "epoch": 0.1703272567537887, + "grad_norm": 2.6658506393432617, + "learning_rate": 8.507135016465422e-07, + "loss": 1.0053, + "mean_token_accuracy": 0.6994467377662659, + "num_tokens": 38926927.0, + "step": 1551 + }, + { + "epoch": 0.17043707445640238, + "grad_norm": 2.470278024673462, + "learning_rate": 8.512623490669593e-07, + "loss": 1.021, + "mean_token_accuracy": 0.7034562826156616, + "num_tokens": 38949653.0, + "step": 1552 + }, + { + "epoch": 0.17054689215901603, + "grad_norm": 2.2730484008789062, + "learning_rate": 8.518111964873764e-07, + "loss": 1.0565, + "mean_token_accuracy": 0.683006763458252, + "num_tokens": 38976201.0, + "step": 1553 + }, + { + "epoch": 0.1706567098616297, + "grad_norm": 2.059715747833252, + "learning_rate": 8.523600439077936e-07, + "loss": 0.9964, + "mean_token_accuracy": 0.6954790949821472, + "num_tokens": 39004438.0, + "step": 1554 + }, + { + "epoch": 0.17076652756424335, + "grad_norm": 2.009270668029785, + "learning_rate": 8.529088913282107e-07, + "loss": 1.0912, + "mean_token_accuracy": 0.6734645962715149, + "num_tokens": 39037563.0, + "step": 1555 + }, + { + "epoch": 0.17087634526685702, + "grad_norm": 2.1332058906555176, + "learning_rate": 8.534577387486278e-07, + "loss": 1.0854, + "mean_token_accuracy": 0.6704635620117188, + "num_tokens": 39067455.0, + "step": 1556 + }, + { + "epoch": 0.17098616296947067, + "grad_norm": 2.1271679401397705, + "learning_rate": 8.54006586169045e-07, + "loss": 0.9888, + "mean_token_accuracy": 0.6981230974197388, + "num_tokens": 39095498.0, + "step": 1557 + }, + { + "epoch": 0.17109598067208434, + "grad_norm": 2.195411205291748, + "learning_rate": 8.545554335894621e-07, + "loss": 1.1245, + "mean_token_accuracy": 0.669039785861969, + "num_tokens": 39124814.0, + "step": 1558 + }, + { + "epoch": 0.17120579837469801, + "grad_norm": 2.1776208877563477, + "learning_rate": 8.551042810098792e-07, + "loss": 1.0709, + "mean_token_accuracy": 0.6820230484008789, + "num_tokens": 39153803.0, + "step": 1559 + }, + { + "epoch": 0.17131561607731166, + "grad_norm": 2.296776533126831, + "learning_rate": 8.556531284302964e-07, + "loss": 1.1255, + "mean_token_accuracy": 0.6712146997451782, + "num_tokens": 39181625.0, + "step": 1560 + }, + { + "epoch": 0.17142543377992533, + "grad_norm": 2.4804091453552246, + "learning_rate": 8.562019758507134e-07, + "loss": 1.0286, + "mean_token_accuracy": 0.7036570906639099, + "num_tokens": 39203353.0, + "step": 1561 + }, + { + "epoch": 0.17153525148253898, + "grad_norm": 2.2178211212158203, + "learning_rate": 8.567508232711306e-07, + "loss": 1.0645, + "mean_token_accuracy": 0.6831389665603638, + "num_tokens": 39232818.0, + "step": 1562 + }, + { + "epoch": 0.17164506918515265, + "grad_norm": 2.379441499710083, + "learning_rate": 8.572996706915478e-07, + "loss": 0.9638, + "mean_token_accuracy": 0.7103729248046875, + "num_tokens": 39256419.0, + "step": 1563 + }, + { + "epoch": 0.1717548868877663, + "grad_norm": 2.6649341583251953, + "learning_rate": 8.578485181119648e-07, + "loss": 0.9342, + "mean_token_accuracy": 0.7146073579788208, + "num_tokens": 39275001.0, + "step": 1564 + }, + { + "epoch": 0.17186470459037997, + "grad_norm": 2.162524700164795, + "learning_rate": 8.58397365532382e-07, + "loss": 1.0033, + "mean_token_accuracy": 0.6978363394737244, + "num_tokens": 39305065.0, + "step": 1565 + }, + { + "epoch": 0.17197452229299362, + "grad_norm": 2.2616159915924072, + "learning_rate": 8.589462129527992e-07, + "loss": 1.0779, + "mean_token_accuracy": 0.6746540069580078, + "num_tokens": 39333311.0, + "step": 1566 + }, + { + "epoch": 0.1720843399956073, + "grad_norm": 2.0760891437530518, + "learning_rate": 8.594950603732161e-07, + "loss": 1.0361, + "mean_token_accuracy": 0.6924809813499451, + "num_tokens": 39365070.0, + "step": 1567 + }, + { + "epoch": 0.17219415769822094, + "grad_norm": 2.4976840019226074, + "learning_rate": 8.600439077936333e-07, + "loss": 0.9577, + "mean_token_accuracy": 0.7090214490890503, + "num_tokens": 39387357.0, + "step": 1568 + }, + { + "epoch": 0.1723039754008346, + "grad_norm": 2.400301456451416, + "learning_rate": 8.605927552140505e-07, + "loss": 0.9688, + "mean_token_accuracy": 0.7084990739822388, + "num_tokens": 39410509.0, + "step": 1569 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 2.72672176361084, + "learning_rate": 8.611416026344675e-07, + "loss": 0.961, + "mean_token_accuracy": 0.7063259482383728, + "num_tokens": 39428287.0, + "step": 1570 + }, + { + "epoch": 0.17252361080606193, + "grad_norm": 2.332404375076294, + "learning_rate": 8.616904500548847e-07, + "loss": 1.0241, + "mean_token_accuracy": 0.6918227076530457, + "num_tokens": 39452251.0, + "step": 1571 + }, + { + "epoch": 0.1726334285086756, + "grad_norm": 2.18835186958313, + "learning_rate": 8.622392974753018e-07, + "loss": 0.9835, + "mean_token_accuracy": 0.7042074203491211, + "num_tokens": 39478380.0, + "step": 1572 + }, + { + "epoch": 0.17274324621128925, + "grad_norm": 2.1983745098114014, + "learning_rate": 8.627881448957189e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.671446681022644, + "num_tokens": 39509183.0, + "step": 1573 + }, + { + "epoch": 0.17285306391390293, + "grad_norm": 2.474571704864502, + "learning_rate": 8.633369923161361e-07, + "loss": 1.0043, + "mean_token_accuracy": 0.6949336528778076, + "num_tokens": 39530979.0, + "step": 1574 + }, + { + "epoch": 0.17296288161651657, + "grad_norm": 2.044685125350952, + "learning_rate": 8.638858397365532e-07, + "loss": 0.9898, + "mean_token_accuracy": 0.7035763263702393, + "num_tokens": 39560573.0, + "step": 1575 + }, + { + "epoch": 0.17307269931913025, + "grad_norm": 2.681269407272339, + "learning_rate": 8.644346871569703e-07, + "loss": 0.9895, + "mean_token_accuracy": 0.702472984790802, + "num_tokens": 39580097.0, + "step": 1576 + }, + { + "epoch": 0.1731825170217439, + "grad_norm": 2.3050992488861084, + "learning_rate": 8.649835345773875e-07, + "loss": 1.0213, + "mean_token_accuracy": 0.6936273574829102, + "num_tokens": 39604160.0, + "step": 1577 + }, + { + "epoch": 0.17329233472435757, + "grad_norm": 2.3729991912841797, + "learning_rate": 8.655323819978046e-07, + "loss": 1.0148, + "mean_token_accuracy": 0.7043059468269348, + "num_tokens": 39631723.0, + "step": 1578 + }, + { + "epoch": 0.17340215242697124, + "grad_norm": 2.3001115322113037, + "learning_rate": 8.660812294182217e-07, + "loss": 1.0186, + "mean_token_accuracy": 0.6854057908058167, + "num_tokens": 39658462.0, + "step": 1579 + }, + { + "epoch": 0.17351197012958489, + "grad_norm": 2.450852394104004, + "learning_rate": 8.666300768386389e-07, + "loss": 0.9282, + "mean_token_accuracy": 0.7160696387290955, + "num_tokens": 39680599.0, + "step": 1580 + }, + { + "epoch": 0.17362178783219856, + "grad_norm": 2.3577404022216797, + "learning_rate": 8.67178924259056e-07, + "loss": 1.0603, + "mean_token_accuracy": 0.683512806892395, + "num_tokens": 39704873.0, + "step": 1581 + }, + { + "epoch": 0.1737316055348122, + "grad_norm": 2.508795976638794, + "learning_rate": 8.677277716794731e-07, + "loss": 0.9858, + "mean_token_accuracy": 0.6976510286331177, + "num_tokens": 39727182.0, + "step": 1582 + }, + { + "epoch": 0.17384142323742588, + "grad_norm": 2.2672665119171143, + "learning_rate": 8.682766190998902e-07, + "loss": 1.1209, + "mean_token_accuracy": 0.670975923538208, + "num_tokens": 39757755.0, + "step": 1583 + }, + { + "epoch": 0.17395124094003953, + "grad_norm": 2.0423669815063477, + "learning_rate": 8.688254665203073e-07, + "loss": 1.0551, + "mean_token_accuracy": 0.685632586479187, + "num_tokens": 39791156.0, + "step": 1584 + }, + { + "epoch": 0.1740610586426532, + "grad_norm": 2.311213493347168, + "learning_rate": 8.693743139407245e-07, + "loss": 1.1377, + "mean_token_accuracy": 0.6660580039024353, + "num_tokens": 39818958.0, + "step": 1585 + }, + { + "epoch": 0.17417087634526685, + "grad_norm": 2.400590658187866, + "learning_rate": 8.699231613611415e-07, + "loss": 1.0151, + "mean_token_accuracy": 0.698007345199585, + "num_tokens": 39842460.0, + "step": 1586 + }, + { + "epoch": 0.17428069404788052, + "grad_norm": 2.2537357807159424, + "learning_rate": 8.704720087815586e-07, + "loss": 1.0674, + "mean_token_accuracy": 0.6920486688613892, + "num_tokens": 39868999.0, + "step": 1587 + }, + { + "epoch": 0.1743905117504942, + "grad_norm": 2.454854726791382, + "learning_rate": 8.710208562019758e-07, + "loss": 1.0564, + "mean_token_accuracy": 0.7006983757019043, + "num_tokens": 39893197.0, + "step": 1588 + }, + { + "epoch": 0.17450032945310784, + "grad_norm": 2.1814558506011963, + "learning_rate": 8.715697036223929e-07, + "loss": 1.0292, + "mean_token_accuracy": 0.6926057934761047, + "num_tokens": 39920155.0, + "step": 1589 + }, + { + "epoch": 0.1746101471557215, + "grad_norm": 2.385262966156006, + "learning_rate": 8.7211855104281e-07, + "loss": 1.0189, + "mean_token_accuracy": 0.700104832649231, + "num_tokens": 39944928.0, + "step": 1590 + }, + { + "epoch": 0.17471996485833516, + "grad_norm": 2.350090503692627, + "learning_rate": 8.726673984632272e-07, + "loss": 0.9928, + "mean_token_accuracy": 0.7032067775726318, + "num_tokens": 39970124.0, + "step": 1591 + }, + { + "epoch": 0.17482978256094883, + "grad_norm": 2.195762872695923, + "learning_rate": 8.732162458836443e-07, + "loss": 1.1038, + "mean_token_accuracy": 0.6777533292770386, + "num_tokens": 39998028.0, + "step": 1592 + }, + { + "epoch": 0.17493960026356248, + "grad_norm": 2.224858045578003, + "learning_rate": 8.737650933040614e-07, + "loss": 1.0294, + "mean_token_accuracy": 0.695146381855011, + "num_tokens": 40026835.0, + "step": 1593 + }, + { + "epoch": 0.17504941796617615, + "grad_norm": 2.4622628688812256, + "learning_rate": 8.743139407244785e-07, + "loss": 0.8989, + "mean_token_accuracy": 0.7201989889144897, + "num_tokens": 40047904.0, + "step": 1594 + }, + { + "epoch": 0.1751592356687898, + "grad_norm": 2.2899248600006104, + "learning_rate": 8.748627881448957e-07, + "loss": 1.0419, + "mean_token_accuracy": 0.690053403377533, + "num_tokens": 40073261.0, + "step": 1595 + }, + { + "epoch": 0.17526905337140347, + "grad_norm": 2.5651113986968994, + "learning_rate": 8.754116355653128e-07, + "loss": 1.0427, + "mean_token_accuracy": 0.6832301616668701, + "num_tokens": 40094933.0, + "step": 1596 + }, + { + "epoch": 0.17537887107401715, + "grad_norm": 2.500401496887207, + "learning_rate": 8.759604829857299e-07, + "loss": 0.9581, + "mean_token_accuracy": 0.7102678418159485, + "num_tokens": 40117612.0, + "step": 1597 + }, + { + "epoch": 0.1754886887766308, + "grad_norm": 2.3512706756591797, + "learning_rate": 8.765093304061471e-07, + "loss": 1.0088, + "mean_token_accuracy": 0.6991238594055176, + "num_tokens": 40142066.0, + "step": 1598 + }, + { + "epoch": 0.17559850647924446, + "grad_norm": 2.127328872680664, + "learning_rate": 8.770581778265642e-07, + "loss": 1.0463, + "mean_token_accuracy": 0.6873540878295898, + "num_tokens": 40170231.0, + "step": 1599 + }, + { + "epoch": 0.1757083241818581, + "grad_norm": 2.120166540145874, + "learning_rate": 8.776070252469813e-07, + "loss": 0.9523, + "mean_token_accuracy": 0.7101098895072937, + "num_tokens": 40195627.0, + "step": 1600 + }, + { + "epoch": 0.17581814188447178, + "grad_norm": 2.3741884231567383, + "learning_rate": 8.781558726673985e-07, + "loss": 1.0298, + "mean_token_accuracy": 0.6974712014198303, + "num_tokens": 40218687.0, + "step": 1601 + }, + { + "epoch": 0.17592795958708543, + "grad_norm": 2.724947452545166, + "learning_rate": 8.787047200878156e-07, + "loss": 0.967, + "mean_token_accuracy": 0.7092569470405579, + "num_tokens": 40237215.0, + "step": 1602 + }, + { + "epoch": 0.1760377772896991, + "grad_norm": 2.1600182056427, + "learning_rate": 8.792535675082327e-07, + "loss": 1.1437, + "mean_token_accuracy": 0.6652284264564514, + "num_tokens": 40265018.0, + "step": 1603 + }, + { + "epoch": 0.17614759499231275, + "grad_norm": 2.296051263809204, + "learning_rate": 8.798024149286499e-07, + "loss": 1.0327, + "mean_token_accuracy": 0.6921944618225098, + "num_tokens": 40290458.0, + "step": 1604 + }, + { + "epoch": 0.17625741269492642, + "grad_norm": 2.4110770225524902, + "learning_rate": 8.803512623490668e-07, + "loss": 0.9238, + "mean_token_accuracy": 0.7180379033088684, + "num_tokens": 40313237.0, + "step": 1605 + }, + { + "epoch": 0.17636723039754007, + "grad_norm": 2.1847550868988037, + "learning_rate": 8.80900109769484e-07, + "loss": 1.0049, + "mean_token_accuracy": 0.7085896730422974, + "num_tokens": 40340504.0, + "step": 1606 + }, + { + "epoch": 0.17647704810015374, + "grad_norm": 2.624114990234375, + "learning_rate": 8.814489571899012e-07, + "loss": 1.0276, + "mean_token_accuracy": 0.698607325553894, + "num_tokens": 40361571.0, + "step": 1607 + }, + { + "epoch": 0.17658686580276742, + "grad_norm": 2.5952656269073486, + "learning_rate": 8.819978046103182e-07, + "loss": 1.0076, + "mean_token_accuracy": 0.6970947980880737, + "num_tokens": 40383151.0, + "step": 1608 + }, + { + "epoch": 0.17669668350538106, + "grad_norm": 2.3094749450683594, + "learning_rate": 8.825466520307354e-07, + "loss": 1.0933, + "mean_token_accuracy": 0.6828062534332275, + "num_tokens": 40408657.0, + "step": 1609 + }, + { + "epoch": 0.17680650120799474, + "grad_norm": 2.852212429046631, + "learning_rate": 8.830954994511526e-07, + "loss": 0.9728, + "mean_token_accuracy": 0.7051982879638672, + "num_tokens": 40425485.0, + "step": 1610 + }, + { + "epoch": 0.17691631891060838, + "grad_norm": 2.156700849533081, + "learning_rate": 8.836443468715696e-07, + "loss": 1.056, + "mean_token_accuracy": 0.6821581125259399, + "num_tokens": 40453993.0, + "step": 1611 + }, + { + "epoch": 0.17702613661322206, + "grad_norm": 2.091895818710327, + "learning_rate": 8.841931942919868e-07, + "loss": 0.9783, + "mean_token_accuracy": 0.7028154730796814, + "num_tokens": 40481541.0, + "step": 1612 + }, + { + "epoch": 0.1771359543158357, + "grad_norm": 2.51049542427063, + "learning_rate": 8.84742041712404e-07, + "loss": 0.9461, + "mean_token_accuracy": 0.7172898054122925, + "num_tokens": 40502736.0, + "step": 1613 + }, + { + "epoch": 0.17724577201844938, + "grad_norm": 2.3666555881500244, + "learning_rate": 8.85290889132821e-07, + "loss": 1.0571, + "mean_token_accuracy": 0.6843013763427734, + "num_tokens": 40528853.0, + "step": 1614 + }, + { + "epoch": 0.17735558972106302, + "grad_norm": 2.43229603767395, + "learning_rate": 8.858397365532382e-07, + "loss": 1.0138, + "mean_token_accuracy": 0.6977855563163757, + "num_tokens": 40551145.0, + "step": 1615 + }, + { + "epoch": 0.1774654074236767, + "grad_norm": 2.4595189094543457, + "learning_rate": 8.863885839736552e-07, + "loss": 1.077, + "mean_token_accuracy": 0.6798230409622192, + "num_tokens": 40574814.0, + "step": 1616 + }, + { + "epoch": 0.17757522512629037, + "grad_norm": 2.4262771606445312, + "learning_rate": 8.869374313940724e-07, + "loss": 0.9288, + "mean_token_accuracy": 0.7179219722747803, + "num_tokens": 40595960.0, + "step": 1617 + }, + { + "epoch": 0.17768504282890402, + "grad_norm": 2.9111011028289795, + "learning_rate": 8.874862788144896e-07, + "loss": 0.8987, + "mean_token_accuracy": 0.7240045070648193, + "num_tokens": 40612985.0, + "step": 1618 + }, + { + "epoch": 0.1777948605315177, + "grad_norm": 2.402726411819458, + "learning_rate": 8.880351262349066e-07, + "loss": 1.0498, + "mean_token_accuracy": 0.6939969062805176, + "num_tokens": 40637091.0, + "step": 1619 + }, + { + "epoch": 0.17790467823413134, + "grad_norm": 2.417564630508423, + "learning_rate": 8.885839736553238e-07, + "loss": 1.0241, + "mean_token_accuracy": 0.6988537311553955, + "num_tokens": 40660283.0, + "step": 1620 + }, + { + "epoch": 0.178014495936745, + "grad_norm": 2.2198987007141113, + "learning_rate": 8.89132821075741e-07, + "loss": 1.0619, + "mean_token_accuracy": 0.6826525330543518, + "num_tokens": 40688430.0, + "step": 1621 + }, + { + "epoch": 0.17812431363935866, + "grad_norm": 2.3579437732696533, + "learning_rate": 8.89681668496158e-07, + "loss": 1.0805, + "mean_token_accuracy": 0.6764974594116211, + "num_tokens": 40713489.0, + "step": 1622 + }, + { + "epoch": 0.17823413134197233, + "grad_norm": 2.567878484725952, + "learning_rate": 8.902305159165752e-07, + "loss": 0.9928, + "mean_token_accuracy": 0.697608470916748, + "num_tokens": 40733222.0, + "step": 1623 + }, + { + "epoch": 0.17834394904458598, + "grad_norm": 2.215341329574585, + "learning_rate": 8.907793633369924e-07, + "loss": 1.065, + "mean_token_accuracy": 0.6881544589996338, + "num_tokens": 40761201.0, + "step": 1624 + }, + { + "epoch": 0.17845376674719965, + "grad_norm": 2.2017717361450195, + "learning_rate": 8.913282107574093e-07, + "loss": 1.0265, + "mean_token_accuracy": 0.6959012746810913, + "num_tokens": 40789191.0, + "step": 1625 + }, + { + "epoch": 0.17856358444981332, + "grad_norm": 2.849832773208618, + "learning_rate": 8.918770581778265e-07, + "loss": 1.0013, + "mean_token_accuracy": 0.6967610120773315, + "num_tokens": 40808356.0, + "step": 1626 + }, + { + "epoch": 0.17867340215242697, + "grad_norm": 2.469109535217285, + "learning_rate": 8.924259055982436e-07, + "loss": 1.0827, + "mean_token_accuracy": 0.6819090247154236, + "num_tokens": 40831487.0, + "step": 1627 + }, + { + "epoch": 0.17878321985504064, + "grad_norm": 2.6976029872894287, + "learning_rate": 8.929747530186607e-07, + "loss": 0.9696, + "mean_token_accuracy": 0.7060035467147827, + "num_tokens": 40850114.0, + "step": 1628 + }, + { + "epoch": 0.1788930375576543, + "grad_norm": 2.2894198894500732, + "learning_rate": 8.935236004390779e-07, + "loss": 1.0543, + "mean_token_accuracy": 0.6950651407241821, + "num_tokens": 40874041.0, + "step": 1629 + }, + { + "epoch": 0.17900285526026796, + "grad_norm": 2.252915859222412, + "learning_rate": 8.94072447859495e-07, + "loss": 1.04, + "mean_token_accuracy": 0.6959065794944763, + "num_tokens": 40902113.0, + "step": 1630 + }, + { + "epoch": 0.1791126729628816, + "grad_norm": 2.488880157470703, + "learning_rate": 8.946212952799121e-07, + "loss": 1.061, + "mean_token_accuracy": 0.7085107564926147, + "num_tokens": 40925003.0, + "step": 1631 + }, + { + "epoch": 0.17922249066549528, + "grad_norm": 2.639948606491089, + "learning_rate": 8.951701427003293e-07, + "loss": 1.0049, + "mean_token_accuracy": 0.6950253248214722, + "num_tokens": 40944557.0, + "step": 1632 + }, + { + "epoch": 0.17933230836810893, + "grad_norm": 2.100996732711792, + "learning_rate": 8.957189901207464e-07, + "loss": 0.9914, + "mean_token_accuracy": 0.7057632803916931, + "num_tokens": 40972359.0, + "step": 1633 + }, + { + "epoch": 0.1794421260707226, + "grad_norm": 2.316922664642334, + "learning_rate": 8.962678375411635e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.6802111864089966, + "num_tokens": 40997509.0, + "step": 1634 + }, + { + "epoch": 0.17955194377333628, + "grad_norm": 2.41292405128479, + "learning_rate": 8.968166849615807e-07, + "loss": 0.9529, + "mean_token_accuracy": 0.7132318019866943, + "num_tokens": 41020414.0, + "step": 1635 + }, + { + "epoch": 0.17966176147594992, + "grad_norm": 2.310288190841675, + "learning_rate": 8.973655323819978e-07, + "loss": 1.005, + "mean_token_accuracy": 0.701606035232544, + "num_tokens": 41045713.0, + "step": 1636 + }, + { + "epoch": 0.1797715791785636, + "grad_norm": 2.335914373397827, + "learning_rate": 8.979143798024149e-07, + "loss": 0.9303, + "mean_token_accuracy": 0.7173244953155518, + "num_tokens": 41069370.0, + "step": 1637 + }, + { + "epoch": 0.17988139688117724, + "grad_norm": 2.3447725772857666, + "learning_rate": 8.98463227222832e-07, + "loss": 1.0042, + "mean_token_accuracy": 0.7005221247673035, + "num_tokens": 41092750.0, + "step": 1638 + }, + { + "epoch": 0.17999121458379091, + "grad_norm": 2.277125120162964, + "learning_rate": 8.990120746432492e-07, + "loss": 1.0373, + "mean_token_accuracy": 0.6961750984191895, + "num_tokens": 41118237.0, + "step": 1639 + }, + { + "epoch": 0.18010103228640456, + "grad_norm": 2.4044063091278076, + "learning_rate": 8.995609220636663e-07, + "loss": 0.9404, + "mean_token_accuracy": 0.7147624492645264, + "num_tokens": 41142531.0, + "step": 1640 + }, + { + "epoch": 0.18021084998901823, + "grad_norm": 2.492001533508301, + "learning_rate": 9.001097694840834e-07, + "loss": 1.0557, + "mean_token_accuracy": 0.6848770976066589, + "num_tokens": 41172556.0, + "step": 1641 + }, + { + "epoch": 0.18032066769163188, + "grad_norm": 2.560771942138672, + "learning_rate": 9.006586169045006e-07, + "loss": 0.8885, + "mean_token_accuracy": 0.723531186580658, + "num_tokens": 41192969.0, + "step": 1642 + }, + { + "epoch": 0.18043048539424555, + "grad_norm": 2.0723073482513428, + "learning_rate": 9.012074643249177e-07, + "loss": 0.9694, + "mean_token_accuracy": 0.7091586589813232, + "num_tokens": 41223732.0, + "step": 1643 + }, + { + "epoch": 0.1805403030968592, + "grad_norm": 2.138472318649292, + "learning_rate": 9.017563117453347e-07, + "loss": 1.1192, + "mean_token_accuracy": 0.6692399382591248, + "num_tokens": 41254759.0, + "step": 1644 + }, + { + "epoch": 0.18065012079947287, + "grad_norm": 2.280691623687744, + "learning_rate": 9.023051591657519e-07, + "loss": 1.054, + "mean_token_accuracy": 0.6794633865356445, + "num_tokens": 41279087.0, + "step": 1645 + }, + { + "epoch": 0.18075993850208655, + "grad_norm": 2.2392172813415527, + "learning_rate": 9.02854006586169e-07, + "loss": 1.0252, + "mean_token_accuracy": 0.704105019569397, + "num_tokens": 41305377.0, + "step": 1646 + }, + { + "epoch": 0.1808697562047002, + "grad_norm": 2.450572967529297, + "learning_rate": 9.034028540065861e-07, + "loss": 1.0811, + "mean_token_accuracy": 0.6745385527610779, + "num_tokens": 41329135.0, + "step": 1647 + }, + { + "epoch": 0.18097957390731387, + "grad_norm": 2.4506709575653076, + "learning_rate": 9.039517014270033e-07, + "loss": 1.1397, + "mean_token_accuracy": 0.6617738604545593, + "num_tokens": 41353413.0, + "step": 1648 + }, + { + "epoch": 0.1810893916099275, + "grad_norm": 2.5688583850860596, + "learning_rate": 9.045005488474203e-07, + "loss": 1.0272, + "mean_token_accuracy": 0.6924328804016113, + "num_tokens": 41373119.0, + "step": 1649 + }, + { + "epoch": 0.1811992093125412, + "grad_norm": 2.166485548019409, + "learning_rate": 9.050493962678375e-07, + "loss": 1.0244, + "mean_token_accuracy": 0.6956197023391724, + "num_tokens": 41401199.0, + "step": 1650 + }, + { + "epoch": 0.18130902701515483, + "grad_norm": 2.5778725147247314, + "learning_rate": 9.055982436882547e-07, + "loss": 1.0049, + "mean_token_accuracy": 0.6917974948883057, + "num_tokens": 41421866.0, + "step": 1651 + }, + { + "epoch": 0.1814188447177685, + "grad_norm": 2.3787829875946045, + "learning_rate": 9.061470911086717e-07, + "loss": 1.0772, + "mean_token_accuracy": 0.6794676780700684, + "num_tokens": 41446038.0, + "step": 1652 + }, + { + "epoch": 0.18152866242038215, + "grad_norm": 2.3699800968170166, + "learning_rate": 9.066959385290889e-07, + "loss": 1.075, + "mean_token_accuracy": 0.6836529970169067, + "num_tokens": 41471476.0, + "step": 1653 + }, + { + "epoch": 0.18163848012299583, + "grad_norm": 2.1702404022216797, + "learning_rate": 9.07244785949506e-07, + "loss": 1.0609, + "mean_token_accuracy": 0.6910940408706665, + "num_tokens": 41499939.0, + "step": 1654 + }, + { + "epoch": 0.1817482978256095, + "grad_norm": 2.264455556869507, + "learning_rate": 9.077936333699231e-07, + "loss": 0.9884, + "mean_token_accuracy": 0.6994353532791138, + "num_tokens": 41527055.0, + "step": 1655 + }, + { + "epoch": 0.18185811552822315, + "grad_norm": 2.2578611373901367, + "learning_rate": 9.083424807903403e-07, + "loss": 0.9395, + "mean_token_accuracy": 0.7162142395973206, + "num_tokens": 41552976.0, + "step": 1656 + }, + { + "epoch": 0.18196793323083682, + "grad_norm": 2.4871904850006104, + "learning_rate": 9.088913282107573e-07, + "loss": 1.0903, + "mean_token_accuracy": 0.6746183037757874, + "num_tokens": 41578910.0, + "step": 1657 + }, + { + "epoch": 0.18207775093345047, + "grad_norm": 2.422405481338501, + "learning_rate": 9.094401756311745e-07, + "loss": 1.0094, + "mean_token_accuracy": 0.6953117847442627, + "num_tokens": 41601676.0, + "step": 1658 + }, + { + "epoch": 0.18218756863606414, + "grad_norm": 2.535583257675171, + "learning_rate": 9.099890230515917e-07, + "loss": 1.0875, + "mean_token_accuracy": 0.6727770566940308, + "num_tokens": 41624189.0, + "step": 1659 + }, + { + "epoch": 0.18229738633867779, + "grad_norm": 2.5064053535461426, + "learning_rate": 9.105378704720087e-07, + "loss": 0.9962, + "mean_token_accuracy": 0.707844078540802, + "num_tokens": 41646430.0, + "step": 1660 + }, + { + "epoch": 0.18240720404129146, + "grad_norm": 2.3602747917175293, + "learning_rate": 9.110867178924259e-07, + "loss": 1.0878, + "mean_token_accuracy": 0.6827608346939087, + "num_tokens": 41669567.0, + "step": 1661 + }, + { + "epoch": 0.1825170217439051, + "grad_norm": 2.018280267715454, + "learning_rate": 9.11635565312843e-07, + "loss": 1.0047, + "mean_token_accuracy": 0.694729745388031, + "num_tokens": 41698753.0, + "step": 1662 + }, + { + "epoch": 0.18262683944651878, + "grad_norm": 2.66686749458313, + "learning_rate": 9.1218441273326e-07, + "loss": 0.9835, + "mean_token_accuracy": 0.7060818672180176, + "num_tokens": 41718066.0, + "step": 1663 + }, + { + "epoch": 0.18273665714913245, + "grad_norm": 2.3219215869903564, + "learning_rate": 9.127332601536772e-07, + "loss": 0.9683, + "mean_token_accuracy": 0.709147572517395, + "num_tokens": 41741563.0, + "step": 1664 + }, + { + "epoch": 0.1828464748517461, + "grad_norm": 2.427075147628784, + "learning_rate": 9.132821075740944e-07, + "loss": 1.0557, + "mean_token_accuracy": 0.6939967274665833, + "num_tokens": 41766207.0, + "step": 1665 + }, + { + "epoch": 0.18295629255435977, + "grad_norm": 2.548576593399048, + "learning_rate": 9.138309549945114e-07, + "loss": 1.0291, + "mean_token_accuracy": 0.704830527305603, + "num_tokens": 41787090.0, + "step": 1666 + }, + { + "epoch": 0.18306611025697342, + "grad_norm": 2.447998285293579, + "learning_rate": 9.143798024149286e-07, + "loss": 0.929, + "mean_token_accuracy": 0.7131144404411316, + "num_tokens": 41810001.0, + "step": 1667 + }, + { + "epoch": 0.1831759279595871, + "grad_norm": 2.2171599864959717, + "learning_rate": 9.149286498353457e-07, + "loss": 1.0292, + "mean_token_accuracy": 0.7054816484451294, + "num_tokens": 41838026.0, + "step": 1668 + }, + { + "epoch": 0.18328574566220074, + "grad_norm": 2.471632957458496, + "learning_rate": 9.154774972557628e-07, + "loss": 1.021, + "mean_token_accuracy": 0.6907703876495361, + "num_tokens": 41861379.0, + "step": 1669 + }, + { + "epoch": 0.1833955633648144, + "grad_norm": 2.3942253589630127, + "learning_rate": 9.1602634467618e-07, + "loss": 1.0139, + "mean_token_accuracy": 0.6985797882080078, + "num_tokens": 41885943.0, + "step": 1670 + }, + { + "epoch": 0.18350538106742806, + "grad_norm": 2.4367587566375732, + "learning_rate": 9.165751920965971e-07, + "loss": 0.9654, + "mean_token_accuracy": 0.7014157772064209, + "num_tokens": 41908201.0, + "step": 1671 + }, + { + "epoch": 0.18361519877004173, + "grad_norm": 2.55257511138916, + "learning_rate": 9.171240395170142e-07, + "loss": 0.9776, + "mean_token_accuracy": 0.7075232267379761, + "num_tokens": 41929319.0, + "step": 1672 + }, + { + "epoch": 0.1837250164726554, + "grad_norm": 2.212718963623047, + "learning_rate": 9.176728869374314e-07, + "loss": 1.0805, + "mean_token_accuracy": 0.6772615909576416, + "num_tokens": 41957868.0, + "step": 1673 + }, + { + "epoch": 0.18383483417526905, + "grad_norm": 2.63783597946167, + "learning_rate": 9.182217343578485e-07, + "loss": 1.0738, + "mean_token_accuracy": 0.6887055039405823, + "num_tokens": 41978794.0, + "step": 1674 + }, + { + "epoch": 0.18394465187788273, + "grad_norm": 2.207087755203247, + "learning_rate": 9.187705817782656e-07, + "loss": 1.0006, + "mean_token_accuracy": 0.7058848142623901, + "num_tokens": 42004593.0, + "step": 1675 + }, + { + "epoch": 0.18405446958049637, + "grad_norm": 2.557203531265259, + "learning_rate": 9.193194291986828e-07, + "loss": 1.0004, + "mean_token_accuracy": 0.7008964419364929, + "num_tokens": 42024660.0, + "step": 1676 + }, + { + "epoch": 0.18416428728311005, + "grad_norm": 2.677619218826294, + "learning_rate": 9.198682766190999e-07, + "loss": 1.0079, + "mean_token_accuracy": 0.6996495723724365, + "num_tokens": 42045381.0, + "step": 1677 + }, + { + "epoch": 0.1842741049857237, + "grad_norm": 2.1829917430877686, + "learning_rate": 9.20417124039517e-07, + "loss": 1.1307, + "mean_token_accuracy": 0.6681711673736572, + "num_tokens": 42074403.0, + "step": 1678 + }, + { + "epoch": 0.18438392268833736, + "grad_norm": 2.2890427112579346, + "learning_rate": 9.209659714599341e-07, + "loss": 1.0789, + "mean_token_accuracy": 0.6806681156158447, + "num_tokens": 42100085.0, + "step": 1679 + }, + { + "epoch": 0.184493740390951, + "grad_norm": 2.019843578338623, + "learning_rate": 9.215148188803513e-07, + "loss": 1.0855, + "mean_token_accuracy": 0.6758274435997009, + "num_tokens": 42131641.0, + "step": 1680 + }, + { + "epoch": 0.18460355809356468, + "grad_norm": 1.9745631217956543, + "learning_rate": 9.220636663007683e-07, + "loss": 1.0315, + "mean_token_accuracy": 0.6978966593742371, + "num_tokens": 42164436.0, + "step": 1681 + }, + { + "epoch": 0.18471337579617833, + "grad_norm": 2.5126912593841553, + "learning_rate": 9.226125137211854e-07, + "loss": 1.0755, + "mean_token_accuracy": 0.6833436489105225, + "num_tokens": 42187644.0, + "step": 1682 + }, + { + "epoch": 0.184823193498792, + "grad_norm": 2.0924301147460938, + "learning_rate": 9.231613611416026e-07, + "loss": 1.0474, + "mean_token_accuracy": 0.6917504072189331, + "num_tokens": 42216550.0, + "step": 1683 + }, + { + "epoch": 0.18493301120140568, + "grad_norm": 2.374368667602539, + "learning_rate": 9.237102085620197e-07, + "loss": 0.9788, + "mean_token_accuracy": 0.6981596350669861, + "num_tokens": 42239739.0, + "step": 1684 + }, + { + "epoch": 0.18504282890401932, + "grad_norm": 2.2711400985717773, + "learning_rate": 9.242590559824368e-07, + "loss": 0.9415, + "mean_token_accuracy": 0.7155580520629883, + "num_tokens": 42264864.0, + "step": 1685 + }, + { + "epoch": 0.185152646606633, + "grad_norm": 2.5004332065582275, + "learning_rate": 9.24807903402854e-07, + "loss": 1.0163, + "mean_token_accuracy": 0.7027220726013184, + "num_tokens": 42286754.0, + "step": 1686 + }, + { + "epoch": 0.18526246430924664, + "grad_norm": 2.1517059803009033, + "learning_rate": 9.253567508232711e-07, + "loss": 1.0456, + "mean_token_accuracy": 0.6928144097328186, + "num_tokens": 42315288.0, + "step": 1687 + }, + { + "epoch": 0.18537228201186032, + "grad_norm": 2.337183713912964, + "learning_rate": 9.259055982436882e-07, + "loss": 1.0429, + "mean_token_accuracy": 0.6882188320159912, + "num_tokens": 42339405.0, + "step": 1688 + }, + { + "epoch": 0.18548209971447396, + "grad_norm": 2.387774705886841, + "learning_rate": 9.264544456641053e-07, + "loss": 0.8782, + "mean_token_accuracy": 0.7326235771179199, + "num_tokens": 42361931.0, + "step": 1689 + }, + { + "epoch": 0.18559191741708764, + "grad_norm": 2.6231372356414795, + "learning_rate": 9.270032930845224e-07, + "loss": 1.0081, + "mean_token_accuracy": 0.6975599527359009, + "num_tokens": 42382208.0, + "step": 1690 + }, + { + "epoch": 0.18570173511970128, + "grad_norm": 2.33967924118042, + "learning_rate": 9.275521405049396e-07, + "loss": 1.0269, + "mean_token_accuracy": 0.697332501411438, + "num_tokens": 42406246.0, + "step": 1691 + }, + { + "epoch": 0.18581155282231496, + "grad_norm": 2.585603713989258, + "learning_rate": 9.281009879253567e-07, + "loss": 1.0077, + "mean_token_accuracy": 0.6953816413879395, + "num_tokens": 42428244.0, + "step": 1692 + }, + { + "epoch": 0.18592137052492863, + "grad_norm": 2.064666986465454, + "learning_rate": 9.286498353457738e-07, + "loss": 0.9609, + "mean_token_accuracy": 0.7145482897758484, + "num_tokens": 42457477.0, + "step": 1693 + }, + { + "epoch": 0.18603118822754228, + "grad_norm": 2.107158660888672, + "learning_rate": 9.29198682766191e-07, + "loss": 1.0177, + "mean_token_accuracy": 0.6935683488845825, + "num_tokens": 42485955.0, + "step": 1694 + }, + { + "epoch": 0.18614100593015595, + "grad_norm": 2.2056119441986084, + "learning_rate": 9.297475301866081e-07, + "loss": 1.0463, + "mean_token_accuracy": 0.6854350566864014, + "num_tokens": 42513679.0, + "step": 1695 + }, + { + "epoch": 0.1862508236327696, + "grad_norm": 2.5432639122009277, + "learning_rate": 9.302963776070252e-07, + "loss": 1.0421, + "mean_token_accuracy": 0.6877461075782776, + "num_tokens": 42535756.0, + "step": 1696 + }, + { + "epoch": 0.18636064133538327, + "grad_norm": 2.5346839427948, + "learning_rate": 9.308452250274424e-07, + "loss": 1.0406, + "mean_token_accuracy": 0.7001357078552246, + "num_tokens": 42557115.0, + "step": 1697 + }, + { + "epoch": 0.18647045903799692, + "grad_norm": 2.561086893081665, + "learning_rate": 9.313940724478595e-07, + "loss": 1.0658, + "mean_token_accuracy": 0.675683856010437, + "num_tokens": 42577966.0, + "step": 1698 + }, + { + "epoch": 0.1865802767406106, + "grad_norm": 2.50807523727417, + "learning_rate": 9.319429198682766e-07, + "loss": 0.9847, + "mean_token_accuracy": 0.702197790145874, + "num_tokens": 42598674.0, + "step": 1699 + }, + { + "epoch": 0.18669009444322424, + "grad_norm": 2.1029586791992188, + "learning_rate": 9.324917672886937e-07, + "loss": 1.0844, + "mean_token_accuracy": 0.671958327293396, + "num_tokens": 42627398.0, + "step": 1700 + }, + { + "epoch": 0.1867999121458379, + "grad_norm": 2.330773115158081, + "learning_rate": 9.330406147091107e-07, + "loss": 1.0429, + "mean_token_accuracy": 0.6843385696411133, + "num_tokens": 42652544.0, + "step": 1701 + }, + { + "epoch": 0.18690972984845158, + "grad_norm": 2.3516645431518555, + "learning_rate": 9.335894621295279e-07, + "loss": 1.049, + "mean_token_accuracy": 0.6979575753211975, + "num_tokens": 42677311.0, + "step": 1702 + }, + { + "epoch": 0.18701954755106523, + "grad_norm": 2.237575054168701, + "learning_rate": 9.341383095499451e-07, + "loss": 1.0195, + "mean_token_accuracy": 0.6968290209770203, + "num_tokens": 42704485.0, + "step": 1703 + }, + { + "epoch": 0.1871293652536789, + "grad_norm": 2.4649202823638916, + "learning_rate": 9.346871569703621e-07, + "loss": 0.9496, + "mean_token_accuracy": 0.7075503468513489, + "num_tokens": 42726826.0, + "step": 1704 + }, + { + "epoch": 0.18723918295629255, + "grad_norm": 2.4227190017700195, + "learning_rate": 9.352360043907793e-07, + "loss": 1.0943, + "mean_token_accuracy": 0.6808358430862427, + "num_tokens": 42749180.0, + "step": 1705 + }, + { + "epoch": 0.18734900065890622, + "grad_norm": 2.3833658695220947, + "learning_rate": 9.357848518111965e-07, + "loss": 0.9391, + "mean_token_accuracy": 0.7102451324462891, + "num_tokens": 42774548.0, + "step": 1706 + }, + { + "epoch": 0.18745881836151987, + "grad_norm": 2.17659330368042, + "learning_rate": 9.363336992316135e-07, + "loss": 1.0353, + "mean_token_accuracy": 0.6847667694091797, + "num_tokens": 42803890.0, + "step": 1707 + }, + { + "epoch": 0.18756863606413354, + "grad_norm": 2.0407192707061768, + "learning_rate": 9.368825466520307e-07, + "loss": 0.9651, + "mean_token_accuracy": 0.710651159286499, + "num_tokens": 42834516.0, + "step": 1708 + }, + { + "epoch": 0.1876784537667472, + "grad_norm": 2.3033411502838135, + "learning_rate": 9.374313940724479e-07, + "loss": 1.0919, + "mean_token_accuracy": 0.6743879318237305, + "num_tokens": 42859994.0, + "step": 1709 + }, + { + "epoch": 0.18778827146936086, + "grad_norm": 2.8631813526153564, + "learning_rate": 9.379802414928649e-07, + "loss": 0.9498, + "mean_token_accuracy": 0.7106931209564209, + "num_tokens": 42878921.0, + "step": 1710 + }, + { + "epoch": 0.18789808917197454, + "grad_norm": 2.27752423286438, + "learning_rate": 9.385290889132821e-07, + "loss": 1.1113, + "mean_token_accuracy": 0.6677027940750122, + "num_tokens": 42909043.0, + "step": 1711 + }, + { + "epoch": 0.18800790687458818, + "grad_norm": 2.303981304168701, + "learning_rate": 9.390779363336992e-07, + "loss": 1.0566, + "mean_token_accuracy": 0.6858080625534058, + "num_tokens": 42935096.0, + "step": 1712 + }, + { + "epoch": 0.18811772457720186, + "grad_norm": 2.301063060760498, + "learning_rate": 9.396267837541163e-07, + "loss": 1.0446, + "mean_token_accuracy": 0.6880849003791809, + "num_tokens": 42961044.0, + "step": 1713 + }, + { + "epoch": 0.1882275422798155, + "grad_norm": 2.041837453842163, + "learning_rate": 9.401756311745335e-07, + "loss": 1.0475, + "mean_token_accuracy": 0.6941601037979126, + "num_tokens": 42994847.0, + "step": 1714 + }, + { + "epoch": 0.18833735998242918, + "grad_norm": 2.2207207679748535, + "learning_rate": 9.407244785949506e-07, + "loss": 1.0204, + "mean_token_accuracy": 0.6954166889190674, + "num_tokens": 43021074.0, + "step": 1715 + }, + { + "epoch": 0.18844717768504282, + "grad_norm": 2.1598713397979736, + "learning_rate": 9.412733260153677e-07, + "loss": 1.0729, + "mean_token_accuracy": 0.6822290420532227, + "num_tokens": 43049322.0, + "step": 1716 + }, + { + "epoch": 0.1885569953876565, + "grad_norm": 2.301452398300171, + "learning_rate": 9.418221734357849e-07, + "loss": 0.9221, + "mean_token_accuracy": 0.7223076820373535, + "num_tokens": 43073059.0, + "step": 1717 + }, + { + "epoch": 0.18866681309027014, + "grad_norm": 2.12516450881958, + "learning_rate": 9.42371020856202e-07, + "loss": 1.0317, + "mean_token_accuracy": 0.6999780535697937, + "num_tokens": 43100405.0, + "step": 1718 + }, + { + "epoch": 0.18877663079288381, + "grad_norm": 2.561347723007202, + "learning_rate": 9.42919868276619e-07, + "loss": 1.0131, + "mean_token_accuracy": 0.6906530261039734, + "num_tokens": 43119959.0, + "step": 1719 + }, + { + "epoch": 0.18888644849549746, + "grad_norm": 2.0343894958496094, + "learning_rate": 9.434687156970362e-07, + "loss": 0.973, + "mean_token_accuracy": 0.6935677528381348, + "num_tokens": 43148542.0, + "step": 1720 + }, + { + "epoch": 0.18899626619811113, + "grad_norm": 2.125866651535034, + "learning_rate": 9.440175631174532e-07, + "loss": 0.999, + "mean_token_accuracy": 0.7004241347312927, + "num_tokens": 43176263.0, + "step": 1721 + }, + { + "epoch": 0.1891060839007248, + "grad_norm": 2.2086400985717773, + "learning_rate": 9.445664105378704e-07, + "loss": 1.0571, + "mean_token_accuracy": 0.6798086166381836, + "num_tokens": 43203919.0, + "step": 1722 + }, + { + "epoch": 0.18921590160333845, + "grad_norm": 2.074054718017578, + "learning_rate": 9.451152579582875e-07, + "loss": 1.033, + "mean_token_accuracy": 0.6947149634361267, + "num_tokens": 43233475.0, + "step": 1723 + }, + { + "epoch": 0.18932571930595213, + "grad_norm": 2.1414127349853516, + "learning_rate": 9.456641053787046e-07, + "loss": 0.9308, + "mean_token_accuracy": 0.707471489906311, + "num_tokens": 43259385.0, + "step": 1724 + }, + { + "epoch": 0.18943553700856577, + "grad_norm": 2.3735125064849854, + "learning_rate": 9.462129527991218e-07, + "loss": 1.0522, + "mean_token_accuracy": 0.6878987550735474, + "num_tokens": 43283133.0, + "step": 1725 + }, + { + "epoch": 0.18954535471117945, + "grad_norm": 2.238878011703491, + "learning_rate": 9.467618002195389e-07, + "loss": 1.082, + "mean_token_accuracy": 0.6880273818969727, + "num_tokens": 43312087.0, + "step": 1726 + }, + { + "epoch": 0.1896551724137931, + "grad_norm": 2.3622002601623535, + "learning_rate": 9.47310647639956e-07, + "loss": 0.9403, + "mean_token_accuracy": 0.7133525609970093, + "num_tokens": 43338051.0, + "step": 1727 + }, + { + "epoch": 0.18976499011640677, + "grad_norm": 2.291858434677124, + "learning_rate": 9.478594950603732e-07, + "loss": 1.0581, + "mean_token_accuracy": 0.6852458119392395, + "num_tokens": 43363304.0, + "step": 1728 + }, + { + "epoch": 0.1898748078190204, + "grad_norm": 2.4687464237213135, + "learning_rate": 9.484083424807903e-07, + "loss": 0.9849, + "mean_token_accuracy": 0.7012256383895874, + "num_tokens": 43385517.0, + "step": 1729 + }, + { + "epoch": 0.1899846255216341, + "grad_norm": 2.067894697189331, + "learning_rate": 9.489571899012074e-07, + "loss": 1.0527, + "mean_token_accuracy": 0.6883291006088257, + "num_tokens": 43416747.0, + "step": 1730 + }, + { + "epoch": 0.19009444322424776, + "grad_norm": 2.0250890254974365, + "learning_rate": 9.495060373216246e-07, + "loss": 0.9638, + "mean_token_accuracy": 0.7054661512374878, + "num_tokens": 43446059.0, + "step": 1731 + }, + { + "epoch": 0.1902042609268614, + "grad_norm": 2.2478809356689453, + "learning_rate": 9.500548847420417e-07, + "loss": 0.9946, + "mean_token_accuracy": 0.6985995769500732, + "num_tokens": 43470756.0, + "step": 1732 + }, + { + "epoch": 0.19031407862947508, + "grad_norm": 2.4246816635131836, + "learning_rate": 9.506037321624588e-07, + "loss": 0.9496, + "mean_token_accuracy": 0.7073412537574768, + "num_tokens": 43492956.0, + "step": 1733 + }, + { + "epoch": 0.19042389633208873, + "grad_norm": 2.407482385635376, + "learning_rate": 9.511525795828759e-07, + "loss": 0.9858, + "mean_token_accuracy": 0.705222487449646, + "num_tokens": 43516773.0, + "step": 1734 + }, + { + "epoch": 0.1905337140347024, + "grad_norm": 2.012791633605957, + "learning_rate": 9.517014270032931e-07, + "loss": 1.1042, + "mean_token_accuracy": 0.6800713539123535, + "num_tokens": 43548600.0, + "step": 1735 + }, + { + "epoch": 0.19064353173731605, + "grad_norm": 2.5509588718414307, + "learning_rate": 9.522502744237102e-07, + "loss": 0.9522, + "mean_token_accuracy": 0.7120956182479858, + "num_tokens": 43570324.0, + "step": 1736 + }, + { + "epoch": 0.19075334943992972, + "grad_norm": 2.002777099609375, + "learning_rate": 9.527991218441273e-07, + "loss": 1.0664, + "mean_token_accuracy": 0.6814852952957153, + "num_tokens": 43604476.0, + "step": 1737 + }, + { + "epoch": 0.19086316714254337, + "grad_norm": 2.104506731033325, + "learning_rate": 9.533479692645444e-07, + "loss": 1.0711, + "mean_token_accuracy": 0.678946316242218, + "num_tokens": 43635782.0, + "step": 1738 + }, + { + "epoch": 0.19097298484515704, + "grad_norm": 2.378434658050537, + "learning_rate": 9.538968166849616e-07, + "loss": 0.9325, + "mean_token_accuracy": 0.7149689197540283, + "num_tokens": 43658965.0, + "step": 1739 + }, + { + "epoch": 0.1910828025477707, + "grad_norm": 2.6023573875427246, + "learning_rate": 9.544456641053787e-07, + "loss": 0.9559, + "mean_token_accuracy": 0.7115628123283386, + "num_tokens": 43677990.0, + "step": 1740 + }, + { + "epoch": 0.19119262025038436, + "grad_norm": 2.3048059940338135, + "learning_rate": 9.549945115257958e-07, + "loss": 1.0938, + "mean_token_accuracy": 0.6739806532859802, + "num_tokens": 43705257.0, + "step": 1741 + }, + { + "epoch": 0.19130243795299803, + "grad_norm": 2.2801272869110107, + "learning_rate": 9.55543358946213e-07, + "loss": 0.9714, + "mean_token_accuracy": 0.7025678157806396, + "num_tokens": 43730465.0, + "step": 1742 + }, + { + "epoch": 0.19141225565561168, + "grad_norm": 2.019120931625366, + "learning_rate": 9.5609220636663e-07, + "loss": 1.0375, + "mean_token_accuracy": 0.6830954551696777, + "num_tokens": 43762500.0, + "step": 1743 + }, + { + "epoch": 0.19152207335822535, + "grad_norm": 2.345323085784912, + "learning_rate": 9.566410537870472e-07, + "loss": 1.0791, + "mean_token_accuracy": 0.6744175553321838, + "num_tokens": 43786982.0, + "step": 1744 + }, + { + "epoch": 0.191631891060839, + "grad_norm": 2.2429072856903076, + "learning_rate": 9.571899012074642e-07, + "loss": 1.1017, + "mean_token_accuracy": 0.6682946681976318, + "num_tokens": 43813886.0, + "step": 1745 + }, + { + "epoch": 0.19174170876345267, + "grad_norm": 2.604389190673828, + "learning_rate": 9.577387486278815e-07, + "loss": 0.9778, + "mean_token_accuracy": 0.6988329291343689, + "num_tokens": 43834292.0, + "step": 1746 + }, + { + "epoch": 0.19185152646606632, + "grad_norm": 2.3763911724090576, + "learning_rate": 9.582875960482986e-07, + "loss": 0.9113, + "mean_token_accuracy": 0.7183761596679688, + "num_tokens": 43856902.0, + "step": 1747 + }, + { + "epoch": 0.19196134416868, + "grad_norm": 2.203001022338867, + "learning_rate": 9.588364434687156e-07, + "loss": 1.0889, + "mean_token_accuracy": 0.6840697526931763, + "num_tokens": 43884906.0, + "step": 1748 + }, + { + "epoch": 0.19207116187129367, + "grad_norm": 2.5260088443756104, + "learning_rate": 9.593852908891327e-07, + "loss": 0.8955, + "mean_token_accuracy": 0.7202033996582031, + "num_tokens": 43905654.0, + "step": 1749 + }, + { + "epoch": 0.1921809795739073, + "grad_norm": 2.273998737335205, + "learning_rate": 9.5993413830955e-07, + "loss": 0.9869, + "mean_token_accuracy": 0.699866533279419, + "num_tokens": 43929113.0, + "step": 1750 + }, + { + "epoch": 0.19229079727652099, + "grad_norm": 2.4463353157043457, + "learning_rate": 9.60482985729967e-07, + "loss": 1.0602, + "mean_token_accuracy": 0.6855489015579224, + "num_tokens": 43951492.0, + "step": 1751 + }, + { + "epoch": 0.19240061497913463, + "grad_norm": 2.330721139907837, + "learning_rate": 9.61031833150384e-07, + "loss": 1.0041, + "mean_token_accuracy": 0.6973105669021606, + "num_tokens": 43976885.0, + "step": 1752 + }, + { + "epoch": 0.1925104326817483, + "grad_norm": 2.5114166736602783, + "learning_rate": 9.615806805708014e-07, + "loss": 1.0293, + "mean_token_accuracy": 0.6941750049591064, + "num_tokens": 43999679.0, + "step": 1753 + }, + { + "epoch": 0.19262025038436195, + "grad_norm": 2.0689303874969482, + "learning_rate": 9.621295279912184e-07, + "loss": 1.0847, + "mean_token_accuracy": 0.6774717569351196, + "num_tokens": 44028717.0, + "step": 1754 + }, + { + "epoch": 0.19273006808697563, + "grad_norm": 2.42579984664917, + "learning_rate": 9.626783754116355e-07, + "loss": 0.999, + "mean_token_accuracy": 0.7036312818527222, + "num_tokens": 44058615.0, + "step": 1755 + }, + { + "epoch": 0.19283988578958927, + "grad_norm": 2.5613579750061035, + "learning_rate": 9.632272228320525e-07, + "loss": 0.9365, + "mean_token_accuracy": 0.7139190435409546, + "num_tokens": 44078378.0, + "step": 1756 + }, + { + "epoch": 0.19294970349220295, + "grad_norm": 2.1200196743011475, + "learning_rate": 9.637760702524698e-07, + "loss": 1.0663, + "mean_token_accuracy": 0.6818646192550659, + "num_tokens": 44106859.0, + "step": 1757 + }, + { + "epoch": 0.1930595211948166, + "grad_norm": 2.396864891052246, + "learning_rate": 9.643249176728869e-07, + "loss": 1.0354, + "mean_token_accuracy": 0.6885242462158203, + "num_tokens": 44132268.0, + "step": 1758 + }, + { + "epoch": 0.19316933889743026, + "grad_norm": 2.3867690563201904, + "learning_rate": 9.64873765093304e-07, + "loss": 1.0629, + "mean_token_accuracy": 0.6796213388442993, + "num_tokens": 44157131.0, + "step": 1759 + }, + { + "epoch": 0.19327915660004394, + "grad_norm": 2.1986711025238037, + "learning_rate": 9.654226125137212e-07, + "loss": 1.0409, + "mean_token_accuracy": 0.6897527575492859, + "num_tokens": 44184567.0, + "step": 1760 + }, + { + "epoch": 0.19338897430265758, + "grad_norm": 2.230116844177246, + "learning_rate": 9.659714599341383e-07, + "loss": 1.1144, + "mean_token_accuracy": 0.6797081232070923, + "num_tokens": 44213724.0, + "step": 1761 + }, + { + "epoch": 0.19349879200527126, + "grad_norm": 1.9991823434829712, + "learning_rate": 9.665203073545553e-07, + "loss": 1.1069, + "mean_token_accuracy": 0.6755950450897217, + "num_tokens": 44245947.0, + "step": 1762 + }, + { + "epoch": 0.1936086097078849, + "grad_norm": 2.455017328262329, + "learning_rate": 9.670691547749726e-07, + "loss": 1.043, + "mean_token_accuracy": 0.6801794767379761, + "num_tokens": 44271529.0, + "step": 1763 + }, + { + "epoch": 0.19371842741049858, + "grad_norm": 2.2331607341766357, + "learning_rate": 9.676180021953897e-07, + "loss": 1.0081, + "mean_token_accuracy": 0.6942092180252075, + "num_tokens": 44299335.0, + "step": 1764 + }, + { + "epoch": 0.19382824511311222, + "grad_norm": 2.329925060272217, + "learning_rate": 9.681668496158067e-07, + "loss": 0.9452, + "mean_token_accuracy": 0.7165969610214233, + "num_tokens": 44323375.0, + "step": 1765 + }, + { + "epoch": 0.1939380628157259, + "grad_norm": 2.63470721244812, + "learning_rate": 9.68715697036224e-07, + "loss": 1.0224, + "mean_token_accuracy": 0.6907235383987427, + "num_tokens": 44343128.0, + "step": 1766 + }, + { + "epoch": 0.19404788051833954, + "grad_norm": 2.8419189453125, + "learning_rate": 9.692645444566409e-07, + "loss": 1.0509, + "mean_token_accuracy": 0.6929669380187988, + "num_tokens": 44361695.0, + "step": 1767 + }, + { + "epoch": 0.19415769822095322, + "grad_norm": 2.471193552017212, + "learning_rate": 9.698133918770581e-07, + "loss": 1.0303, + "mean_token_accuracy": 0.6929081678390503, + "num_tokens": 44385021.0, + "step": 1768 + }, + { + "epoch": 0.1942675159235669, + "grad_norm": 2.3825535774230957, + "learning_rate": 9.703622392974752e-07, + "loss": 1.048, + "mean_token_accuracy": 0.6894923448562622, + "num_tokens": 44408055.0, + "step": 1769 + }, + { + "epoch": 0.19437733362618054, + "grad_norm": 2.2983059883117676, + "learning_rate": 9.709110867178923e-07, + "loss": 0.9671, + "mean_token_accuracy": 0.706169605255127, + "num_tokens": 44431320.0, + "step": 1770 + }, + { + "epoch": 0.1944871513287942, + "grad_norm": 2.3688249588012695, + "learning_rate": 9.714599341383095e-07, + "loss": 1.0649, + "mean_token_accuracy": 0.6843432188034058, + "num_tokens": 44456297.0, + "step": 1771 + }, + { + "epoch": 0.19459696903140786, + "grad_norm": 2.012962579727173, + "learning_rate": 9.720087815587266e-07, + "loss": 0.9976, + "mean_token_accuracy": 0.7019718885421753, + "num_tokens": 44485968.0, + "step": 1772 + }, + { + "epoch": 0.19470678673402153, + "grad_norm": 2.60888671875, + "learning_rate": 9.725576289791437e-07, + "loss": 1.0541, + "mean_token_accuracy": 0.6796259880065918, + "num_tokens": 44507285.0, + "step": 1773 + }, + { + "epoch": 0.19481660443663518, + "grad_norm": 2.2229747772216797, + "learning_rate": 9.73106476399561e-07, + "loss": 1.0024, + "mean_token_accuracy": 0.6911605596542358, + "num_tokens": 44534362.0, + "step": 1774 + }, + { + "epoch": 0.19492642213924885, + "grad_norm": 2.34677791595459, + "learning_rate": 9.73655323819978e-07, + "loss": 1.0149, + "mean_token_accuracy": 0.6915988326072693, + "num_tokens": 44560700.0, + "step": 1775 + }, + { + "epoch": 0.1950362398418625, + "grad_norm": 2.476642608642578, + "learning_rate": 9.74204171240395e-07, + "loss": 0.9826, + "mean_token_accuracy": 0.7025781273841858, + "num_tokens": 44582511.0, + "step": 1776 + }, + { + "epoch": 0.19514605754447617, + "grad_norm": 2.051626682281494, + "learning_rate": 9.747530186608123e-07, + "loss": 1.1075, + "mean_token_accuracy": 0.6756128072738647, + "num_tokens": 44615940.0, + "step": 1777 + }, + { + "epoch": 0.19525587524708984, + "grad_norm": 2.107551336288452, + "learning_rate": 9.753018660812294e-07, + "loss": 1.107, + "mean_token_accuracy": 0.6683595180511475, + "num_tokens": 44644920.0, + "step": 1778 + }, + { + "epoch": 0.1953656929497035, + "grad_norm": 2.3616080284118652, + "learning_rate": 9.758507135016465e-07, + "loss": 1.0225, + "mean_token_accuracy": 0.6896378397941589, + "num_tokens": 44668972.0, + "step": 1779 + }, + { + "epoch": 0.19547551065231716, + "grad_norm": 2.2911112308502197, + "learning_rate": 9.763995609220637e-07, + "loss": 1.0499, + "mean_token_accuracy": 0.6844778656959534, + "num_tokens": 44694128.0, + "step": 1780 + }, + { + "epoch": 0.1955853283549308, + "grad_norm": 2.1423187255859375, + "learning_rate": 9.769484083424808e-07, + "loss": 1.0296, + "mean_token_accuracy": 0.6949794292449951, + "num_tokens": 44722981.0, + "step": 1781 + }, + { + "epoch": 0.19569514605754448, + "grad_norm": 2.3456718921661377, + "learning_rate": 9.774972557628979e-07, + "loss": 1.1347, + "mean_token_accuracy": 0.6649633646011353, + "num_tokens": 44748049.0, + "step": 1782 + }, + { + "epoch": 0.19580496376015813, + "grad_norm": 2.215397596359253, + "learning_rate": 9.780461031833151e-07, + "loss": 1.0087, + "mean_token_accuracy": 0.6952489614486694, + "num_tokens": 44774542.0, + "step": 1783 + }, + { + "epoch": 0.1959147814627718, + "grad_norm": 2.7501282691955566, + "learning_rate": 9.785949506037322e-07, + "loss": 0.9739, + "mean_token_accuracy": 0.6991872787475586, + "num_tokens": 44793912.0, + "step": 1784 + }, + { + "epoch": 0.19602459916538545, + "grad_norm": 2.0865561962127686, + "learning_rate": 9.791437980241493e-07, + "loss": 0.9906, + "mean_token_accuracy": 0.7038302421569824, + "num_tokens": 44822735.0, + "step": 1785 + }, + { + "epoch": 0.19613441686799912, + "grad_norm": 2.6799216270446777, + "learning_rate": 9.796926454445663e-07, + "loss": 1.0096, + "mean_token_accuracy": 0.6923709511756897, + "num_tokens": 44842855.0, + "step": 1786 + }, + { + "epoch": 0.1962442345706128, + "grad_norm": 2.609262228012085, + "learning_rate": 9.802414928649834e-07, + "loss": 1.0243, + "mean_token_accuracy": 0.6931425929069519, + "num_tokens": 44863851.0, + "step": 1787 + }, + { + "epoch": 0.19635405227322644, + "grad_norm": 2.209914207458496, + "learning_rate": 9.807903402854007e-07, + "loss": 0.9294, + "mean_token_accuracy": 0.7247280478477478, + "num_tokens": 44888832.0, + "step": 1788 + }, + { + "epoch": 0.19646386997584012, + "grad_norm": 2.050537586212158, + "learning_rate": 9.813391877058177e-07, + "loss": 1.058, + "mean_token_accuracy": 0.685721755027771, + "num_tokens": 44920750.0, + "step": 1789 + }, + { + "epoch": 0.19657368767845376, + "grad_norm": 2.3405275344848633, + "learning_rate": 9.818880351262348e-07, + "loss": 0.9198, + "mean_token_accuracy": 0.7159318923950195, + "num_tokens": 44943306.0, + "step": 1790 + }, + { + "epoch": 0.19668350538106744, + "grad_norm": 2.4028639793395996, + "learning_rate": 9.82436882546652e-07, + "loss": 1.0817, + "mean_token_accuracy": 0.6755192279815674, + "num_tokens": 44967949.0, + "step": 1791 + }, + { + "epoch": 0.19679332308368108, + "grad_norm": 2.28031063079834, + "learning_rate": 9.829857299670691e-07, + "loss": 1.0157, + "mean_token_accuracy": 0.7008576393127441, + "num_tokens": 44993012.0, + "step": 1792 + }, + { + "epoch": 0.19690314078629476, + "grad_norm": 2.3331472873687744, + "learning_rate": 9.835345773874862e-07, + "loss": 0.9691, + "mean_token_accuracy": 0.7054033279418945, + "num_tokens": 45017204.0, + "step": 1793 + }, + { + "epoch": 0.1970129584889084, + "grad_norm": 2.296215772628784, + "learning_rate": 9.840834248079035e-07, + "loss": 0.9979, + "mean_token_accuracy": 0.7080838084220886, + "num_tokens": 45044531.0, + "step": 1794 + }, + { + "epoch": 0.19712277619152208, + "grad_norm": 2.6079466342926025, + "learning_rate": 9.846322722283205e-07, + "loss": 0.9315, + "mean_token_accuracy": 0.7162137031555176, + "num_tokens": 45064908.0, + "step": 1795 + }, + { + "epoch": 0.19723259389413572, + "grad_norm": 2.3505091667175293, + "learning_rate": 9.851811196487376e-07, + "loss": 1.0469, + "mean_token_accuracy": 0.689140260219574, + "num_tokens": 45088420.0, + "step": 1796 + }, + { + "epoch": 0.1973424115967494, + "grad_norm": 2.550914764404297, + "learning_rate": 9.857299670691546e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.6846412420272827, + "num_tokens": 45109052.0, + "step": 1797 + }, + { + "epoch": 0.19745222929936307, + "grad_norm": 2.2871174812316895, + "learning_rate": 9.86278814489572e-07, + "loss": 0.9911, + "mean_token_accuracy": 0.6995629072189331, + "num_tokens": 45132677.0, + "step": 1798 + }, + { + "epoch": 0.19756204700197671, + "grad_norm": 2.048579692840576, + "learning_rate": 9.86827661909989e-07, + "loss": 1.0825, + "mean_token_accuracy": 0.6789509654045105, + "num_tokens": 45163741.0, + "step": 1799 + }, + { + "epoch": 0.1976718647045904, + "grad_norm": 2.238921880722046, + "learning_rate": 9.87376509330406e-07, + "loss": 1.0047, + "mean_token_accuracy": 0.6960175037384033, + "num_tokens": 45191249.0, + "step": 1800 + }, + { + "epoch": 0.19778168240720403, + "grad_norm": 2.402751922607422, + "learning_rate": 9.879253567508233e-07, + "loss": 1.0131, + "mean_token_accuracy": 0.6950619220733643, + "num_tokens": 45215309.0, + "step": 1801 + }, + { + "epoch": 0.1978915001098177, + "grad_norm": 2.5687413215637207, + "learning_rate": 9.884742041712404e-07, + "loss": 0.9638, + "mean_token_accuracy": 0.7086860537528992, + "num_tokens": 45234585.0, + "step": 1802 + }, + { + "epoch": 0.19800131781243135, + "grad_norm": 2.268040180206299, + "learning_rate": 9.890230515916574e-07, + "loss": 1.013, + "mean_token_accuracy": 0.6958433389663696, + "num_tokens": 45260154.0, + "step": 1803 + }, + { + "epoch": 0.19811113551504503, + "grad_norm": 2.6004791259765625, + "learning_rate": 9.895718990120747e-07, + "loss": 0.8868, + "mean_token_accuracy": 0.7240527868270874, + "num_tokens": 45281629.0, + "step": 1804 + }, + { + "epoch": 0.19822095321765867, + "grad_norm": 2.2561757564544678, + "learning_rate": 9.901207464324918e-07, + "loss": 0.9793, + "mean_token_accuracy": 0.7094178795814514, + "num_tokens": 45306375.0, + "step": 1805 + }, + { + "epoch": 0.19833077092027235, + "grad_norm": 2.294628143310547, + "learning_rate": 9.906695938529088e-07, + "loss": 0.9591, + "mean_token_accuracy": 0.7150183916091919, + "num_tokens": 45331238.0, + "step": 1806 + }, + { + "epoch": 0.19844058862288602, + "grad_norm": 2.1299214363098145, + "learning_rate": 9.91218441273326e-07, + "loss": 1.0037, + "mean_token_accuracy": 0.6962621212005615, + "num_tokens": 45359519.0, + "step": 1807 + }, + { + "epoch": 0.19855040632549967, + "grad_norm": 2.486145257949829, + "learning_rate": 9.91767288693743e-07, + "loss": 1.1176, + "mean_token_accuracy": 0.691024899482727, + "num_tokens": 45384826.0, + "step": 1808 + }, + { + "epoch": 0.19866022402811334, + "grad_norm": 2.5691699981689453, + "learning_rate": 9.923161361141602e-07, + "loss": 1.0408, + "mean_token_accuracy": 0.6869226098060608, + "num_tokens": 45406611.0, + "step": 1809 + }, + { + "epoch": 0.198770041730727, + "grad_norm": 2.467947483062744, + "learning_rate": 9.928649835345773e-07, + "loss": 1.0075, + "mean_token_accuracy": 0.6929025650024414, + "num_tokens": 45430919.0, + "step": 1810 + }, + { + "epoch": 0.19887985943334066, + "grad_norm": 2.368643045425415, + "learning_rate": 9.934138309549944e-07, + "loss": 1.0471, + "mean_token_accuracy": 0.6806666254997253, + "num_tokens": 45456588.0, + "step": 1811 + }, + { + "epoch": 0.1989896771359543, + "grad_norm": 2.511106491088867, + "learning_rate": 9.939626783754116e-07, + "loss": 0.9654, + "mean_token_accuracy": 0.7089542150497437, + "num_tokens": 45478367.0, + "step": 1812 + }, + { + "epoch": 0.19909949483856798, + "grad_norm": 2.2909157276153564, + "learning_rate": 9.945115257958287e-07, + "loss": 1.0705, + "mean_token_accuracy": 0.6865952014923096, + "num_tokens": 45503510.0, + "step": 1813 + }, + { + "epoch": 0.19920931254118163, + "grad_norm": 2.3099446296691895, + "learning_rate": 9.950603732162458e-07, + "loss": 0.984, + "mean_token_accuracy": 0.707017183303833, + "num_tokens": 45530501.0, + "step": 1814 + }, + { + "epoch": 0.1993191302437953, + "grad_norm": 2.2454166412353516, + "learning_rate": 9.95609220636663e-07, + "loss": 1.1582, + "mean_token_accuracy": 0.6588767766952515, + "num_tokens": 45557220.0, + "step": 1815 + }, + { + "epoch": 0.19942894794640897, + "grad_norm": 2.342351198196411, + "learning_rate": 9.9615806805708e-07, + "loss": 1.0362, + "mean_token_accuracy": 0.691767156124115, + "num_tokens": 45581333.0, + "step": 1816 + }, + { + "epoch": 0.19953876564902262, + "grad_norm": 2.926016330718994, + "learning_rate": 9.967069154774972e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.7037192583084106, + "num_tokens": 45597998.0, + "step": 1817 + }, + { + "epoch": 0.1996485833516363, + "grad_norm": 2.1625638008117676, + "learning_rate": 9.972557628979144e-07, + "loss": 1.072, + "mean_token_accuracy": 0.6872937679290771, + "num_tokens": 45631148.0, + "step": 1818 + }, + { + "epoch": 0.19975840105424994, + "grad_norm": 2.850992441177368, + "learning_rate": 9.978046103183315e-07, + "loss": 0.9363, + "mean_token_accuracy": 0.7072917222976685, + "num_tokens": 45648214.0, + "step": 1819 + }, + { + "epoch": 0.1998682187568636, + "grad_norm": 2.2086668014526367, + "learning_rate": 9.983534577387486e-07, + "loss": 1.0415, + "mean_token_accuracy": 0.6867694854736328, + "num_tokens": 45677606.0, + "step": 1820 + }, + { + "epoch": 0.19997803645947726, + "grad_norm": 2.331897735595703, + "learning_rate": 9.989023051591658e-07, + "loss": 1.0342, + "mean_token_accuracy": 0.6922863721847534, + "num_tokens": 45702727.0, + "step": 1821 + }, + { + "epoch": 0.20008785416209093, + "grad_norm": 1.9602701663970947, + "learning_rate": 9.994511525795829e-07, + "loss": 1.094, + "mean_token_accuracy": 0.6709454655647278, + "num_tokens": 45734821.0, + "step": 1822 + }, + { + "epoch": 0.20019767186470458, + "grad_norm": 2.332500457763672, + "learning_rate": 1e-06, + "loss": 1.1228, + "mean_token_accuracy": 0.6672888398170471, + "num_tokens": 45759174.0, + "step": 1823 + }, + { + "epoch": 0.20030748956731825, + "grad_norm": 2.877493143081665, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6984584331512451, + "num_tokens": 45780887.0, + "step": 1824 + }, + { + "epoch": 0.2004173072699319, + "grad_norm": 2.659433126449585, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7228422164916992, + "num_tokens": 45797764.0, + "step": 1825 + }, + { + "epoch": 0.20052712497254557, + "grad_norm": 2.597841262817383, + "learning_rate": 1e-06, + "loss": 1.0836, + "mean_token_accuracy": 0.6815930604934692, + "num_tokens": 45824579.0, + "step": 1826 + }, + { + "epoch": 0.20063694267515925, + "grad_norm": 2.182420253753662, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6953352689743042, + "num_tokens": 45852624.0, + "step": 1827 + }, + { + "epoch": 0.2007467603777729, + "grad_norm": 2.4284796714782715, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7069715261459351, + "num_tokens": 45875265.0, + "step": 1828 + }, + { + "epoch": 0.20085657808038657, + "grad_norm": 2.161654472351074, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7109653353691101, + "num_tokens": 45901910.0, + "step": 1829 + }, + { + "epoch": 0.2009663957830002, + "grad_norm": 2.5271005630493164, + "learning_rate": 1e-06, + "loss": 1.0951, + "mean_token_accuracy": 0.6749303340911865, + "num_tokens": 45924999.0, + "step": 1830 + }, + { + "epoch": 0.20107621348561389, + "grad_norm": 2.000173807144165, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7067615985870361, + "num_tokens": 45954822.0, + "step": 1831 + }, + { + "epoch": 0.20118603118822753, + "grad_norm": 2.4837119579315186, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.716705322265625, + "num_tokens": 45975659.0, + "step": 1832 + }, + { + "epoch": 0.2012958488908412, + "grad_norm": 2.239671230316162, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6933795213699341, + "num_tokens": 46002715.0, + "step": 1833 + }, + { + "epoch": 0.20140566659345485, + "grad_norm": 2.7628753185272217, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6992185711860657, + "num_tokens": 46021698.0, + "step": 1834 + }, + { + "epoch": 0.20151548429606853, + "grad_norm": 2.2260639667510986, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7002230286598206, + "num_tokens": 46050663.0, + "step": 1835 + }, + { + "epoch": 0.2016253019986822, + "grad_norm": 2.714000701904297, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7043194770812988, + "num_tokens": 46069855.0, + "step": 1836 + }, + { + "epoch": 0.20173511970129585, + "grad_norm": 2.1122708320617676, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7019820213317871, + "num_tokens": 46098091.0, + "step": 1837 + }, + { + "epoch": 0.20184493740390952, + "grad_norm": 2.252415657043457, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.7042381167411804, + "num_tokens": 46124507.0, + "step": 1838 + }, + { + "epoch": 0.20195475510652316, + "grad_norm": 2.488413095474243, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7324352860450745, + "num_tokens": 46144690.0, + "step": 1839 + }, + { + "epoch": 0.20206457280913684, + "grad_norm": 2.6993038654327393, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6868875026702881, + "num_tokens": 46165910.0, + "step": 1840 + }, + { + "epoch": 0.20217439051175048, + "grad_norm": 2.5401129722595215, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6948670148849487, + "num_tokens": 46186630.0, + "step": 1841 + }, + { + "epoch": 0.20228420821436416, + "grad_norm": 2.3213307857513428, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7093467712402344, + "num_tokens": 46210179.0, + "step": 1842 + }, + { + "epoch": 0.2023940259169778, + "grad_norm": 2.320511817932129, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7095175981521606, + "num_tokens": 46234401.0, + "step": 1843 + }, + { + "epoch": 0.20250384361959148, + "grad_norm": 2.3195438385009766, + "learning_rate": 1e-06, + "loss": 1.0997, + "mean_token_accuracy": 0.6755520105361938, + "num_tokens": 46260293.0, + "step": 1844 + }, + { + "epoch": 0.20261366132220515, + "grad_norm": 2.5734288692474365, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6903430223464966, + "num_tokens": 46280736.0, + "step": 1845 + }, + { + "epoch": 0.2027234790248188, + "grad_norm": 2.126220464706421, + "learning_rate": 1e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6777124404907227, + "num_tokens": 46308124.0, + "step": 1846 + }, + { + "epoch": 0.20283329672743247, + "grad_norm": 2.080719232559204, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7151888608932495, + "num_tokens": 46334079.0, + "step": 1847 + }, + { + "epoch": 0.20294311443004612, + "grad_norm": 2.5089619159698486, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6977742910385132, + "num_tokens": 46355826.0, + "step": 1848 + }, + { + "epoch": 0.2030529321326598, + "grad_norm": 2.4360415935516357, + "learning_rate": 1e-06, + "loss": 1.1114, + "mean_token_accuracy": 0.670729398727417, + "num_tokens": 46380262.0, + "step": 1849 + }, + { + "epoch": 0.20316274983527344, + "grad_norm": 2.5217292308807373, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6917667388916016, + "num_tokens": 46401444.0, + "step": 1850 + }, + { + "epoch": 0.2032725675378871, + "grad_norm": 2.5430078506469727, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7181265950202942, + "num_tokens": 46422419.0, + "step": 1851 + }, + { + "epoch": 0.20338238524050076, + "grad_norm": 2.178847074508667, + "learning_rate": 1e-06, + "loss": 1.066, + "mean_token_accuracy": 0.6799485087394714, + "num_tokens": 46449155.0, + "step": 1852 + }, + { + "epoch": 0.20349220294311443, + "grad_norm": 2.1530840396881104, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6974393129348755, + "num_tokens": 46477633.0, + "step": 1853 + }, + { + "epoch": 0.2036020206457281, + "grad_norm": 2.2106897830963135, + "learning_rate": 1e-06, + "loss": 1.0689, + "mean_token_accuracy": 0.679712176322937, + "num_tokens": 46505162.0, + "step": 1854 + }, + { + "epoch": 0.20371183834834175, + "grad_norm": 2.19187331199646, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.697385311126709, + "num_tokens": 46532688.0, + "step": 1855 + }, + { + "epoch": 0.20382165605095542, + "grad_norm": 2.1004207134246826, + "learning_rate": 1e-06, + "loss": 1.0975, + "mean_token_accuracy": 0.6738922595977783, + "num_tokens": 46561861.0, + "step": 1856 + }, + { + "epoch": 0.20393147375356907, + "grad_norm": 2.061755418777466, + "learning_rate": 1e-06, + "loss": 1.0927, + "mean_token_accuracy": 0.6851357221603394, + "num_tokens": 46593989.0, + "step": 1857 + }, + { + "epoch": 0.20404129145618274, + "grad_norm": 2.372373580932617, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6988562345504761, + "num_tokens": 46617386.0, + "step": 1858 + }, + { + "epoch": 0.2041511091587964, + "grad_norm": 2.4207870960235596, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.685111403465271, + "num_tokens": 46643212.0, + "step": 1859 + }, + { + "epoch": 0.20426092686141006, + "grad_norm": 2.402000665664673, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7141134142875671, + "num_tokens": 46666037.0, + "step": 1860 + }, + { + "epoch": 0.2043707445640237, + "grad_norm": 2.4642579555511475, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6938260793685913, + "num_tokens": 46688231.0, + "step": 1861 + }, + { + "epoch": 0.20448056226663738, + "grad_norm": 2.4717819690704346, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6787846088409424, + "num_tokens": 46711690.0, + "step": 1862 + }, + { + "epoch": 0.20459037996925103, + "grad_norm": 2.346247911453247, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7066881656646729, + "num_tokens": 46735568.0, + "step": 1863 + }, + { + "epoch": 0.2047001976718647, + "grad_norm": 2.1213927268981934, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6968448162078857, + "num_tokens": 46766118.0, + "step": 1864 + }, + { + "epoch": 0.20481001537447838, + "grad_norm": 2.04801082611084, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7112756967544556, + "num_tokens": 46794202.0, + "step": 1865 + }, + { + "epoch": 0.20491983307709202, + "grad_norm": 2.3650994300842285, + "learning_rate": 1e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6754552721977234, + "num_tokens": 46817834.0, + "step": 1866 + }, + { + "epoch": 0.2050296507797057, + "grad_norm": 2.9096157550811768, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6997114419937134, + "num_tokens": 46834488.0, + "step": 1867 + }, + { + "epoch": 0.20513946848231934, + "grad_norm": 2.3178577423095703, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7137268781661987, + "num_tokens": 46859666.0, + "step": 1868 + }, + { + "epoch": 0.20524928618493302, + "grad_norm": 2.232032299041748, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6993412971496582, + "num_tokens": 46883644.0, + "step": 1869 + }, + { + "epoch": 0.20535910388754666, + "grad_norm": 2.295186758041382, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.689527153968811, + "num_tokens": 46911743.0, + "step": 1870 + }, + { + "epoch": 0.20546892159016034, + "grad_norm": 2.3661394119262695, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6951914429664612, + "num_tokens": 46937281.0, + "step": 1871 + }, + { + "epoch": 0.20557873929277398, + "grad_norm": 2.4005203247070312, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6852455139160156, + "num_tokens": 46961172.0, + "step": 1872 + }, + { + "epoch": 0.20568855699538766, + "grad_norm": 2.1156229972839355, + "learning_rate": 1e-06, + "loss": 1.0709, + "mean_token_accuracy": 0.6778573989868164, + "num_tokens": 46992233.0, + "step": 1873 + }, + { + "epoch": 0.20579837469800133, + "grad_norm": 2.04036021232605, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6971650123596191, + "num_tokens": 47024965.0, + "step": 1874 + }, + { + "epoch": 0.20590819240061498, + "grad_norm": 2.3179712295532227, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7069308161735535, + "num_tokens": 47050078.0, + "step": 1875 + }, + { + "epoch": 0.20601801010322865, + "grad_norm": 2.36108136177063, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6895307898521423, + "num_tokens": 47074244.0, + "step": 1876 + }, + { + "epoch": 0.2061278278058423, + "grad_norm": 2.551071882247925, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7157670259475708, + "num_tokens": 47094420.0, + "step": 1877 + }, + { + "epoch": 0.20623764550845597, + "grad_norm": 2.1399905681610107, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.693347156047821, + "num_tokens": 47122132.0, + "step": 1878 + }, + { + "epoch": 0.20634746321106961, + "grad_norm": 2.6085572242736816, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.688361644744873, + "num_tokens": 47146148.0, + "step": 1879 + }, + { + "epoch": 0.2064572809136833, + "grad_norm": 2.2206685543060303, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7060161828994751, + "num_tokens": 47171357.0, + "step": 1880 + }, + { + "epoch": 0.20656709861629693, + "grad_norm": 2.176064968109131, + "learning_rate": 1e-06, + "loss": 1.102, + "mean_token_accuracy": 0.6711708903312683, + "num_tokens": 47200191.0, + "step": 1881 + }, + { + "epoch": 0.2066769163189106, + "grad_norm": 2.296125888824463, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6900334358215332, + "num_tokens": 47226142.0, + "step": 1882 + }, + { + "epoch": 0.20678673402152428, + "grad_norm": 2.264289140701294, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7004483342170715, + "num_tokens": 47253276.0, + "step": 1883 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 2.445981740951538, + "learning_rate": 1e-06, + "loss": 1.0814, + "mean_token_accuracy": 0.6814303994178772, + "num_tokens": 47274940.0, + "step": 1884 + }, + { + "epoch": 0.2070063694267516, + "grad_norm": 2.189636707305908, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7224977016448975, + "num_tokens": 47300609.0, + "step": 1885 + }, + { + "epoch": 0.20711618712936525, + "grad_norm": 2.1311354637145996, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6880583763122559, + "num_tokens": 47329516.0, + "step": 1886 + }, + { + "epoch": 0.20722600483197892, + "grad_norm": 2.3758225440979004, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6901623010635376, + "num_tokens": 47352270.0, + "step": 1887 + }, + { + "epoch": 0.20733582253459257, + "grad_norm": 2.274099826812744, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6859369277954102, + "num_tokens": 47378459.0, + "step": 1888 + }, + { + "epoch": 0.20744564023720624, + "grad_norm": 2.7482125759124756, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6981379389762878, + "num_tokens": 47399768.0, + "step": 1889 + }, + { + "epoch": 0.2075554579398199, + "grad_norm": 2.209374189376831, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6866070628166199, + "num_tokens": 47427175.0, + "step": 1890 + }, + { + "epoch": 0.20766527564243356, + "grad_norm": 2.1284732818603516, + "learning_rate": 1e-06, + "loss": 1.1446, + "mean_token_accuracy": 0.6621620059013367, + "num_tokens": 47459075.0, + "step": 1891 + }, + { + "epoch": 0.20777509334504723, + "grad_norm": 2.478017568588257, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7220011353492737, + "num_tokens": 47480385.0, + "step": 1892 + }, + { + "epoch": 0.20788491104766088, + "grad_norm": 2.2805674076080322, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6914801597595215, + "num_tokens": 47506756.0, + "step": 1893 + }, + { + "epoch": 0.20799472875027455, + "grad_norm": 2.299717426300049, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6712988615036011, + "num_tokens": 47531533.0, + "step": 1894 + }, + { + "epoch": 0.2081045464528882, + "grad_norm": 2.3913705348968506, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6927531957626343, + "num_tokens": 47554936.0, + "step": 1895 + }, + { + "epoch": 0.20821436415550187, + "grad_norm": 2.347529649734497, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6855113506317139, + "num_tokens": 47578793.0, + "step": 1896 + }, + { + "epoch": 0.20832418185811552, + "grad_norm": 2.7965505123138428, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7172163724899292, + "num_tokens": 47595995.0, + "step": 1897 + }, + { + "epoch": 0.2084339995607292, + "grad_norm": 2.705332040786743, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6928755044937134, + "num_tokens": 47615814.0, + "step": 1898 + }, + { + "epoch": 0.20854381726334284, + "grad_norm": 2.4818453788757324, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6857303380966187, + "num_tokens": 47639574.0, + "step": 1899 + }, + { + "epoch": 0.2086536349659565, + "grad_norm": 2.5723226070404053, + "learning_rate": 1e-06, + "loss": 1.1018, + "mean_token_accuracy": 0.6835685968399048, + "num_tokens": 47661955.0, + "step": 1900 + }, + { + "epoch": 0.20876345266857016, + "grad_norm": 1.978360652923584, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6851505041122437, + "num_tokens": 47692713.0, + "step": 1901 + }, + { + "epoch": 0.20887327037118383, + "grad_norm": 2.037982702255249, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7125787734985352, + "num_tokens": 47722267.0, + "step": 1902 + }, + { + "epoch": 0.2089830880737975, + "grad_norm": 2.4859044551849365, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6935073137283325, + "num_tokens": 47742537.0, + "step": 1903 + }, + { + "epoch": 0.20909290577641115, + "grad_norm": 2.100677967071533, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6957434415817261, + "num_tokens": 47769407.0, + "step": 1904 + }, + { + "epoch": 0.20920272347902483, + "grad_norm": 2.3379995822906494, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7252732515335083, + "num_tokens": 47792973.0, + "step": 1905 + }, + { + "epoch": 0.20931254118163847, + "grad_norm": 2.3494741916656494, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6990890502929688, + "num_tokens": 47815184.0, + "step": 1906 + }, + { + "epoch": 0.20942235888425215, + "grad_norm": 2.4743733406066895, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6965169906616211, + "num_tokens": 47836177.0, + "step": 1907 + }, + { + "epoch": 0.2095321765868658, + "grad_norm": 2.488767623901367, + "learning_rate": 1e-06, + "loss": 1.0887, + "mean_token_accuracy": 0.6825988292694092, + "num_tokens": 47857926.0, + "step": 1908 + }, + { + "epoch": 0.20964199428947947, + "grad_norm": 2.2876968383789062, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6899821758270264, + "num_tokens": 47882448.0, + "step": 1909 + }, + { + "epoch": 0.2097518119920931, + "grad_norm": 2.4408090114593506, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6902826428413391, + "num_tokens": 47907513.0, + "step": 1910 + }, + { + "epoch": 0.20986162969470679, + "grad_norm": 2.016500949859619, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7196493148803711, + "num_tokens": 47936029.0, + "step": 1911 + }, + { + "epoch": 0.20997144739732046, + "grad_norm": 2.5585999488830566, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7278463244438171, + "num_tokens": 47954188.0, + "step": 1912 + }, + { + "epoch": 0.2100812650999341, + "grad_norm": 2.6101346015930176, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6861413717269897, + "num_tokens": 47976395.0, + "step": 1913 + }, + { + "epoch": 0.21019108280254778, + "grad_norm": 2.1683807373046875, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6977226734161377, + "num_tokens": 48005282.0, + "step": 1914 + }, + { + "epoch": 0.21030090050516143, + "grad_norm": 2.2994754314422607, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6951814889907837, + "num_tokens": 48029835.0, + "step": 1915 + }, + { + "epoch": 0.2104107182077751, + "grad_norm": 2.667273759841919, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7361762523651123, + "num_tokens": 48048416.0, + "step": 1916 + }, + { + "epoch": 0.21052053591038875, + "grad_norm": 2.2748219966888428, + "learning_rate": 1e-06, + "loss": 1.0796, + "mean_token_accuracy": 0.6759737730026245, + "num_tokens": 48072994.0, + "step": 1917 + }, + { + "epoch": 0.21063035361300242, + "grad_norm": 2.517132520675659, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6979280114173889, + "num_tokens": 48094533.0, + "step": 1918 + }, + { + "epoch": 0.21074017131561606, + "grad_norm": 2.544318675994873, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6876654624938965, + "num_tokens": 48115283.0, + "step": 1919 + }, + { + "epoch": 0.21084998901822974, + "grad_norm": 2.0667498111724854, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6891981363296509, + "num_tokens": 48146448.0, + "step": 1920 + }, + { + "epoch": 0.2109598067208434, + "grad_norm": 2.0953800678253174, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6895343065261841, + "num_tokens": 48173426.0, + "step": 1921 + }, + { + "epoch": 0.21106962442345706, + "grad_norm": 2.3323419094085693, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6922422647476196, + "num_tokens": 48199937.0, + "step": 1922 + }, + { + "epoch": 0.21117944212607073, + "grad_norm": 2.106785297393799, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7164175510406494, + "num_tokens": 48227240.0, + "step": 1923 + }, + { + "epoch": 0.21128925982868438, + "grad_norm": 2.2926955223083496, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6946828961372375, + "num_tokens": 48251277.0, + "step": 1924 + }, + { + "epoch": 0.21139907753129805, + "grad_norm": 2.283785104751587, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.707811713218689, + "num_tokens": 48277589.0, + "step": 1925 + }, + { + "epoch": 0.2115088952339117, + "grad_norm": 2.4667553901672363, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7087637186050415, + "num_tokens": 48299546.0, + "step": 1926 + }, + { + "epoch": 0.21161871293652537, + "grad_norm": 2.0930309295654297, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6809521913528442, + "num_tokens": 48330034.0, + "step": 1927 + }, + { + "epoch": 0.21172853063913902, + "grad_norm": 2.254617929458618, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6881766319274902, + "num_tokens": 48355682.0, + "step": 1928 + }, + { + "epoch": 0.2118383483417527, + "grad_norm": 2.395345449447632, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6800887584686279, + "num_tokens": 48379772.0, + "step": 1929 + }, + { + "epoch": 0.21194816604436637, + "grad_norm": 2.290114402770996, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7086067199707031, + "num_tokens": 48404338.0, + "step": 1930 + }, + { + "epoch": 0.21205798374698, + "grad_norm": 2.408010244369507, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7336230278015137, + "num_tokens": 48425069.0, + "step": 1931 + }, + { + "epoch": 0.21216780144959368, + "grad_norm": 2.353843927383423, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6968588829040527, + "num_tokens": 48450400.0, + "step": 1932 + }, + { + "epoch": 0.21227761915220733, + "grad_norm": 2.687779188156128, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.698796272277832, + "num_tokens": 48469967.0, + "step": 1933 + }, + { + "epoch": 0.212387436854821, + "grad_norm": 2.08579158782959, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6902489066123962, + "num_tokens": 48501646.0, + "step": 1934 + }, + { + "epoch": 0.21249725455743465, + "grad_norm": 2.4702534675598145, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7306920289993286, + "num_tokens": 48520893.0, + "step": 1935 + }, + { + "epoch": 0.21260707226004832, + "grad_norm": 2.4453930854797363, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6988340616226196, + "num_tokens": 48542540.0, + "step": 1936 + }, + { + "epoch": 0.21271688996266197, + "grad_norm": 2.054527759552002, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6957428455352783, + "num_tokens": 48574003.0, + "step": 1937 + }, + { + "epoch": 0.21282670766527564, + "grad_norm": 2.151275157928467, + "learning_rate": 1e-06, + "loss": 1.1079, + "mean_token_accuracy": 0.67302405834198, + "num_tokens": 48603807.0, + "step": 1938 + }, + { + "epoch": 0.2129365253678893, + "grad_norm": 1.9336066246032715, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6888704299926758, + "num_tokens": 48638572.0, + "step": 1939 + }, + { + "epoch": 0.21304634307050296, + "grad_norm": 2.4795517921447754, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6842936873435974, + "num_tokens": 48659732.0, + "step": 1940 + }, + { + "epoch": 0.21315616077311664, + "grad_norm": 2.118434190750122, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6954622268676758, + "num_tokens": 48689512.0, + "step": 1941 + }, + { + "epoch": 0.21326597847573028, + "grad_norm": 2.0713744163513184, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6908653974533081, + "num_tokens": 48721169.0, + "step": 1942 + }, + { + "epoch": 0.21337579617834396, + "grad_norm": 2.348896026611328, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7018659114837646, + "num_tokens": 48745369.0, + "step": 1943 + }, + { + "epoch": 0.2134856138809576, + "grad_norm": 2.3579185009002686, + "learning_rate": 1e-06, + "loss": 1.1092, + "mean_token_accuracy": 0.6829320788383484, + "num_tokens": 48769576.0, + "step": 1944 + }, + { + "epoch": 0.21359543158357128, + "grad_norm": 2.310774087905884, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6810587644577026, + "num_tokens": 48797027.0, + "step": 1945 + }, + { + "epoch": 0.21370524928618492, + "grad_norm": 2.490070104598999, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7102826833724976, + "num_tokens": 48819249.0, + "step": 1946 + }, + { + "epoch": 0.2138150669887986, + "grad_norm": 2.1678314208984375, + "learning_rate": 1e-06, + "loss": 1.1215, + "mean_token_accuracy": 0.6634421348571777, + "num_tokens": 48849966.0, + "step": 1947 + }, + { + "epoch": 0.21392488469141224, + "grad_norm": 2.0310916900634766, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7147698402404785, + "num_tokens": 48879147.0, + "step": 1948 + }, + { + "epoch": 0.21403470239402592, + "grad_norm": 2.389446973800659, + "learning_rate": 1e-06, + "loss": 1.1202, + "mean_token_accuracy": 0.6657068729400635, + "num_tokens": 48902180.0, + "step": 1949 + }, + { + "epoch": 0.2141445200966396, + "grad_norm": 2.3548996448516846, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.712221622467041, + "num_tokens": 48928364.0, + "step": 1950 + }, + { + "epoch": 0.21425433779925324, + "grad_norm": 2.203036069869995, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6895725727081299, + "num_tokens": 48954898.0, + "step": 1951 + }, + { + "epoch": 0.2143641555018669, + "grad_norm": 2.376739978790283, + "learning_rate": 1e-06, + "loss": 1.0822, + "mean_token_accuracy": 0.6755008697509766, + "num_tokens": 48979143.0, + "step": 1952 + }, + { + "epoch": 0.21447397320448056, + "grad_norm": 2.1966681480407715, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6942586302757263, + "num_tokens": 49004722.0, + "step": 1953 + }, + { + "epoch": 0.21458379090709423, + "grad_norm": 2.624897003173828, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.702076256275177, + "num_tokens": 49023120.0, + "step": 1954 + }, + { + "epoch": 0.21469360860970788, + "grad_norm": 2.708130359649658, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.712844729423523, + "num_tokens": 49042658.0, + "step": 1955 + }, + { + "epoch": 0.21480342631232155, + "grad_norm": 2.305084228515625, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7205597162246704, + "num_tokens": 49067993.0, + "step": 1956 + }, + { + "epoch": 0.2149132440149352, + "grad_norm": 2.128232717514038, + "learning_rate": 1e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.6766038537025452, + "num_tokens": 49096742.0, + "step": 1957 + }, + { + "epoch": 0.21502306171754887, + "grad_norm": 2.349188804626465, + "learning_rate": 1e-06, + "loss": 1.1213, + "mean_token_accuracy": 0.666028618812561, + "num_tokens": 49120855.0, + "step": 1958 + }, + { + "epoch": 0.21513287942016254, + "grad_norm": 2.5341224670410156, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7030916213989258, + "num_tokens": 49142381.0, + "step": 1959 + }, + { + "epoch": 0.2152426971227762, + "grad_norm": 2.1680052280426025, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6821233034133911, + "num_tokens": 49168908.0, + "step": 1960 + }, + { + "epoch": 0.21535251482538986, + "grad_norm": 2.1557204723358154, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6806443929672241, + "num_tokens": 49197256.0, + "step": 1961 + }, + { + "epoch": 0.2154623325280035, + "grad_norm": 2.139876365661621, + "learning_rate": 1e-06, + "loss": 1.077, + "mean_token_accuracy": 0.6792997121810913, + "num_tokens": 49226118.0, + "step": 1962 + }, + { + "epoch": 0.21557215023061718, + "grad_norm": 2.5066516399383545, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6863616704940796, + "num_tokens": 49249637.0, + "step": 1963 + }, + { + "epoch": 0.21568196793323083, + "grad_norm": 2.057610034942627, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7092069387435913, + "num_tokens": 49277460.0, + "step": 1964 + }, + { + "epoch": 0.2157917856358445, + "grad_norm": 2.1661696434020996, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6983680725097656, + "num_tokens": 49303204.0, + "step": 1965 + }, + { + "epoch": 0.21590160333845815, + "grad_norm": 2.1186633110046387, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6963036060333252, + "num_tokens": 49330079.0, + "step": 1966 + }, + { + "epoch": 0.21601142104107182, + "grad_norm": 2.3340978622436523, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7027629613876343, + "num_tokens": 49353345.0, + "step": 1967 + }, + { + "epoch": 0.2161212387436855, + "grad_norm": 2.545502185821533, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7034558653831482, + "num_tokens": 49374369.0, + "step": 1968 + }, + { + "epoch": 0.21623105644629914, + "grad_norm": 2.232269287109375, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6968855857849121, + "num_tokens": 49400820.0, + "step": 1969 + }, + { + "epoch": 0.21634087414891282, + "grad_norm": 2.6126644611358643, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7089946866035461, + "num_tokens": 49420351.0, + "step": 1970 + }, + { + "epoch": 0.21645069185152646, + "grad_norm": 2.40936541557312, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6932113766670227, + "num_tokens": 49442905.0, + "step": 1971 + }, + { + "epoch": 0.21656050955414013, + "grad_norm": 2.242255926132202, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.690008282661438, + "num_tokens": 49468624.0, + "step": 1972 + }, + { + "epoch": 0.21667032725675378, + "grad_norm": 2.611855983734131, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7097361087799072, + "num_tokens": 49488084.0, + "step": 1973 + }, + { + "epoch": 0.21678014495936745, + "grad_norm": 2.1969997882843018, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6766388416290283, + "num_tokens": 49514818.0, + "step": 1974 + }, + { + "epoch": 0.2168899626619811, + "grad_norm": 2.1913628578186035, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6884396076202393, + "num_tokens": 49540215.0, + "step": 1975 + }, + { + "epoch": 0.21699978036459477, + "grad_norm": 2.0929999351501465, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7143265008926392, + "num_tokens": 49565741.0, + "step": 1976 + }, + { + "epoch": 0.21710959806720842, + "grad_norm": 1.979981541633606, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6835556030273438, + "num_tokens": 49600189.0, + "step": 1977 + }, + { + "epoch": 0.2172194157698221, + "grad_norm": 2.0769314765930176, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6879042983055115, + "num_tokens": 49628182.0, + "step": 1978 + }, + { + "epoch": 0.21732923347243577, + "grad_norm": 2.2049560546875, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6813538670539856, + "num_tokens": 49656826.0, + "step": 1979 + }, + { + "epoch": 0.2174390511750494, + "grad_norm": 2.2294209003448486, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7004747986793518, + "num_tokens": 49681257.0, + "step": 1980 + }, + { + "epoch": 0.2175488688776631, + "grad_norm": 2.3420441150665283, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6901834011077881, + "num_tokens": 49706238.0, + "step": 1981 + }, + { + "epoch": 0.21765868658027673, + "grad_norm": 2.279134511947632, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6732751727104187, + "num_tokens": 49732167.0, + "step": 1982 + }, + { + "epoch": 0.2177685042828904, + "grad_norm": 2.11598801612854, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.719692587852478, + "num_tokens": 49761406.0, + "step": 1983 + }, + { + "epoch": 0.21787832198550405, + "grad_norm": 2.4811277389526367, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6982370018959045, + "num_tokens": 49784110.0, + "step": 1984 + }, + { + "epoch": 0.21798813968811773, + "grad_norm": 1.930038332939148, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6901267766952515, + "num_tokens": 49817382.0, + "step": 1985 + }, + { + "epoch": 0.21809795739073137, + "grad_norm": 2.7158219814300537, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7159138321876526, + "num_tokens": 49834753.0, + "step": 1986 + }, + { + "epoch": 0.21820777509334505, + "grad_norm": 2.3230550289154053, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6806496381759644, + "num_tokens": 49859567.0, + "step": 1987 + }, + { + "epoch": 0.21831759279595872, + "grad_norm": 2.524127960205078, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6933224201202393, + "num_tokens": 49882938.0, + "step": 1988 + }, + { + "epoch": 0.21842741049857237, + "grad_norm": 2.3213388919830322, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.6948152184486389, + "num_tokens": 49909698.0, + "step": 1989 + }, + { + "epoch": 0.21853722820118604, + "grad_norm": 2.7544174194335938, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7060317993164062, + "num_tokens": 49928910.0, + "step": 1990 + }, + { + "epoch": 0.21864704590379969, + "grad_norm": 2.015838861465454, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6940388083457947, + "num_tokens": 49960234.0, + "step": 1991 + }, + { + "epoch": 0.21875686360641336, + "grad_norm": 2.52349853515625, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6851421594619751, + "num_tokens": 49983500.0, + "step": 1992 + }, + { + "epoch": 0.218866681309027, + "grad_norm": 2.7516534328460693, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7165155410766602, + "num_tokens": 50001155.0, + "step": 1993 + }, + { + "epoch": 0.21897649901164068, + "grad_norm": 2.235231399536133, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7076936364173889, + "num_tokens": 50026297.0, + "step": 1994 + }, + { + "epoch": 0.21908631671425433, + "grad_norm": 2.0150651931762695, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.7004494071006775, + "num_tokens": 50057582.0, + "step": 1995 + }, + { + "epoch": 0.219196134416868, + "grad_norm": 2.432696580886841, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6798062920570374, + "num_tokens": 50081131.0, + "step": 1996 + }, + { + "epoch": 0.21930595211948167, + "grad_norm": 2.2677173614501953, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6889611482620239, + "num_tokens": 50107350.0, + "step": 1997 + }, + { + "epoch": 0.21941576982209532, + "grad_norm": 2.2869691848754883, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7147344946861267, + "num_tokens": 50132585.0, + "step": 1998 + }, + { + "epoch": 0.219525587524709, + "grad_norm": 2.5053203105926514, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7046397924423218, + "num_tokens": 50154506.0, + "step": 1999 + }, + { + "epoch": 0.21963540522732264, + "grad_norm": 1.940089464187622, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.727994441986084, + "num_tokens": 50185398.0, + "step": 2000 + }, + { + "epoch": 0.2197452229299363, + "grad_norm": 2.5057740211486816, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6873272657394409, + "num_tokens": 50207819.0, + "step": 2001 + }, + { + "epoch": 0.21985504063254996, + "grad_norm": 2.6871604919433594, + "learning_rate": 1e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.7003942728042603, + "num_tokens": 50228212.0, + "step": 2002 + }, + { + "epoch": 0.21996485833516363, + "grad_norm": 2.199455976486206, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7132635116577148, + "num_tokens": 50254592.0, + "step": 2003 + }, + { + "epoch": 0.22007467603777728, + "grad_norm": 2.3842318058013916, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7083157300949097, + "num_tokens": 50277758.0, + "step": 2004 + }, + { + "epoch": 0.22018449374039095, + "grad_norm": 2.1762337684631348, + "learning_rate": 1e-06, + "loss": 1.1209, + "mean_token_accuracy": 0.6733766198158264, + "num_tokens": 50308294.0, + "step": 2005 + }, + { + "epoch": 0.22029431144300463, + "grad_norm": 2.645785093307495, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7162946462631226, + "num_tokens": 50329328.0, + "step": 2006 + }, + { + "epoch": 0.22040412914561827, + "grad_norm": 2.67124342918396, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7050783038139343, + "num_tokens": 50348419.0, + "step": 2007 + }, + { + "epoch": 0.22051394684823195, + "grad_norm": 2.3791110515594482, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.700188398361206, + "num_tokens": 50370377.0, + "step": 2008 + }, + { + "epoch": 0.2206237645508456, + "grad_norm": 2.3504703044891357, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.716996431350708, + "num_tokens": 50393550.0, + "step": 2009 + }, + { + "epoch": 0.22073358225345927, + "grad_norm": 2.5202250480651855, + "learning_rate": 1e-06, + "loss": 1.0765, + "mean_token_accuracy": 0.6844306588172913, + "num_tokens": 50414602.0, + "step": 2010 + }, + { + "epoch": 0.2208433999560729, + "grad_norm": 2.378375291824341, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6746969223022461, + "num_tokens": 50439593.0, + "step": 2011 + }, + { + "epoch": 0.22095321765868658, + "grad_norm": 2.2485828399658203, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7111291885375977, + "num_tokens": 50466377.0, + "step": 2012 + }, + { + "epoch": 0.22106303536130023, + "grad_norm": 2.4769883155822754, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6982402801513672, + "num_tokens": 50489122.0, + "step": 2013 + }, + { + "epoch": 0.2211728530639139, + "grad_norm": 1.9839837551116943, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7027971148490906, + "num_tokens": 50518964.0, + "step": 2014 + }, + { + "epoch": 0.22128267076652755, + "grad_norm": 2.010066509246826, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7000894546508789, + "num_tokens": 50551200.0, + "step": 2015 + }, + { + "epoch": 0.22139248846914122, + "grad_norm": 2.191150665283203, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6893037557601929, + "num_tokens": 50580387.0, + "step": 2016 + }, + { + "epoch": 0.2215023061717549, + "grad_norm": 2.3096935749053955, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.715101957321167, + "num_tokens": 50605708.0, + "step": 2017 + }, + { + "epoch": 0.22161212387436854, + "grad_norm": 2.5602402687072754, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6889089345932007, + "num_tokens": 50628579.0, + "step": 2018 + }, + { + "epoch": 0.22172194157698222, + "grad_norm": 2.445448160171509, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.7061271071434021, + "num_tokens": 50651015.0, + "step": 2019 + }, + { + "epoch": 0.22183175927959586, + "grad_norm": 2.1801950931549072, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6958354115486145, + "num_tokens": 50675320.0, + "step": 2020 + }, + { + "epoch": 0.22194157698220954, + "grad_norm": 2.3696212768554688, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.6905204653739929, + "num_tokens": 50698643.0, + "step": 2021 + }, + { + "epoch": 0.22205139468482318, + "grad_norm": 2.509927272796631, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.702023983001709, + "num_tokens": 50720891.0, + "step": 2022 + }, + { + "epoch": 0.22216121238743686, + "grad_norm": 2.317652702331543, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7159804105758667, + "num_tokens": 50745897.0, + "step": 2023 + }, + { + "epoch": 0.2222710300900505, + "grad_norm": 2.528648853302002, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6849710941314697, + "num_tokens": 50767954.0, + "step": 2024 + }, + { + "epoch": 0.22238084779266418, + "grad_norm": 2.597830057144165, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6914074420928955, + "num_tokens": 50792706.0, + "step": 2025 + }, + { + "epoch": 0.22249066549527785, + "grad_norm": 2.3645195960998535, + "learning_rate": 1e-06, + "loss": 1.1203, + "mean_token_accuracy": 0.6768010854721069, + "num_tokens": 50817210.0, + "step": 2026 + }, + { + "epoch": 0.2226004831978915, + "grad_norm": 2.426093578338623, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7026334404945374, + "num_tokens": 50840237.0, + "step": 2027 + }, + { + "epoch": 0.22271030090050517, + "grad_norm": 2.380934000015259, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6897227764129639, + "num_tokens": 50865179.0, + "step": 2028 + }, + { + "epoch": 0.22282011860311882, + "grad_norm": 2.372898578643799, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7083902359008789, + "num_tokens": 50887349.0, + "step": 2029 + }, + { + "epoch": 0.2229299363057325, + "grad_norm": 2.676880359649658, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7211344242095947, + "num_tokens": 50905126.0, + "step": 2030 + }, + { + "epoch": 0.22303975400834614, + "grad_norm": 2.2432143688201904, + "learning_rate": 1e-06, + "loss": 1.1867, + "mean_token_accuracy": 0.6554964184761047, + "num_tokens": 50934450.0, + "step": 2031 + }, + { + "epoch": 0.2231495717109598, + "grad_norm": 2.2144110202789307, + "learning_rate": 1e-06, + "loss": 1.0907, + "mean_token_accuracy": 0.6707704067230225, + "num_tokens": 50963677.0, + "step": 2032 + }, + { + "epoch": 0.22325938941357346, + "grad_norm": 2.418710708618164, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6943677663803101, + "num_tokens": 50986550.0, + "step": 2033 + }, + { + "epoch": 0.22336920711618713, + "grad_norm": 2.1860036849975586, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6958105564117432, + "num_tokens": 51015032.0, + "step": 2034 + }, + { + "epoch": 0.2234790248188008, + "grad_norm": 2.0270488262176514, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7078748345375061, + "num_tokens": 51045900.0, + "step": 2035 + }, + { + "epoch": 0.22358884252141445, + "grad_norm": 2.1075127124786377, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7069454193115234, + "num_tokens": 51073380.0, + "step": 2036 + }, + { + "epoch": 0.22369866022402812, + "grad_norm": 2.2619574069976807, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7103492021560669, + "num_tokens": 51097946.0, + "step": 2037 + }, + { + "epoch": 0.22380847792664177, + "grad_norm": 2.1534974575042725, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.694696307182312, + "num_tokens": 51124719.0, + "step": 2038 + }, + { + "epoch": 0.22391829562925544, + "grad_norm": 2.0504953861236572, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7142018675804138, + "num_tokens": 51152797.0, + "step": 2039 + }, + { + "epoch": 0.2240281133318691, + "grad_norm": 2.295707941055298, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7212833762168884, + "num_tokens": 51177367.0, + "step": 2040 + }, + { + "epoch": 0.22413793103448276, + "grad_norm": 2.6426193714141846, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6791446805000305, + "num_tokens": 51196229.0, + "step": 2041 + }, + { + "epoch": 0.2242477487370964, + "grad_norm": 2.1236164569854736, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6985923051834106, + "num_tokens": 51224292.0, + "step": 2042 + }, + { + "epoch": 0.22435756643971008, + "grad_norm": 2.6443443298339844, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7318887114524841, + "num_tokens": 51246275.0, + "step": 2043 + }, + { + "epoch": 0.22446738414232376, + "grad_norm": 2.4481754302978516, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7097403407096863, + "num_tokens": 51269155.0, + "step": 2044 + }, + { + "epoch": 0.2245772018449374, + "grad_norm": 2.1876206398010254, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.704761266708374, + "num_tokens": 51295026.0, + "step": 2045 + }, + { + "epoch": 0.22468701954755108, + "grad_norm": 2.5040581226348877, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.700884222984314, + "num_tokens": 51317140.0, + "step": 2046 + }, + { + "epoch": 0.22479683725016472, + "grad_norm": 2.284451484680176, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7140674591064453, + "num_tokens": 51341199.0, + "step": 2047 + }, + { + "epoch": 0.2249066549527784, + "grad_norm": 2.387882709503174, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7092553377151489, + "num_tokens": 51363572.0, + "step": 2048 + }, + { + "epoch": 0.22501647265539204, + "grad_norm": 2.8633315563201904, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7149784564971924, + "num_tokens": 51380863.0, + "step": 2049 + }, + { + "epoch": 0.22512629035800572, + "grad_norm": 2.0694222450256348, + "learning_rate": 1e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.6806656122207642, + "num_tokens": 51413025.0, + "step": 2050 + }, + { + "epoch": 0.22523610806061936, + "grad_norm": 2.4454638957977295, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6939399838447571, + "num_tokens": 51434939.0, + "step": 2051 + }, + { + "epoch": 0.22534592576323303, + "grad_norm": 2.3640897274017334, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7120873928070068, + "num_tokens": 51456869.0, + "step": 2052 + }, + { + "epoch": 0.22545574346584668, + "grad_norm": 2.5780508518218994, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6829893589019775, + "num_tokens": 51477538.0, + "step": 2053 + }, + { + "epoch": 0.22556556116846035, + "grad_norm": 2.6993157863616943, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7055438756942749, + "num_tokens": 51496679.0, + "step": 2054 + }, + { + "epoch": 0.22567537887107403, + "grad_norm": 1.9583977460861206, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7251612544059753, + "num_tokens": 51525835.0, + "step": 2055 + }, + { + "epoch": 0.22578519657368767, + "grad_norm": 2.258171558380127, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6805970668792725, + "num_tokens": 51551933.0, + "step": 2056 + }, + { + "epoch": 0.22589501427630135, + "grad_norm": 2.4033565521240234, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6998000144958496, + "num_tokens": 51573335.0, + "step": 2057 + }, + { + "epoch": 0.226004831978915, + "grad_norm": 2.545753002166748, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.6959037780761719, + "num_tokens": 51595397.0, + "step": 2058 + }, + { + "epoch": 0.22611464968152867, + "grad_norm": 2.184626579284668, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6975451707839966, + "num_tokens": 51623853.0, + "step": 2059 + }, + { + "epoch": 0.2262244673841423, + "grad_norm": 2.70933198928833, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6887143850326538, + "num_tokens": 51645011.0, + "step": 2060 + }, + { + "epoch": 0.226334285086756, + "grad_norm": 2.520411729812622, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7160496711730957, + "num_tokens": 51665920.0, + "step": 2061 + }, + { + "epoch": 0.22644410278936963, + "grad_norm": 2.199763059616089, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6917074918746948, + "num_tokens": 51694859.0, + "step": 2062 + }, + { + "epoch": 0.2265539204919833, + "grad_norm": 2.2931783199310303, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6969590187072754, + "num_tokens": 51720031.0, + "step": 2063 + }, + { + "epoch": 0.22666373819459698, + "grad_norm": 2.1366329193115234, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6887529492378235, + "num_tokens": 51748216.0, + "step": 2064 + }, + { + "epoch": 0.22677355589721063, + "grad_norm": 2.31781005859375, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6891043186187744, + "num_tokens": 51773289.0, + "step": 2065 + }, + { + "epoch": 0.2268833735998243, + "grad_norm": 2.1775104999542236, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7096447348594666, + "num_tokens": 51800581.0, + "step": 2066 + }, + { + "epoch": 0.22699319130243795, + "grad_norm": 2.5901505947113037, + "learning_rate": 1e-06, + "loss": 1.0629, + "mean_token_accuracy": 0.6834144592285156, + "num_tokens": 51823416.0, + "step": 2067 + }, + { + "epoch": 0.22710300900505162, + "grad_norm": 2.5391170978546143, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6981245279312134, + "num_tokens": 51846211.0, + "step": 2068 + }, + { + "epoch": 0.22721282670766527, + "grad_norm": 2.1071319580078125, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6860238909721375, + "num_tokens": 51874508.0, + "step": 2069 + }, + { + "epoch": 0.22732264441027894, + "grad_norm": 2.2868900299072266, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6921585202217102, + "num_tokens": 51900432.0, + "step": 2070 + }, + { + "epoch": 0.22743246211289259, + "grad_norm": 2.4704973697662354, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6994470357894897, + "num_tokens": 51919576.0, + "step": 2071 + }, + { + "epoch": 0.22754227981550626, + "grad_norm": 2.3935346603393555, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7019305229187012, + "num_tokens": 51942777.0, + "step": 2072 + }, + { + "epoch": 0.22765209751811993, + "grad_norm": 2.3889811038970947, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7247104644775391, + "num_tokens": 51964189.0, + "step": 2073 + }, + { + "epoch": 0.22776191522073358, + "grad_norm": 2.184126615524292, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6942825317382812, + "num_tokens": 51990998.0, + "step": 2074 + }, + { + "epoch": 0.22787173292334725, + "grad_norm": 2.127875566482544, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7135657072067261, + "num_tokens": 52021466.0, + "step": 2075 + }, + { + "epoch": 0.2279815506259609, + "grad_norm": 2.184844732284546, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7016189098358154, + "num_tokens": 52048641.0, + "step": 2076 + }, + { + "epoch": 0.22809136832857457, + "grad_norm": 2.354790449142456, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6914466619491577, + "num_tokens": 52074554.0, + "step": 2077 + }, + { + "epoch": 0.22820118603118822, + "grad_norm": 2.9360809326171875, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7270165681838989, + "num_tokens": 52092050.0, + "step": 2078 + }, + { + "epoch": 0.2283110037338019, + "grad_norm": 2.573380947113037, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6865663528442383, + "num_tokens": 52113250.0, + "step": 2079 + }, + { + "epoch": 0.22842082143641554, + "grad_norm": 2.459172248840332, + "learning_rate": 1e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6857914924621582, + "num_tokens": 52135791.0, + "step": 2080 + }, + { + "epoch": 0.2285306391390292, + "grad_norm": 2.3290514945983887, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6985888481140137, + "num_tokens": 52159205.0, + "step": 2081 + }, + { + "epoch": 0.2286404568416429, + "grad_norm": 2.0746707916259766, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7090997695922852, + "num_tokens": 52189505.0, + "step": 2082 + }, + { + "epoch": 0.22875027454425653, + "grad_norm": 2.3944649696350098, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.7060106992721558, + "num_tokens": 52211636.0, + "step": 2083 + }, + { + "epoch": 0.2288600922468702, + "grad_norm": 2.3956377506256104, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6959816217422485, + "num_tokens": 52233437.0, + "step": 2084 + }, + { + "epoch": 0.22896990994948385, + "grad_norm": 2.4471724033355713, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.722902774810791, + "num_tokens": 52256214.0, + "step": 2085 + }, + { + "epoch": 0.22907972765209753, + "grad_norm": 2.5069854259490967, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7060369253158569, + "num_tokens": 52276030.0, + "step": 2086 + }, + { + "epoch": 0.22918954535471117, + "grad_norm": 2.443938732147217, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7176094055175781, + "num_tokens": 52297930.0, + "step": 2087 + }, + { + "epoch": 0.22929936305732485, + "grad_norm": 2.4547953605651855, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6931083798408508, + "num_tokens": 52319745.0, + "step": 2088 + }, + { + "epoch": 0.2294091807599385, + "grad_norm": 2.496005058288574, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7027993202209473, + "num_tokens": 52342572.0, + "step": 2089 + }, + { + "epoch": 0.22951899846255217, + "grad_norm": 2.0361106395721436, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6945194005966187, + "num_tokens": 52373653.0, + "step": 2090 + }, + { + "epoch": 0.2296288161651658, + "grad_norm": 2.481017589569092, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7070038318634033, + "num_tokens": 52396727.0, + "step": 2091 + }, + { + "epoch": 0.22973863386777948, + "grad_norm": 2.1870334148406982, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6782371401786804, + "num_tokens": 52425350.0, + "step": 2092 + }, + { + "epoch": 0.22984845157039316, + "grad_norm": 2.0951859951019287, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6824585795402527, + "num_tokens": 52453003.0, + "step": 2093 + }, + { + "epoch": 0.2299582692730068, + "grad_norm": 2.3898062705993652, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7257846593856812, + "num_tokens": 52474291.0, + "step": 2094 + }, + { + "epoch": 0.23006808697562048, + "grad_norm": 2.2269299030303955, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6993821859359741, + "num_tokens": 52500520.0, + "step": 2095 + }, + { + "epoch": 0.23017790467823412, + "grad_norm": 2.7268216609954834, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6943409442901611, + "num_tokens": 52519700.0, + "step": 2096 + }, + { + "epoch": 0.2302877223808478, + "grad_norm": 2.0787675380706787, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7445120811462402, + "num_tokens": 52545246.0, + "step": 2097 + }, + { + "epoch": 0.23039754008346144, + "grad_norm": 2.8324410915374756, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7157039642333984, + "num_tokens": 52562485.0, + "step": 2098 + }, + { + "epoch": 0.23050735778607512, + "grad_norm": 2.0606532096862793, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.700843095779419, + "num_tokens": 52590689.0, + "step": 2099 + }, + { + "epoch": 0.23061717548868876, + "grad_norm": 2.1649837493896484, + "learning_rate": 1e-06, + "loss": 1.0592, + "mean_token_accuracy": 0.6937814354896545, + "num_tokens": 52621113.0, + "step": 2100 + }, + { + "epoch": 0.23072699319130244, + "grad_norm": 2.447878837585449, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6882309913635254, + "num_tokens": 52646011.0, + "step": 2101 + }, + { + "epoch": 0.2308368108939161, + "grad_norm": 2.120190382003784, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.695233941078186, + "num_tokens": 52674458.0, + "step": 2102 + }, + { + "epoch": 0.23094662859652976, + "grad_norm": 2.724052667617798, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6928972601890564, + "num_tokens": 52694684.0, + "step": 2103 + }, + { + "epoch": 0.23105644629914343, + "grad_norm": 2.19449782371521, + "learning_rate": 1e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.6875168681144714, + "num_tokens": 52721010.0, + "step": 2104 + }, + { + "epoch": 0.23116626400175708, + "grad_norm": 2.53340744972229, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6974496245384216, + "num_tokens": 52741112.0, + "step": 2105 + }, + { + "epoch": 0.23127608170437075, + "grad_norm": 2.0844907760620117, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6936134099960327, + "num_tokens": 52768202.0, + "step": 2106 + }, + { + "epoch": 0.2313858994069844, + "grad_norm": 2.2239813804626465, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6917840838432312, + "num_tokens": 52795859.0, + "step": 2107 + }, + { + "epoch": 0.23149571710959807, + "grad_norm": 2.229133129119873, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6802245378494263, + "num_tokens": 52824795.0, + "step": 2108 + }, + { + "epoch": 0.23160553481221172, + "grad_norm": 2.486297369003296, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6906791925430298, + "num_tokens": 52845214.0, + "step": 2109 + }, + { + "epoch": 0.2317153525148254, + "grad_norm": 2.3000833988189697, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7106555700302124, + "num_tokens": 52867683.0, + "step": 2110 + }, + { + "epoch": 0.23182517021743906, + "grad_norm": 2.6710619926452637, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7059906721115112, + "num_tokens": 52886693.0, + "step": 2111 + }, + { + "epoch": 0.2319349879200527, + "grad_norm": 2.4650447368621826, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7140516042709351, + "num_tokens": 52910598.0, + "step": 2112 + }, + { + "epoch": 0.23204480562266638, + "grad_norm": 2.403285503387451, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7026346325874329, + "num_tokens": 52933300.0, + "step": 2113 + }, + { + "epoch": 0.23215462332528003, + "grad_norm": 2.56233549118042, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6932389736175537, + "num_tokens": 52954671.0, + "step": 2114 + }, + { + "epoch": 0.2322644410278937, + "grad_norm": 2.3558926582336426, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7012897729873657, + "num_tokens": 52979983.0, + "step": 2115 + }, + { + "epoch": 0.23237425873050735, + "grad_norm": 2.2540910243988037, + "learning_rate": 1e-06, + "loss": 1.0735, + "mean_token_accuracy": 0.6841545104980469, + "num_tokens": 53007352.0, + "step": 2116 + }, + { + "epoch": 0.23248407643312102, + "grad_norm": 2.0986077785491943, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7088843584060669, + "num_tokens": 53035843.0, + "step": 2117 + }, + { + "epoch": 0.23259389413573467, + "grad_norm": 2.4121248722076416, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6910326480865479, + "num_tokens": 53060692.0, + "step": 2118 + }, + { + "epoch": 0.23270371183834834, + "grad_norm": 2.333758592605591, + "learning_rate": 1e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6831828355789185, + "num_tokens": 53086114.0, + "step": 2119 + }, + { + "epoch": 0.23281352954096202, + "grad_norm": 2.7486634254455566, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6926670074462891, + "num_tokens": 53104200.0, + "step": 2120 + }, + { + "epoch": 0.23292334724357566, + "grad_norm": 2.2791712284088135, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7311355471611023, + "num_tokens": 53125864.0, + "step": 2121 + }, + { + "epoch": 0.23303316494618934, + "grad_norm": 2.4183082580566406, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7000659108161926, + "num_tokens": 53150051.0, + "step": 2122 + }, + { + "epoch": 0.23314298264880298, + "grad_norm": 2.5637052059173584, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.698306143283844, + "num_tokens": 53172638.0, + "step": 2123 + }, + { + "epoch": 0.23325280035141666, + "grad_norm": 2.542924404144287, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.692052960395813, + "num_tokens": 53192332.0, + "step": 2124 + }, + { + "epoch": 0.2333626180540303, + "grad_norm": 2.3852505683898926, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6948674321174622, + "num_tokens": 53217067.0, + "step": 2125 + }, + { + "epoch": 0.23347243575664398, + "grad_norm": 2.190833568572998, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7013134956359863, + "num_tokens": 53244197.0, + "step": 2126 + }, + { + "epoch": 0.23358225345925762, + "grad_norm": 2.4634363651275635, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6958402395248413, + "num_tokens": 53265938.0, + "step": 2127 + }, + { + "epoch": 0.2336920711618713, + "grad_norm": 2.12113881111145, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6890814304351807, + "num_tokens": 53296020.0, + "step": 2128 + }, + { + "epoch": 0.23380188886448494, + "grad_norm": 1.9999794960021973, + "learning_rate": 1e-06, + "loss": 1.1344, + "mean_token_accuracy": 0.6615080237388611, + "num_tokens": 53330279.0, + "step": 2129 + }, + { + "epoch": 0.23391170656709862, + "grad_norm": 2.1242423057556152, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7023950219154358, + "num_tokens": 53360239.0, + "step": 2130 + }, + { + "epoch": 0.2340215242697123, + "grad_norm": 2.42228627204895, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7077174186706543, + "num_tokens": 53382913.0, + "step": 2131 + }, + { + "epoch": 0.23413134197232593, + "grad_norm": 2.3368191719055176, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7013564109802246, + "num_tokens": 53407603.0, + "step": 2132 + }, + { + "epoch": 0.2342411596749396, + "grad_norm": 2.246002197265625, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7040500640869141, + "num_tokens": 53433349.0, + "step": 2133 + }, + { + "epoch": 0.23435097737755325, + "grad_norm": 2.3326385021209717, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6964331865310669, + "num_tokens": 53459849.0, + "step": 2134 + }, + { + "epoch": 0.23446079508016693, + "grad_norm": 2.452592372894287, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6996350288391113, + "num_tokens": 53482451.0, + "step": 2135 + }, + { + "epoch": 0.23457061278278057, + "grad_norm": 2.810370922088623, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.713752031326294, + "num_tokens": 53498851.0, + "step": 2136 + }, + { + "epoch": 0.23468043048539425, + "grad_norm": 2.2007884979248047, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6859409809112549, + "num_tokens": 53525089.0, + "step": 2137 + }, + { + "epoch": 0.2347902481880079, + "grad_norm": 2.4660370349884033, + "learning_rate": 1e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.6784908175468445, + "num_tokens": 53549043.0, + "step": 2138 + }, + { + "epoch": 0.23490006589062157, + "grad_norm": 2.296847105026245, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.674788236618042, + "num_tokens": 53576317.0, + "step": 2139 + }, + { + "epoch": 0.23500988359323524, + "grad_norm": 2.401860237121582, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7226161956787109, + "num_tokens": 53597073.0, + "step": 2140 + }, + { + "epoch": 0.2351197012958489, + "grad_norm": 2.0700738430023193, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7003140449523926, + "num_tokens": 53627385.0, + "step": 2141 + }, + { + "epoch": 0.23522951899846256, + "grad_norm": 2.309361219406128, + "learning_rate": 1e-06, + "loss": 1.0654, + "mean_token_accuracy": 0.6831755638122559, + "num_tokens": 53653768.0, + "step": 2142 + }, + { + "epoch": 0.2353393367010762, + "grad_norm": 1.9236923456192017, + "learning_rate": 1e-06, + "loss": 1.1001, + "mean_token_accuracy": 0.6781266927719116, + "num_tokens": 53688207.0, + "step": 2143 + }, + { + "epoch": 0.23544915440368988, + "grad_norm": 2.0370442867279053, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7200933694839478, + "num_tokens": 53717940.0, + "step": 2144 + }, + { + "epoch": 0.23555897210630353, + "grad_norm": 2.3525450229644775, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.70854651927948, + "num_tokens": 53740606.0, + "step": 2145 + }, + { + "epoch": 0.2356687898089172, + "grad_norm": 2.3417325019836426, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7020746469497681, + "num_tokens": 53765389.0, + "step": 2146 + }, + { + "epoch": 0.23577860751153085, + "grad_norm": 2.3222780227661133, + "learning_rate": 1e-06, + "loss": 1.1123, + "mean_token_accuracy": 0.6723393201828003, + "num_tokens": 53791293.0, + "step": 2147 + }, + { + "epoch": 0.23588842521414452, + "grad_norm": 2.024125099182129, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7083848714828491, + "num_tokens": 53821287.0, + "step": 2148 + }, + { + "epoch": 0.2359982429167582, + "grad_norm": 2.5120465755462646, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.719558835029602, + "num_tokens": 53841123.0, + "step": 2149 + }, + { + "epoch": 0.23610806061937184, + "grad_norm": 2.2672934532165527, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6976075172424316, + "num_tokens": 53866150.0, + "step": 2150 + }, + { + "epoch": 0.2362178783219855, + "grad_norm": 2.2465481758117676, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6997976303100586, + "num_tokens": 53891923.0, + "step": 2151 + }, + { + "epoch": 0.23632769602459916, + "grad_norm": 2.272167205810547, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6839277744293213, + "num_tokens": 53919502.0, + "step": 2152 + }, + { + "epoch": 0.23643751372721283, + "grad_norm": 2.319746255874634, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7208089828491211, + "num_tokens": 53942898.0, + "step": 2153 + }, + { + "epoch": 0.23654733142982648, + "grad_norm": 2.2831826210021973, + "learning_rate": 1e-06, + "loss": 1.1024, + "mean_token_accuracy": 0.6744091510772705, + "num_tokens": 53968656.0, + "step": 2154 + }, + { + "epoch": 0.23665714913244015, + "grad_norm": 2.0659923553466797, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7175933122634888, + "num_tokens": 53998047.0, + "step": 2155 + }, + { + "epoch": 0.2367669668350538, + "grad_norm": 2.3613317012786865, + "learning_rate": 1e-06, + "loss": 1.0758, + "mean_token_accuracy": 0.6825302839279175, + "num_tokens": 54023165.0, + "step": 2156 + }, + { + "epoch": 0.23687678453766747, + "grad_norm": 2.652832508087158, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7211991548538208, + "num_tokens": 54042588.0, + "step": 2157 + }, + { + "epoch": 0.23698660224028115, + "grad_norm": 2.306910753250122, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7087893486022949, + "num_tokens": 54066970.0, + "step": 2158 + }, + { + "epoch": 0.2370964199428948, + "grad_norm": 2.2997336387634277, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7060544490814209, + "num_tokens": 54090730.0, + "step": 2159 + }, + { + "epoch": 0.23720623764550847, + "grad_norm": 2.454244375228882, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.705160915851593, + "num_tokens": 54114526.0, + "step": 2160 + }, + { + "epoch": 0.2373160553481221, + "grad_norm": 2.459596872329712, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7078981995582581, + "num_tokens": 54135759.0, + "step": 2161 + }, + { + "epoch": 0.2374258730507358, + "grad_norm": 2.247793436050415, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7010906934738159, + "num_tokens": 54161932.0, + "step": 2162 + }, + { + "epoch": 0.23753569075334943, + "grad_norm": 2.4895267486572266, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7054243683815002, + "num_tokens": 54184151.0, + "step": 2163 + }, + { + "epoch": 0.2376455084559631, + "grad_norm": 2.4919981956481934, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7127299904823303, + "num_tokens": 54206190.0, + "step": 2164 + }, + { + "epoch": 0.23775532615857675, + "grad_norm": 2.075037956237793, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7151407599449158, + "num_tokens": 54234166.0, + "step": 2165 + }, + { + "epoch": 0.23786514386119043, + "grad_norm": 2.230233669281006, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6941642761230469, + "num_tokens": 54259926.0, + "step": 2166 + }, + { + "epoch": 0.23797496156380407, + "grad_norm": 2.1698083877563477, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.689561128616333, + "num_tokens": 54288138.0, + "step": 2167 + }, + { + "epoch": 0.23808477926641775, + "grad_norm": 2.286726474761963, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7093920707702637, + "num_tokens": 54312941.0, + "step": 2168 + }, + { + "epoch": 0.23819459696903142, + "grad_norm": 2.209197521209717, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6868960857391357, + "num_tokens": 54338700.0, + "step": 2169 + }, + { + "epoch": 0.23830441467164507, + "grad_norm": 2.3037617206573486, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7114900946617126, + "num_tokens": 54364073.0, + "step": 2170 + }, + { + "epoch": 0.23841423237425874, + "grad_norm": 2.2878406047821045, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6982672214508057, + "num_tokens": 54390406.0, + "step": 2171 + }, + { + "epoch": 0.23852405007687238, + "grad_norm": 2.43160343170166, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7141396403312683, + "num_tokens": 54411837.0, + "step": 2172 + }, + { + "epoch": 0.23863386777948606, + "grad_norm": 2.174582004547119, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7001290917396545, + "num_tokens": 54437818.0, + "step": 2173 + }, + { + "epoch": 0.2387436854820997, + "grad_norm": 1.98055899143219, + "learning_rate": 1e-06, + "loss": 1.0795, + "mean_token_accuracy": 0.67918860912323, + "num_tokens": 54471540.0, + "step": 2174 + }, + { + "epoch": 0.23885350318471338, + "grad_norm": 2.2151694297790527, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7010001540184021, + "num_tokens": 54498588.0, + "step": 2175 + }, + { + "epoch": 0.23896332088732702, + "grad_norm": 2.0449650287628174, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6839678287506104, + "num_tokens": 54531287.0, + "step": 2176 + }, + { + "epoch": 0.2390731385899407, + "grad_norm": 2.83857798576355, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7072524428367615, + "num_tokens": 54547406.0, + "step": 2177 + }, + { + "epoch": 0.23918295629255437, + "grad_norm": 2.64516282081604, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7108851671218872, + "num_tokens": 54566319.0, + "step": 2178 + }, + { + "epoch": 0.23929277399516802, + "grad_norm": 2.0059964656829834, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6792912483215332, + "num_tokens": 54597346.0, + "step": 2179 + }, + { + "epoch": 0.2394025916977817, + "grad_norm": 2.3424997329711914, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6931673288345337, + "num_tokens": 54621640.0, + "step": 2180 + }, + { + "epoch": 0.23951240940039534, + "grad_norm": 2.3562910556793213, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6994212865829468, + "num_tokens": 54645926.0, + "step": 2181 + }, + { + "epoch": 0.239622227103009, + "grad_norm": 2.261333703994751, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7012801766395569, + "num_tokens": 54670526.0, + "step": 2182 + }, + { + "epoch": 0.23973204480562266, + "grad_norm": 2.261070966720581, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6840909123420715, + "num_tokens": 54696833.0, + "step": 2183 + }, + { + "epoch": 0.23984186250823633, + "grad_norm": 2.1392080783843994, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7016711831092834, + "num_tokens": 54721845.0, + "step": 2184 + }, + { + "epoch": 0.23995168021084998, + "grad_norm": 2.228024959564209, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7053178548812866, + "num_tokens": 54746458.0, + "step": 2185 + }, + { + "epoch": 0.24006149791346365, + "grad_norm": 2.3531484603881836, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7198030948638916, + "num_tokens": 54770799.0, + "step": 2186 + }, + { + "epoch": 0.24017131561607732, + "grad_norm": 2.44783616065979, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6931156516075134, + "num_tokens": 54792844.0, + "step": 2187 + }, + { + "epoch": 0.24028113331869097, + "grad_norm": 1.9029831886291504, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6908220648765564, + "num_tokens": 54826976.0, + "step": 2188 + }, + { + "epoch": 0.24039095102130464, + "grad_norm": 2.253389596939087, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7180881500244141, + "num_tokens": 54852958.0, + "step": 2189 + }, + { + "epoch": 0.2405007687239183, + "grad_norm": 2.2974565029144287, + "learning_rate": 1e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6908708810806274, + "num_tokens": 54878375.0, + "step": 2190 + }, + { + "epoch": 0.24061058642653196, + "grad_norm": 2.1534805297851562, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7087513208389282, + "num_tokens": 54905246.0, + "step": 2191 + }, + { + "epoch": 0.2407204041291456, + "grad_norm": 2.438325881958008, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6802206039428711, + "num_tokens": 54928833.0, + "step": 2192 + }, + { + "epoch": 0.24083022183175928, + "grad_norm": 2.270162343978882, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.712309718132019, + "num_tokens": 54953089.0, + "step": 2193 + }, + { + "epoch": 0.24094003953437293, + "grad_norm": 2.481508731842041, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.708834171295166, + "num_tokens": 54977222.0, + "step": 2194 + }, + { + "epoch": 0.2410498572369866, + "grad_norm": 2.407217264175415, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7055652141571045, + "num_tokens": 55000210.0, + "step": 2195 + }, + { + "epoch": 0.24115967493960028, + "grad_norm": 2.199925184249878, + "learning_rate": 1e-06, + "loss": 1.0777, + "mean_token_accuracy": 0.6788982152938843, + "num_tokens": 55028980.0, + "step": 2196 + }, + { + "epoch": 0.24126949264221392, + "grad_norm": 2.1820693016052246, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6999232769012451, + "num_tokens": 55060763.0, + "step": 2197 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 2.19387149810791, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6932852864265442, + "num_tokens": 55087622.0, + "step": 2198 + }, + { + "epoch": 0.24148912804744124, + "grad_norm": 2.2260565757751465, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7008246779441833, + "num_tokens": 55112821.0, + "step": 2199 + }, + { + "epoch": 0.24159894575005492, + "grad_norm": 2.6499927043914795, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6928337812423706, + "num_tokens": 55135469.0, + "step": 2200 + }, + { + "epoch": 0.24170876345266856, + "grad_norm": 2.285099983215332, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7298503518104553, + "num_tokens": 55158515.0, + "step": 2201 + }, + { + "epoch": 0.24181858115528224, + "grad_norm": 2.044408082962036, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6848468780517578, + "num_tokens": 55187667.0, + "step": 2202 + }, + { + "epoch": 0.24192839885789588, + "grad_norm": 2.2216670513153076, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6886370182037354, + "num_tokens": 55213421.0, + "step": 2203 + }, + { + "epoch": 0.24203821656050956, + "grad_norm": 2.141083240509033, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.677223801612854, + "num_tokens": 55242778.0, + "step": 2204 + }, + { + "epoch": 0.2421480342631232, + "grad_norm": 2.305391788482666, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6935300827026367, + "num_tokens": 55268362.0, + "step": 2205 + }, + { + "epoch": 0.24225785196573688, + "grad_norm": 2.756364107131958, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7059392929077148, + "num_tokens": 55287771.0, + "step": 2206 + }, + { + "epoch": 0.24236766966835055, + "grad_norm": 2.3489151000976562, + "learning_rate": 1e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.6728085875511169, + "num_tokens": 55313057.0, + "step": 2207 + }, + { + "epoch": 0.2424774873709642, + "grad_norm": 2.085066795349121, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7157887816429138, + "num_tokens": 55341783.0, + "step": 2208 + }, + { + "epoch": 0.24258730507357787, + "grad_norm": 2.1321325302124023, + "learning_rate": 1e-06, + "loss": 1.0819, + "mean_token_accuracy": 0.6865017414093018, + "num_tokens": 55371828.0, + "step": 2209 + }, + { + "epoch": 0.24269712277619152, + "grad_norm": 2.5187182426452637, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6921807527542114, + "num_tokens": 55393830.0, + "step": 2210 + }, + { + "epoch": 0.2428069404788052, + "grad_norm": 2.2520902156829834, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7084015011787415, + "num_tokens": 55419410.0, + "step": 2211 + }, + { + "epoch": 0.24291675818141883, + "grad_norm": 2.2130978107452393, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7074581384658813, + "num_tokens": 55445270.0, + "step": 2212 + }, + { + "epoch": 0.2430265758840325, + "grad_norm": 2.293128728866577, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6868546009063721, + "num_tokens": 55470087.0, + "step": 2213 + }, + { + "epoch": 0.24313639358664615, + "grad_norm": 2.072028636932373, + "learning_rate": 1e-06, + "loss": 1.1063, + "mean_token_accuracy": 0.6699376106262207, + "num_tokens": 55500110.0, + "step": 2214 + }, + { + "epoch": 0.24324621128925983, + "grad_norm": 2.094315528869629, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7032214999198914, + "num_tokens": 55528536.0, + "step": 2215 + }, + { + "epoch": 0.2433560289918735, + "grad_norm": 2.1427969932556152, + "learning_rate": 1e-06, + "loss": 1.1033, + "mean_token_accuracy": 0.6841525435447693, + "num_tokens": 55558606.0, + "step": 2216 + }, + { + "epoch": 0.24346584669448715, + "grad_norm": 2.2037243843078613, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7108256816864014, + "num_tokens": 55586009.0, + "step": 2217 + }, + { + "epoch": 0.24357566439710082, + "grad_norm": 2.361876964569092, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6932226419448853, + "num_tokens": 55610379.0, + "step": 2218 + }, + { + "epoch": 0.24368548209971447, + "grad_norm": 2.5694527626037598, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.6759200692176819, + "num_tokens": 55632460.0, + "step": 2219 + }, + { + "epoch": 0.24379529980232814, + "grad_norm": 2.279524087905884, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.689179003238678, + "num_tokens": 55662958.0, + "step": 2220 + }, + { + "epoch": 0.2439051175049418, + "grad_norm": 2.3447694778442383, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7172666788101196, + "num_tokens": 55685955.0, + "step": 2221 + }, + { + "epoch": 0.24401493520755546, + "grad_norm": 2.055109739303589, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6863175630569458, + "num_tokens": 55715946.0, + "step": 2222 + }, + { + "epoch": 0.2441247529101691, + "grad_norm": 2.3360207080841064, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7143417000770569, + "num_tokens": 55738166.0, + "step": 2223 + }, + { + "epoch": 0.24423457061278278, + "grad_norm": 2.013967752456665, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6992328763008118, + "num_tokens": 55767398.0, + "step": 2224 + }, + { + "epoch": 0.24434438831539645, + "grad_norm": 2.0326855182647705, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6992563009262085, + "num_tokens": 55796781.0, + "step": 2225 + }, + { + "epoch": 0.2444542060180101, + "grad_norm": 2.411806106567383, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7152561545372009, + "num_tokens": 55819970.0, + "step": 2226 + }, + { + "epoch": 0.24456402372062377, + "grad_norm": 2.3603053092956543, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7023006677627563, + "num_tokens": 55842670.0, + "step": 2227 + }, + { + "epoch": 0.24467384142323742, + "grad_norm": 2.459271192550659, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7144325971603394, + "num_tokens": 55863014.0, + "step": 2228 + }, + { + "epoch": 0.2447836591258511, + "grad_norm": 2.222429037094116, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6838517189025879, + "num_tokens": 55891963.0, + "step": 2229 + }, + { + "epoch": 0.24489347682846474, + "grad_norm": 2.290710210800171, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6987911462783813, + "num_tokens": 55915697.0, + "step": 2230 + }, + { + "epoch": 0.2450032945310784, + "grad_norm": 2.082772731781006, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.682680606842041, + "num_tokens": 55946793.0, + "step": 2231 + }, + { + "epoch": 0.24511311223369206, + "grad_norm": 2.580874443054199, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6875319480895996, + "num_tokens": 55970045.0, + "step": 2232 + }, + { + "epoch": 0.24522292993630573, + "grad_norm": 2.4083313941955566, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6878783702850342, + "num_tokens": 55994213.0, + "step": 2233 + }, + { + "epoch": 0.2453327476389194, + "grad_norm": 2.667670488357544, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6843585968017578, + "num_tokens": 56014854.0, + "step": 2234 + }, + { + "epoch": 0.24544256534153305, + "grad_norm": 2.5924339294433594, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7110022306442261, + "num_tokens": 56034332.0, + "step": 2235 + }, + { + "epoch": 0.24555238304414673, + "grad_norm": 2.3045566082000732, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7108986377716064, + "num_tokens": 56058002.0, + "step": 2236 + }, + { + "epoch": 0.24566220074676037, + "grad_norm": 2.1723830699920654, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.696861743927002, + "num_tokens": 56085654.0, + "step": 2237 + }, + { + "epoch": 0.24577201844937405, + "grad_norm": 2.245964288711548, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6921136975288391, + "num_tokens": 56111275.0, + "step": 2238 + }, + { + "epoch": 0.2458818361519877, + "grad_norm": 2.3466174602508545, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7020145058631897, + "num_tokens": 56136111.0, + "step": 2239 + }, + { + "epoch": 0.24599165385460137, + "grad_norm": 2.2164504528045654, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.687516987323761, + "num_tokens": 56165394.0, + "step": 2240 + }, + { + "epoch": 0.246101471557215, + "grad_norm": 2.3545124530792236, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6875824332237244, + "num_tokens": 56188652.0, + "step": 2241 + }, + { + "epoch": 0.2462112892598287, + "grad_norm": 2.4943885803222656, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.71702641248703, + "num_tokens": 56207755.0, + "step": 2242 + }, + { + "epoch": 0.24632110696244233, + "grad_norm": 2.2304177284240723, + "learning_rate": 1e-06, + "loss": 1.0913, + "mean_token_accuracy": 0.6732792854309082, + "num_tokens": 56234148.0, + "step": 2243 + }, + { + "epoch": 0.246430924665056, + "grad_norm": 2.409116744995117, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7045580148696899, + "num_tokens": 56259523.0, + "step": 2244 + }, + { + "epoch": 0.24654074236766968, + "grad_norm": 2.7628135681152344, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7276840209960938, + "num_tokens": 56277746.0, + "step": 2245 + }, + { + "epoch": 0.24665056007028333, + "grad_norm": 2.3077375888824463, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.722140908241272, + "num_tokens": 56301859.0, + "step": 2246 + }, + { + "epoch": 0.246760377772897, + "grad_norm": 1.9514501094818115, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7198535203933716, + "num_tokens": 56334247.0, + "step": 2247 + }, + { + "epoch": 0.24687019547551065, + "grad_norm": 2.7356996536254883, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7233626842498779, + "num_tokens": 56353104.0, + "step": 2248 + }, + { + "epoch": 0.24698001317812432, + "grad_norm": 2.364551067352295, + "learning_rate": 1e-06, + "loss": 1.0503, + "mean_token_accuracy": 0.6989774703979492, + "num_tokens": 56376343.0, + "step": 2249 + }, + { + "epoch": 0.24708983088073797, + "grad_norm": 2.0806150436401367, + "learning_rate": 1e-06, + "loss": 1.0646, + "mean_token_accuracy": 0.679277777671814, + "num_tokens": 56405407.0, + "step": 2250 + }, + { + "epoch": 0.24719964858335164, + "grad_norm": 2.222090244293213, + "learning_rate": 1e-06, + "loss": 1.1002, + "mean_token_accuracy": 0.6770938634872437, + "num_tokens": 56435128.0, + "step": 2251 + }, + { + "epoch": 0.24730946628596528, + "grad_norm": 2.2614355087280273, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7179849147796631, + "num_tokens": 56461129.0, + "step": 2252 + }, + { + "epoch": 0.24741928398857896, + "grad_norm": 2.3517277240753174, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7174947261810303, + "num_tokens": 56485025.0, + "step": 2253 + }, + { + "epoch": 0.24752910169119263, + "grad_norm": 2.354328155517578, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7136640548706055, + "num_tokens": 56507967.0, + "step": 2254 + }, + { + "epoch": 0.24763891939380628, + "grad_norm": 2.2232072353363037, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6993747353553772, + "num_tokens": 56533159.0, + "step": 2255 + }, + { + "epoch": 0.24774873709641995, + "grad_norm": 2.1904375553131104, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.695348858833313, + "num_tokens": 56559355.0, + "step": 2256 + }, + { + "epoch": 0.2478585547990336, + "grad_norm": 1.9250191450119019, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6830404996871948, + "num_tokens": 56595342.0, + "step": 2257 + }, + { + "epoch": 0.24796837250164727, + "grad_norm": 2.1372971534729004, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.6960657835006714, + "num_tokens": 56621292.0, + "step": 2258 + }, + { + "epoch": 0.24807819020426092, + "grad_norm": 2.260880708694458, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6730825901031494, + "num_tokens": 56647510.0, + "step": 2259 + }, + { + "epoch": 0.2481880079068746, + "grad_norm": 2.2136642932891846, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.704102635383606, + "num_tokens": 56673034.0, + "step": 2260 + }, + { + "epoch": 0.24829782560948824, + "grad_norm": 2.289445400238037, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7085651159286499, + "num_tokens": 56696972.0, + "step": 2261 + }, + { + "epoch": 0.2484076433121019, + "grad_norm": 2.3017072677612305, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7004365921020508, + "num_tokens": 56721507.0, + "step": 2262 + }, + { + "epoch": 0.24851746101471558, + "grad_norm": 2.4199256896972656, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7247629165649414, + "num_tokens": 56743138.0, + "step": 2263 + }, + { + "epoch": 0.24862727871732923, + "grad_norm": 2.080195665359497, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6821410059928894, + "num_tokens": 56774951.0, + "step": 2264 + }, + { + "epoch": 0.2487370964199429, + "grad_norm": 2.378607988357544, + "learning_rate": 1e-06, + "loss": 1.0762, + "mean_token_accuracy": 0.6884815692901611, + "num_tokens": 56799530.0, + "step": 2265 + }, + { + "epoch": 0.24884691412255655, + "grad_norm": 2.53483247756958, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6970760822296143, + "num_tokens": 56819793.0, + "step": 2266 + }, + { + "epoch": 0.24895673182517022, + "grad_norm": 2.4775660037994385, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6946864128112793, + "num_tokens": 56841605.0, + "step": 2267 + }, + { + "epoch": 0.24906654952778387, + "grad_norm": 2.2643790245056152, + "learning_rate": 1e-06, + "loss": 1.0713, + "mean_token_accuracy": 0.6834423542022705, + "num_tokens": 56867245.0, + "step": 2268 + }, + { + "epoch": 0.24917636723039754, + "grad_norm": 2.216951370239258, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6993522644042969, + "num_tokens": 56891880.0, + "step": 2269 + }, + { + "epoch": 0.2492861849330112, + "grad_norm": 2.4641666412353516, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7139345407485962, + "num_tokens": 56912605.0, + "step": 2270 + }, + { + "epoch": 0.24939600263562486, + "grad_norm": 2.1163742542266846, + "learning_rate": 1e-06, + "loss": 1.1069, + "mean_token_accuracy": 0.6726564168930054, + "num_tokens": 56943850.0, + "step": 2271 + }, + { + "epoch": 0.24950582033823854, + "grad_norm": 2.1547420024871826, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6950972080230713, + "num_tokens": 56971148.0, + "step": 2272 + }, + { + "epoch": 0.24961563804085218, + "grad_norm": 2.243476390838623, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7228764295578003, + "num_tokens": 56996128.0, + "step": 2273 + }, + { + "epoch": 0.24972545574346586, + "grad_norm": 2.348362922668457, + "learning_rate": 1e-06, + "loss": 1.1172, + "mean_token_accuracy": 0.6841399073600769, + "num_tokens": 57020714.0, + "step": 2274 + }, + { + "epoch": 0.2498352734460795, + "grad_norm": 2.4374589920043945, + "learning_rate": 1e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.680006206035614, + "num_tokens": 57043704.0, + "step": 2275 + }, + { + "epoch": 0.24994509114869318, + "grad_norm": 2.4176251888275146, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6781783103942871, + "num_tokens": 57068173.0, + "step": 2276 + }, + { + "epoch": 0.2500549088513068, + "grad_norm": 2.2425880432128906, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6928881406784058, + "num_tokens": 57094151.0, + "step": 2277 + }, + { + "epoch": 0.25016472655392047, + "grad_norm": 2.19305157661438, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7246783971786499, + "num_tokens": 57118427.0, + "step": 2278 + }, + { + "epoch": 0.25027454425653417, + "grad_norm": 2.367328405380249, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7137735486030579, + "num_tokens": 57140208.0, + "step": 2279 + }, + { + "epoch": 0.2503843619591478, + "grad_norm": 2.2386677265167236, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6927756667137146, + "num_tokens": 57166002.0, + "step": 2280 + }, + { + "epoch": 0.25049417966176146, + "grad_norm": 2.286402702331543, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.6982611417770386, + "num_tokens": 57191040.0, + "step": 2281 + }, + { + "epoch": 0.25060399736437516, + "grad_norm": 2.389519214630127, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7090258598327637, + "num_tokens": 57213920.0, + "step": 2282 + }, + { + "epoch": 0.2507138150669888, + "grad_norm": 2.4502387046813965, + "learning_rate": 1e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.6670807003974915, + "num_tokens": 57237431.0, + "step": 2283 + }, + { + "epoch": 0.25082363276960246, + "grad_norm": 2.2036123275756836, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7048891186714172, + "num_tokens": 57264176.0, + "step": 2284 + }, + { + "epoch": 0.2509334504722161, + "grad_norm": 2.1678318977355957, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7177921533584595, + "num_tokens": 57290544.0, + "step": 2285 + }, + { + "epoch": 0.2510432681748298, + "grad_norm": 2.2602906227111816, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7050250172615051, + "num_tokens": 57313121.0, + "step": 2286 + }, + { + "epoch": 0.25115308587744345, + "grad_norm": 2.139960765838623, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7082589864730835, + "num_tokens": 57338674.0, + "step": 2287 + }, + { + "epoch": 0.2512629035800571, + "grad_norm": 2.185657262802124, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.685559093952179, + "num_tokens": 57366815.0, + "step": 2288 + }, + { + "epoch": 0.25137272128267074, + "grad_norm": 2.007230281829834, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6944547295570374, + "num_tokens": 57398557.0, + "step": 2289 + }, + { + "epoch": 0.25148253898528444, + "grad_norm": 2.270738124847412, + "learning_rate": 1e-06, + "loss": 1.1045, + "mean_token_accuracy": 0.6753993630409241, + "num_tokens": 57424589.0, + "step": 2290 + }, + { + "epoch": 0.2515923566878981, + "grad_norm": 2.2320690155029297, + "learning_rate": 1e-06, + "loss": 1.0803, + "mean_token_accuracy": 0.6835250854492188, + "num_tokens": 57450825.0, + "step": 2291 + }, + { + "epoch": 0.25170217439051173, + "grad_norm": 2.34358811378479, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.704991340637207, + "num_tokens": 57473564.0, + "step": 2292 + }, + { + "epoch": 0.25181199209312544, + "grad_norm": 2.4583327770233154, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7173857688903809, + "num_tokens": 57493146.0, + "step": 2293 + }, + { + "epoch": 0.2519218097957391, + "grad_norm": 2.520352840423584, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6975372433662415, + "num_tokens": 57514388.0, + "step": 2294 + }, + { + "epoch": 0.25203162749835273, + "grad_norm": 2.6012766361236572, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7082028985023499, + "num_tokens": 57534903.0, + "step": 2295 + }, + { + "epoch": 0.2521414452009664, + "grad_norm": 2.2291622161865234, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6982734799385071, + "num_tokens": 57561818.0, + "step": 2296 + }, + { + "epoch": 0.2522512629035801, + "grad_norm": 2.342378616333008, + "learning_rate": 1e-06, + "loss": 1.1165, + "mean_token_accuracy": 0.6689097881317139, + "num_tokens": 57590453.0, + "step": 2297 + }, + { + "epoch": 0.2523610806061937, + "grad_norm": 2.0566020011901855, + "learning_rate": 1e-06, + "loss": 1.128, + "mean_token_accuracy": 0.6617453098297119, + "num_tokens": 57621792.0, + "step": 2298 + }, + { + "epoch": 0.25247089830880737, + "grad_norm": 2.2615160942077637, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6878585815429688, + "num_tokens": 57650019.0, + "step": 2299 + }, + { + "epoch": 0.252580716011421, + "grad_norm": 2.1713156700134277, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6888002157211304, + "num_tokens": 57679156.0, + "step": 2300 + }, + { + "epoch": 0.2526905337140347, + "grad_norm": 2.325190305709839, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6901025772094727, + "num_tokens": 57702304.0, + "step": 2301 + }, + { + "epoch": 0.25280035141664836, + "grad_norm": 2.0821213722229004, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7099452018737793, + "num_tokens": 57729753.0, + "step": 2302 + }, + { + "epoch": 0.252910169119262, + "grad_norm": 2.2697479724884033, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7313730716705322, + "num_tokens": 57754370.0, + "step": 2303 + }, + { + "epoch": 0.2530199868218757, + "grad_norm": 1.964987874031067, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7010881900787354, + "num_tokens": 57784784.0, + "step": 2304 + }, + { + "epoch": 0.25312980452448935, + "grad_norm": 2.158510684967041, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7010293006896973, + "num_tokens": 57810014.0, + "step": 2305 + }, + { + "epoch": 0.253239622227103, + "grad_norm": 2.245326519012451, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.677336573600769, + "num_tokens": 57837001.0, + "step": 2306 + }, + { + "epoch": 0.25334943992971665, + "grad_norm": 2.415476083755493, + "learning_rate": 1e-06, + "loss": 1.0768, + "mean_token_accuracy": 0.6889524459838867, + "num_tokens": 57859296.0, + "step": 2307 + }, + { + "epoch": 0.25345925763233035, + "grad_norm": 2.2194714546203613, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6959572434425354, + "num_tokens": 57887245.0, + "step": 2308 + }, + { + "epoch": 0.253569075334944, + "grad_norm": 2.1702404022216797, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6990101933479309, + "num_tokens": 57914977.0, + "step": 2309 + }, + { + "epoch": 0.25367889303755764, + "grad_norm": 1.9905441999435425, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7071446180343628, + "num_tokens": 57944185.0, + "step": 2310 + }, + { + "epoch": 0.25378871074017134, + "grad_norm": 2.2631571292877197, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7024877667427063, + "num_tokens": 57970811.0, + "step": 2311 + }, + { + "epoch": 0.253898528442785, + "grad_norm": 2.1177468299865723, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7095629572868347, + "num_tokens": 57996928.0, + "step": 2312 + }, + { + "epoch": 0.25400834614539863, + "grad_norm": 2.00762939453125, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6962759494781494, + "num_tokens": 58027686.0, + "step": 2313 + }, + { + "epoch": 0.2541181638480123, + "grad_norm": 2.4049153327941895, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.714331865310669, + "num_tokens": 58050128.0, + "step": 2314 + }, + { + "epoch": 0.254227981550626, + "grad_norm": 2.198662757873535, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6897940039634705, + "num_tokens": 58076085.0, + "step": 2315 + }, + { + "epoch": 0.2543377992532396, + "grad_norm": 2.322127103805542, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7009643316268921, + "num_tokens": 58100596.0, + "step": 2316 + }, + { + "epoch": 0.2544476169558533, + "grad_norm": 2.450732469558716, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6916511058807373, + "num_tokens": 58122706.0, + "step": 2317 + }, + { + "epoch": 0.2545574346584669, + "grad_norm": 2.2753803730010986, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.701714277267456, + "num_tokens": 58146969.0, + "step": 2318 + }, + { + "epoch": 0.2546672523610806, + "grad_norm": 2.5261244773864746, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7340977191925049, + "num_tokens": 58167360.0, + "step": 2319 + }, + { + "epoch": 0.25477707006369427, + "grad_norm": 2.171570062637329, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6993474960327148, + "num_tokens": 58192474.0, + "step": 2320 + }, + { + "epoch": 0.2548868877663079, + "grad_norm": 2.251303195953369, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.702957034111023, + "num_tokens": 58217139.0, + "step": 2321 + }, + { + "epoch": 0.2549967054689216, + "grad_norm": 2.4802417755126953, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6992917060852051, + "num_tokens": 58237986.0, + "step": 2322 + }, + { + "epoch": 0.25510652317153526, + "grad_norm": 2.4090030193328857, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6905257105827332, + "num_tokens": 58260333.0, + "step": 2323 + }, + { + "epoch": 0.2552163408741489, + "grad_norm": 2.2356438636779785, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7034382224082947, + "num_tokens": 58284812.0, + "step": 2324 + }, + { + "epoch": 0.25532615857676255, + "grad_norm": 2.338575601577759, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6935467720031738, + "num_tokens": 58307116.0, + "step": 2325 + }, + { + "epoch": 0.25543597627937625, + "grad_norm": 2.40580153465271, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7224767804145813, + "num_tokens": 58327818.0, + "step": 2326 + }, + { + "epoch": 0.2555457939819899, + "grad_norm": 2.0791687965393066, + "learning_rate": 1e-06, + "loss": 1.1323, + "mean_token_accuracy": 0.6642634868621826, + "num_tokens": 58357051.0, + "step": 2327 + }, + { + "epoch": 0.25565561168460355, + "grad_norm": 2.2415997982025146, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6960346102714539, + "num_tokens": 58383878.0, + "step": 2328 + }, + { + "epoch": 0.25576542938721725, + "grad_norm": 2.167747735977173, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7254491448402405, + "num_tokens": 58408139.0, + "step": 2329 + }, + { + "epoch": 0.2558752470898309, + "grad_norm": 2.274874210357666, + "learning_rate": 1e-06, + "loss": 1.0791, + "mean_token_accuracy": 0.6749849319458008, + "num_tokens": 58434033.0, + "step": 2330 + }, + { + "epoch": 0.25598506479244454, + "grad_norm": 2.3580265045166016, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.709058403968811, + "num_tokens": 58458398.0, + "step": 2331 + }, + { + "epoch": 0.2560948824950582, + "grad_norm": 2.027194023132324, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6822545528411865, + "num_tokens": 58488871.0, + "step": 2332 + }, + { + "epoch": 0.2562047001976719, + "grad_norm": 2.4703874588012695, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7177497148513794, + "num_tokens": 58509842.0, + "step": 2333 + }, + { + "epoch": 0.25631451790028553, + "grad_norm": 2.4205873012542725, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6897634267807007, + "num_tokens": 58533488.0, + "step": 2334 + }, + { + "epoch": 0.2564243356028992, + "grad_norm": 2.3107144832611084, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6857017278671265, + "num_tokens": 58559176.0, + "step": 2335 + }, + { + "epoch": 0.2565341533055128, + "grad_norm": 2.1786468029022217, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7042940258979797, + "num_tokens": 58588335.0, + "step": 2336 + }, + { + "epoch": 0.2566439710081265, + "grad_norm": 2.4586148262023926, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7152809500694275, + "num_tokens": 58610013.0, + "step": 2337 + }, + { + "epoch": 0.25675378871074017, + "grad_norm": 2.175455331802368, + "learning_rate": 1e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6787580251693726, + "num_tokens": 58639914.0, + "step": 2338 + }, + { + "epoch": 0.2568636064133538, + "grad_norm": 2.234778642654419, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7069146633148193, + "num_tokens": 58665527.0, + "step": 2339 + }, + { + "epoch": 0.2569734241159675, + "grad_norm": 2.3221333026885986, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6886132955551147, + "num_tokens": 58689146.0, + "step": 2340 + }, + { + "epoch": 0.25708324181858117, + "grad_norm": 2.117697238922119, + "learning_rate": 1e-06, + "loss": 1.0958, + "mean_token_accuracy": 0.6724463701248169, + "num_tokens": 58717501.0, + "step": 2341 + }, + { + "epoch": 0.2571930595211948, + "grad_norm": 2.301027297973633, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6876522302627563, + "num_tokens": 58741640.0, + "step": 2342 + }, + { + "epoch": 0.25730287722380846, + "grad_norm": 2.2057831287384033, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7121996879577637, + "num_tokens": 58765864.0, + "step": 2343 + }, + { + "epoch": 0.25741269492642216, + "grad_norm": 2.321655750274658, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6834640502929688, + "num_tokens": 58790055.0, + "step": 2344 + }, + { + "epoch": 0.2575225126290358, + "grad_norm": 2.274822473526001, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7078422904014587, + "num_tokens": 58815639.0, + "step": 2345 + }, + { + "epoch": 0.25763233033164945, + "grad_norm": 2.3623368740081787, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6849917769432068, + "num_tokens": 58838179.0, + "step": 2346 + }, + { + "epoch": 0.2577421480342631, + "grad_norm": 2.6068458557128906, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7102800011634827, + "num_tokens": 58857358.0, + "step": 2347 + }, + { + "epoch": 0.2578519657368768, + "grad_norm": 2.3330349922180176, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.686215877532959, + "num_tokens": 58881605.0, + "step": 2348 + }, + { + "epoch": 0.25796178343949044, + "grad_norm": 2.075890302658081, + "learning_rate": 1e-06, + "loss": 1.0954, + "mean_token_accuracy": 0.672709584236145, + "num_tokens": 58911400.0, + "step": 2349 + }, + { + "epoch": 0.2580716011421041, + "grad_norm": 2.329364538192749, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6992545127868652, + "num_tokens": 58934929.0, + "step": 2350 + }, + { + "epoch": 0.2581814188447178, + "grad_norm": 2.3533530235290527, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7019728422164917, + "num_tokens": 58959005.0, + "step": 2351 + }, + { + "epoch": 0.25829123654733144, + "grad_norm": 2.258317708969116, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6999543309211731, + "num_tokens": 58985429.0, + "step": 2352 + }, + { + "epoch": 0.2584010542499451, + "grad_norm": 2.170435905456543, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.707789421081543, + "num_tokens": 59014813.0, + "step": 2353 + }, + { + "epoch": 0.25851087195255873, + "grad_norm": 2.6004183292388916, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7348724603652954, + "num_tokens": 59033284.0, + "step": 2354 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 2.200889825820923, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6997339725494385, + "num_tokens": 59059845.0, + "step": 2355 + }, + { + "epoch": 0.2587305073577861, + "grad_norm": 2.431972026824951, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6945947408676147, + "num_tokens": 59080881.0, + "step": 2356 + }, + { + "epoch": 0.2588403250603997, + "grad_norm": 2.598799705505371, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7137249708175659, + "num_tokens": 59100414.0, + "step": 2357 + }, + { + "epoch": 0.2589501427630134, + "grad_norm": 2.1548166275024414, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6829513311386108, + "num_tokens": 59127257.0, + "step": 2358 + }, + { + "epoch": 0.25905996046562707, + "grad_norm": 2.1964309215545654, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6958140134811401, + "num_tokens": 59153797.0, + "step": 2359 + }, + { + "epoch": 0.2591697781682407, + "grad_norm": 1.9901597499847412, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6888079643249512, + "num_tokens": 59183493.0, + "step": 2360 + }, + { + "epoch": 0.25927959587085436, + "grad_norm": 2.6011409759521484, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7154374122619629, + "num_tokens": 59201748.0, + "step": 2361 + }, + { + "epoch": 0.25938941357346806, + "grad_norm": 2.2746758460998535, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.730492115020752, + "num_tokens": 59223973.0, + "step": 2362 + }, + { + "epoch": 0.2594992312760817, + "grad_norm": 2.340471029281616, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7027243971824646, + "num_tokens": 59247357.0, + "step": 2363 + }, + { + "epoch": 0.25960904897869536, + "grad_norm": 2.2559704780578613, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6933308839797974, + "num_tokens": 59273097.0, + "step": 2364 + }, + { + "epoch": 0.259718866681309, + "grad_norm": 2.280348300933838, + "learning_rate": 1e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6726546287536621, + "num_tokens": 59301612.0, + "step": 2365 + }, + { + "epoch": 0.2598286843839227, + "grad_norm": 2.0853981971740723, + "learning_rate": 1e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.684760570526123, + "num_tokens": 59330072.0, + "step": 2366 + }, + { + "epoch": 0.25993850208653635, + "grad_norm": 2.3239166736602783, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7012547254562378, + "num_tokens": 59353980.0, + "step": 2367 + }, + { + "epoch": 0.26004831978915, + "grad_norm": 2.361237049102783, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6822795867919922, + "num_tokens": 59379021.0, + "step": 2368 + }, + { + "epoch": 0.2601581374917637, + "grad_norm": 2.220665216445923, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6949104070663452, + "num_tokens": 59403805.0, + "step": 2369 + }, + { + "epoch": 0.26026795519437734, + "grad_norm": 2.5008246898651123, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6908233165740967, + "num_tokens": 59426752.0, + "step": 2370 + }, + { + "epoch": 0.260377772896991, + "grad_norm": 2.5554263591766357, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7091360688209534, + "num_tokens": 59447310.0, + "step": 2371 + }, + { + "epoch": 0.26048759059960463, + "grad_norm": 2.053528308868408, + "learning_rate": 1e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.6885552406311035, + "num_tokens": 59477182.0, + "step": 2372 + }, + { + "epoch": 0.26059740830221834, + "grad_norm": 2.1621618270874023, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7230116128921509, + "num_tokens": 59500714.0, + "step": 2373 + }, + { + "epoch": 0.260707226004832, + "grad_norm": 2.380284070968628, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.692312479019165, + "num_tokens": 59524127.0, + "step": 2374 + }, + { + "epoch": 0.26081704370744563, + "grad_norm": 2.362389326095581, + "learning_rate": 1e-06, + "loss": 1.114, + "mean_token_accuracy": 0.6679636240005493, + "num_tokens": 59548011.0, + "step": 2375 + }, + { + "epoch": 0.2609268614100593, + "grad_norm": 2.4529120922088623, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.713462233543396, + "num_tokens": 59568848.0, + "step": 2376 + }, + { + "epoch": 0.261036679112673, + "grad_norm": 2.2348971366882324, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.6998203992843628, + "num_tokens": 59594905.0, + "step": 2377 + }, + { + "epoch": 0.2611464968152866, + "grad_norm": 2.541412591934204, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7092748880386353, + "num_tokens": 59614960.0, + "step": 2378 + }, + { + "epoch": 0.26125631451790027, + "grad_norm": 2.2460947036743164, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6821126937866211, + "num_tokens": 59641423.0, + "step": 2379 + }, + { + "epoch": 0.26136613222051397, + "grad_norm": 2.2951271533966064, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6896048784255981, + "num_tokens": 59668144.0, + "step": 2380 + }, + { + "epoch": 0.2614759499231276, + "grad_norm": 2.5554163455963135, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7068201303482056, + "num_tokens": 59688882.0, + "step": 2381 + }, + { + "epoch": 0.26158576762574126, + "grad_norm": 2.3282365798950195, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7069978713989258, + "num_tokens": 59715850.0, + "step": 2382 + }, + { + "epoch": 0.2616955853283549, + "grad_norm": 2.6122796535491943, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7007828950881958, + "num_tokens": 59736177.0, + "step": 2383 + }, + { + "epoch": 0.2618054030309686, + "grad_norm": 1.9894771575927734, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6998103260993958, + "num_tokens": 59769343.0, + "step": 2384 + }, + { + "epoch": 0.26191522073358225, + "grad_norm": 2.1954524517059326, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6864808797836304, + "num_tokens": 59795130.0, + "step": 2385 + }, + { + "epoch": 0.2620250384361959, + "grad_norm": 2.2664499282836914, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7097809910774231, + "num_tokens": 59821189.0, + "step": 2386 + }, + { + "epoch": 0.2621348561388096, + "grad_norm": 2.6058380603790283, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.708228588104248, + "num_tokens": 59840184.0, + "step": 2387 + }, + { + "epoch": 0.26224467384142325, + "grad_norm": 2.332838773727417, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6903822422027588, + "num_tokens": 59863658.0, + "step": 2388 + }, + { + "epoch": 0.2623544915440369, + "grad_norm": 2.0145318508148193, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6909716129302979, + "num_tokens": 59895240.0, + "step": 2389 + }, + { + "epoch": 0.26246430924665054, + "grad_norm": 2.617191791534424, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6960828900337219, + "num_tokens": 59915979.0, + "step": 2390 + }, + { + "epoch": 0.26257412694926424, + "grad_norm": 2.505464792251587, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6991589069366455, + "num_tokens": 59937249.0, + "step": 2391 + }, + { + "epoch": 0.2626839446518779, + "grad_norm": 2.1265735626220703, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6857718825340271, + "num_tokens": 59966230.0, + "step": 2392 + }, + { + "epoch": 0.26279376235449153, + "grad_norm": 2.3314483165740967, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6857200860977173, + "num_tokens": 59990296.0, + "step": 2393 + }, + { + "epoch": 0.2629035800571052, + "grad_norm": 2.1966335773468018, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6985326409339905, + "num_tokens": 60017123.0, + "step": 2394 + }, + { + "epoch": 0.2630133977597189, + "grad_norm": 2.2811808586120605, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7125267386436462, + "num_tokens": 60040291.0, + "step": 2395 + }, + { + "epoch": 0.2631232154623325, + "grad_norm": 1.914067268371582, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6928346753120422, + "num_tokens": 60073187.0, + "step": 2396 + }, + { + "epoch": 0.2632330331649462, + "grad_norm": 2.136155128479004, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.687915563583374, + "num_tokens": 60101406.0, + "step": 2397 + }, + { + "epoch": 0.2633428508675599, + "grad_norm": 2.3115410804748535, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7048318386077881, + "num_tokens": 60125932.0, + "step": 2398 + }, + { + "epoch": 0.2634526685701735, + "grad_norm": 2.197382688522339, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.69352787733078, + "num_tokens": 60153850.0, + "step": 2399 + }, + { + "epoch": 0.26356248627278717, + "grad_norm": 2.083789825439453, + "learning_rate": 1e-06, + "loss": 1.092, + "mean_token_accuracy": 0.6723850965499878, + "num_tokens": 60185601.0, + "step": 2400 + }, + { + "epoch": 0.2636723039754008, + "grad_norm": 2.056438446044922, + "learning_rate": 1e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6766033172607422, + "num_tokens": 60215454.0, + "step": 2401 + }, + { + "epoch": 0.2637821216780145, + "grad_norm": 2.4986116886138916, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6939884424209595, + "num_tokens": 60242888.0, + "step": 2402 + }, + { + "epoch": 0.26389193938062816, + "grad_norm": 2.09041166305542, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7164173126220703, + "num_tokens": 60270495.0, + "step": 2403 + }, + { + "epoch": 0.2640017570832418, + "grad_norm": 2.7663168907165527, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.712695837020874, + "num_tokens": 60287565.0, + "step": 2404 + }, + { + "epoch": 0.2641115747858555, + "grad_norm": 2.4439303874969482, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7150720357894897, + "num_tokens": 60309536.0, + "step": 2405 + }, + { + "epoch": 0.26422139248846915, + "grad_norm": 2.3315141201019287, + "learning_rate": 1e-06, + "loss": 1.0509, + "mean_token_accuracy": 0.6928130388259888, + "num_tokens": 60334667.0, + "step": 2406 + }, + { + "epoch": 0.2643312101910828, + "grad_norm": 2.6183714866638184, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7036001086235046, + "num_tokens": 60357408.0, + "step": 2407 + }, + { + "epoch": 0.26444102789369645, + "grad_norm": 2.2472705841064453, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7220659255981445, + "num_tokens": 60381922.0, + "step": 2408 + }, + { + "epoch": 0.26455084559631015, + "grad_norm": 2.2576820850372314, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7036177515983582, + "num_tokens": 60407493.0, + "step": 2409 + }, + { + "epoch": 0.2646606632989238, + "grad_norm": 2.086888313293457, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7138566970825195, + "num_tokens": 60436194.0, + "step": 2410 + }, + { + "epoch": 0.26477048100153744, + "grad_norm": 2.356846809387207, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6933295130729675, + "num_tokens": 60461270.0, + "step": 2411 + }, + { + "epoch": 0.2648802987041511, + "grad_norm": 2.7196786403656006, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.688651442527771, + "num_tokens": 60481394.0, + "step": 2412 + }, + { + "epoch": 0.2649901164067648, + "grad_norm": 2.4706106185913086, + "learning_rate": 1e-06, + "loss": 1.1085, + "mean_token_accuracy": 0.6667851209640503, + "num_tokens": 60507477.0, + "step": 2413 + }, + { + "epoch": 0.26509993410937843, + "grad_norm": 2.4875454902648926, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6969110369682312, + "num_tokens": 60529829.0, + "step": 2414 + }, + { + "epoch": 0.2652097518119921, + "grad_norm": 2.754398822784424, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6957380175590515, + "num_tokens": 60548062.0, + "step": 2415 + }, + { + "epoch": 0.2653195695146058, + "grad_norm": 2.3701586723327637, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7056009769439697, + "num_tokens": 60572238.0, + "step": 2416 + }, + { + "epoch": 0.2654293872172194, + "grad_norm": 2.1069893836975098, + "learning_rate": 1e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6755213737487793, + "num_tokens": 60601618.0, + "step": 2417 + }, + { + "epoch": 0.26553920491983307, + "grad_norm": 1.9040690660476685, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.693946361541748, + "num_tokens": 60636490.0, + "step": 2418 + }, + { + "epoch": 0.2656490226224467, + "grad_norm": 2.48952317237854, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7074488401412964, + "num_tokens": 60657935.0, + "step": 2419 + }, + { + "epoch": 0.2657588403250604, + "grad_norm": 2.2856600284576416, + "learning_rate": 1e-06, + "loss": 1.1666, + "mean_token_accuracy": 0.6655049324035645, + "num_tokens": 60684294.0, + "step": 2420 + }, + { + "epoch": 0.26586865802767407, + "grad_norm": 2.155263662338257, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6986866593360901, + "num_tokens": 60710148.0, + "step": 2421 + }, + { + "epoch": 0.2659784757302877, + "grad_norm": 2.2273919582366943, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7023705840110779, + "num_tokens": 60737874.0, + "step": 2422 + }, + { + "epoch": 0.26608829343290136, + "grad_norm": 2.849334955215454, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7305083870887756, + "num_tokens": 60753983.0, + "step": 2423 + }, + { + "epoch": 0.26619811113551506, + "grad_norm": 2.240037679672241, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6858116984367371, + "num_tokens": 60779550.0, + "step": 2424 + }, + { + "epoch": 0.2663079288381287, + "grad_norm": 2.5447349548339844, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7163957357406616, + "num_tokens": 60800647.0, + "step": 2425 + }, + { + "epoch": 0.26641774654074235, + "grad_norm": 2.108607292175293, + "learning_rate": 1e-06, + "loss": 1.0781, + "mean_token_accuracy": 0.6748974323272705, + "num_tokens": 60832679.0, + "step": 2426 + }, + { + "epoch": 0.26652756424335605, + "grad_norm": 2.307910442352295, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7141090631484985, + "num_tokens": 60856992.0, + "step": 2427 + }, + { + "epoch": 0.2666373819459697, + "grad_norm": 1.9511383771896362, + "learning_rate": 1e-06, + "loss": 1.1141, + "mean_token_accuracy": 0.6732539534568787, + "num_tokens": 60892419.0, + "step": 2428 + }, + { + "epoch": 0.26674719964858334, + "grad_norm": 2.2981557846069336, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6996240019798279, + "num_tokens": 60915650.0, + "step": 2429 + }, + { + "epoch": 0.266857017351197, + "grad_norm": 2.1633050441741943, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6911614537239075, + "num_tokens": 60940419.0, + "step": 2430 + }, + { + "epoch": 0.2669668350538107, + "grad_norm": 2.368324041366577, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7004201412200928, + "num_tokens": 60964940.0, + "step": 2431 + }, + { + "epoch": 0.26707665275642434, + "grad_norm": 2.123544216156006, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7058326601982117, + "num_tokens": 60990249.0, + "step": 2432 + }, + { + "epoch": 0.267186470459038, + "grad_norm": 2.3740336894989014, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.73189777135849, + "num_tokens": 61010687.0, + "step": 2433 + }, + { + "epoch": 0.2672962881616517, + "grad_norm": 2.0444085597991943, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6869415640830994, + "num_tokens": 61043421.0, + "step": 2434 + }, + { + "epoch": 0.26740610586426533, + "grad_norm": 2.275592565536499, + "learning_rate": 1e-06, + "loss": 1.1144, + "mean_token_accuracy": 0.6691839694976807, + "num_tokens": 61071812.0, + "step": 2435 + }, + { + "epoch": 0.267515923566879, + "grad_norm": 2.4302597045898438, + "learning_rate": 1e-06, + "loss": 1.0529, + "mean_token_accuracy": 0.6851370334625244, + "num_tokens": 61096870.0, + "step": 2436 + }, + { + "epoch": 0.2676257412694926, + "grad_norm": 2.4854912757873535, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6848863363265991, + "num_tokens": 61117165.0, + "step": 2437 + }, + { + "epoch": 0.2677355589721063, + "grad_norm": 2.1535332202911377, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6883349418640137, + "num_tokens": 61144872.0, + "step": 2438 + }, + { + "epoch": 0.26784537667471997, + "grad_norm": 2.335434675216675, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7164572477340698, + "num_tokens": 61168677.0, + "step": 2439 + }, + { + "epoch": 0.2679551943773336, + "grad_norm": 2.523993968963623, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7058366537094116, + "num_tokens": 61189793.0, + "step": 2440 + }, + { + "epoch": 0.26806501207994726, + "grad_norm": 2.2860019207000732, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6872304677963257, + "num_tokens": 61215197.0, + "step": 2441 + }, + { + "epoch": 0.26817482978256096, + "grad_norm": 2.0824379920959473, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7103132009506226, + "num_tokens": 61242925.0, + "step": 2442 + }, + { + "epoch": 0.2682846474851746, + "grad_norm": 2.1219592094421387, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7009303569793701, + "num_tokens": 61268911.0, + "step": 2443 + }, + { + "epoch": 0.26839446518778826, + "grad_norm": 2.3624701499938965, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7095326781272888, + "num_tokens": 61292919.0, + "step": 2444 + }, + { + "epoch": 0.26850428289040196, + "grad_norm": 2.1495449542999268, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6885532140731812, + "num_tokens": 61321090.0, + "step": 2445 + }, + { + "epoch": 0.2686141005930156, + "grad_norm": 2.684499740600586, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7236500978469849, + "num_tokens": 61339956.0, + "step": 2446 + }, + { + "epoch": 0.26872391829562925, + "grad_norm": 2.0316405296325684, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.696110725402832, + "num_tokens": 61369812.0, + "step": 2447 + }, + { + "epoch": 0.2688337359982429, + "grad_norm": 2.1309814453125, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6955292224884033, + "num_tokens": 61396559.0, + "step": 2448 + }, + { + "epoch": 0.2689435537008566, + "grad_norm": 2.4346108436584473, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7094835638999939, + "num_tokens": 61418976.0, + "step": 2449 + }, + { + "epoch": 0.26905337140347024, + "grad_norm": 2.357142210006714, + "learning_rate": 1e-06, + "loss": 1.0795, + "mean_token_accuracy": 0.6803935766220093, + "num_tokens": 61443801.0, + "step": 2450 + }, + { + "epoch": 0.2691631891060839, + "grad_norm": 2.2812881469726562, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.7042874693870544, + "num_tokens": 61469731.0, + "step": 2451 + }, + { + "epoch": 0.26927300680869753, + "grad_norm": 2.1820926666259766, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6995038986206055, + "num_tokens": 61497085.0, + "step": 2452 + }, + { + "epoch": 0.26938282451131124, + "grad_norm": 2.221925735473633, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7023053765296936, + "num_tokens": 61523926.0, + "step": 2453 + }, + { + "epoch": 0.2694926422139249, + "grad_norm": 2.1765875816345215, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7067254781723022, + "num_tokens": 61549287.0, + "step": 2454 + }, + { + "epoch": 0.26960245991653853, + "grad_norm": 2.597806215286255, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7063827514648438, + "num_tokens": 61569054.0, + "step": 2455 + }, + { + "epoch": 0.26971227761915223, + "grad_norm": 2.045008420944214, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6986579298973083, + "num_tokens": 61599318.0, + "step": 2456 + }, + { + "epoch": 0.2698220953217659, + "grad_norm": 2.434720993041992, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7003452777862549, + "num_tokens": 61622317.0, + "step": 2457 + }, + { + "epoch": 0.2699319130243795, + "grad_norm": 2.3761608600616455, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7139534950256348, + "num_tokens": 61643932.0, + "step": 2458 + }, + { + "epoch": 0.27004173072699317, + "grad_norm": 2.5468857288360596, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.703689455986023, + "num_tokens": 61662944.0, + "step": 2459 + }, + { + "epoch": 0.27015154842960687, + "grad_norm": 1.9280118942260742, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6929923295974731, + "num_tokens": 61694104.0, + "step": 2460 + }, + { + "epoch": 0.2702613661322205, + "grad_norm": 2.273470640182495, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6867133378982544, + "num_tokens": 61720273.0, + "step": 2461 + }, + { + "epoch": 0.27037118383483416, + "grad_norm": 2.2145354747772217, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7052028775215149, + "num_tokens": 61745521.0, + "step": 2462 + }, + { + "epoch": 0.27048100153744786, + "grad_norm": 2.2057721614837646, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6912694573402405, + "num_tokens": 61770849.0, + "step": 2463 + }, + { + "epoch": 0.2705908192400615, + "grad_norm": 2.2443575859069824, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.691333532333374, + "num_tokens": 61797096.0, + "step": 2464 + }, + { + "epoch": 0.27070063694267515, + "grad_norm": 2.4234776496887207, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6892536878585815, + "num_tokens": 61818230.0, + "step": 2465 + }, + { + "epoch": 0.2708104546452888, + "grad_norm": 2.5401759147644043, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.6982040405273438, + "num_tokens": 61838280.0, + "step": 2466 + }, + { + "epoch": 0.2709202723479025, + "grad_norm": 2.471249580383301, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6990326642990112, + "num_tokens": 61861938.0, + "step": 2467 + }, + { + "epoch": 0.27103009005051615, + "grad_norm": 2.2479798793792725, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6947829723358154, + "num_tokens": 61889162.0, + "step": 2468 + }, + { + "epoch": 0.2711399077531298, + "grad_norm": 2.28642201423645, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7117027044296265, + "num_tokens": 61912569.0, + "step": 2469 + }, + { + "epoch": 0.27124972545574344, + "grad_norm": 2.1015548706054688, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6879932880401611, + "num_tokens": 61942165.0, + "step": 2470 + }, + { + "epoch": 0.27135954315835714, + "grad_norm": 2.4078238010406494, + "learning_rate": 1e-06, + "loss": 1.0969, + "mean_token_accuracy": 0.6660821437835693, + "num_tokens": 61966093.0, + "step": 2471 + }, + { + "epoch": 0.2714693608609708, + "grad_norm": 2.261979103088379, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7103223204612732, + "num_tokens": 61992021.0, + "step": 2472 + }, + { + "epoch": 0.27157917856358443, + "grad_norm": 2.0263211727142334, + "learning_rate": 1e-06, + "loss": 1.1058, + "mean_token_accuracy": 0.6655168533325195, + "num_tokens": 62025149.0, + "step": 2473 + }, + { + "epoch": 0.27168899626619814, + "grad_norm": 2.4211158752441406, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6930672526359558, + "num_tokens": 62048774.0, + "step": 2474 + }, + { + "epoch": 0.2717988139688118, + "grad_norm": 2.0174977779388428, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.709476113319397, + "num_tokens": 62078522.0, + "step": 2475 + }, + { + "epoch": 0.2719086316714254, + "grad_norm": 2.4970877170562744, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6865700483322144, + "num_tokens": 62100739.0, + "step": 2476 + }, + { + "epoch": 0.2720184493740391, + "grad_norm": 2.1913487911224365, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7025334239006042, + "num_tokens": 62128220.0, + "step": 2477 + }, + { + "epoch": 0.2721282670766528, + "grad_norm": 2.276386022567749, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7047961950302124, + "num_tokens": 62151652.0, + "step": 2478 + }, + { + "epoch": 0.2722380847792664, + "grad_norm": 2.309285879135132, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6988701820373535, + "num_tokens": 62177514.0, + "step": 2479 + }, + { + "epoch": 0.27234790248188007, + "grad_norm": 2.3571834564208984, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6991921067237854, + "num_tokens": 62201423.0, + "step": 2480 + }, + { + "epoch": 0.27245772018449377, + "grad_norm": 2.193791627883911, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.698291540145874, + "num_tokens": 62229535.0, + "step": 2481 + }, + { + "epoch": 0.2725675378871074, + "grad_norm": 1.9403948783874512, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6958374977111816, + "num_tokens": 62262678.0, + "step": 2482 + }, + { + "epoch": 0.27267735558972106, + "grad_norm": 2.4647884368896484, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6948331594467163, + "num_tokens": 62284605.0, + "step": 2483 + }, + { + "epoch": 0.2727871732923347, + "grad_norm": 2.024097204208374, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6967242956161499, + "num_tokens": 62314813.0, + "step": 2484 + }, + { + "epoch": 0.2728969909949484, + "grad_norm": 2.4663281440734863, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7123273611068726, + "num_tokens": 62335446.0, + "step": 2485 + }, + { + "epoch": 0.27300680869756205, + "grad_norm": 2.2162063121795654, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6841152906417847, + "num_tokens": 62360487.0, + "step": 2486 + }, + { + "epoch": 0.2731166264001757, + "grad_norm": 1.9265360832214355, + "learning_rate": 1e-06, + "loss": 1.0829, + "mean_token_accuracy": 0.67647385597229, + "num_tokens": 62395263.0, + "step": 2487 + }, + { + "epoch": 0.27322644410278935, + "grad_norm": 2.2819254398345947, + "learning_rate": 1e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.685857892036438, + "num_tokens": 62422383.0, + "step": 2488 + }, + { + "epoch": 0.27333626180540305, + "grad_norm": 2.120673179626465, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7033573389053345, + "num_tokens": 62450750.0, + "step": 2489 + }, + { + "epoch": 0.2734460795080167, + "grad_norm": 2.103928565979004, + "learning_rate": 1e-06, + "loss": 1.0856, + "mean_token_accuracy": 0.681942343711853, + "num_tokens": 62479463.0, + "step": 2490 + }, + { + "epoch": 0.27355589721063034, + "grad_norm": 2.579594850540161, + "learning_rate": 1e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6724303960800171, + "num_tokens": 62499917.0, + "step": 2491 + }, + { + "epoch": 0.27366571491324404, + "grad_norm": 2.3850722312927246, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7316300272941589, + "num_tokens": 62522238.0, + "step": 2492 + }, + { + "epoch": 0.2737755326158577, + "grad_norm": 2.6655433177948, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.6979357600212097, + "num_tokens": 62542940.0, + "step": 2493 + }, + { + "epoch": 0.27388535031847133, + "grad_norm": 2.231487989425659, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7274277210235596, + "num_tokens": 62567904.0, + "step": 2494 + }, + { + "epoch": 0.273995168021085, + "grad_norm": 2.5633621215820312, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7220959067344666, + "num_tokens": 62587941.0, + "step": 2495 + }, + { + "epoch": 0.2741049857236987, + "grad_norm": 2.181283950805664, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6876240968704224, + "num_tokens": 62617692.0, + "step": 2496 + }, + { + "epoch": 0.2742148034263123, + "grad_norm": 2.3910915851593018, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7115260362625122, + "num_tokens": 62641766.0, + "step": 2497 + }, + { + "epoch": 0.27432462112892597, + "grad_norm": 2.077923059463501, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7019668817520142, + "num_tokens": 62672857.0, + "step": 2498 + }, + { + "epoch": 0.2744344388315396, + "grad_norm": 2.285283327102661, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6879576444625854, + "num_tokens": 62700371.0, + "step": 2499 + }, + { + "epoch": 0.2745442565341533, + "grad_norm": 2.3403897285461426, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7034189105033875, + "num_tokens": 62724578.0, + "step": 2500 + }, + { + "epoch": 0.27465407423676697, + "grad_norm": 2.3436129093170166, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7048070430755615, + "num_tokens": 62750275.0, + "step": 2501 + }, + { + "epoch": 0.2747638919393806, + "grad_norm": 2.597609281539917, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7173845171928406, + "num_tokens": 62769568.0, + "step": 2502 + }, + { + "epoch": 0.2748737096419943, + "grad_norm": 2.4946484565734863, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6842644214630127, + "num_tokens": 62793016.0, + "step": 2503 + }, + { + "epoch": 0.27498352734460796, + "grad_norm": 2.1969239711761475, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6910836696624756, + "num_tokens": 62820006.0, + "step": 2504 + }, + { + "epoch": 0.2750933450472216, + "grad_norm": 2.166902780532837, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7026019096374512, + "num_tokens": 62845789.0, + "step": 2505 + }, + { + "epoch": 0.27520316274983525, + "grad_norm": 2.2259750366210938, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7030452489852905, + "num_tokens": 62873953.0, + "step": 2506 + }, + { + "epoch": 0.27531298045244895, + "grad_norm": 2.3172948360443115, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7038744688034058, + "num_tokens": 62898314.0, + "step": 2507 + }, + { + "epoch": 0.2754227981550626, + "grad_norm": 2.5378551483154297, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.701029896736145, + "num_tokens": 62919220.0, + "step": 2508 + }, + { + "epoch": 0.27553261585767624, + "grad_norm": 2.348896026611328, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6991622447967529, + "num_tokens": 62942923.0, + "step": 2509 + }, + { + "epoch": 0.27564243356028995, + "grad_norm": 2.443913698196411, + "learning_rate": 1e-06, + "loss": 1.0808, + "mean_token_accuracy": 0.6742026805877686, + "num_tokens": 62964774.0, + "step": 2510 + }, + { + "epoch": 0.2757522512629036, + "grad_norm": 1.9516301155090332, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6814855933189392, + "num_tokens": 62996460.0, + "step": 2511 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 2.3135221004486084, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.697135329246521, + "num_tokens": 63020758.0, + "step": 2512 + }, + { + "epoch": 0.2759718866681309, + "grad_norm": 2.335789918899536, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.7017309665679932, + "num_tokens": 63044936.0, + "step": 2513 + }, + { + "epoch": 0.2760817043707446, + "grad_norm": 2.4278125762939453, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7034146785736084, + "num_tokens": 63068720.0, + "step": 2514 + }, + { + "epoch": 0.27619152207335823, + "grad_norm": 2.557513952255249, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7208883166313171, + "num_tokens": 63089878.0, + "step": 2515 + }, + { + "epoch": 0.2763013397759719, + "grad_norm": 2.236833333969116, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6923531889915466, + "num_tokens": 63116820.0, + "step": 2516 + }, + { + "epoch": 0.2764111574785855, + "grad_norm": 2.5296123027801514, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6960722804069519, + "num_tokens": 63139628.0, + "step": 2517 + }, + { + "epoch": 0.2765209751811992, + "grad_norm": 1.9560940265655518, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6990483999252319, + "num_tokens": 63173823.0, + "step": 2518 + }, + { + "epoch": 0.27663079288381287, + "grad_norm": 2.4367868900299072, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.707902193069458, + "num_tokens": 63195793.0, + "step": 2519 + }, + { + "epoch": 0.2767406105864265, + "grad_norm": 2.197220802307129, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6898152828216553, + "num_tokens": 63224023.0, + "step": 2520 + }, + { + "epoch": 0.2768504282890402, + "grad_norm": 2.3439853191375732, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6944984197616577, + "num_tokens": 63248175.0, + "step": 2521 + }, + { + "epoch": 0.27696024599165386, + "grad_norm": 2.4113590717315674, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7130087614059448, + "num_tokens": 63270235.0, + "step": 2522 + }, + { + "epoch": 0.2770700636942675, + "grad_norm": 2.1962039470672607, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6964988112449646, + "num_tokens": 63296997.0, + "step": 2523 + }, + { + "epoch": 0.27717988139688116, + "grad_norm": 2.550428628921509, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7040120363235474, + "num_tokens": 63317531.0, + "step": 2524 + }, + { + "epoch": 0.27728969909949486, + "grad_norm": 2.2830238342285156, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7080936431884766, + "num_tokens": 63341424.0, + "step": 2525 + }, + { + "epoch": 0.2773995168021085, + "grad_norm": 2.6233012676239014, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6936851739883423, + "num_tokens": 63360485.0, + "step": 2526 + }, + { + "epoch": 0.27750933450472215, + "grad_norm": 2.0465052127838135, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.6970464587211609, + "num_tokens": 63392385.0, + "step": 2527 + }, + { + "epoch": 0.2776191522073358, + "grad_norm": 2.0906693935394287, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7026702761650085, + "num_tokens": 63422767.0, + "step": 2528 + }, + { + "epoch": 0.2777289699099495, + "grad_norm": 2.208416223526001, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7059550881385803, + "num_tokens": 63450116.0, + "step": 2529 + }, + { + "epoch": 0.27783878761256314, + "grad_norm": 2.333980083465576, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7219762206077576, + "num_tokens": 63471967.0, + "step": 2530 + }, + { + "epoch": 0.2779486053151768, + "grad_norm": 2.5617241859436035, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7127008438110352, + "num_tokens": 63490999.0, + "step": 2531 + }, + { + "epoch": 0.2780584230177905, + "grad_norm": 2.1656904220581055, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7038822174072266, + "num_tokens": 63517178.0, + "step": 2532 + }, + { + "epoch": 0.27816824072040414, + "grad_norm": 2.377002239227295, + "learning_rate": 1e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.6651405096054077, + "num_tokens": 63542336.0, + "step": 2533 + }, + { + "epoch": 0.2782780584230178, + "grad_norm": 2.558804988861084, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7084808349609375, + "num_tokens": 63565143.0, + "step": 2534 + }, + { + "epoch": 0.27838787612563143, + "grad_norm": 2.148761034011841, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.734700083732605, + "num_tokens": 63589473.0, + "step": 2535 + }, + { + "epoch": 0.27849769382824513, + "grad_norm": 2.2105863094329834, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.720360517501831, + "num_tokens": 63614237.0, + "step": 2536 + }, + { + "epoch": 0.2786075115308588, + "grad_norm": 2.3818414211273193, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7125314474105835, + "num_tokens": 63635754.0, + "step": 2537 + }, + { + "epoch": 0.2787173292334724, + "grad_norm": 2.3299436569213867, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7028939723968506, + "num_tokens": 63660279.0, + "step": 2538 + }, + { + "epoch": 0.2788271469360861, + "grad_norm": 2.0269172191619873, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6890572905540466, + "num_tokens": 63692201.0, + "step": 2539 + }, + { + "epoch": 0.27893696463869977, + "grad_norm": 2.383056640625, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6951316595077515, + "num_tokens": 63715306.0, + "step": 2540 + }, + { + "epoch": 0.2790467823413134, + "grad_norm": 2.039186716079712, + "learning_rate": 1e-06, + "loss": 1.0773, + "mean_token_accuracy": 0.6848583817481995, + "num_tokens": 63745027.0, + "step": 2541 + }, + { + "epoch": 0.27915660004392706, + "grad_norm": 2.320502996444702, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6835550665855408, + "num_tokens": 63769129.0, + "step": 2542 + }, + { + "epoch": 0.27926641774654076, + "grad_norm": 2.0814411640167236, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7078903913497925, + "num_tokens": 63796524.0, + "step": 2543 + }, + { + "epoch": 0.2793762354491544, + "grad_norm": 2.404895067214966, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6890077590942383, + "num_tokens": 63819820.0, + "step": 2544 + }, + { + "epoch": 0.27948605315176805, + "grad_norm": 2.1051113605499268, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6979395151138306, + "num_tokens": 63848573.0, + "step": 2545 + }, + { + "epoch": 0.2795958708543817, + "grad_norm": 2.462646961212158, + "learning_rate": 1e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.6803231239318848, + "num_tokens": 63870211.0, + "step": 2546 + }, + { + "epoch": 0.2797056885569954, + "grad_norm": 2.344879150390625, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6855939626693726, + "num_tokens": 63893650.0, + "step": 2547 + }, + { + "epoch": 0.27981550625960905, + "grad_norm": 2.1582677364349365, + "learning_rate": 1e-06, + "loss": 1.1139, + "mean_token_accuracy": 0.6662383675575256, + "num_tokens": 63924111.0, + "step": 2548 + }, + { + "epoch": 0.2799253239622227, + "grad_norm": 2.243905544281006, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7079119682312012, + "num_tokens": 63950048.0, + "step": 2549 + }, + { + "epoch": 0.2800351416648364, + "grad_norm": 2.4060771465301514, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.7028037309646606, + "num_tokens": 63973585.0, + "step": 2550 + }, + { + "epoch": 0.28014495936745004, + "grad_norm": 2.5836894512176514, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6899716854095459, + "num_tokens": 63993742.0, + "step": 2551 + }, + { + "epoch": 0.2802547770700637, + "grad_norm": 2.3346285820007324, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6898322105407715, + "num_tokens": 64016513.0, + "step": 2552 + }, + { + "epoch": 0.28036459477267733, + "grad_norm": 2.1381287574768066, + "learning_rate": 1e-06, + "loss": 1.1042, + "mean_token_accuracy": 0.6732533574104309, + "num_tokens": 64045492.0, + "step": 2553 + }, + { + "epoch": 0.28047441247529104, + "grad_norm": 2.031233787536621, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6860792636871338, + "num_tokens": 64076354.0, + "step": 2554 + }, + { + "epoch": 0.2805842301779047, + "grad_norm": 2.4678192138671875, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7098268270492554, + "num_tokens": 64097796.0, + "step": 2555 + }, + { + "epoch": 0.2806940478805183, + "grad_norm": 2.391295909881592, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6924676895141602, + "num_tokens": 64120305.0, + "step": 2556 + }, + { + "epoch": 0.28080386558313203, + "grad_norm": 2.223444700241089, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.69622802734375, + "num_tokens": 64147228.0, + "step": 2557 + }, + { + "epoch": 0.2809136832857457, + "grad_norm": 2.469614028930664, + "learning_rate": 1e-06, + "loss": 1.0814, + "mean_token_accuracy": 0.6738846302032471, + "num_tokens": 64170147.0, + "step": 2558 + }, + { + "epoch": 0.2810235009883593, + "grad_norm": 2.2788491249084473, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7229791879653931, + "num_tokens": 64192940.0, + "step": 2559 + }, + { + "epoch": 0.28113331869097297, + "grad_norm": 2.4038939476013184, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7118465900421143, + "num_tokens": 64215554.0, + "step": 2560 + }, + { + "epoch": 0.28124313639358667, + "grad_norm": 2.202118158340454, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7119928598403931, + "num_tokens": 64240176.0, + "step": 2561 + }, + { + "epoch": 0.2813529540962003, + "grad_norm": 2.1798300743103027, + "learning_rate": 1e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6828758716583252, + "num_tokens": 64267281.0, + "step": 2562 + }, + { + "epoch": 0.28146277179881396, + "grad_norm": 2.674238681793213, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7110265493392944, + "num_tokens": 64286428.0, + "step": 2563 + }, + { + "epoch": 0.2815725895014276, + "grad_norm": 2.1746344566345215, + "learning_rate": 1e-06, + "loss": 1.063, + "mean_token_accuracy": 0.6795057058334351, + "num_tokens": 64315845.0, + "step": 2564 + }, + { + "epoch": 0.2816824072040413, + "grad_norm": 2.9913434982299805, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7090224027633667, + "num_tokens": 64338551.0, + "step": 2565 + }, + { + "epoch": 0.28179222490665495, + "grad_norm": 2.4214537143707275, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7151033878326416, + "num_tokens": 64360251.0, + "step": 2566 + }, + { + "epoch": 0.2819020426092686, + "grad_norm": 2.4725632667541504, + "learning_rate": 1e-06, + "loss": 1.0929, + "mean_token_accuracy": 0.6798514723777771, + "num_tokens": 64382487.0, + "step": 2567 + }, + { + "epoch": 0.2820118603118823, + "grad_norm": 2.124889373779297, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6856669783592224, + "num_tokens": 64408212.0, + "step": 2568 + }, + { + "epoch": 0.28212167801449595, + "grad_norm": 2.454655885696411, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7149534821510315, + "num_tokens": 64429131.0, + "step": 2569 + }, + { + "epoch": 0.2822314957171096, + "grad_norm": 2.2646260261535645, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.696403980255127, + "num_tokens": 64453607.0, + "step": 2570 + }, + { + "epoch": 0.28234131341972324, + "grad_norm": 2.3986077308654785, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7080837488174438, + "num_tokens": 64475357.0, + "step": 2571 + }, + { + "epoch": 0.28245113112233694, + "grad_norm": 2.420870065689087, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7089701890945435, + "num_tokens": 64498150.0, + "step": 2572 + }, + { + "epoch": 0.2825609488249506, + "grad_norm": 2.217012643814087, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7292789816856384, + "num_tokens": 64521043.0, + "step": 2573 + }, + { + "epoch": 0.28267076652756423, + "grad_norm": 2.5395119190216064, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6822984218597412, + "num_tokens": 64543043.0, + "step": 2574 + }, + { + "epoch": 0.2827805842301779, + "grad_norm": 2.1989312171936035, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.687515377998352, + "num_tokens": 64568288.0, + "step": 2575 + }, + { + "epoch": 0.2828904019327916, + "grad_norm": 2.3473453521728516, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7030456066131592, + "num_tokens": 64591352.0, + "step": 2576 + }, + { + "epoch": 0.2830002196354052, + "grad_norm": 2.489729166030884, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7064014673233032, + "num_tokens": 64613184.0, + "step": 2577 + }, + { + "epoch": 0.28311003733801887, + "grad_norm": 2.1905713081359863, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.7015383243560791, + "num_tokens": 64639913.0, + "step": 2578 + }, + { + "epoch": 0.2832198550406326, + "grad_norm": 2.3770928382873535, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7038884162902832, + "num_tokens": 64663938.0, + "step": 2579 + }, + { + "epoch": 0.2833296727432462, + "grad_norm": 2.3090548515319824, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.6827492713928223, + "num_tokens": 64690611.0, + "step": 2580 + }, + { + "epoch": 0.28343949044585987, + "grad_norm": 2.1336240768432617, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.685678243637085, + "num_tokens": 64717362.0, + "step": 2581 + }, + { + "epoch": 0.2835493081484735, + "grad_norm": 2.119955062866211, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7085974216461182, + "num_tokens": 64746999.0, + "step": 2582 + }, + { + "epoch": 0.2836591258510872, + "grad_norm": 2.4354562759399414, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7122130990028381, + "num_tokens": 64768888.0, + "step": 2583 + }, + { + "epoch": 0.28376894355370086, + "grad_norm": 2.167799949645996, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7166332602500916, + "num_tokens": 64793809.0, + "step": 2584 + }, + { + "epoch": 0.2838787612563145, + "grad_norm": 2.5741164684295654, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.693329393863678, + "num_tokens": 64814563.0, + "step": 2585 + }, + { + "epoch": 0.2839885789589282, + "grad_norm": 2.1654882431030273, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.713374674320221, + "num_tokens": 64842644.0, + "step": 2586 + }, + { + "epoch": 0.28409839666154185, + "grad_norm": 1.8533285856246948, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7089892625808716, + "num_tokens": 64875877.0, + "step": 2587 + }, + { + "epoch": 0.2842082143641555, + "grad_norm": 2.285005807876587, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.698514461517334, + "num_tokens": 64900171.0, + "step": 2588 + }, + { + "epoch": 0.28431803206676914, + "grad_norm": 2.4235172271728516, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7001399993896484, + "num_tokens": 64922250.0, + "step": 2589 + }, + { + "epoch": 0.28442784976938285, + "grad_norm": 2.1235740184783936, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7021373510360718, + "num_tokens": 64950920.0, + "step": 2590 + }, + { + "epoch": 0.2845376674719965, + "grad_norm": 2.184873580932617, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6969253420829773, + "num_tokens": 64975935.0, + "step": 2591 + }, + { + "epoch": 0.28464748517461014, + "grad_norm": 2.1849496364593506, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6849024295806885, + "num_tokens": 65002205.0, + "step": 2592 + }, + { + "epoch": 0.2847573028772238, + "grad_norm": 2.2101762294769287, + "learning_rate": 1e-06, + "loss": 1.0964, + "mean_token_accuracy": 0.6743543744087219, + "num_tokens": 65031795.0, + "step": 2593 + }, + { + "epoch": 0.2848671205798375, + "grad_norm": 2.3503379821777344, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6965600848197937, + "num_tokens": 65055055.0, + "step": 2594 + }, + { + "epoch": 0.28497693828245113, + "grad_norm": 2.4512462615966797, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6924482583999634, + "num_tokens": 65078306.0, + "step": 2595 + }, + { + "epoch": 0.2850867559850648, + "grad_norm": 2.344289779663086, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6972854733467102, + "num_tokens": 65101759.0, + "step": 2596 + }, + { + "epoch": 0.2851965736876785, + "grad_norm": 2.2754669189453125, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7119957804679871, + "num_tokens": 65127028.0, + "step": 2597 + }, + { + "epoch": 0.2853063913902921, + "grad_norm": 2.11063814163208, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6995940208435059, + "num_tokens": 65155175.0, + "step": 2598 + }, + { + "epoch": 0.28541620909290577, + "grad_norm": 2.3382699489593506, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7136101722717285, + "num_tokens": 65178831.0, + "step": 2599 + }, + { + "epoch": 0.2855260267955194, + "grad_norm": 2.20278263092041, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6918178796768188, + "num_tokens": 65206690.0, + "step": 2600 + }, + { + "epoch": 0.2856358444981331, + "grad_norm": 2.1995935440063477, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7085243463516235, + "num_tokens": 65232863.0, + "step": 2601 + }, + { + "epoch": 0.28574566220074676, + "grad_norm": 2.265028238296509, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7019176483154297, + "num_tokens": 65256517.0, + "step": 2602 + }, + { + "epoch": 0.2858554799033604, + "grad_norm": 2.252054452896118, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7209088802337646, + "num_tokens": 65279968.0, + "step": 2603 + }, + { + "epoch": 0.28596529760597406, + "grad_norm": 2.4655256271362305, + "learning_rate": 1e-06, + "loss": 1.0961, + "mean_token_accuracy": 0.6785789728164673, + "num_tokens": 65302581.0, + "step": 2604 + }, + { + "epoch": 0.28607511530858776, + "grad_norm": 2.1282336711883545, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7020725607872009, + "num_tokens": 65330361.0, + "step": 2605 + }, + { + "epoch": 0.2861849330112014, + "grad_norm": 2.465792655944824, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7049086093902588, + "num_tokens": 65355890.0, + "step": 2606 + }, + { + "epoch": 0.28629475071381505, + "grad_norm": 2.2424614429473877, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.711144745349884, + "num_tokens": 65378983.0, + "step": 2607 + }, + { + "epoch": 0.28640456841642875, + "grad_norm": 2.3165502548217773, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6909145712852478, + "num_tokens": 65404013.0, + "step": 2608 + }, + { + "epoch": 0.2865143861190424, + "grad_norm": 2.4346954822540283, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7191210985183716, + "num_tokens": 65425494.0, + "step": 2609 + }, + { + "epoch": 0.28662420382165604, + "grad_norm": 2.476713180541992, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7067176103591919, + "num_tokens": 65446396.0, + "step": 2610 + }, + { + "epoch": 0.2867340215242697, + "grad_norm": 2.1347005367279053, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6946839690208435, + "num_tokens": 65474844.0, + "step": 2611 + }, + { + "epoch": 0.2868438392268834, + "grad_norm": 2.0819661617279053, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.708479642868042, + "num_tokens": 65504189.0, + "step": 2612 + }, + { + "epoch": 0.28695365692949704, + "grad_norm": 2.5333504676818848, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7075803279876709, + "num_tokens": 65525338.0, + "step": 2613 + }, + { + "epoch": 0.2870634746321107, + "grad_norm": 2.3749587535858154, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6904021501541138, + "num_tokens": 65549169.0, + "step": 2614 + }, + { + "epoch": 0.2871732923347244, + "grad_norm": 2.3666512966156006, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7064148187637329, + "num_tokens": 65573913.0, + "step": 2615 + }, + { + "epoch": 0.28728311003733803, + "grad_norm": 2.3421475887298584, + "learning_rate": 1e-06, + "loss": 1.1041, + "mean_token_accuracy": 0.6789764165878296, + "num_tokens": 65598759.0, + "step": 2616 + }, + { + "epoch": 0.2873929277399517, + "grad_norm": 2.5616683959960938, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6877020001411438, + "num_tokens": 65618801.0, + "step": 2617 + }, + { + "epoch": 0.2875027454425653, + "grad_norm": 2.4831807613372803, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6897768974304199, + "num_tokens": 65640308.0, + "step": 2618 + }, + { + "epoch": 0.287612563145179, + "grad_norm": 2.604853630065918, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6943041682243347, + "num_tokens": 65662990.0, + "step": 2619 + }, + { + "epoch": 0.28772238084779267, + "grad_norm": 2.231606960296631, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6872241497039795, + "num_tokens": 65687815.0, + "step": 2620 + }, + { + "epoch": 0.2878321985504063, + "grad_norm": 2.3980460166931152, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7207512855529785, + "num_tokens": 65710248.0, + "step": 2621 + }, + { + "epoch": 0.28794201625301996, + "grad_norm": 2.3978188037872314, + "learning_rate": 1e-06, + "loss": 1.0921, + "mean_token_accuracy": 0.6790677309036255, + "num_tokens": 65736365.0, + "step": 2622 + }, + { + "epoch": 0.28805183395563366, + "grad_norm": 2.4127891063690186, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7282241582870483, + "num_tokens": 65757697.0, + "step": 2623 + }, + { + "epoch": 0.2881616516582473, + "grad_norm": 2.5034945011138916, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6922196745872498, + "num_tokens": 65780730.0, + "step": 2624 + }, + { + "epoch": 0.28827146936086095, + "grad_norm": 2.256675958633423, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7054839134216309, + "num_tokens": 65805051.0, + "step": 2625 + }, + { + "epoch": 0.28838128706347466, + "grad_norm": 2.2367687225341797, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6932440996170044, + "num_tokens": 65830867.0, + "step": 2626 + }, + { + "epoch": 0.2884911047660883, + "grad_norm": 2.3624820709228516, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7069351673126221, + "num_tokens": 65853967.0, + "step": 2627 + }, + { + "epoch": 0.28860092246870195, + "grad_norm": 2.2050647735595703, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7039642930030823, + "num_tokens": 65879365.0, + "step": 2628 + }, + { + "epoch": 0.2887107401713156, + "grad_norm": 2.1581225395202637, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6859931349754333, + "num_tokens": 65906854.0, + "step": 2629 + }, + { + "epoch": 0.2888205578739293, + "grad_norm": 2.457965612411499, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7154684662818909, + "num_tokens": 65927570.0, + "step": 2630 + }, + { + "epoch": 0.28893037557654294, + "grad_norm": 2.518930673599243, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.6812528967857361, + "num_tokens": 65953707.0, + "step": 2631 + }, + { + "epoch": 0.2890401932791566, + "grad_norm": 2.239863395690918, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6951415538787842, + "num_tokens": 65980632.0, + "step": 2632 + }, + { + "epoch": 0.2891500109817703, + "grad_norm": 2.2482709884643555, + "learning_rate": 1e-06, + "loss": 1.1091, + "mean_token_accuracy": 0.6754545569419861, + "num_tokens": 66008518.0, + "step": 2633 + }, + { + "epoch": 0.28925982868438394, + "grad_norm": 2.187464475631714, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7017143964767456, + "num_tokens": 66037041.0, + "step": 2634 + }, + { + "epoch": 0.2893696463869976, + "grad_norm": 2.2581465244293213, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6992254257202148, + "num_tokens": 66062326.0, + "step": 2635 + }, + { + "epoch": 0.2894794640896112, + "grad_norm": 2.3590211868286133, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.696553111076355, + "num_tokens": 66085354.0, + "step": 2636 + }, + { + "epoch": 0.28958928179222493, + "grad_norm": 2.3231475353240967, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.7014757394790649, + "num_tokens": 66109592.0, + "step": 2637 + }, + { + "epoch": 0.2896990994948386, + "grad_norm": 2.39042067527771, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7004157304763794, + "num_tokens": 66133184.0, + "step": 2638 + }, + { + "epoch": 0.2898089171974522, + "grad_norm": 2.2122738361358643, + "learning_rate": 1e-06, + "loss": 1.0746, + "mean_token_accuracy": 0.6803731918334961, + "num_tokens": 66160970.0, + "step": 2639 + }, + { + "epoch": 0.28991873490006587, + "grad_norm": 2.2309272289276123, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7010499238967896, + "num_tokens": 66186945.0, + "step": 2640 + }, + { + "epoch": 0.29002855260267957, + "grad_norm": 2.225118637084961, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7004371881484985, + "num_tokens": 66212306.0, + "step": 2641 + }, + { + "epoch": 0.2901383703052932, + "grad_norm": 1.9306273460388184, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7205968499183655, + "num_tokens": 66244578.0, + "step": 2642 + }, + { + "epoch": 0.29024818800790686, + "grad_norm": 2.2134547233581543, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7179558873176575, + "num_tokens": 66269741.0, + "step": 2643 + }, + { + "epoch": 0.29035800571052056, + "grad_norm": 2.434492588043213, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.715821385383606, + "num_tokens": 66290569.0, + "step": 2644 + }, + { + "epoch": 0.2904678234131342, + "grad_norm": 2.326568126678467, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6969257593154907, + "num_tokens": 66315596.0, + "step": 2645 + }, + { + "epoch": 0.29057764111574785, + "grad_norm": 2.3479044437408447, + "learning_rate": 1e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.6818238496780396, + "num_tokens": 66339256.0, + "step": 2646 + }, + { + "epoch": 0.2906874588183615, + "grad_norm": 2.0553290843963623, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.6867220401763916, + "num_tokens": 66370156.0, + "step": 2647 + }, + { + "epoch": 0.2907972765209752, + "grad_norm": 2.1579487323760986, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7085933685302734, + "num_tokens": 66395984.0, + "step": 2648 + }, + { + "epoch": 0.29090709422358885, + "grad_norm": 2.8181040287017822, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7114730477333069, + "num_tokens": 66415515.0, + "step": 2649 + }, + { + "epoch": 0.2910169119262025, + "grad_norm": 2.1413261890411377, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6938838958740234, + "num_tokens": 66443794.0, + "step": 2650 + }, + { + "epoch": 0.29112672962881614, + "grad_norm": 2.3844780921936035, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6885999441146851, + "num_tokens": 66469294.0, + "step": 2651 + }, + { + "epoch": 0.29123654733142984, + "grad_norm": 2.401057720184326, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7027080059051514, + "num_tokens": 66493423.0, + "step": 2652 + }, + { + "epoch": 0.2913463650340435, + "grad_norm": 2.439764976501465, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7044031620025635, + "num_tokens": 66514333.0, + "step": 2653 + }, + { + "epoch": 0.29145618273665713, + "grad_norm": 2.5753912925720215, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7225775718688965, + "num_tokens": 66534450.0, + "step": 2654 + }, + { + "epoch": 0.29156600043927083, + "grad_norm": 2.210381031036377, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7037526965141296, + "num_tokens": 66560602.0, + "step": 2655 + }, + { + "epoch": 0.2916758181418845, + "grad_norm": 2.6624956130981445, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7047781944274902, + "num_tokens": 66580777.0, + "step": 2656 + }, + { + "epoch": 0.2917856358444981, + "grad_norm": 2.3015336990356445, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6922491788864136, + "num_tokens": 66606170.0, + "step": 2657 + }, + { + "epoch": 0.29189545354711177, + "grad_norm": 2.421741008758545, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7018309831619263, + "num_tokens": 66628261.0, + "step": 2658 + }, + { + "epoch": 0.2920052712497255, + "grad_norm": 2.3852336406707764, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.696081280708313, + "num_tokens": 66650930.0, + "step": 2659 + }, + { + "epoch": 0.2921150889523391, + "grad_norm": 2.39554762840271, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7192145586013794, + "num_tokens": 66672131.0, + "step": 2660 + }, + { + "epoch": 0.29222490665495277, + "grad_norm": 2.1778080463409424, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.693168044090271, + "num_tokens": 66696149.0, + "step": 2661 + }, + { + "epoch": 0.29233472435756647, + "grad_norm": 2.306645154953003, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7121438980102539, + "num_tokens": 66721123.0, + "step": 2662 + }, + { + "epoch": 0.2924445420601801, + "grad_norm": 2.332044839859009, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6905420422554016, + "num_tokens": 66745119.0, + "step": 2663 + }, + { + "epoch": 0.29255435976279376, + "grad_norm": 2.2164647579193115, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7239142656326294, + "num_tokens": 66770136.0, + "step": 2664 + }, + { + "epoch": 0.2926641774654074, + "grad_norm": 2.4978251457214355, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.7029572129249573, + "num_tokens": 66791538.0, + "step": 2665 + }, + { + "epoch": 0.2927739951680211, + "grad_norm": 2.378448009490967, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7120785117149353, + "num_tokens": 66812807.0, + "step": 2666 + }, + { + "epoch": 0.29288381287063475, + "grad_norm": 2.1537225246429443, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6890493631362915, + "num_tokens": 66838895.0, + "step": 2667 + }, + { + "epoch": 0.2929936305732484, + "grad_norm": 2.1922714710235596, + "learning_rate": 1e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.6917937397956848, + "num_tokens": 66866313.0, + "step": 2668 + }, + { + "epoch": 0.29310344827586204, + "grad_norm": 2.0925605297088623, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6951127648353577, + "num_tokens": 66894653.0, + "step": 2669 + }, + { + "epoch": 0.29321326597847575, + "grad_norm": 2.711207151412964, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7095353603363037, + "num_tokens": 66912493.0, + "step": 2670 + }, + { + "epoch": 0.2933230836810894, + "grad_norm": 2.1809284687042236, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7045791149139404, + "num_tokens": 66937985.0, + "step": 2671 + }, + { + "epoch": 0.29343290138370304, + "grad_norm": 2.136561393737793, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6861788630485535, + "num_tokens": 66966178.0, + "step": 2672 + }, + { + "epoch": 0.29354271908631674, + "grad_norm": 2.589411973953247, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7026921510696411, + "num_tokens": 66984534.0, + "step": 2673 + }, + { + "epoch": 0.2936525367889304, + "grad_norm": 2.149893283843994, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7092668414115906, + "num_tokens": 67011256.0, + "step": 2674 + }, + { + "epoch": 0.29376235449154403, + "grad_norm": 2.2825844287872314, + "learning_rate": 1e-06, + "loss": 1.1126, + "mean_token_accuracy": 0.6721392869949341, + "num_tokens": 67037817.0, + "step": 2675 + }, + { + "epoch": 0.2938721721941577, + "grad_norm": 2.4725005626678467, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.695055365562439, + "num_tokens": 67060216.0, + "step": 2676 + }, + { + "epoch": 0.2939819898967714, + "grad_norm": 2.19077205657959, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6906362771987915, + "num_tokens": 67089815.0, + "step": 2677 + }, + { + "epoch": 0.294091807599385, + "grad_norm": 2.5438392162323, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7054920196533203, + "num_tokens": 67110331.0, + "step": 2678 + }, + { + "epoch": 0.29420162530199867, + "grad_norm": 2.5531303882598877, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7082390785217285, + "num_tokens": 67129368.0, + "step": 2679 + }, + { + "epoch": 0.2943114430046123, + "grad_norm": 2.8322925567626953, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6942650079727173, + "num_tokens": 67147485.0, + "step": 2680 + }, + { + "epoch": 0.294421260707226, + "grad_norm": 2.181244134902954, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6888774633407593, + "num_tokens": 67175008.0, + "step": 2681 + }, + { + "epoch": 0.29453107840983966, + "grad_norm": 2.471189022064209, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6913225054740906, + "num_tokens": 67194955.0, + "step": 2682 + }, + { + "epoch": 0.2946408961124533, + "grad_norm": 2.3977882862091064, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7141401767730713, + "num_tokens": 67217930.0, + "step": 2683 + }, + { + "epoch": 0.294750713815067, + "grad_norm": 2.2737481594085693, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6945769786834717, + "num_tokens": 67242228.0, + "step": 2684 + }, + { + "epoch": 0.29486053151768066, + "grad_norm": 2.364610433578491, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6757358908653259, + "num_tokens": 67264237.0, + "step": 2685 + }, + { + "epoch": 0.2949703492202943, + "grad_norm": 2.362149238586426, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7209352254867554, + "num_tokens": 67287008.0, + "step": 2686 + }, + { + "epoch": 0.29508016692290795, + "grad_norm": 2.394469738006592, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7046526670455933, + "num_tokens": 67310138.0, + "step": 2687 + }, + { + "epoch": 0.29518998462552165, + "grad_norm": 2.2292022705078125, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6968231201171875, + "num_tokens": 67335809.0, + "step": 2688 + }, + { + "epoch": 0.2952998023281353, + "grad_norm": 2.127657413482666, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7018745541572571, + "num_tokens": 67362413.0, + "step": 2689 + }, + { + "epoch": 0.29540962003074894, + "grad_norm": 2.7772247791290283, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6965017318725586, + "num_tokens": 67379935.0, + "step": 2690 + }, + { + "epoch": 0.29551943773336264, + "grad_norm": 2.3062877655029297, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6870512962341309, + "num_tokens": 67405331.0, + "step": 2691 + }, + { + "epoch": 0.2956292554359763, + "grad_norm": 2.276045560836792, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7073605060577393, + "num_tokens": 67429646.0, + "step": 2692 + }, + { + "epoch": 0.29573907313858994, + "grad_norm": 2.1313869953155518, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6831750869750977, + "num_tokens": 67458805.0, + "step": 2693 + }, + { + "epoch": 0.2958488908412036, + "grad_norm": 2.1938633918762207, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7005568742752075, + "num_tokens": 67485589.0, + "step": 2694 + }, + { + "epoch": 0.2959587085438173, + "grad_norm": 2.350989818572998, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7027561664581299, + "num_tokens": 67508440.0, + "step": 2695 + }, + { + "epoch": 0.29606852624643093, + "grad_norm": 2.2218332290649414, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.733148455619812, + "num_tokens": 67533730.0, + "step": 2696 + }, + { + "epoch": 0.2961783439490446, + "grad_norm": 2.2419323921203613, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6997493505477905, + "num_tokens": 67559002.0, + "step": 2697 + }, + { + "epoch": 0.2962881616516582, + "grad_norm": 2.040300130844116, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6942190527915955, + "num_tokens": 67588023.0, + "step": 2698 + }, + { + "epoch": 0.2963979793542719, + "grad_norm": 2.7294669151306152, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.685848593711853, + "num_tokens": 67607760.0, + "step": 2699 + }, + { + "epoch": 0.29650779705688557, + "grad_norm": 2.2063779830932617, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6969576478004456, + "num_tokens": 67632897.0, + "step": 2700 + }, + { + "epoch": 0.2966176147594992, + "grad_norm": 2.1378872394561768, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6867492198944092, + "num_tokens": 67660160.0, + "step": 2701 + }, + { + "epoch": 0.2967274324621129, + "grad_norm": 2.2658133506774902, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7209278345108032, + "num_tokens": 67684084.0, + "step": 2702 + }, + { + "epoch": 0.29683725016472656, + "grad_norm": 3.2194347381591797, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7246819138526917, + "num_tokens": 67697924.0, + "step": 2703 + }, + { + "epoch": 0.2969470678673402, + "grad_norm": 2.0932111740112305, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7220280170440674, + "num_tokens": 67723038.0, + "step": 2704 + }, + { + "epoch": 0.29705688556995385, + "grad_norm": 2.5271453857421875, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6923626661300659, + "num_tokens": 67744223.0, + "step": 2705 + }, + { + "epoch": 0.29716670327256756, + "grad_norm": 2.5791642665863037, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6887451410293579, + "num_tokens": 67765161.0, + "step": 2706 + }, + { + "epoch": 0.2972765209751812, + "grad_norm": 2.368595600128174, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6956331729888916, + "num_tokens": 67788722.0, + "step": 2707 + }, + { + "epoch": 0.29738633867779485, + "grad_norm": 2.370229721069336, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7092419862747192, + "num_tokens": 67812841.0, + "step": 2708 + }, + { + "epoch": 0.29749615638040855, + "grad_norm": 2.448674201965332, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.710774302482605, + "num_tokens": 67833755.0, + "step": 2709 + }, + { + "epoch": 0.2976059740830222, + "grad_norm": 2.223965644836426, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.687526285648346, + "num_tokens": 67861111.0, + "step": 2710 + }, + { + "epoch": 0.29771579178563584, + "grad_norm": 2.2156121730804443, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6958503127098083, + "num_tokens": 67889330.0, + "step": 2711 + }, + { + "epoch": 0.2978256094882495, + "grad_norm": 2.2682552337646484, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.692184567451477, + "num_tokens": 67914034.0, + "step": 2712 + }, + { + "epoch": 0.2979354271908632, + "grad_norm": 2.7312333583831787, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7222653031349182, + "num_tokens": 67931199.0, + "step": 2713 + }, + { + "epoch": 0.29804524489347684, + "grad_norm": 2.445038318634033, + "learning_rate": 1e-06, + "loss": 1.0713, + "mean_token_accuracy": 0.6821227669715881, + "num_tokens": 67953462.0, + "step": 2714 + }, + { + "epoch": 0.2981550625960905, + "grad_norm": 2.1310832500457764, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7064141631126404, + "num_tokens": 67980436.0, + "step": 2715 + }, + { + "epoch": 0.2982648802987041, + "grad_norm": 2.5232038497924805, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6815791130065918, + "num_tokens": 68004176.0, + "step": 2716 + }, + { + "epoch": 0.29837469800131783, + "grad_norm": 2.1182539463043213, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6906511187553406, + "num_tokens": 68033903.0, + "step": 2717 + }, + { + "epoch": 0.2984845157039315, + "grad_norm": 2.431736469268799, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7233327627182007, + "num_tokens": 68054415.0, + "step": 2718 + }, + { + "epoch": 0.2985943334065451, + "grad_norm": 2.5549607276916504, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.712724506855011, + "num_tokens": 68075519.0, + "step": 2719 + }, + { + "epoch": 0.2987041511091588, + "grad_norm": 2.55461049079895, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.680061399936676, + "num_tokens": 68097879.0, + "step": 2720 + }, + { + "epoch": 0.29881396881177247, + "grad_norm": 2.2564656734466553, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6966542601585388, + "num_tokens": 68125585.0, + "step": 2721 + }, + { + "epoch": 0.2989237865143861, + "grad_norm": 2.409306764602661, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.6916247606277466, + "num_tokens": 68150019.0, + "step": 2722 + }, + { + "epoch": 0.29903360421699976, + "grad_norm": 2.3614940643310547, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7227889895439148, + "num_tokens": 68171519.0, + "step": 2723 + }, + { + "epoch": 0.29914342191961346, + "grad_norm": 2.246910572052002, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7168936729431152, + "num_tokens": 68197238.0, + "step": 2724 + }, + { + "epoch": 0.2992532396222271, + "grad_norm": 2.2294812202453613, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7000558376312256, + "num_tokens": 68224762.0, + "step": 2725 + }, + { + "epoch": 0.29936305732484075, + "grad_norm": 2.4369826316833496, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.700455904006958, + "num_tokens": 68246932.0, + "step": 2726 + }, + { + "epoch": 0.2994728750274544, + "grad_norm": 2.991464138031006, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7194613218307495, + "num_tokens": 68262951.0, + "step": 2727 + }, + { + "epoch": 0.2995826927300681, + "grad_norm": 2.5418968200683594, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6943435072898865, + "num_tokens": 68284729.0, + "step": 2728 + }, + { + "epoch": 0.29969251043268175, + "grad_norm": 2.5524702072143555, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6967340707778931, + "num_tokens": 68305729.0, + "step": 2729 + }, + { + "epoch": 0.2998023281352954, + "grad_norm": 2.4173848628997803, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7002255916595459, + "num_tokens": 68328568.0, + "step": 2730 + }, + { + "epoch": 0.2999121458379091, + "grad_norm": 2.383155107498169, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7073850035667419, + "num_tokens": 68353646.0, + "step": 2731 + }, + { + "epoch": 0.30002196354052274, + "grad_norm": 2.4452548027038574, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7138469219207764, + "num_tokens": 68376418.0, + "step": 2732 + }, + { + "epoch": 0.3001317812431364, + "grad_norm": 2.435415744781494, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6969718933105469, + "num_tokens": 68399825.0, + "step": 2733 + }, + { + "epoch": 0.30024159894575003, + "grad_norm": 2.517657995223999, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.697035551071167, + "num_tokens": 68424099.0, + "step": 2734 + }, + { + "epoch": 0.30035141664836373, + "grad_norm": 1.8591731786727905, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.70363849401474, + "num_tokens": 68461229.0, + "step": 2735 + }, + { + "epoch": 0.3004612343509774, + "grad_norm": 2.485687017440796, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6902984380722046, + "num_tokens": 68482688.0, + "step": 2736 + }, + { + "epoch": 0.300571052053591, + "grad_norm": 2.0735692977905273, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7290298342704773, + "num_tokens": 68511233.0, + "step": 2737 + }, + { + "epoch": 0.3006808697562047, + "grad_norm": 2.324470043182373, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7318836450576782, + "num_tokens": 68533937.0, + "step": 2738 + }, + { + "epoch": 0.3007906874588184, + "grad_norm": 2.2662272453308105, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7036498785018921, + "num_tokens": 68556568.0, + "step": 2739 + }, + { + "epoch": 0.300900505161432, + "grad_norm": 2.1423768997192383, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6929228901863098, + "num_tokens": 68584565.0, + "step": 2740 + }, + { + "epoch": 0.30101032286404567, + "grad_norm": 2.2035179138183594, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6811034083366394, + "num_tokens": 68613298.0, + "step": 2741 + }, + { + "epoch": 0.30112014056665937, + "grad_norm": 2.2625818252563477, + "learning_rate": 1e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.6717825531959534, + "num_tokens": 68638987.0, + "step": 2742 + }, + { + "epoch": 0.301229958269273, + "grad_norm": 2.1135315895080566, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6878383755683899, + "num_tokens": 68666649.0, + "step": 2743 + }, + { + "epoch": 0.30133977597188666, + "grad_norm": 2.3987741470336914, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7051478028297424, + "num_tokens": 68689478.0, + "step": 2744 + }, + { + "epoch": 0.3014495936745003, + "grad_norm": 1.9969033002853394, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6875211596488953, + "num_tokens": 68718938.0, + "step": 2745 + }, + { + "epoch": 0.301559411377114, + "grad_norm": 2.592144727706909, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.7180184721946716, + "num_tokens": 68739070.0, + "step": 2746 + }, + { + "epoch": 0.30166922907972765, + "grad_norm": 2.6944174766540527, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7075576782226562, + "num_tokens": 68758280.0, + "step": 2747 + }, + { + "epoch": 0.3017790467823413, + "grad_norm": 2.738628387451172, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6863722801208496, + "num_tokens": 68778085.0, + "step": 2748 + }, + { + "epoch": 0.301888864484955, + "grad_norm": 2.6558804512023926, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.690193772315979, + "num_tokens": 68797959.0, + "step": 2749 + }, + { + "epoch": 0.30199868218756865, + "grad_norm": 2.181304931640625, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6962058544158936, + "num_tokens": 68826706.0, + "step": 2750 + }, + { + "epoch": 0.3021084998901823, + "grad_norm": 2.303450584411621, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.691292405128479, + "num_tokens": 68850756.0, + "step": 2751 + }, + { + "epoch": 0.30221831759279594, + "grad_norm": 2.40521502494812, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6830555200576782, + "num_tokens": 68873410.0, + "step": 2752 + }, + { + "epoch": 0.30232813529540964, + "grad_norm": 2.0625107288360596, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6927454471588135, + "num_tokens": 68905032.0, + "step": 2753 + }, + { + "epoch": 0.3024379529980233, + "grad_norm": 2.1882007122039795, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7121859788894653, + "num_tokens": 68931043.0, + "step": 2754 + }, + { + "epoch": 0.30254777070063693, + "grad_norm": 2.0472519397735596, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7171920537948608, + "num_tokens": 68960547.0, + "step": 2755 + }, + { + "epoch": 0.3026575884032506, + "grad_norm": 2.2655975818634033, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.693300187587738, + "num_tokens": 68986869.0, + "step": 2756 + }, + { + "epoch": 0.3027674061058643, + "grad_norm": 2.2953100204467773, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6959196329116821, + "num_tokens": 69010609.0, + "step": 2757 + }, + { + "epoch": 0.3028772238084779, + "grad_norm": 1.9595000743865967, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7024140954017639, + "num_tokens": 69040589.0, + "step": 2758 + }, + { + "epoch": 0.30298704151109157, + "grad_norm": 2.154820680618286, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7181090712547302, + "num_tokens": 69069163.0, + "step": 2759 + }, + { + "epoch": 0.30309685921370527, + "grad_norm": 2.7378089427948, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7192243933677673, + "num_tokens": 69086106.0, + "step": 2760 + }, + { + "epoch": 0.3032066769163189, + "grad_norm": 2.233635425567627, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7101458311080933, + "num_tokens": 69111909.0, + "step": 2761 + }, + { + "epoch": 0.30331649461893256, + "grad_norm": 2.3100311756134033, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6891780495643616, + "num_tokens": 69137394.0, + "step": 2762 + }, + { + "epoch": 0.3034263123215462, + "grad_norm": 2.3039515018463135, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.711958646774292, + "num_tokens": 69159509.0, + "step": 2763 + }, + { + "epoch": 0.3035361300241599, + "grad_norm": 2.3797264099121094, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6839953660964966, + "num_tokens": 69182165.0, + "step": 2764 + }, + { + "epoch": 0.30364594772677356, + "grad_norm": 2.1731224060058594, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7132811546325684, + "num_tokens": 69207707.0, + "step": 2765 + }, + { + "epoch": 0.3037557654293872, + "grad_norm": 2.371798038482666, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6899542212486267, + "num_tokens": 69232486.0, + "step": 2766 + }, + { + "epoch": 0.3038655831320009, + "grad_norm": 2.1656906604766846, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6966619491577148, + "num_tokens": 69261820.0, + "step": 2767 + }, + { + "epoch": 0.30397540083461455, + "grad_norm": 2.3164734840393066, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7137872576713562, + "num_tokens": 69286982.0, + "step": 2768 + }, + { + "epoch": 0.3040852185372282, + "grad_norm": 2.2970314025878906, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6938478946685791, + "num_tokens": 69312978.0, + "step": 2769 + }, + { + "epoch": 0.30419503623984184, + "grad_norm": 2.458552360534668, + "learning_rate": 1e-06, + "loss": 1.0719, + "mean_token_accuracy": 0.6838402152061462, + "num_tokens": 69335250.0, + "step": 2770 + }, + { + "epoch": 0.30430485394245554, + "grad_norm": 2.5860435962677, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7013519406318665, + "num_tokens": 69356543.0, + "step": 2771 + }, + { + "epoch": 0.3044146716450692, + "grad_norm": 2.530940294265747, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6877623796463013, + "num_tokens": 69379054.0, + "step": 2772 + }, + { + "epoch": 0.30452448934768284, + "grad_norm": 2.208130359649658, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6966323852539062, + "num_tokens": 69405703.0, + "step": 2773 + }, + { + "epoch": 0.3046343070502965, + "grad_norm": 2.521078586578369, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7081297039985657, + "num_tokens": 69427817.0, + "step": 2774 + }, + { + "epoch": 0.3047441247529102, + "grad_norm": 2.1810503005981445, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6847140789031982, + "num_tokens": 69455463.0, + "step": 2775 + }, + { + "epoch": 0.30485394245552383, + "grad_norm": 2.040998935699463, + "learning_rate": 1e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6875073909759521, + "num_tokens": 69485342.0, + "step": 2776 + }, + { + "epoch": 0.3049637601581375, + "grad_norm": 2.762852191925049, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7321227788925171, + "num_tokens": 69503943.0, + "step": 2777 + }, + { + "epoch": 0.3050735778607512, + "grad_norm": 2.275935173034668, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7095895409584045, + "num_tokens": 69528230.0, + "step": 2778 + }, + { + "epoch": 0.3051833955633648, + "grad_norm": 2.279902935028076, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7149168848991394, + "num_tokens": 69551144.0, + "step": 2779 + }, + { + "epoch": 0.30529321326597847, + "grad_norm": 1.9061052799224854, + "learning_rate": 1e-06, + "loss": 1.0884, + "mean_token_accuracy": 0.6810001134872437, + "num_tokens": 69585110.0, + "step": 2780 + }, + { + "epoch": 0.3054030309685921, + "grad_norm": 2.3046531677246094, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6993091702461243, + "num_tokens": 69608484.0, + "step": 2781 + }, + { + "epoch": 0.3055128486712058, + "grad_norm": 2.1924166679382324, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6948450803756714, + "num_tokens": 69635818.0, + "step": 2782 + }, + { + "epoch": 0.30562266637381946, + "grad_norm": 2.336097240447998, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6847411394119263, + "num_tokens": 69659518.0, + "step": 2783 + }, + { + "epoch": 0.3057324840764331, + "grad_norm": 2.0276339054107666, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7140428423881531, + "num_tokens": 69685971.0, + "step": 2784 + }, + { + "epoch": 0.30584230177904675, + "grad_norm": 2.0131587982177734, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7104512453079224, + "num_tokens": 69714919.0, + "step": 2785 + }, + { + "epoch": 0.30595211948166046, + "grad_norm": 1.9710873365402222, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7070653438568115, + "num_tokens": 69748629.0, + "step": 2786 + }, + { + "epoch": 0.3060619371842741, + "grad_norm": 2.252570390701294, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6924868226051331, + "num_tokens": 69774448.0, + "step": 2787 + }, + { + "epoch": 0.30617175488688775, + "grad_norm": 2.186279535293579, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7264171838760376, + "num_tokens": 69800504.0, + "step": 2788 + }, + { + "epoch": 0.30628157258950145, + "grad_norm": 2.4880919456481934, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6968026161193848, + "num_tokens": 69822858.0, + "step": 2789 + }, + { + "epoch": 0.3063913902921151, + "grad_norm": 2.259188413619995, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7266660928726196, + "num_tokens": 69846463.0, + "step": 2790 + }, + { + "epoch": 0.30650120799472874, + "grad_norm": 2.5344555377960205, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.728328287601471, + "num_tokens": 69867384.0, + "step": 2791 + }, + { + "epoch": 0.3066110256973424, + "grad_norm": 2.2026114463806152, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6956763863563538, + "num_tokens": 69892318.0, + "step": 2792 + }, + { + "epoch": 0.3067208433999561, + "grad_norm": 2.2193257808685303, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6899890303611755, + "num_tokens": 69919187.0, + "step": 2793 + }, + { + "epoch": 0.30683066110256974, + "grad_norm": 2.4170475006103516, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7119475603103638, + "num_tokens": 69940188.0, + "step": 2794 + }, + { + "epoch": 0.3069404788051834, + "grad_norm": 2.441783905029297, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.704259991645813, + "num_tokens": 69962624.0, + "step": 2795 + }, + { + "epoch": 0.3070502965077971, + "grad_norm": 2.3663721084594727, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7096410393714905, + "num_tokens": 69987372.0, + "step": 2796 + }, + { + "epoch": 0.30716011421041073, + "grad_norm": 2.245946168899536, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6811997890472412, + "num_tokens": 70014275.0, + "step": 2797 + }, + { + "epoch": 0.3072699319130244, + "grad_norm": 2.227748394012451, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7097737789154053, + "num_tokens": 70042110.0, + "step": 2798 + }, + { + "epoch": 0.307379749615638, + "grad_norm": 2.4616310596466064, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7026979923248291, + "num_tokens": 70064766.0, + "step": 2799 + }, + { + "epoch": 0.3074895673182517, + "grad_norm": 2.191697359085083, + "learning_rate": 1e-06, + "loss": 1.0817, + "mean_token_accuracy": 0.6900368332862854, + "num_tokens": 70093422.0, + "step": 2800 + }, + { + "epoch": 0.30759938502086537, + "grad_norm": 2.0634870529174805, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6974340081214905, + "num_tokens": 70123312.0, + "step": 2801 + }, + { + "epoch": 0.307709202723479, + "grad_norm": 2.137789011001587, + "learning_rate": 1e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.6777770519256592, + "num_tokens": 70151487.0, + "step": 2802 + }, + { + "epoch": 0.30781902042609266, + "grad_norm": 2.264871597290039, + "learning_rate": 1e-06, + "loss": 1.1331, + "mean_token_accuracy": 0.6669421195983887, + "num_tokens": 70178165.0, + "step": 2803 + }, + { + "epoch": 0.30792883812870636, + "grad_norm": 2.0295448303222656, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6858149170875549, + "num_tokens": 70209184.0, + "step": 2804 + }, + { + "epoch": 0.30803865583132, + "grad_norm": 2.5431838035583496, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.706157922744751, + "num_tokens": 70229031.0, + "step": 2805 + }, + { + "epoch": 0.30814847353393365, + "grad_norm": 2.075228452682495, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6969740986824036, + "num_tokens": 70258286.0, + "step": 2806 + }, + { + "epoch": 0.30825829123654735, + "grad_norm": 2.369658946990967, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7230404615402222, + "num_tokens": 70280393.0, + "step": 2807 + }, + { + "epoch": 0.308368108939161, + "grad_norm": 2.051769971847534, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7270159721374512, + "num_tokens": 70307282.0, + "step": 2808 + }, + { + "epoch": 0.30847792664177465, + "grad_norm": 2.467329740524292, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.71509850025177, + "num_tokens": 70327429.0, + "step": 2809 + }, + { + "epoch": 0.3085877443443883, + "grad_norm": 2.2894160747528076, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7028428912162781, + "num_tokens": 70351452.0, + "step": 2810 + }, + { + "epoch": 0.308697562047002, + "grad_norm": 2.1148154735565186, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6888074278831482, + "num_tokens": 70379020.0, + "step": 2811 + }, + { + "epoch": 0.30880737974961564, + "grad_norm": 2.237565040588379, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7068415284156799, + "num_tokens": 70405242.0, + "step": 2812 + }, + { + "epoch": 0.3089171974522293, + "grad_norm": 2.1158783435821533, + "learning_rate": 1e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6723496317863464, + "num_tokens": 70436082.0, + "step": 2813 + }, + { + "epoch": 0.309027015154843, + "grad_norm": 1.9302278757095337, + "learning_rate": 1e-06, + "loss": 1.0931, + "mean_token_accuracy": 0.6741809844970703, + "num_tokens": 70471209.0, + "step": 2814 + }, + { + "epoch": 0.30913683285745663, + "grad_norm": 2.322683095932007, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6922943592071533, + "num_tokens": 70494208.0, + "step": 2815 + }, + { + "epoch": 0.3092466505600703, + "grad_norm": 2.37764573097229, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7260339260101318, + "num_tokens": 70518731.0, + "step": 2816 + }, + { + "epoch": 0.3093564682626839, + "grad_norm": 2.192819595336914, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7015330791473389, + "num_tokens": 70546403.0, + "step": 2817 + }, + { + "epoch": 0.3094662859652976, + "grad_norm": 2.4766061305999756, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7011719942092896, + "num_tokens": 70567453.0, + "step": 2818 + }, + { + "epoch": 0.3095761036679113, + "grad_norm": 2.375206708908081, + "learning_rate": 1e-06, + "loss": 1.0775, + "mean_token_accuracy": 0.681805431842804, + "num_tokens": 70591835.0, + "step": 2819 + }, + { + "epoch": 0.3096859213705249, + "grad_norm": 2.32926082611084, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6938426494598389, + "num_tokens": 70615813.0, + "step": 2820 + }, + { + "epoch": 0.30979573907313857, + "grad_norm": 1.9622621536254883, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6871434450149536, + "num_tokens": 70649477.0, + "step": 2821 + }, + { + "epoch": 0.30990555677575227, + "grad_norm": 2.2412710189819336, + "learning_rate": 1e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6770405769348145, + "num_tokens": 70676025.0, + "step": 2822 + }, + { + "epoch": 0.3100153744783659, + "grad_norm": 2.337214708328247, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.713345468044281, + "num_tokens": 70698622.0, + "step": 2823 + }, + { + "epoch": 0.31012519218097956, + "grad_norm": 2.168181896209717, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6955653429031372, + "num_tokens": 70724298.0, + "step": 2824 + }, + { + "epoch": 0.31023500988359326, + "grad_norm": 2.1666104793548584, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7067257761955261, + "num_tokens": 70752360.0, + "step": 2825 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 2.0100245475769043, + "learning_rate": 1e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.6736149787902832, + "num_tokens": 70784198.0, + "step": 2826 + }, + { + "epoch": 0.31045464528882055, + "grad_norm": 2.585210084915161, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7211642265319824, + "num_tokens": 70800937.0, + "step": 2827 + }, + { + "epoch": 0.3105644629914342, + "grad_norm": 2.1617431640625, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6854134798049927, + "num_tokens": 70828846.0, + "step": 2828 + }, + { + "epoch": 0.3106742806940479, + "grad_norm": 2.2272837162017822, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6929121017456055, + "num_tokens": 70853283.0, + "step": 2829 + }, + { + "epoch": 0.31078409839666155, + "grad_norm": 2.086466073989868, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6809527277946472, + "num_tokens": 70882140.0, + "step": 2830 + }, + { + "epoch": 0.3108939160992752, + "grad_norm": 2.269113540649414, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7127141356468201, + "num_tokens": 70907799.0, + "step": 2831 + }, + { + "epoch": 0.31100373380188884, + "grad_norm": 2.199235200881958, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.7097405195236206, + "num_tokens": 70933530.0, + "step": 2832 + }, + { + "epoch": 0.31111355150450254, + "grad_norm": 2.4918618202209473, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7034350633621216, + "num_tokens": 70955737.0, + "step": 2833 + }, + { + "epoch": 0.3112233692071162, + "grad_norm": 2.342261791229248, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6870627403259277, + "num_tokens": 70980549.0, + "step": 2834 + }, + { + "epoch": 0.31133318690972983, + "grad_norm": 2.2495038509368896, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6783296465873718, + "num_tokens": 71005378.0, + "step": 2835 + }, + { + "epoch": 0.31144300461234353, + "grad_norm": 2.159658193588257, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7250315546989441, + "num_tokens": 71032656.0, + "step": 2836 + }, + { + "epoch": 0.3115528223149572, + "grad_norm": 2.3245456218719482, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6986116766929626, + "num_tokens": 71057606.0, + "step": 2837 + }, + { + "epoch": 0.3116626400175708, + "grad_norm": 2.2717878818511963, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.691699743270874, + "num_tokens": 71082600.0, + "step": 2838 + }, + { + "epoch": 0.31177245772018447, + "grad_norm": 2.2115628719329834, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6955332159996033, + "num_tokens": 71108289.0, + "step": 2839 + }, + { + "epoch": 0.31188227542279817, + "grad_norm": 2.333617925643921, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6964449882507324, + "num_tokens": 71130526.0, + "step": 2840 + }, + { + "epoch": 0.3119920931254118, + "grad_norm": 2.2634129524230957, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7083942294120789, + "num_tokens": 71156148.0, + "step": 2841 + }, + { + "epoch": 0.31210191082802546, + "grad_norm": 2.2423746585845947, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6997050046920776, + "num_tokens": 71181578.0, + "step": 2842 + }, + { + "epoch": 0.31221172853063917, + "grad_norm": 1.94818115234375, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.687334418296814, + "num_tokens": 71211494.0, + "step": 2843 + }, + { + "epoch": 0.3123215462332528, + "grad_norm": 2.4842517375946045, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7010422348976135, + "num_tokens": 71233180.0, + "step": 2844 + }, + { + "epoch": 0.31243136393586646, + "grad_norm": 2.1578145027160645, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6920774579048157, + "num_tokens": 71261480.0, + "step": 2845 + }, + { + "epoch": 0.3125411816384801, + "grad_norm": 2.2063119411468506, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7122024297714233, + "num_tokens": 71285790.0, + "step": 2846 + }, + { + "epoch": 0.3126509993410938, + "grad_norm": 2.4682579040527344, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.7027899622917175, + "num_tokens": 71306554.0, + "step": 2847 + }, + { + "epoch": 0.31276081704370745, + "grad_norm": 2.1579678058624268, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7066461443901062, + "num_tokens": 71332610.0, + "step": 2848 + }, + { + "epoch": 0.3128706347463211, + "grad_norm": 2.296278953552246, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6859191656112671, + "num_tokens": 71358089.0, + "step": 2849 + }, + { + "epoch": 0.31298045244893474, + "grad_norm": 2.350297451019287, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6953012943267822, + "num_tokens": 71380916.0, + "step": 2850 + }, + { + "epoch": 0.31309027015154844, + "grad_norm": 2.3011202812194824, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7061247825622559, + "num_tokens": 71404922.0, + "step": 2851 + }, + { + "epoch": 0.3132000878541621, + "grad_norm": 1.991977572441101, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7012805938720703, + "num_tokens": 71433130.0, + "step": 2852 + }, + { + "epoch": 0.31330990555677574, + "grad_norm": 2.3674168586730957, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6862058043479919, + "num_tokens": 71456832.0, + "step": 2853 + }, + { + "epoch": 0.31341972325938944, + "grad_norm": 2.132629632949829, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6910241842269897, + "num_tokens": 71485664.0, + "step": 2854 + }, + { + "epoch": 0.3135295409620031, + "grad_norm": 2.055847644805908, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7148337364196777, + "num_tokens": 71515586.0, + "step": 2855 + }, + { + "epoch": 0.31363935866461673, + "grad_norm": 2.1065561771392822, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6997768878936768, + "num_tokens": 71545630.0, + "step": 2856 + }, + { + "epoch": 0.3137491763672304, + "grad_norm": 2.2732508182525635, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6934725046157837, + "num_tokens": 71570309.0, + "step": 2857 + }, + { + "epoch": 0.3138589940698441, + "grad_norm": 1.947659969329834, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7191480994224548, + "num_tokens": 71599938.0, + "step": 2858 + }, + { + "epoch": 0.3139688117724577, + "grad_norm": 2.4072303771972656, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7245810031890869, + "num_tokens": 71623710.0, + "step": 2859 + }, + { + "epoch": 0.31407862947507137, + "grad_norm": 2.3397715091705322, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7083238363265991, + "num_tokens": 71646563.0, + "step": 2860 + }, + { + "epoch": 0.314188447177685, + "grad_norm": 2.1896016597747803, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6994103193283081, + "num_tokens": 71673521.0, + "step": 2861 + }, + { + "epoch": 0.3142982648802987, + "grad_norm": 2.3173770904541016, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7046921849250793, + "num_tokens": 71696461.0, + "step": 2862 + }, + { + "epoch": 0.31440808258291236, + "grad_norm": 2.273104667663574, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6962964534759521, + "num_tokens": 71721591.0, + "step": 2863 + }, + { + "epoch": 0.314517900285526, + "grad_norm": 2.3074793815612793, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6875376105308533, + "num_tokens": 71746351.0, + "step": 2864 + }, + { + "epoch": 0.3146277179881397, + "grad_norm": 2.3247554302215576, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.715417742729187, + "num_tokens": 71768824.0, + "step": 2865 + }, + { + "epoch": 0.31473753569075336, + "grad_norm": 2.280513048171997, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6822264194488525, + "num_tokens": 71793788.0, + "step": 2866 + }, + { + "epoch": 0.314847353393367, + "grad_norm": 2.3297033309936523, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7159509062767029, + "num_tokens": 71817116.0, + "step": 2867 + }, + { + "epoch": 0.31495717109598065, + "grad_norm": 2.2253096103668213, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6967380046844482, + "num_tokens": 71840420.0, + "step": 2868 + }, + { + "epoch": 0.31506698879859435, + "grad_norm": 2.559328317642212, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7032983303070068, + "num_tokens": 71861739.0, + "step": 2869 + }, + { + "epoch": 0.315176806501208, + "grad_norm": 2.248262643814087, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6895182132720947, + "num_tokens": 71885052.0, + "step": 2870 + }, + { + "epoch": 0.31528662420382164, + "grad_norm": 1.8796619176864624, + "learning_rate": 1e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.6782538294792175, + "num_tokens": 71919698.0, + "step": 2871 + }, + { + "epoch": 0.31539644190643534, + "grad_norm": 2.2210097312927246, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6886249780654907, + "num_tokens": 71945811.0, + "step": 2872 + }, + { + "epoch": 0.315506259609049, + "grad_norm": 2.3472702503204346, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7108286023139954, + "num_tokens": 71968483.0, + "step": 2873 + }, + { + "epoch": 0.31561607731166264, + "grad_norm": 2.0593719482421875, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.718119740486145, + "num_tokens": 71996341.0, + "step": 2874 + }, + { + "epoch": 0.3157258950142763, + "grad_norm": 2.3780624866485596, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7055763006210327, + "num_tokens": 72018854.0, + "step": 2875 + }, + { + "epoch": 0.31583571271689, + "grad_norm": 2.2522695064544678, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6981592178344727, + "num_tokens": 72043589.0, + "step": 2876 + }, + { + "epoch": 0.31594553041950363, + "grad_norm": 2.3604304790496826, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6962360143661499, + "num_tokens": 72067448.0, + "step": 2877 + }, + { + "epoch": 0.3160553481221173, + "grad_norm": 2.1619410514831543, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6895667910575867, + "num_tokens": 72095449.0, + "step": 2878 + }, + { + "epoch": 0.3161651658247309, + "grad_norm": 2.5057408809661865, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7189348936080933, + "num_tokens": 72116078.0, + "step": 2879 + }, + { + "epoch": 0.3162749835273446, + "grad_norm": 2.26604962348938, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7291837930679321, + "num_tokens": 72137982.0, + "step": 2880 + }, + { + "epoch": 0.31638480122995827, + "grad_norm": 2.182218074798584, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7170381546020508, + "num_tokens": 72163114.0, + "step": 2881 + }, + { + "epoch": 0.3164946189325719, + "grad_norm": 2.1135149002075195, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6752192974090576, + "num_tokens": 72192855.0, + "step": 2882 + }, + { + "epoch": 0.3166044366351856, + "grad_norm": 2.323136329650879, + "learning_rate": 1e-06, + "loss": 1.0855, + "mean_token_accuracy": 0.6740801334381104, + "num_tokens": 72217379.0, + "step": 2883 + }, + { + "epoch": 0.31671425433779926, + "grad_norm": 2.3164937496185303, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7167616486549377, + "num_tokens": 72240995.0, + "step": 2884 + }, + { + "epoch": 0.3168240720404129, + "grad_norm": 2.3307950496673584, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7209033966064453, + "num_tokens": 72262798.0, + "step": 2885 + }, + { + "epoch": 0.31693388974302655, + "grad_norm": 2.0706262588500977, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6920404434204102, + "num_tokens": 72294153.0, + "step": 2886 + }, + { + "epoch": 0.31704370744564025, + "grad_norm": 2.244412422180176, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7017608284950256, + "num_tokens": 72319622.0, + "step": 2887 + }, + { + "epoch": 0.3171535251482539, + "grad_norm": 2.3304479122161865, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7081759572029114, + "num_tokens": 72344483.0, + "step": 2888 + }, + { + "epoch": 0.31726334285086755, + "grad_norm": 2.4663267135620117, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6953478455543518, + "num_tokens": 72365624.0, + "step": 2889 + }, + { + "epoch": 0.31737316055348125, + "grad_norm": 2.079822063446045, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6946593523025513, + "num_tokens": 72395173.0, + "step": 2890 + }, + { + "epoch": 0.3174829782560949, + "grad_norm": 2.1008219718933105, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6999655961990356, + "num_tokens": 72421460.0, + "step": 2891 + }, + { + "epoch": 0.31759279595870854, + "grad_norm": 2.6267812252044678, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7068893313407898, + "num_tokens": 72441761.0, + "step": 2892 + }, + { + "epoch": 0.3177026136613222, + "grad_norm": 2.332521915435791, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6863990426063538, + "num_tokens": 72468854.0, + "step": 2893 + }, + { + "epoch": 0.3178124313639359, + "grad_norm": 2.1403541564941406, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.7063851952552795, + "num_tokens": 72494739.0, + "step": 2894 + }, + { + "epoch": 0.31792224906654953, + "grad_norm": 2.459404945373535, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6876442432403564, + "num_tokens": 72519429.0, + "step": 2895 + }, + { + "epoch": 0.3180320667691632, + "grad_norm": 2.384491443634033, + "learning_rate": 1e-06, + "loss": 1.0975, + "mean_token_accuracy": 0.6770012378692627, + "num_tokens": 72544727.0, + "step": 2896 + }, + { + "epoch": 0.3181418844717768, + "grad_norm": 2.2912752628326416, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6950458884239197, + "num_tokens": 72569063.0, + "step": 2897 + }, + { + "epoch": 0.3182517021743905, + "grad_norm": 2.157780647277832, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6965917944908142, + "num_tokens": 72594115.0, + "step": 2898 + }, + { + "epoch": 0.3183615198770042, + "grad_norm": 2.3194522857666016, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7103464007377625, + "num_tokens": 72616454.0, + "step": 2899 + }, + { + "epoch": 0.3184713375796178, + "grad_norm": 2.478466033935547, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7157623171806335, + "num_tokens": 72637314.0, + "step": 2900 + }, + { + "epoch": 0.3185811552822315, + "grad_norm": 2.3216776847839355, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7047907710075378, + "num_tokens": 72663090.0, + "step": 2901 + }, + { + "epoch": 0.31869097298484517, + "grad_norm": 2.2400643825531006, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6838049292564392, + "num_tokens": 72689385.0, + "step": 2902 + }, + { + "epoch": 0.3188007906874588, + "grad_norm": 2.243567705154419, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6942533254623413, + "num_tokens": 72714865.0, + "step": 2903 + }, + { + "epoch": 0.31891060839007246, + "grad_norm": 2.8682966232299805, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6901955008506775, + "num_tokens": 72733192.0, + "step": 2904 + }, + { + "epoch": 0.31902042609268616, + "grad_norm": 2.5123438835144043, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.723242998123169, + "num_tokens": 72755059.0, + "step": 2905 + }, + { + "epoch": 0.3191302437952998, + "grad_norm": 2.0804150104522705, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7097764015197754, + "num_tokens": 72781977.0, + "step": 2906 + }, + { + "epoch": 0.31924006149791345, + "grad_norm": 2.2428090572357178, + "learning_rate": 1e-06, + "loss": 1.1062, + "mean_token_accuracy": 0.6652040481567383, + "num_tokens": 72810144.0, + "step": 2907 + }, + { + "epoch": 0.3193498792005271, + "grad_norm": 2.3383517265319824, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7215047478675842, + "num_tokens": 72834186.0, + "step": 2908 + }, + { + "epoch": 0.3194596969031408, + "grad_norm": 2.470050096511841, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7038372159004211, + "num_tokens": 72855984.0, + "step": 2909 + }, + { + "epoch": 0.31956951460575445, + "grad_norm": 2.3846170902252197, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.697582483291626, + "num_tokens": 72879466.0, + "step": 2910 + }, + { + "epoch": 0.3196793323083681, + "grad_norm": 2.330967664718628, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.712256133556366, + "num_tokens": 72900913.0, + "step": 2911 + }, + { + "epoch": 0.3197891500109818, + "grad_norm": 2.3820133209228516, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7084693312644958, + "num_tokens": 72923986.0, + "step": 2912 + }, + { + "epoch": 0.31989896771359544, + "grad_norm": 2.2558507919311523, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7073965072631836, + "num_tokens": 72948931.0, + "step": 2913 + }, + { + "epoch": 0.3200087854162091, + "grad_norm": 2.2326908111572266, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6994366645812988, + "num_tokens": 72977104.0, + "step": 2914 + }, + { + "epoch": 0.32011860311882273, + "grad_norm": 2.0339443683624268, + "learning_rate": 1e-06, + "loss": 1.094, + "mean_token_accuracy": 0.673501193523407, + "num_tokens": 73006901.0, + "step": 2915 + }, + { + "epoch": 0.32022842082143643, + "grad_norm": 2.5009632110595703, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7173742651939392, + "num_tokens": 73027468.0, + "step": 2916 + }, + { + "epoch": 0.3203382385240501, + "grad_norm": 2.334407329559326, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7095928192138672, + "num_tokens": 73051177.0, + "step": 2917 + }, + { + "epoch": 0.3204480562266637, + "grad_norm": 2.1485321521759033, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7035230398178101, + "num_tokens": 73078733.0, + "step": 2918 + }, + { + "epoch": 0.3205578739292774, + "grad_norm": 2.1113548278808594, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7308701872825623, + "num_tokens": 73104379.0, + "step": 2919 + }, + { + "epoch": 0.32066769163189107, + "grad_norm": 2.1660735607147217, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6903093457221985, + "num_tokens": 73130122.0, + "step": 2920 + }, + { + "epoch": 0.3207775093345047, + "grad_norm": 2.490053176879883, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7035736441612244, + "num_tokens": 73150512.0, + "step": 2921 + }, + { + "epoch": 0.32088732703711836, + "grad_norm": 1.9787880182266235, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7039259672164917, + "num_tokens": 73179763.0, + "step": 2922 + }, + { + "epoch": 0.32099714473973207, + "grad_norm": 1.9686845541000366, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6956841945648193, + "num_tokens": 73211443.0, + "step": 2923 + }, + { + "epoch": 0.3211069624423457, + "grad_norm": 2.3684628009796143, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.697742223739624, + "num_tokens": 73235100.0, + "step": 2924 + }, + { + "epoch": 0.32121678014495936, + "grad_norm": 2.0710747241973877, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7049511671066284, + "num_tokens": 73264345.0, + "step": 2925 + }, + { + "epoch": 0.321326597847573, + "grad_norm": 2.3706881999969482, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6803296208381653, + "num_tokens": 73287901.0, + "step": 2926 + }, + { + "epoch": 0.3214364155501867, + "grad_norm": 2.7608277797698975, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7038674354553223, + "num_tokens": 73304826.0, + "step": 2927 + }, + { + "epoch": 0.32154623325280035, + "grad_norm": 2.1207735538482666, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.6794369220733643, + "num_tokens": 73336557.0, + "step": 2928 + }, + { + "epoch": 0.321656050955414, + "grad_norm": 2.4172203540802, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7257446050643921, + "num_tokens": 73359904.0, + "step": 2929 + }, + { + "epoch": 0.3217658686580277, + "grad_norm": 2.410313606262207, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7079759836196899, + "num_tokens": 73383606.0, + "step": 2930 + }, + { + "epoch": 0.32187568636064134, + "grad_norm": 2.294475793838501, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6896761655807495, + "num_tokens": 73408491.0, + "step": 2931 + }, + { + "epoch": 0.321985504063255, + "grad_norm": 2.4794986248016357, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6990487575531006, + "num_tokens": 73429078.0, + "step": 2932 + }, + { + "epoch": 0.32209532176586864, + "grad_norm": 2.7313952445983887, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7090896368026733, + "num_tokens": 73446384.0, + "step": 2933 + }, + { + "epoch": 0.32220513946848234, + "grad_norm": 2.3131489753723145, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7087386846542358, + "num_tokens": 73469963.0, + "step": 2934 + }, + { + "epoch": 0.322314957171096, + "grad_norm": 2.438664197921753, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7101086378097534, + "num_tokens": 73491160.0, + "step": 2935 + }, + { + "epoch": 0.32242477487370963, + "grad_norm": 2.3341445922851562, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.7009682655334473, + "num_tokens": 73519031.0, + "step": 2936 + }, + { + "epoch": 0.3225345925763233, + "grad_norm": 2.094015121459961, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6885330677032471, + "num_tokens": 73552068.0, + "step": 2937 + }, + { + "epoch": 0.322644410278937, + "grad_norm": 2.15751051902771, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6968297958374023, + "num_tokens": 73578759.0, + "step": 2938 + }, + { + "epoch": 0.3227542279815506, + "grad_norm": 2.1855108737945557, + "learning_rate": 1e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.6826428771018982, + "num_tokens": 73608480.0, + "step": 2939 + }, + { + "epoch": 0.32286404568416427, + "grad_norm": 2.2305808067321777, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7198253870010376, + "num_tokens": 73632725.0, + "step": 2940 + }, + { + "epoch": 0.32297386338677797, + "grad_norm": 2.4059038162231445, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7210924029350281, + "num_tokens": 73655087.0, + "step": 2941 + }, + { + "epoch": 0.3230836810893916, + "grad_norm": 2.128676176071167, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6851949095726013, + "num_tokens": 73682988.0, + "step": 2942 + }, + { + "epoch": 0.32319349879200526, + "grad_norm": 2.1369118690490723, + "learning_rate": 1e-06, + "loss": 1.1237, + "mean_token_accuracy": 0.6654784679412842, + "num_tokens": 73714121.0, + "step": 2943 + }, + { + "epoch": 0.3233033164946189, + "grad_norm": 2.5638492107391357, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7040918469429016, + "num_tokens": 73735222.0, + "step": 2944 + }, + { + "epoch": 0.3234131341972326, + "grad_norm": 2.506108522415161, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6939563751220703, + "num_tokens": 73756852.0, + "step": 2945 + }, + { + "epoch": 0.32352295189984626, + "grad_norm": 2.112614393234253, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6917384266853333, + "num_tokens": 73782527.0, + "step": 2946 + }, + { + "epoch": 0.3236327696024599, + "grad_norm": 2.271916151046753, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.7004293203353882, + "num_tokens": 73809382.0, + "step": 2947 + }, + { + "epoch": 0.3237425873050736, + "grad_norm": 2.58524489402771, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7049785852432251, + "num_tokens": 73829395.0, + "step": 2948 + }, + { + "epoch": 0.32385240500768725, + "grad_norm": 2.469712972640991, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7038061618804932, + "num_tokens": 73851574.0, + "step": 2949 + }, + { + "epoch": 0.3239622227103009, + "grad_norm": 2.3857669830322266, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6909569501876831, + "num_tokens": 73876493.0, + "step": 2950 + }, + { + "epoch": 0.32407204041291454, + "grad_norm": 2.584416627883911, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.696976900100708, + "num_tokens": 73896623.0, + "step": 2951 + }, + { + "epoch": 0.32418185811552824, + "grad_norm": 2.5554885864257812, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6952165961265564, + "num_tokens": 73917107.0, + "step": 2952 + }, + { + "epoch": 0.3242916758181419, + "grad_norm": 2.0902559757232666, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7065001130104065, + "num_tokens": 73946855.0, + "step": 2953 + }, + { + "epoch": 0.32440149352075554, + "grad_norm": 2.207146406173706, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6801649928092957, + "num_tokens": 73974260.0, + "step": 2954 + }, + { + "epoch": 0.3245113112233692, + "grad_norm": 2.0883424282073975, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7060848474502563, + "num_tokens": 74003957.0, + "step": 2955 + }, + { + "epoch": 0.3246211289259829, + "grad_norm": 2.1080615520477295, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.708778977394104, + "num_tokens": 74031164.0, + "step": 2956 + }, + { + "epoch": 0.32473094662859653, + "grad_norm": 1.947084903717041, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6897280812263489, + "num_tokens": 74060827.0, + "step": 2957 + }, + { + "epoch": 0.3248407643312102, + "grad_norm": 2.1471545696258545, + "learning_rate": 1e-06, + "loss": 1.0836, + "mean_token_accuracy": 0.6775027513504028, + "num_tokens": 74089789.0, + "step": 2958 + }, + { + "epoch": 0.3249505820338239, + "grad_norm": 2.516744375228882, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7040389776229858, + "num_tokens": 74109631.0, + "step": 2959 + }, + { + "epoch": 0.3250603997364375, + "grad_norm": 2.3172407150268555, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7068605422973633, + "num_tokens": 74133463.0, + "step": 2960 + }, + { + "epoch": 0.32517021743905117, + "grad_norm": 2.578199863433838, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7016241550445557, + "num_tokens": 74153412.0, + "step": 2961 + }, + { + "epoch": 0.3252800351416648, + "grad_norm": 1.955952525138855, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6973380446434021, + "num_tokens": 74183087.0, + "step": 2962 + }, + { + "epoch": 0.3253898528442785, + "grad_norm": 2.2056610584259033, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7062594890594482, + "num_tokens": 74207603.0, + "step": 2963 + }, + { + "epoch": 0.32549967054689216, + "grad_norm": 2.0706160068511963, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6891146898269653, + "num_tokens": 74235864.0, + "step": 2964 + }, + { + "epoch": 0.3256094882495058, + "grad_norm": 2.2350869178771973, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6843549013137817, + "num_tokens": 74261825.0, + "step": 2965 + }, + { + "epoch": 0.3257193059521195, + "grad_norm": 2.031919002532959, + "learning_rate": 1e-06, + "loss": 1.0715, + "mean_token_accuracy": 0.6863551735877991, + "num_tokens": 74292979.0, + "step": 2966 + }, + { + "epoch": 0.32582912365473315, + "grad_norm": 2.2218024730682373, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.730394721031189, + "num_tokens": 74317222.0, + "step": 2967 + }, + { + "epoch": 0.3259389413573468, + "grad_norm": 2.2465813159942627, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7390322089195251, + "num_tokens": 74341871.0, + "step": 2968 + }, + { + "epoch": 0.32604875905996045, + "grad_norm": 2.2181711196899414, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.717011570930481, + "num_tokens": 74366774.0, + "step": 2969 + }, + { + "epoch": 0.32615857676257415, + "grad_norm": 2.1659133434295654, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7090367078781128, + "num_tokens": 74393534.0, + "step": 2970 + }, + { + "epoch": 0.3262683944651878, + "grad_norm": 2.260908365249634, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7196274995803833, + "num_tokens": 74415726.0, + "step": 2971 + }, + { + "epoch": 0.32637821216780144, + "grad_norm": 2.5546388626098633, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7086753845214844, + "num_tokens": 74436359.0, + "step": 2972 + }, + { + "epoch": 0.3264880298704151, + "grad_norm": 2.1461336612701416, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.706210732460022, + "num_tokens": 74461920.0, + "step": 2973 + }, + { + "epoch": 0.3265978475730288, + "grad_norm": 2.3134665489196777, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6973539590835571, + "num_tokens": 74483732.0, + "step": 2974 + }, + { + "epoch": 0.32670766527564243, + "grad_norm": 2.177798271179199, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6967241764068604, + "num_tokens": 74509548.0, + "step": 2975 + }, + { + "epoch": 0.3268174829782561, + "grad_norm": 2.7325048446655273, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6906472444534302, + "num_tokens": 74529780.0, + "step": 2976 + }, + { + "epoch": 0.3269273006808698, + "grad_norm": 2.54762864112854, + "learning_rate": 1e-06, + "loss": 1.0753, + "mean_token_accuracy": 0.6813514232635498, + "num_tokens": 74550972.0, + "step": 2977 + }, + { + "epoch": 0.3270371183834834, + "grad_norm": 2.282374143600464, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.7030631303787231, + "num_tokens": 74577248.0, + "step": 2978 + }, + { + "epoch": 0.3271469360860971, + "grad_norm": 2.345564603805542, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7022331953048706, + "num_tokens": 74599128.0, + "step": 2979 + }, + { + "epoch": 0.3272567537887107, + "grad_norm": 2.6191608905792236, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7008241415023804, + "num_tokens": 74618889.0, + "step": 2980 + }, + { + "epoch": 0.3273665714913244, + "grad_norm": 2.2018280029296875, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6940439939498901, + "num_tokens": 74643071.0, + "step": 2981 + }, + { + "epoch": 0.32747638919393807, + "grad_norm": 2.2264726161956787, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7022860050201416, + "num_tokens": 74668807.0, + "step": 2982 + }, + { + "epoch": 0.3275862068965517, + "grad_norm": 2.0213518142700195, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6858378648757935, + "num_tokens": 74698964.0, + "step": 2983 + }, + { + "epoch": 0.32769602459916536, + "grad_norm": 2.433992624282837, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7025749683380127, + "num_tokens": 74720286.0, + "step": 2984 + }, + { + "epoch": 0.32780584230177906, + "grad_norm": 2.2667593955993652, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6833125352859497, + "num_tokens": 74744520.0, + "step": 2985 + }, + { + "epoch": 0.3279156600043927, + "grad_norm": 2.3081576824188232, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7156839370727539, + "num_tokens": 74766555.0, + "step": 2986 + }, + { + "epoch": 0.32802547770700635, + "grad_norm": 2.0553979873657227, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6839867830276489, + "num_tokens": 74797419.0, + "step": 2987 + }, + { + "epoch": 0.32813529540962005, + "grad_norm": 2.3194258213043213, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7169265747070312, + "num_tokens": 74819523.0, + "step": 2988 + }, + { + "epoch": 0.3282451131122337, + "grad_norm": 2.409527063369751, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6892420053482056, + "num_tokens": 74842722.0, + "step": 2989 + }, + { + "epoch": 0.32835493081484735, + "grad_norm": 2.1911773681640625, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6866880655288696, + "num_tokens": 74867248.0, + "step": 2990 + }, + { + "epoch": 0.328464748517461, + "grad_norm": 2.2949931621551514, + "learning_rate": 1e-06, + "loss": 1.0852, + "mean_token_accuracy": 0.6782314777374268, + "num_tokens": 74891212.0, + "step": 2991 + }, + { + "epoch": 0.3285745662200747, + "grad_norm": 2.2168095111846924, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.705459713935852, + "num_tokens": 74915709.0, + "step": 2992 + }, + { + "epoch": 0.32868438392268834, + "grad_norm": 2.334139585494995, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7258045673370361, + "num_tokens": 74936471.0, + "step": 2993 + }, + { + "epoch": 0.328794201625302, + "grad_norm": 2.2138774394989014, + "learning_rate": 1e-06, + "loss": 1.0797, + "mean_token_accuracy": 0.6791913509368896, + "num_tokens": 74964182.0, + "step": 2994 + }, + { + "epoch": 0.3289040193279157, + "grad_norm": 2.2279345989227295, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7073628902435303, + "num_tokens": 74989596.0, + "step": 2995 + }, + { + "epoch": 0.32901383703052933, + "grad_norm": 2.188723087310791, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6916297674179077, + "num_tokens": 75016931.0, + "step": 2996 + }, + { + "epoch": 0.329123654733143, + "grad_norm": 2.2053604125976562, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7011465430259705, + "num_tokens": 75042603.0, + "step": 2997 + }, + { + "epoch": 0.3292334724357566, + "grad_norm": 2.2873716354370117, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.689667820930481, + "num_tokens": 75065877.0, + "step": 2998 + }, + { + "epoch": 0.3293432901383703, + "grad_norm": 2.544736862182617, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6954851150512695, + "num_tokens": 75088056.0, + "step": 2999 + }, + { + "epoch": 0.32945310784098397, + "grad_norm": 2.769721508026123, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7096113562583923, + "num_tokens": 75106802.0, + "step": 3000 + }, + { + "epoch": 0.3295629255435976, + "grad_norm": 2.284433364868164, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7060224413871765, + "num_tokens": 75131050.0, + "step": 3001 + }, + { + "epoch": 0.32967274324621126, + "grad_norm": 2.0458240509033203, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6852530241012573, + "num_tokens": 75160496.0, + "step": 3002 + }, + { + "epoch": 0.32978256094882497, + "grad_norm": 2.28151273727417, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7140482664108276, + "num_tokens": 75186467.0, + "step": 3003 + }, + { + "epoch": 0.3298923786514386, + "grad_norm": 2.267049789428711, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6883939504623413, + "num_tokens": 75210983.0, + "step": 3004 + }, + { + "epoch": 0.33000219635405226, + "grad_norm": 2.049762725830078, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.702145516872406, + "num_tokens": 75239231.0, + "step": 3005 + }, + { + "epoch": 0.33011201405666596, + "grad_norm": 2.1868791580200195, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.6977659463882446, + "num_tokens": 75266800.0, + "step": 3006 + }, + { + "epoch": 0.3302218317592796, + "grad_norm": 2.2529351711273193, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7041444778442383, + "num_tokens": 75289997.0, + "step": 3007 + }, + { + "epoch": 0.33033164946189325, + "grad_norm": 2.19393253326416, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.7100399732589722, + "num_tokens": 75317170.0, + "step": 3008 + }, + { + "epoch": 0.3304414671645069, + "grad_norm": 2.475019693374634, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7054290771484375, + "num_tokens": 75338693.0, + "step": 3009 + }, + { + "epoch": 0.3305512848671206, + "grad_norm": 2.0606236457824707, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.695810854434967, + "num_tokens": 75367314.0, + "step": 3010 + }, + { + "epoch": 0.33066110256973424, + "grad_norm": 2.3342106342315674, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6951581239700317, + "num_tokens": 75390692.0, + "step": 3011 + }, + { + "epoch": 0.3307709202723479, + "grad_norm": 2.2826757431030273, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6980502009391785, + "num_tokens": 75416947.0, + "step": 3012 + }, + { + "epoch": 0.33088073797496154, + "grad_norm": 2.597003221511841, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6943209171295166, + "num_tokens": 75435714.0, + "step": 3013 + }, + { + "epoch": 0.33099055567757524, + "grad_norm": 2.481168746948242, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7031466960906982, + "num_tokens": 75457363.0, + "step": 3014 + }, + { + "epoch": 0.3311003733801889, + "grad_norm": 2.2104012966156006, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7338559627532959, + "num_tokens": 75480301.0, + "step": 3015 + }, + { + "epoch": 0.33121019108280253, + "grad_norm": 2.2371106147766113, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.695826530456543, + "num_tokens": 75506590.0, + "step": 3016 + }, + { + "epoch": 0.33132000878541623, + "grad_norm": 2.537437677383423, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7229313254356384, + "num_tokens": 75529207.0, + "step": 3017 + }, + { + "epoch": 0.3314298264880299, + "grad_norm": 2.2463128566741943, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7054284811019897, + "num_tokens": 75555113.0, + "step": 3018 + }, + { + "epoch": 0.3315396441906435, + "grad_norm": 2.1577210426330566, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7112736701965332, + "num_tokens": 75582798.0, + "step": 3019 + }, + { + "epoch": 0.33164946189325717, + "grad_norm": 2.183520793914795, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6931103467941284, + "num_tokens": 75610639.0, + "step": 3020 + }, + { + "epoch": 0.33175927959587087, + "grad_norm": 2.1905922889709473, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6944757699966431, + "num_tokens": 75637911.0, + "step": 3021 + }, + { + "epoch": 0.3318690972984845, + "grad_norm": 2.4341745376586914, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7180788516998291, + "num_tokens": 75659360.0, + "step": 3022 + }, + { + "epoch": 0.33197891500109816, + "grad_norm": 2.287120819091797, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6889586448669434, + "num_tokens": 75684789.0, + "step": 3023 + }, + { + "epoch": 0.33208873270371186, + "grad_norm": 2.3642640113830566, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7043179273605347, + "num_tokens": 75710314.0, + "step": 3024 + }, + { + "epoch": 0.3321985504063255, + "grad_norm": 2.1710264682769775, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6843595504760742, + "num_tokens": 75738657.0, + "step": 3025 + }, + { + "epoch": 0.33230836810893916, + "grad_norm": 2.0508604049682617, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7018318176269531, + "num_tokens": 75765608.0, + "step": 3026 + }, + { + "epoch": 0.3324181858115528, + "grad_norm": 2.368730306625366, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.6996975541114807, + "num_tokens": 75787999.0, + "step": 3027 + }, + { + "epoch": 0.3325280035141665, + "grad_norm": 2.221672534942627, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7155554294586182, + "num_tokens": 75813567.0, + "step": 3028 + }, + { + "epoch": 0.33263782121678015, + "grad_norm": 1.8850492238998413, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7019352316856384, + "num_tokens": 75847501.0, + "step": 3029 + }, + { + "epoch": 0.3327476389193938, + "grad_norm": 2.2003772258758545, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6852837800979614, + "num_tokens": 75874488.0, + "step": 3030 + }, + { + "epoch": 0.33285745662200744, + "grad_norm": 2.297973394393921, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6993355751037598, + "num_tokens": 75897970.0, + "step": 3031 + }, + { + "epoch": 0.33296727432462114, + "grad_norm": 2.148024320602417, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.686521053314209, + "num_tokens": 75925843.0, + "step": 3032 + }, + { + "epoch": 0.3330770920272348, + "grad_norm": 2.1286826133728027, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7002531886100769, + "num_tokens": 75953900.0, + "step": 3033 + }, + { + "epoch": 0.33318690972984844, + "grad_norm": 2.277010440826416, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.698784351348877, + "num_tokens": 75979103.0, + "step": 3034 + }, + { + "epoch": 0.33329672743246214, + "grad_norm": 2.082742929458618, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.695032000541687, + "num_tokens": 76007030.0, + "step": 3035 + }, + { + "epoch": 0.3334065451350758, + "grad_norm": 2.1231653690338135, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7130285501480103, + "num_tokens": 76035013.0, + "step": 3036 + }, + { + "epoch": 0.33351636283768943, + "grad_norm": 2.2668404579162598, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7041823863983154, + "num_tokens": 76059720.0, + "step": 3037 + }, + { + "epoch": 0.3336261805403031, + "grad_norm": 2.4528636932373047, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6994823813438416, + "num_tokens": 76082746.0, + "step": 3038 + }, + { + "epoch": 0.3337359982429168, + "grad_norm": 2.5374341011047363, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7013593912124634, + "num_tokens": 76103644.0, + "step": 3039 + }, + { + "epoch": 0.3338458159455304, + "grad_norm": 2.1495325565338135, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.701055645942688, + "num_tokens": 76133838.0, + "step": 3040 + }, + { + "epoch": 0.33395563364814407, + "grad_norm": 1.972292184829712, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.699805736541748, + "num_tokens": 76165705.0, + "step": 3041 + }, + { + "epoch": 0.33406545135075777, + "grad_norm": 2.073221206665039, + "learning_rate": 1e-06, + "loss": 1.0925, + "mean_token_accuracy": 0.6719375848770142, + "num_tokens": 76194150.0, + "step": 3042 + }, + { + "epoch": 0.3341752690533714, + "grad_norm": 2.3003342151641846, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6871165037155151, + "num_tokens": 76221460.0, + "step": 3043 + }, + { + "epoch": 0.33428508675598506, + "grad_norm": 2.0024726390838623, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6870518922805786, + "num_tokens": 76253386.0, + "step": 3044 + }, + { + "epoch": 0.3343949044585987, + "grad_norm": 2.3414556980133057, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6978222131729126, + "num_tokens": 76276727.0, + "step": 3045 + }, + { + "epoch": 0.3345047221612124, + "grad_norm": 2.745060682296753, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.697786808013916, + "num_tokens": 76295472.0, + "step": 3046 + }, + { + "epoch": 0.33461453986382605, + "grad_norm": 2.2849645614624023, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6921485066413879, + "num_tokens": 76324601.0, + "step": 3047 + }, + { + "epoch": 0.3347243575664397, + "grad_norm": 2.4091720581054688, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7249608635902405, + "num_tokens": 76346930.0, + "step": 3048 + }, + { + "epoch": 0.33483417526905335, + "grad_norm": 2.278247833251953, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.681655764579773, + "num_tokens": 76371020.0, + "step": 3049 + }, + { + "epoch": 0.33494399297166705, + "grad_norm": 1.9890556335449219, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6873205304145813, + "num_tokens": 76401929.0, + "step": 3050 + }, + { + "epoch": 0.3350538106742807, + "grad_norm": 2.3938987255096436, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.724305272102356, + "num_tokens": 76423455.0, + "step": 3051 + }, + { + "epoch": 0.33516362837689434, + "grad_norm": 2.307492971420288, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6895259618759155, + "num_tokens": 76447197.0, + "step": 3052 + }, + { + "epoch": 0.33527344607950804, + "grad_norm": 2.128183364868164, + "learning_rate": 1e-06, + "loss": 1.0739, + "mean_token_accuracy": 0.6861408948898315, + "num_tokens": 76475067.0, + "step": 3053 + }, + { + "epoch": 0.3353832637821217, + "grad_norm": 2.286635160446167, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.698039710521698, + "num_tokens": 76498238.0, + "step": 3054 + }, + { + "epoch": 0.33549308148473533, + "grad_norm": 2.1362884044647217, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7142905592918396, + "num_tokens": 76524007.0, + "step": 3055 + }, + { + "epoch": 0.335602899187349, + "grad_norm": 2.5003113746643066, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7267391681671143, + "num_tokens": 76548315.0, + "step": 3056 + }, + { + "epoch": 0.3357127168899627, + "grad_norm": 2.2711868286132812, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7159162759780884, + "num_tokens": 76572050.0, + "step": 3057 + }, + { + "epoch": 0.3358225345925763, + "grad_norm": 2.105062484741211, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6914249658584595, + "num_tokens": 76599908.0, + "step": 3058 + }, + { + "epoch": 0.33593235229519, + "grad_norm": 2.3726978302001953, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7090759873390198, + "num_tokens": 76623132.0, + "step": 3059 + }, + { + "epoch": 0.3360421699978036, + "grad_norm": 2.2279515266418457, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.691007137298584, + "num_tokens": 76647819.0, + "step": 3060 + }, + { + "epoch": 0.3361519877004173, + "grad_norm": 2.590198516845703, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7167460322380066, + "num_tokens": 76668475.0, + "step": 3061 + }, + { + "epoch": 0.33626180540303097, + "grad_norm": 2.000514030456543, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7086700201034546, + "num_tokens": 76697888.0, + "step": 3062 + }, + { + "epoch": 0.3363716231056446, + "grad_norm": 2.366553544998169, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6839649677276611, + "num_tokens": 76721777.0, + "step": 3063 + }, + { + "epoch": 0.3364814408082583, + "grad_norm": 2.278055191040039, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6869425177574158, + "num_tokens": 76746167.0, + "step": 3064 + }, + { + "epoch": 0.33659125851087196, + "grad_norm": 1.9370942115783691, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6839830875396729, + "num_tokens": 76776962.0, + "step": 3065 + }, + { + "epoch": 0.3367010762134856, + "grad_norm": 2.020775318145752, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6959196925163269, + "num_tokens": 76806805.0, + "step": 3066 + }, + { + "epoch": 0.33681089391609925, + "grad_norm": 2.2071151733398438, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6872817873954773, + "num_tokens": 76831717.0, + "step": 3067 + }, + { + "epoch": 0.33692071161871295, + "grad_norm": 2.4280548095703125, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7113538980484009, + "num_tokens": 76852841.0, + "step": 3068 + }, + { + "epoch": 0.3370305293213266, + "grad_norm": 2.1753392219543457, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7188268303871155, + "num_tokens": 76879494.0, + "step": 3069 + }, + { + "epoch": 0.33714034702394025, + "grad_norm": 2.3239145278930664, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.725403368473053, + "num_tokens": 76902172.0, + "step": 3070 + }, + { + "epoch": 0.33725016472655395, + "grad_norm": 2.363028049468994, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6999474763870239, + "num_tokens": 76925298.0, + "step": 3071 + }, + { + "epoch": 0.3373599824291676, + "grad_norm": 2.054922103881836, + "learning_rate": 1e-06, + "loss": 1.1033, + "mean_token_accuracy": 0.6695451140403748, + "num_tokens": 76955590.0, + "step": 3072 + }, + { + "epoch": 0.33746980013178124, + "grad_norm": 2.1735830307006836, + "learning_rate": 1e-06, + "loss": 1.0999, + "mean_token_accuracy": 0.6787438988685608, + "num_tokens": 76982833.0, + "step": 3073 + }, + { + "epoch": 0.3375796178343949, + "grad_norm": 2.6912083625793457, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7213954925537109, + "num_tokens": 77003503.0, + "step": 3074 + }, + { + "epoch": 0.3376894355370086, + "grad_norm": 1.9835985898971558, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7195903062820435, + "num_tokens": 77032836.0, + "step": 3075 + }, + { + "epoch": 0.33779925323962223, + "grad_norm": 2.177246332168579, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6980757713317871, + "num_tokens": 77059266.0, + "step": 3076 + }, + { + "epoch": 0.3379090709422359, + "grad_norm": 2.304344892501831, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6939724683761597, + "num_tokens": 77087160.0, + "step": 3077 + }, + { + "epoch": 0.3380188886448495, + "grad_norm": 2.226402521133423, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.6976647973060608, + "num_tokens": 77113884.0, + "step": 3078 + }, + { + "epoch": 0.3381287063474632, + "grad_norm": 2.2962610721588135, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7204523682594299, + "num_tokens": 77136213.0, + "step": 3079 + }, + { + "epoch": 0.33823852405007687, + "grad_norm": 2.3460726737976074, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7002080678939819, + "num_tokens": 77158892.0, + "step": 3080 + }, + { + "epoch": 0.3383483417526905, + "grad_norm": 2.278244733810425, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.690584659576416, + "num_tokens": 77183603.0, + "step": 3081 + }, + { + "epoch": 0.3384581594553042, + "grad_norm": 2.57724928855896, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6936483383178711, + "num_tokens": 77203579.0, + "step": 3082 + }, + { + "epoch": 0.33856797715791787, + "grad_norm": 2.0959622859954834, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7118368148803711, + "num_tokens": 77233257.0, + "step": 3083 + }, + { + "epoch": 0.3386777948605315, + "grad_norm": 2.0038511753082275, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.707578182220459, + "num_tokens": 77263428.0, + "step": 3084 + }, + { + "epoch": 0.33878761256314516, + "grad_norm": 2.305074691772461, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7035129070281982, + "num_tokens": 77286444.0, + "step": 3085 + }, + { + "epoch": 0.33889743026575886, + "grad_norm": 2.1947364807128906, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7014855146408081, + "num_tokens": 77313523.0, + "step": 3086 + }, + { + "epoch": 0.3390072479683725, + "grad_norm": 2.4259819984436035, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6829413175582886, + "num_tokens": 77336162.0, + "step": 3087 + }, + { + "epoch": 0.33911706567098615, + "grad_norm": 2.230548858642578, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7179463505744934, + "num_tokens": 77361815.0, + "step": 3088 + }, + { + "epoch": 0.3392268833735998, + "grad_norm": 2.271817684173584, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7124239206314087, + "num_tokens": 77386912.0, + "step": 3089 + }, + { + "epoch": 0.3393367010762135, + "grad_norm": 2.451040506362915, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6916939616203308, + "num_tokens": 77407206.0, + "step": 3090 + }, + { + "epoch": 0.33944651877882714, + "grad_norm": 2.2336018085479736, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7147220373153687, + "num_tokens": 77433192.0, + "step": 3091 + }, + { + "epoch": 0.3395563364814408, + "grad_norm": 2.248497247695923, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.6989794969558716, + "num_tokens": 77456823.0, + "step": 3092 + }, + { + "epoch": 0.3396661541840545, + "grad_norm": 2.2733893394470215, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6929094791412354, + "num_tokens": 77482452.0, + "step": 3093 + }, + { + "epoch": 0.33977597188666814, + "grad_norm": 2.0793142318725586, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7090432643890381, + "num_tokens": 77509581.0, + "step": 3094 + }, + { + "epoch": 0.3398857895892818, + "grad_norm": 2.449127435684204, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7014924883842468, + "num_tokens": 77530102.0, + "step": 3095 + }, + { + "epoch": 0.33999560729189543, + "grad_norm": 2.2288148403167725, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.702221691608429, + "num_tokens": 77555287.0, + "step": 3096 + }, + { + "epoch": 0.34010542499450913, + "grad_norm": 2.460559129714966, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.696794867515564, + "num_tokens": 77579031.0, + "step": 3097 + }, + { + "epoch": 0.3402152426971228, + "grad_norm": 2.44433331489563, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6807880401611328, + "num_tokens": 77601205.0, + "step": 3098 + }, + { + "epoch": 0.3403250603997364, + "grad_norm": 1.9723572731018066, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7061023712158203, + "num_tokens": 77634607.0, + "step": 3099 + }, + { + "epoch": 0.3404348781023501, + "grad_norm": 1.993650197982788, + "learning_rate": 1e-06, + "loss": 1.1482, + "mean_token_accuracy": 0.6565191149711609, + "num_tokens": 77667474.0, + "step": 3100 + }, + { + "epoch": 0.34054469580496377, + "grad_norm": 2.535703182220459, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.712968647480011, + "num_tokens": 77687510.0, + "step": 3101 + }, + { + "epoch": 0.3406545135075774, + "grad_norm": 2.3062829971313477, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6827282905578613, + "num_tokens": 77710822.0, + "step": 3102 + }, + { + "epoch": 0.34076433121019106, + "grad_norm": 2.25734543800354, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6924116611480713, + "num_tokens": 77735833.0, + "step": 3103 + }, + { + "epoch": 0.34087414891280476, + "grad_norm": 2.2972805500030518, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6968593597412109, + "num_tokens": 77758111.0, + "step": 3104 + }, + { + "epoch": 0.3409839666154184, + "grad_norm": 2.2458698749542236, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.694861650466919, + "num_tokens": 77783837.0, + "step": 3105 + }, + { + "epoch": 0.34109378431803206, + "grad_norm": 2.5431997776031494, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7430185079574585, + "num_tokens": 77802295.0, + "step": 3106 + }, + { + "epoch": 0.3412036020206457, + "grad_norm": 2.3359856605529785, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6970300674438477, + "num_tokens": 77826307.0, + "step": 3107 + }, + { + "epoch": 0.3413134197232594, + "grad_norm": 2.316404104232788, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6844540238380432, + "num_tokens": 77853095.0, + "step": 3108 + }, + { + "epoch": 0.34142323742587305, + "grad_norm": 2.289114236831665, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6901057362556458, + "num_tokens": 77878890.0, + "step": 3109 + }, + { + "epoch": 0.3415330551284867, + "grad_norm": 2.005664825439453, + "learning_rate": 1e-06, + "loss": 1.1198, + "mean_token_accuracy": 0.6655786037445068, + "num_tokens": 77910429.0, + "step": 3110 + }, + { + "epoch": 0.3416428728311004, + "grad_norm": 2.3156745433807373, + "learning_rate": 1e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.669121503829956, + "num_tokens": 77934275.0, + "step": 3111 + }, + { + "epoch": 0.34175269053371404, + "grad_norm": 2.357370138168335, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7271111011505127, + "num_tokens": 77956012.0, + "step": 3112 + }, + { + "epoch": 0.3418625082363277, + "grad_norm": 2.14288592338562, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6884469985961914, + "num_tokens": 77984876.0, + "step": 3113 + }, + { + "epoch": 0.34197232593894134, + "grad_norm": 2.551643133163452, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7235839366912842, + "num_tokens": 78003410.0, + "step": 3114 + }, + { + "epoch": 0.34208214364155504, + "grad_norm": 2.3171935081481934, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.709844708442688, + "num_tokens": 78024489.0, + "step": 3115 + }, + { + "epoch": 0.3421919613441687, + "grad_norm": 2.1068153381347656, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6895747780799866, + "num_tokens": 78051632.0, + "step": 3116 + }, + { + "epoch": 0.34230177904678233, + "grad_norm": 2.113130807876587, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.6915782690048218, + "num_tokens": 78078162.0, + "step": 3117 + }, + { + "epoch": 0.34241159674939603, + "grad_norm": 2.1658616065979004, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7085094451904297, + "num_tokens": 78103501.0, + "step": 3118 + }, + { + "epoch": 0.3425214144520097, + "grad_norm": 2.159144878387451, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7000445127487183, + "num_tokens": 78128884.0, + "step": 3119 + }, + { + "epoch": 0.3426312321546233, + "grad_norm": 2.2096502780914307, + "learning_rate": 1e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.6805700063705444, + "num_tokens": 78155837.0, + "step": 3120 + }, + { + "epoch": 0.34274104985723697, + "grad_norm": 1.978917121887207, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7084450125694275, + "num_tokens": 78185955.0, + "step": 3121 + }, + { + "epoch": 0.34285086755985067, + "grad_norm": 2.307756185531616, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6968733072280884, + "num_tokens": 78211129.0, + "step": 3122 + }, + { + "epoch": 0.3429606852624643, + "grad_norm": 2.119208335876465, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6841023564338684, + "num_tokens": 78239772.0, + "step": 3123 + }, + { + "epoch": 0.34307050296507796, + "grad_norm": 2.461005210876465, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7110101580619812, + "num_tokens": 78262386.0, + "step": 3124 + }, + { + "epoch": 0.3431803206676916, + "grad_norm": 2.0877349376678467, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.699550211429596, + "num_tokens": 78291451.0, + "step": 3125 + }, + { + "epoch": 0.3432901383703053, + "grad_norm": 2.418456792831421, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7108381390571594, + "num_tokens": 78314490.0, + "step": 3126 + }, + { + "epoch": 0.34339995607291895, + "grad_norm": 2.5865979194641113, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7130255699157715, + "num_tokens": 78334720.0, + "step": 3127 + }, + { + "epoch": 0.3435097737755326, + "grad_norm": 2.0710954666137695, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6945911645889282, + "num_tokens": 78363724.0, + "step": 3128 + }, + { + "epoch": 0.3436195914781463, + "grad_norm": 2.5526111125946045, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7012906074523926, + "num_tokens": 78383885.0, + "step": 3129 + }, + { + "epoch": 0.34372940918075995, + "grad_norm": 2.057556629180908, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6962827444076538, + "num_tokens": 78411379.0, + "step": 3130 + }, + { + "epoch": 0.3438392268833736, + "grad_norm": 2.4739136695861816, + "learning_rate": 1e-06, + "loss": 1.0919, + "mean_token_accuracy": 0.6745278835296631, + "num_tokens": 78434752.0, + "step": 3131 + }, + { + "epoch": 0.34394904458598724, + "grad_norm": 2.302014112472534, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6874390244483948, + "num_tokens": 78460129.0, + "step": 3132 + }, + { + "epoch": 0.34405886228860094, + "grad_norm": 2.280097484588623, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7030203342437744, + "num_tokens": 78483533.0, + "step": 3133 + }, + { + "epoch": 0.3441686799912146, + "grad_norm": 2.3597872257232666, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6888983249664307, + "num_tokens": 78506523.0, + "step": 3134 + }, + { + "epoch": 0.34427849769382823, + "grad_norm": 2.492103338241577, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.730605959892273, + "num_tokens": 78525965.0, + "step": 3135 + }, + { + "epoch": 0.3443883153964419, + "grad_norm": 2.440225124359131, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.6983622312545776, + "num_tokens": 78548061.0, + "step": 3136 + }, + { + "epoch": 0.3444981330990556, + "grad_norm": 2.8090977668762207, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7024685740470886, + "num_tokens": 78566047.0, + "step": 3137 + }, + { + "epoch": 0.3446079508016692, + "grad_norm": 2.477003812789917, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7088640928268433, + "num_tokens": 78586664.0, + "step": 3138 + }, + { + "epoch": 0.3447177685042829, + "grad_norm": 2.029933452606201, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6900694370269775, + "num_tokens": 78614985.0, + "step": 3139 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 2.0817747116088867, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7051661610603333, + "num_tokens": 78643020.0, + "step": 3140 + }, + { + "epoch": 0.3449374039095102, + "grad_norm": 2.4162745475769043, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6875821352005005, + "num_tokens": 78666087.0, + "step": 3141 + }, + { + "epoch": 0.34504722161212387, + "grad_norm": 2.192293882369995, + "learning_rate": 1e-06, + "loss": 1.109, + "mean_token_accuracy": 0.6649249196052551, + "num_tokens": 78693733.0, + "step": 3142 + }, + { + "epoch": 0.3451570393147375, + "grad_norm": 2.082866668701172, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6882797479629517, + "num_tokens": 78721370.0, + "step": 3143 + }, + { + "epoch": 0.3452668570173512, + "grad_norm": 2.4064230918884277, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.730721652507782, + "num_tokens": 78742532.0, + "step": 3144 + }, + { + "epoch": 0.34537667471996486, + "grad_norm": 2.400979995727539, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7164108753204346, + "num_tokens": 78763681.0, + "step": 3145 + }, + { + "epoch": 0.3454864924225785, + "grad_norm": 2.2116539478302, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7244230508804321, + "num_tokens": 78788465.0, + "step": 3146 + }, + { + "epoch": 0.3455963101251922, + "grad_norm": 2.1803269386291504, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7202484607696533, + "num_tokens": 78813488.0, + "step": 3147 + }, + { + "epoch": 0.34570612782780585, + "grad_norm": 2.5058434009552, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7342002987861633, + "num_tokens": 78832731.0, + "step": 3148 + }, + { + "epoch": 0.3458159455304195, + "grad_norm": 2.1552810668945312, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7009338140487671, + "num_tokens": 78859517.0, + "step": 3149 + }, + { + "epoch": 0.34592576323303315, + "grad_norm": 2.2417469024658203, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7033494114875793, + "num_tokens": 78884676.0, + "step": 3150 + }, + { + "epoch": 0.34603558093564685, + "grad_norm": 2.38124418258667, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6974214911460876, + "num_tokens": 78908385.0, + "step": 3151 + }, + { + "epoch": 0.3461453986382605, + "grad_norm": 2.2527849674224854, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7019573450088501, + "num_tokens": 78934417.0, + "step": 3152 + }, + { + "epoch": 0.34625521634087414, + "grad_norm": 1.9998974800109863, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6902715563774109, + "num_tokens": 78964925.0, + "step": 3153 + }, + { + "epoch": 0.3463650340434878, + "grad_norm": 2.4140207767486572, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.717513918876648, + "num_tokens": 78987742.0, + "step": 3154 + }, + { + "epoch": 0.3464748517461015, + "grad_norm": 2.2363533973693848, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7252939939498901, + "num_tokens": 79013013.0, + "step": 3155 + }, + { + "epoch": 0.34658466944871513, + "grad_norm": 2.3018338680267334, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6827914118766785, + "num_tokens": 79037637.0, + "step": 3156 + }, + { + "epoch": 0.3466944871513288, + "grad_norm": 2.0064189434051514, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.715900182723999, + "num_tokens": 79066643.0, + "step": 3157 + }, + { + "epoch": 0.3468043048539425, + "grad_norm": 2.6212923526763916, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6936173439025879, + "num_tokens": 79086457.0, + "step": 3158 + }, + { + "epoch": 0.3469141225565561, + "grad_norm": 2.146303653717041, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6976803541183472, + "num_tokens": 79114259.0, + "step": 3159 + }, + { + "epoch": 0.34702394025916977, + "grad_norm": 2.0976145267486572, + "learning_rate": 1e-06, + "loss": 1.0729, + "mean_token_accuracy": 0.6800761222839355, + "num_tokens": 79143439.0, + "step": 3160 + }, + { + "epoch": 0.3471337579617834, + "grad_norm": 2.150733232498169, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7038818597793579, + "num_tokens": 79170869.0, + "step": 3161 + }, + { + "epoch": 0.3472435756643971, + "grad_norm": 2.1397202014923096, + "learning_rate": 1e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.6752893924713135, + "num_tokens": 79201512.0, + "step": 3162 + }, + { + "epoch": 0.34735339336701077, + "grad_norm": 1.9861035346984863, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6935129165649414, + "num_tokens": 79233623.0, + "step": 3163 + }, + { + "epoch": 0.3474632110696244, + "grad_norm": 2.285856008529663, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7174594402313232, + "num_tokens": 79256870.0, + "step": 3164 + }, + { + "epoch": 0.34757302877223806, + "grad_norm": 2.0380451679229736, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6877444982528687, + "num_tokens": 79287021.0, + "step": 3165 + }, + { + "epoch": 0.34768284647485176, + "grad_norm": 2.1827304363250732, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7038339972496033, + "num_tokens": 79312225.0, + "step": 3166 + }, + { + "epoch": 0.3477926641774654, + "grad_norm": 2.23150372505188, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6864535808563232, + "num_tokens": 79339935.0, + "step": 3167 + }, + { + "epoch": 0.34790248188007905, + "grad_norm": 2.4827756881713867, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7153890132904053, + "num_tokens": 79360741.0, + "step": 3168 + }, + { + "epoch": 0.34801229958269275, + "grad_norm": 2.0190117359161377, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7143215537071228, + "num_tokens": 79387472.0, + "step": 3169 + }, + { + "epoch": 0.3481221172853064, + "grad_norm": 2.5949301719665527, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7103433609008789, + "num_tokens": 79406397.0, + "step": 3170 + }, + { + "epoch": 0.34823193498792004, + "grad_norm": 2.313129425048828, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.704842209815979, + "num_tokens": 79428612.0, + "step": 3171 + }, + { + "epoch": 0.3483417526905337, + "grad_norm": 2.3256072998046875, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7139852643013, + "num_tokens": 79452079.0, + "step": 3172 + }, + { + "epoch": 0.3484515703931474, + "grad_norm": 2.146620988845825, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7231378555297852, + "num_tokens": 79479066.0, + "step": 3173 + }, + { + "epoch": 0.34856138809576104, + "grad_norm": 2.4427294731140137, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7036925554275513, + "num_tokens": 79499810.0, + "step": 3174 + }, + { + "epoch": 0.3486712057983747, + "grad_norm": 2.282748222351074, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7007082104682922, + "num_tokens": 79524300.0, + "step": 3175 + }, + { + "epoch": 0.3487810235009884, + "grad_norm": 2.2474753856658936, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7067047357559204, + "num_tokens": 79547228.0, + "step": 3176 + }, + { + "epoch": 0.34889084120360203, + "grad_norm": 2.416964292526245, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6945360898971558, + "num_tokens": 79569272.0, + "step": 3177 + }, + { + "epoch": 0.3490006589062157, + "grad_norm": 2.5221331119537354, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7021256685256958, + "num_tokens": 79594082.0, + "step": 3178 + }, + { + "epoch": 0.3491104766088293, + "grad_norm": 2.5069615840911865, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6861357092857361, + "num_tokens": 79615981.0, + "step": 3179 + }, + { + "epoch": 0.349220294311443, + "grad_norm": 2.498124837875366, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6915891170501709, + "num_tokens": 79637595.0, + "step": 3180 + }, + { + "epoch": 0.34933011201405667, + "grad_norm": 2.4392638206481934, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7030370831489563, + "num_tokens": 79659101.0, + "step": 3181 + }, + { + "epoch": 0.3494399297166703, + "grad_norm": 2.3146631717681885, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.683318555355072, + "num_tokens": 79685321.0, + "step": 3182 + }, + { + "epoch": 0.34954974741928396, + "grad_norm": 2.2429192066192627, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6924718618392944, + "num_tokens": 79711727.0, + "step": 3183 + }, + { + "epoch": 0.34965956512189766, + "grad_norm": 1.9136179685592651, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6945315003395081, + "num_tokens": 79744367.0, + "step": 3184 + }, + { + "epoch": 0.3497693828245113, + "grad_norm": 2.379894256591797, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.6977515816688538, + "num_tokens": 79768035.0, + "step": 3185 + }, + { + "epoch": 0.34987920052712496, + "grad_norm": 2.6609857082366943, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7126593589782715, + "num_tokens": 79786842.0, + "step": 3186 + }, + { + "epoch": 0.34998901822973866, + "grad_norm": 2.3304758071899414, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.7044721841812134, + "num_tokens": 79811388.0, + "step": 3187 + }, + { + "epoch": 0.3500988359323523, + "grad_norm": 2.5107779502868652, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7040523886680603, + "num_tokens": 79833100.0, + "step": 3188 + }, + { + "epoch": 0.35020865363496595, + "grad_norm": 2.2933948040008545, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.715526819229126, + "num_tokens": 79855557.0, + "step": 3189 + }, + { + "epoch": 0.3503184713375796, + "grad_norm": 2.714381694793701, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7096668481826782, + "num_tokens": 79874517.0, + "step": 3190 + }, + { + "epoch": 0.3504282890401933, + "grad_norm": 2.0983269214630127, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7089118957519531, + "num_tokens": 79901974.0, + "step": 3191 + }, + { + "epoch": 0.35053810674280694, + "grad_norm": 2.287919044494629, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6952999830245972, + "num_tokens": 79925450.0, + "step": 3192 + }, + { + "epoch": 0.3506479244454206, + "grad_norm": 2.3728840351104736, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.701576828956604, + "num_tokens": 79949361.0, + "step": 3193 + }, + { + "epoch": 0.3507577421480343, + "grad_norm": 2.3440024852752686, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.6987071633338928, + "num_tokens": 79975422.0, + "step": 3194 + }, + { + "epoch": 0.35086755985064794, + "grad_norm": 2.1825873851776123, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6913858652114868, + "num_tokens": 80003761.0, + "step": 3195 + }, + { + "epoch": 0.3509773775532616, + "grad_norm": 2.9309048652648926, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7279092669487, + "num_tokens": 80018684.0, + "step": 3196 + }, + { + "epoch": 0.35108719525587523, + "grad_norm": 2.394838809967041, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7038289308547974, + "num_tokens": 80041577.0, + "step": 3197 + }, + { + "epoch": 0.35119701295848893, + "grad_norm": 2.2442972660064697, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.699181318283081, + "num_tokens": 80067170.0, + "step": 3198 + }, + { + "epoch": 0.3513068306611026, + "grad_norm": 2.183218240737915, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.690258264541626, + "num_tokens": 80092316.0, + "step": 3199 + }, + { + "epoch": 0.3514166483637162, + "grad_norm": 2.1272571086883545, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.690105676651001, + "num_tokens": 80119847.0, + "step": 3200 + }, + { + "epoch": 0.35152646606632987, + "grad_norm": 2.7870097160339355, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7142302393913269, + "num_tokens": 80138064.0, + "step": 3201 + }, + { + "epoch": 0.35163628376894357, + "grad_norm": 2.1645097732543945, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7097558379173279, + "num_tokens": 80164930.0, + "step": 3202 + }, + { + "epoch": 0.3517461014715572, + "grad_norm": 2.28821063041687, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6929575204849243, + "num_tokens": 80189866.0, + "step": 3203 + }, + { + "epoch": 0.35185591917417086, + "grad_norm": 2.228515148162842, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6910666227340698, + "num_tokens": 80215526.0, + "step": 3204 + }, + { + "epoch": 0.35196573687678456, + "grad_norm": 2.316912889480591, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7086707353591919, + "num_tokens": 80240005.0, + "step": 3205 + }, + { + "epoch": 0.3520755545793982, + "grad_norm": 1.8865083456039429, + "learning_rate": 1e-06, + "loss": 1.0778, + "mean_token_accuracy": 0.6792082786560059, + "num_tokens": 80275963.0, + "step": 3206 + }, + { + "epoch": 0.35218537228201185, + "grad_norm": 2.2387359142303467, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7053894996643066, + "num_tokens": 80300195.0, + "step": 3207 + }, + { + "epoch": 0.3522951899846255, + "grad_norm": 2.1888535022735596, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6852113008499146, + "num_tokens": 80326916.0, + "step": 3208 + }, + { + "epoch": 0.3524050076872392, + "grad_norm": 1.973340392112732, + "learning_rate": 1e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.6825863122940063, + "num_tokens": 80357646.0, + "step": 3209 + }, + { + "epoch": 0.35251482538985285, + "grad_norm": 2.3606529235839844, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7244585752487183, + "num_tokens": 80378384.0, + "step": 3210 + }, + { + "epoch": 0.3526246430924665, + "grad_norm": 2.3491127490997314, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7088994979858398, + "num_tokens": 80401167.0, + "step": 3211 + }, + { + "epoch": 0.35273446079508014, + "grad_norm": 2.1757407188415527, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7099143862724304, + "num_tokens": 80426644.0, + "step": 3212 + }, + { + "epoch": 0.35284427849769384, + "grad_norm": 1.943490743637085, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7018389105796814, + "num_tokens": 80456996.0, + "step": 3213 + }, + { + "epoch": 0.3529540962003075, + "grad_norm": 2.378732919692993, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6851288676261902, + "num_tokens": 80481452.0, + "step": 3214 + }, + { + "epoch": 0.35306391390292113, + "grad_norm": 2.341271162033081, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6964932680130005, + "num_tokens": 80505060.0, + "step": 3215 + }, + { + "epoch": 0.35317373160553484, + "grad_norm": 2.026750087738037, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6828125715255737, + "num_tokens": 80536724.0, + "step": 3216 + }, + { + "epoch": 0.3532835493081485, + "grad_norm": 2.288872718811035, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7117650508880615, + "num_tokens": 80562015.0, + "step": 3217 + }, + { + "epoch": 0.3533933670107621, + "grad_norm": 2.271040201187134, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6976196765899658, + "num_tokens": 80586367.0, + "step": 3218 + }, + { + "epoch": 0.3535031847133758, + "grad_norm": 2.2826404571533203, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7026311755180359, + "num_tokens": 80609151.0, + "step": 3219 + }, + { + "epoch": 0.3536130024159895, + "grad_norm": 2.033267021179199, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6860412359237671, + "num_tokens": 80639285.0, + "step": 3220 + }, + { + "epoch": 0.3537228201186031, + "grad_norm": 2.2872326374053955, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.690822958946228, + "num_tokens": 80664724.0, + "step": 3221 + }, + { + "epoch": 0.35383263782121677, + "grad_norm": 2.1002140045166016, + "learning_rate": 1e-06, + "loss": 1.1036, + "mean_token_accuracy": 0.6715649366378784, + "num_tokens": 80694508.0, + "step": 3222 + }, + { + "epoch": 0.35394245552383047, + "grad_norm": 2.3703598976135254, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.6942092180252075, + "num_tokens": 80716147.0, + "step": 3223 + }, + { + "epoch": 0.3540522732264441, + "grad_norm": 2.369260549545288, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6796271204948425, + "num_tokens": 80740970.0, + "step": 3224 + }, + { + "epoch": 0.35416209092905776, + "grad_norm": 2.5071418285369873, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7270183563232422, + "num_tokens": 80762412.0, + "step": 3225 + }, + { + "epoch": 0.3542719086316714, + "grad_norm": 2.402336597442627, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.722247838973999, + "num_tokens": 80783620.0, + "step": 3226 + }, + { + "epoch": 0.3543817263342851, + "grad_norm": 1.985074758529663, + "learning_rate": 1e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.6862183809280396, + "num_tokens": 80815165.0, + "step": 3227 + }, + { + "epoch": 0.35449154403689875, + "grad_norm": 2.3095223903656006, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7182173132896423, + "num_tokens": 80837698.0, + "step": 3228 + }, + { + "epoch": 0.3546013617395124, + "grad_norm": 2.3419322967529297, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7428361177444458, + "num_tokens": 80858714.0, + "step": 3229 + }, + { + "epoch": 0.35471117944212605, + "grad_norm": 2.2582623958587646, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7071202993392944, + "num_tokens": 80882076.0, + "step": 3230 + }, + { + "epoch": 0.35482099714473975, + "grad_norm": 2.610276222229004, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7025033831596375, + "num_tokens": 80902807.0, + "step": 3231 + }, + { + "epoch": 0.3549308148473534, + "grad_norm": 2.1284196376800537, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.685285210609436, + "num_tokens": 80932230.0, + "step": 3232 + }, + { + "epoch": 0.35504063254996704, + "grad_norm": 2.2828426361083984, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7203123569488525, + "num_tokens": 80956062.0, + "step": 3233 + }, + { + "epoch": 0.35515045025258074, + "grad_norm": 2.45043683052063, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6958741545677185, + "num_tokens": 80979345.0, + "step": 3234 + }, + { + "epoch": 0.3552602679551944, + "grad_norm": 2.242417097091675, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7155447006225586, + "num_tokens": 81003805.0, + "step": 3235 + }, + { + "epoch": 0.35537008565780803, + "grad_norm": 2.114345073699951, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.6798794865608215, + "num_tokens": 81031402.0, + "step": 3236 + }, + { + "epoch": 0.3554799033604217, + "grad_norm": 2.2105016708374023, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6993916630744934, + "num_tokens": 81059216.0, + "step": 3237 + }, + { + "epoch": 0.3555897210630354, + "grad_norm": 2.0793983936309814, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7216026782989502, + "num_tokens": 81085804.0, + "step": 3238 + }, + { + "epoch": 0.355699538765649, + "grad_norm": 2.465273857116699, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7103824615478516, + "num_tokens": 81106441.0, + "step": 3239 + }, + { + "epoch": 0.35580935646826267, + "grad_norm": 2.013406753540039, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7098554372787476, + "num_tokens": 81137246.0, + "step": 3240 + }, + { + "epoch": 0.3559191741708763, + "grad_norm": 1.8559143543243408, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7156403064727783, + "num_tokens": 81168756.0, + "step": 3241 + }, + { + "epoch": 0.35602899187349, + "grad_norm": 2.412926435470581, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7157203555107117, + "num_tokens": 81189510.0, + "step": 3242 + }, + { + "epoch": 0.35613880957610367, + "grad_norm": 2.547064781188965, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7102880477905273, + "num_tokens": 81210410.0, + "step": 3243 + }, + { + "epoch": 0.3562486272787173, + "grad_norm": 2.4436757564544678, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7041743993759155, + "num_tokens": 81232751.0, + "step": 3244 + }, + { + "epoch": 0.356358444981331, + "grad_norm": 2.0283539295196533, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.6979387402534485, + "num_tokens": 81261624.0, + "step": 3245 + }, + { + "epoch": 0.35646826268394466, + "grad_norm": 2.664885997772217, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6993781924247742, + "num_tokens": 81281841.0, + "step": 3246 + }, + { + "epoch": 0.3565780803865583, + "grad_norm": 2.003082752227783, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7127345204353333, + "num_tokens": 81311588.0, + "step": 3247 + }, + { + "epoch": 0.35668789808917195, + "grad_norm": 2.311325788497925, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7209712266921997, + "num_tokens": 81332710.0, + "step": 3248 + }, + { + "epoch": 0.35679771579178565, + "grad_norm": 2.017456531524658, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7009567022323608, + "num_tokens": 81362924.0, + "step": 3249 + }, + { + "epoch": 0.3569075334943993, + "grad_norm": 2.23374605178833, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6989290118217468, + "num_tokens": 81389405.0, + "step": 3250 + }, + { + "epoch": 0.35701735119701294, + "grad_norm": 2.5195441246032715, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6965632438659668, + "num_tokens": 81410062.0, + "step": 3251 + }, + { + "epoch": 0.35712716889962665, + "grad_norm": 1.9721101522445679, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7130076885223389, + "num_tokens": 81439896.0, + "step": 3252 + }, + { + "epoch": 0.3572369866022403, + "grad_norm": 2.205436944961548, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6985971331596375, + "num_tokens": 81464474.0, + "step": 3253 + }, + { + "epoch": 0.35734680430485394, + "grad_norm": 2.2904961109161377, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6904290914535522, + "num_tokens": 81487217.0, + "step": 3254 + }, + { + "epoch": 0.3574566220074676, + "grad_norm": 2.2908143997192383, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7064390182495117, + "num_tokens": 81512943.0, + "step": 3255 + }, + { + "epoch": 0.3575664397100813, + "grad_norm": 2.175368070602417, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7046191692352295, + "num_tokens": 81538774.0, + "step": 3256 + }, + { + "epoch": 0.35767625741269493, + "grad_norm": 2.1746864318847656, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6942054629325867, + "num_tokens": 81567464.0, + "step": 3257 + }, + { + "epoch": 0.3577860751153086, + "grad_norm": 2.2743728160858154, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6996872425079346, + "num_tokens": 81591798.0, + "step": 3258 + }, + { + "epoch": 0.3578958928179222, + "grad_norm": 2.590625524520874, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7060887217521667, + "num_tokens": 81611278.0, + "step": 3259 + }, + { + "epoch": 0.3580057105205359, + "grad_norm": 2.0049474239349365, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7121435403823853, + "num_tokens": 81640424.0, + "step": 3260 + }, + { + "epoch": 0.35811552822314957, + "grad_norm": 2.400834798812866, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7042505741119385, + "num_tokens": 81662131.0, + "step": 3261 + }, + { + "epoch": 0.3582253459257632, + "grad_norm": 2.2935092449188232, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.698272705078125, + "num_tokens": 81686123.0, + "step": 3262 + }, + { + "epoch": 0.3583351636283769, + "grad_norm": 2.3531203269958496, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7007632255554199, + "num_tokens": 81710570.0, + "step": 3263 + }, + { + "epoch": 0.35844498133099056, + "grad_norm": 2.539903402328491, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7173497080802917, + "num_tokens": 81731918.0, + "step": 3264 + }, + { + "epoch": 0.3585547990336042, + "grad_norm": 2.182900905609131, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6797943115234375, + "num_tokens": 81758681.0, + "step": 3265 + }, + { + "epoch": 0.35866461673621786, + "grad_norm": 2.068514585494995, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.684036374092102, + "num_tokens": 81789898.0, + "step": 3266 + }, + { + "epoch": 0.35877443443883156, + "grad_norm": 2.481494665145874, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6815901398658752, + "num_tokens": 81811830.0, + "step": 3267 + }, + { + "epoch": 0.3588842521414452, + "grad_norm": 2.2127721309661865, + "learning_rate": 1e-06, + "loss": 1.0655, + "mean_token_accuracy": 0.6780750155448914, + "num_tokens": 81840456.0, + "step": 3268 + }, + { + "epoch": 0.35899406984405885, + "grad_norm": 2.288693428039551, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7037818431854248, + "num_tokens": 81864771.0, + "step": 3269 + }, + { + "epoch": 0.35910388754667255, + "grad_norm": 2.056504249572754, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6992390751838684, + "num_tokens": 81894345.0, + "step": 3270 + }, + { + "epoch": 0.3592137052492862, + "grad_norm": 2.5149970054626465, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6913489103317261, + "num_tokens": 81915603.0, + "step": 3271 + }, + { + "epoch": 0.35932352295189984, + "grad_norm": 2.2261316776275635, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.6961964964866638, + "num_tokens": 81940845.0, + "step": 3272 + }, + { + "epoch": 0.3594333406545135, + "grad_norm": 2.569671392440796, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7098344564437866, + "num_tokens": 81960357.0, + "step": 3273 + }, + { + "epoch": 0.3595431583571272, + "grad_norm": 2.1623291969299316, + "learning_rate": 1e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.6851410269737244, + "num_tokens": 81987377.0, + "step": 3274 + }, + { + "epoch": 0.35965297605974084, + "grad_norm": 2.400646209716797, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6972746849060059, + "num_tokens": 82010399.0, + "step": 3275 + }, + { + "epoch": 0.3597627937623545, + "grad_norm": 2.326984167098999, + "learning_rate": 1e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6800026297569275, + "num_tokens": 82036211.0, + "step": 3276 + }, + { + "epoch": 0.35987261146496813, + "grad_norm": 2.4535727500915527, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6954019069671631, + "num_tokens": 82057809.0, + "step": 3277 + }, + { + "epoch": 0.35998242916758183, + "grad_norm": 2.1219234466552734, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6952033042907715, + "num_tokens": 82084096.0, + "step": 3278 + }, + { + "epoch": 0.3600922468701955, + "grad_norm": 2.2036566734313965, + "learning_rate": 1e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.6817020773887634, + "num_tokens": 82111596.0, + "step": 3279 + }, + { + "epoch": 0.3602020645728091, + "grad_norm": 2.6703040599823, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7081030607223511, + "num_tokens": 82129807.0, + "step": 3280 + }, + { + "epoch": 0.3603118822754228, + "grad_norm": 2.3461861610412598, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7078815698623657, + "num_tokens": 82152037.0, + "step": 3281 + }, + { + "epoch": 0.36042169997803647, + "grad_norm": 2.4177000522613525, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7098023891448975, + "num_tokens": 82174241.0, + "step": 3282 + }, + { + "epoch": 0.3605315176806501, + "grad_norm": 2.169682741165161, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7219758629798889, + "num_tokens": 82199552.0, + "step": 3283 + }, + { + "epoch": 0.36064133538326376, + "grad_norm": 2.1220755577087402, + "learning_rate": 1e-06, + "loss": 1.1222, + "mean_token_accuracy": 0.66566401720047, + "num_tokens": 82228294.0, + "step": 3284 + }, + { + "epoch": 0.36075115308587746, + "grad_norm": 2.5848021507263184, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7203925848007202, + "num_tokens": 82247754.0, + "step": 3285 + }, + { + "epoch": 0.3608609707884911, + "grad_norm": 2.6717960834503174, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7066470384597778, + "num_tokens": 82265477.0, + "step": 3286 + }, + { + "epoch": 0.36097078849110475, + "grad_norm": 2.727382183074951, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7095166444778442, + "num_tokens": 82282768.0, + "step": 3287 + }, + { + "epoch": 0.3610806061937184, + "grad_norm": 1.986745834350586, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6866360902786255, + "num_tokens": 82312360.0, + "step": 3288 + }, + { + "epoch": 0.3611904238963321, + "grad_norm": 2.171010732650757, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7102432250976562, + "num_tokens": 82337257.0, + "step": 3289 + }, + { + "epoch": 0.36130024159894575, + "grad_norm": 2.3179752826690674, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7284116744995117, + "num_tokens": 82358276.0, + "step": 3290 + }, + { + "epoch": 0.3614100593015594, + "grad_norm": 2.268670082092285, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.714572548866272, + "num_tokens": 82382468.0, + "step": 3291 + }, + { + "epoch": 0.3615198770041731, + "grad_norm": 2.246034860610962, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.680097758769989, + "num_tokens": 82408469.0, + "step": 3292 + }, + { + "epoch": 0.36162969470678674, + "grad_norm": 2.4668166637420654, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.706895112991333, + "num_tokens": 82431739.0, + "step": 3293 + }, + { + "epoch": 0.3617395124094004, + "grad_norm": 2.1749258041381836, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7005516886711121, + "num_tokens": 82459629.0, + "step": 3294 + }, + { + "epoch": 0.36184933011201403, + "grad_norm": 2.248089075088501, + "learning_rate": 1e-06, + "loss": 1.1102, + "mean_token_accuracy": 0.6695804595947266, + "num_tokens": 82488718.0, + "step": 3295 + }, + { + "epoch": 0.36195914781462774, + "grad_norm": 2.065365791320801, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7027722597122192, + "num_tokens": 82518993.0, + "step": 3296 + }, + { + "epoch": 0.3620689655172414, + "grad_norm": 1.8957760334014893, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6818975210189819, + "num_tokens": 82555434.0, + "step": 3297 + }, + { + "epoch": 0.362178783219855, + "grad_norm": 2.186253070831299, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7077110409736633, + "num_tokens": 82580072.0, + "step": 3298 + }, + { + "epoch": 0.36228860092246873, + "grad_norm": 1.8841986656188965, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6896819472312927, + "num_tokens": 82613693.0, + "step": 3299 + }, + { + "epoch": 0.3623984186250824, + "grad_norm": 2.229788064956665, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7062064409255981, + "num_tokens": 82639219.0, + "step": 3300 + }, + { + "epoch": 0.362508236327696, + "grad_norm": 2.210911750793457, + "learning_rate": 1e-06, + "loss": 1.0973, + "mean_token_accuracy": 0.6764522790908813, + "num_tokens": 82669686.0, + "step": 3301 + }, + { + "epoch": 0.36261805403030967, + "grad_norm": 2.5889933109283447, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7254045009613037, + "num_tokens": 82689496.0, + "step": 3302 + }, + { + "epoch": 0.36272787173292337, + "grad_norm": 2.2619919776916504, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7166351675987244, + "num_tokens": 82714206.0, + "step": 3303 + }, + { + "epoch": 0.362837689435537, + "grad_norm": 2.284379243850708, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.7089313268661499, + "num_tokens": 82740213.0, + "step": 3304 + }, + { + "epoch": 0.36294750713815066, + "grad_norm": 2.2367167472839355, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7163628339767456, + "num_tokens": 82762968.0, + "step": 3305 + }, + { + "epoch": 0.3630573248407643, + "grad_norm": 2.4690041542053223, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6887422204017639, + "num_tokens": 82784762.0, + "step": 3306 + }, + { + "epoch": 0.363167142543378, + "grad_norm": 2.5993261337280273, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6967032551765442, + "num_tokens": 82808371.0, + "step": 3307 + }, + { + "epoch": 0.36327696024599165, + "grad_norm": 2.4899890422821045, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7024694681167603, + "num_tokens": 82831343.0, + "step": 3308 + }, + { + "epoch": 0.3633867779486053, + "grad_norm": 2.520214796066284, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6896830797195435, + "num_tokens": 82851994.0, + "step": 3309 + }, + { + "epoch": 0.363496595651219, + "grad_norm": 2.44136381149292, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6980093717575073, + "num_tokens": 82875625.0, + "step": 3310 + }, + { + "epoch": 0.36360641335383265, + "grad_norm": 2.2086620330810547, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7042126655578613, + "num_tokens": 82900763.0, + "step": 3311 + }, + { + "epoch": 0.3637162310564463, + "grad_norm": 2.1864728927612305, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7088949084281921, + "num_tokens": 82928326.0, + "step": 3312 + }, + { + "epoch": 0.36382604875905994, + "grad_norm": 2.1629087924957275, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6993813514709473, + "num_tokens": 82956392.0, + "step": 3313 + }, + { + "epoch": 0.36393586646167364, + "grad_norm": 2.0302093029022217, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6884803771972656, + "num_tokens": 82988552.0, + "step": 3314 + }, + { + "epoch": 0.3640456841642873, + "grad_norm": 1.9429458379745483, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6873498558998108, + "num_tokens": 83020052.0, + "step": 3315 + }, + { + "epoch": 0.36415550186690093, + "grad_norm": 2.3886055946350098, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7040843367576599, + "num_tokens": 83042394.0, + "step": 3316 + }, + { + "epoch": 0.3642653195695146, + "grad_norm": 2.352337121963501, + "learning_rate": 1e-06, + "loss": 1.0983, + "mean_token_accuracy": 0.671812891960144, + "num_tokens": 83067670.0, + "step": 3317 + }, + { + "epoch": 0.3643751372721283, + "grad_norm": 2.035095453262329, + "learning_rate": 1e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6824586391448975, + "num_tokens": 83096941.0, + "step": 3318 + }, + { + "epoch": 0.3644849549747419, + "grad_norm": 1.9454947710037231, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6807079315185547, + "num_tokens": 83127927.0, + "step": 3319 + }, + { + "epoch": 0.36459477267735557, + "grad_norm": 2.3189446926116943, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7261127233505249, + "num_tokens": 83151669.0, + "step": 3320 + }, + { + "epoch": 0.3647045903799693, + "grad_norm": 2.3581202030181885, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7051733136177063, + "num_tokens": 83176543.0, + "step": 3321 + }, + { + "epoch": 0.3648144080825829, + "grad_norm": 1.9661035537719727, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.696596622467041, + "num_tokens": 83208974.0, + "step": 3322 + }, + { + "epoch": 0.36492422578519657, + "grad_norm": 2.5017459392547607, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7020148038864136, + "num_tokens": 83229831.0, + "step": 3323 + }, + { + "epoch": 0.3650340434878102, + "grad_norm": 2.2258236408233643, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7069681882858276, + "num_tokens": 83255750.0, + "step": 3324 + }, + { + "epoch": 0.3651438611904239, + "grad_norm": 2.1025335788726807, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7100000381469727, + "num_tokens": 83282650.0, + "step": 3325 + }, + { + "epoch": 0.36525367889303756, + "grad_norm": 2.5204315185546875, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7020930647850037, + "num_tokens": 83303707.0, + "step": 3326 + }, + { + "epoch": 0.3653634965956512, + "grad_norm": 2.733837127685547, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7215439081192017, + "num_tokens": 83321735.0, + "step": 3327 + }, + { + "epoch": 0.3654733142982649, + "grad_norm": 2.2254390716552734, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7095651626586914, + "num_tokens": 83348530.0, + "step": 3328 + }, + { + "epoch": 0.36558313200087855, + "grad_norm": 2.375091314315796, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7086485028266907, + "num_tokens": 83371743.0, + "step": 3329 + }, + { + "epoch": 0.3656929497034922, + "grad_norm": 2.009817123413086, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7109978795051575, + "num_tokens": 83402246.0, + "step": 3330 + }, + { + "epoch": 0.36580276740610584, + "grad_norm": 2.078373908996582, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6933351159095764, + "num_tokens": 83430350.0, + "step": 3331 + }, + { + "epoch": 0.36591258510871955, + "grad_norm": 2.6514008045196533, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7153775691986084, + "num_tokens": 83448539.0, + "step": 3332 + }, + { + "epoch": 0.3660224028113332, + "grad_norm": 2.6489927768707275, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6947072744369507, + "num_tokens": 83468360.0, + "step": 3333 + }, + { + "epoch": 0.36613222051394684, + "grad_norm": 2.062490224838257, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7263995409011841, + "num_tokens": 83496177.0, + "step": 3334 + }, + { + "epoch": 0.3662420382165605, + "grad_norm": 2.221327066421509, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7316834926605225, + "num_tokens": 83520337.0, + "step": 3335 + }, + { + "epoch": 0.3663518559191742, + "grad_norm": 2.3345468044281006, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6986494064331055, + "num_tokens": 83544611.0, + "step": 3336 + }, + { + "epoch": 0.36646167362178783, + "grad_norm": 2.376373529434204, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6894465684890747, + "num_tokens": 83568520.0, + "step": 3337 + }, + { + "epoch": 0.3665714913244015, + "grad_norm": 2.299964427947998, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7188029289245605, + "num_tokens": 83591869.0, + "step": 3338 + }, + { + "epoch": 0.3666813090270152, + "grad_norm": 2.2044975757598877, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6969521045684814, + "num_tokens": 83616564.0, + "step": 3339 + }, + { + "epoch": 0.3667911267296288, + "grad_norm": 2.384326934814453, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7034364342689514, + "num_tokens": 83639266.0, + "step": 3340 + }, + { + "epoch": 0.36690094443224247, + "grad_norm": 2.332878589630127, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6924534440040588, + "num_tokens": 83663589.0, + "step": 3341 + }, + { + "epoch": 0.3670107621348561, + "grad_norm": 2.0269358158111572, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6894775629043579, + "num_tokens": 83693068.0, + "step": 3342 + }, + { + "epoch": 0.3671205798374698, + "grad_norm": 2.9203381538391113, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7409142255783081, + "num_tokens": 83708279.0, + "step": 3343 + }, + { + "epoch": 0.36723039754008346, + "grad_norm": 2.4814107418060303, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.6970362067222595, + "num_tokens": 83730257.0, + "step": 3344 + }, + { + "epoch": 0.3673402152426971, + "grad_norm": 2.229890823364258, + "learning_rate": 1e-06, + "loss": 1.0887, + "mean_token_accuracy": 0.6707103848457336, + "num_tokens": 83758140.0, + "step": 3345 + }, + { + "epoch": 0.3674500329453108, + "grad_norm": 2.0953192710876465, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7073631286621094, + "num_tokens": 83784609.0, + "step": 3346 + }, + { + "epoch": 0.36755985064792446, + "grad_norm": 2.2383382320404053, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7030393481254578, + "num_tokens": 83811419.0, + "step": 3347 + }, + { + "epoch": 0.3676696683505381, + "grad_norm": 2.477815628051758, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7070448398590088, + "num_tokens": 83832930.0, + "step": 3348 + }, + { + "epoch": 0.36777948605315175, + "grad_norm": 2.489959239959717, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.692846417427063, + "num_tokens": 83854025.0, + "step": 3349 + }, + { + "epoch": 0.36788930375576545, + "grad_norm": 2.4500114917755127, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7048712372779846, + "num_tokens": 83875787.0, + "step": 3350 + }, + { + "epoch": 0.3679991214583791, + "grad_norm": 2.2533156871795654, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6976943016052246, + "num_tokens": 83899855.0, + "step": 3351 + }, + { + "epoch": 0.36810893916099274, + "grad_norm": 2.2677245140075684, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7133972644805908, + "num_tokens": 83925220.0, + "step": 3352 + }, + { + "epoch": 0.3682187568636064, + "grad_norm": 2.5321648120880127, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7076835632324219, + "num_tokens": 83945644.0, + "step": 3353 + }, + { + "epoch": 0.3683285745662201, + "grad_norm": 2.0975985527038574, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6912321448326111, + "num_tokens": 83975047.0, + "step": 3354 + }, + { + "epoch": 0.36843839226883374, + "grad_norm": 2.291882276535034, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7016101479530334, + "num_tokens": 83998332.0, + "step": 3355 + }, + { + "epoch": 0.3685482099714474, + "grad_norm": 2.1557013988494873, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7060913443565369, + "num_tokens": 84026972.0, + "step": 3356 + }, + { + "epoch": 0.3686580276740611, + "grad_norm": 2.2153103351593018, + "learning_rate": 1e-06, + "loss": 1.1169, + "mean_token_accuracy": 0.6688650250434875, + "num_tokens": 84056142.0, + "step": 3357 + }, + { + "epoch": 0.36876784537667473, + "grad_norm": 2.1142749786376953, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7111209630966187, + "num_tokens": 84083125.0, + "step": 3358 + }, + { + "epoch": 0.3688776630792884, + "grad_norm": 2.446566581726074, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6865413188934326, + "num_tokens": 84104069.0, + "step": 3359 + }, + { + "epoch": 0.368987480781902, + "grad_norm": 2.3419950008392334, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6894301176071167, + "num_tokens": 84127380.0, + "step": 3360 + }, + { + "epoch": 0.3690972984845157, + "grad_norm": 2.531428813934326, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6976878643035889, + "num_tokens": 84147602.0, + "step": 3361 + }, + { + "epoch": 0.36920711618712937, + "grad_norm": 2.000013589859009, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6994678974151611, + "num_tokens": 84176865.0, + "step": 3362 + }, + { + "epoch": 0.369316933889743, + "grad_norm": 2.2497127056121826, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.686496913433075, + "num_tokens": 84204385.0, + "step": 3363 + }, + { + "epoch": 0.36942675159235666, + "grad_norm": 2.5639243125915527, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7045544385910034, + "num_tokens": 84224768.0, + "step": 3364 + }, + { + "epoch": 0.36953656929497036, + "grad_norm": 2.44071102142334, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6996303796768188, + "num_tokens": 84248457.0, + "step": 3365 + }, + { + "epoch": 0.369646386997584, + "grad_norm": 2.5648934841156006, + "learning_rate": 1e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6856188774108887, + "num_tokens": 84270143.0, + "step": 3366 + }, + { + "epoch": 0.36975620470019765, + "grad_norm": 2.173496961593628, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.720761239528656, + "num_tokens": 84297193.0, + "step": 3367 + }, + { + "epoch": 0.36986602240281136, + "grad_norm": 2.082148790359497, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6941083669662476, + "num_tokens": 84328217.0, + "step": 3368 + }, + { + "epoch": 0.369975840105425, + "grad_norm": 2.325655221939087, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.6990062594413757, + "num_tokens": 84351622.0, + "step": 3369 + }, + { + "epoch": 0.37008565780803865, + "grad_norm": 2.165879726409912, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7011584639549255, + "num_tokens": 84377798.0, + "step": 3370 + }, + { + "epoch": 0.3701954755106523, + "grad_norm": 2.2501280307769775, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7149348855018616, + "num_tokens": 84405122.0, + "step": 3371 + }, + { + "epoch": 0.370305293213266, + "grad_norm": 2.2220406532287598, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7032080888748169, + "num_tokens": 84430912.0, + "step": 3372 + }, + { + "epoch": 0.37041511091587964, + "grad_norm": 2.143101453781128, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6902898550033569, + "num_tokens": 84459751.0, + "step": 3373 + }, + { + "epoch": 0.3705249286184933, + "grad_norm": 2.184753894805908, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.698209822177887, + "num_tokens": 84487395.0, + "step": 3374 + }, + { + "epoch": 0.370634746321107, + "grad_norm": 2.2135169506073, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7003975510597229, + "num_tokens": 84512807.0, + "step": 3375 + }, + { + "epoch": 0.37074456402372064, + "grad_norm": 2.4618566036224365, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.707004189491272, + "num_tokens": 84533392.0, + "step": 3376 + }, + { + "epoch": 0.3708543817263343, + "grad_norm": 2.039641857147217, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7197458744049072, + "num_tokens": 84561256.0, + "step": 3377 + }, + { + "epoch": 0.3709641994289479, + "grad_norm": 2.3723740577697754, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7248044013977051, + "num_tokens": 84581879.0, + "step": 3378 + }, + { + "epoch": 0.37107401713156163, + "grad_norm": 2.5131337642669678, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.721786618232727, + "num_tokens": 84602191.0, + "step": 3379 + }, + { + "epoch": 0.3711838348341753, + "grad_norm": 1.9841259717941284, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6940762996673584, + "num_tokens": 84632662.0, + "step": 3380 + }, + { + "epoch": 0.3712936525367889, + "grad_norm": 2.3768084049224854, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7138838171958923, + "num_tokens": 84655602.0, + "step": 3381 + }, + { + "epoch": 0.37140347023940257, + "grad_norm": 2.3350327014923096, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7076845169067383, + "num_tokens": 84679019.0, + "step": 3382 + }, + { + "epoch": 0.37151328794201627, + "grad_norm": 2.0916450023651123, + "learning_rate": 1e-06, + "loss": 0.7592, + "mean_token_accuracy": 0.7595877051353455, + "num_tokens": 84702913.0, + "step": 3383 + }, + { + "epoch": 0.3716231056446299, + "grad_norm": 2.2017767429351807, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6989507675170898, + "num_tokens": 84728579.0, + "step": 3384 + }, + { + "epoch": 0.37173292334724356, + "grad_norm": 2.4797089099884033, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.696797251701355, + "num_tokens": 84752281.0, + "step": 3385 + }, + { + "epoch": 0.37184274104985726, + "grad_norm": 1.9757494926452637, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.6766464710235596, + "num_tokens": 84786331.0, + "step": 3386 + }, + { + "epoch": 0.3719525587524709, + "grad_norm": 2.49562406539917, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7091652750968933, + "num_tokens": 84810169.0, + "step": 3387 + }, + { + "epoch": 0.37206237645508455, + "grad_norm": 2.6825852394104004, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7015616297721863, + "num_tokens": 84828982.0, + "step": 3388 + }, + { + "epoch": 0.3721721941576982, + "grad_norm": 2.4828414916992188, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7030903100967407, + "num_tokens": 84849541.0, + "step": 3389 + }, + { + "epoch": 0.3722820118603119, + "grad_norm": 2.241762161254883, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7043509483337402, + "num_tokens": 84873566.0, + "step": 3390 + }, + { + "epoch": 0.37239182956292555, + "grad_norm": 2.385133743286133, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7085897922515869, + "num_tokens": 84895993.0, + "step": 3391 + }, + { + "epoch": 0.3725016472655392, + "grad_norm": 2.294837713241577, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.700234055519104, + "num_tokens": 84923430.0, + "step": 3392 + }, + { + "epoch": 0.37261146496815284, + "grad_norm": 2.3483686447143555, + "learning_rate": 1e-06, + "loss": 1.0959, + "mean_token_accuracy": 0.6783639788627625, + "num_tokens": 84946932.0, + "step": 3393 + }, + { + "epoch": 0.37272128267076654, + "grad_norm": 2.6102442741394043, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.70524001121521, + "num_tokens": 84965111.0, + "step": 3394 + }, + { + "epoch": 0.3728311003733802, + "grad_norm": 2.033639430999756, + "learning_rate": 1e-06, + "loss": 1.0937, + "mean_token_accuracy": 0.6757999658584595, + "num_tokens": 84995369.0, + "step": 3395 + }, + { + "epoch": 0.37294091807599383, + "grad_norm": 2.2210776805877686, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6966357231140137, + "num_tokens": 85020937.0, + "step": 3396 + }, + { + "epoch": 0.37305073577860753, + "grad_norm": 2.0195374488830566, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6909162402153015, + "num_tokens": 85049801.0, + "step": 3397 + }, + { + "epoch": 0.3731605534812212, + "grad_norm": 2.103555917739868, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7105348110198975, + "num_tokens": 85075717.0, + "step": 3398 + }, + { + "epoch": 0.3732703711838348, + "grad_norm": 2.23846435546875, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7069376707077026, + "num_tokens": 85100492.0, + "step": 3399 + }, + { + "epoch": 0.37338018888644847, + "grad_norm": 2.210439443588257, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.698380708694458, + "num_tokens": 85126906.0, + "step": 3400 + }, + { + "epoch": 0.3734900065890622, + "grad_norm": 2.0153582096099854, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.7011933326721191, + "num_tokens": 85158571.0, + "step": 3401 + }, + { + "epoch": 0.3735998242916758, + "grad_norm": 2.3327629566192627, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7286058664321899, + "num_tokens": 85181315.0, + "step": 3402 + }, + { + "epoch": 0.37370964199428947, + "grad_norm": 2.144954204559326, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7209638357162476, + "num_tokens": 85205680.0, + "step": 3403 + }, + { + "epoch": 0.37381945969690317, + "grad_norm": 2.2546916007995605, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6966073513031006, + "num_tokens": 85231279.0, + "step": 3404 + }, + { + "epoch": 0.3739292773995168, + "grad_norm": 2.1588380336761475, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7204234600067139, + "num_tokens": 85259255.0, + "step": 3405 + }, + { + "epoch": 0.37403909510213046, + "grad_norm": 2.469453811645508, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7101526260375977, + "num_tokens": 85280068.0, + "step": 3406 + }, + { + "epoch": 0.3741489128047441, + "grad_norm": 2.1745171546936035, + "learning_rate": 1e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.675288200378418, + "num_tokens": 85310671.0, + "step": 3407 + }, + { + "epoch": 0.3742587305073578, + "grad_norm": 2.0249297618865967, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6998422741889954, + "num_tokens": 85341038.0, + "step": 3408 + }, + { + "epoch": 0.37436854820997145, + "grad_norm": 2.062382459640503, + "learning_rate": 1e-06, + "loss": 1.0872, + "mean_token_accuracy": 0.6749157905578613, + "num_tokens": 85373156.0, + "step": 3409 + }, + { + "epoch": 0.3744783659125851, + "grad_norm": 1.955092191696167, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6922453045845032, + "num_tokens": 85407246.0, + "step": 3410 + }, + { + "epoch": 0.37458818361519874, + "grad_norm": 2.514043092727661, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.699580729007721, + "num_tokens": 85427819.0, + "step": 3411 + }, + { + "epoch": 0.37469800131781245, + "grad_norm": 2.3674099445343018, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7047716379165649, + "num_tokens": 85451065.0, + "step": 3412 + }, + { + "epoch": 0.3748078190204261, + "grad_norm": 2.006617784500122, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7208311557769775, + "num_tokens": 85480477.0, + "step": 3413 + }, + { + "epoch": 0.37491763672303974, + "grad_norm": 2.015634775161743, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.7006106376647949, + "num_tokens": 85511485.0, + "step": 3414 + }, + { + "epoch": 0.37502745442565344, + "grad_norm": 2.310359477996826, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7126803994178772, + "num_tokens": 85535162.0, + "step": 3415 + }, + { + "epoch": 0.3751372721282671, + "grad_norm": 2.7554023265838623, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.704484224319458, + "num_tokens": 85555766.0, + "step": 3416 + }, + { + "epoch": 0.37524708983088073, + "grad_norm": 2.196650743484497, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.714819073677063, + "num_tokens": 85579910.0, + "step": 3417 + }, + { + "epoch": 0.3753569075334944, + "grad_norm": 2.455547332763672, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.694233238697052, + "num_tokens": 85603996.0, + "step": 3418 + }, + { + "epoch": 0.3754667252361081, + "grad_norm": 2.4686222076416016, + "learning_rate": 1e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6862568259239197, + "num_tokens": 85627986.0, + "step": 3419 + }, + { + "epoch": 0.3755765429387217, + "grad_norm": 2.3252711296081543, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7007930278778076, + "num_tokens": 85651254.0, + "step": 3420 + }, + { + "epoch": 0.37568636064133537, + "grad_norm": 2.194033145904541, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6879570484161377, + "num_tokens": 85676947.0, + "step": 3421 + }, + { + "epoch": 0.37579617834394907, + "grad_norm": 2.3755996227264404, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7151434421539307, + "num_tokens": 85701307.0, + "step": 3422 + }, + { + "epoch": 0.3759059960465627, + "grad_norm": 2.075467109680176, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.710776686668396, + "num_tokens": 85729862.0, + "step": 3423 + }, + { + "epoch": 0.37601581374917636, + "grad_norm": 1.938077449798584, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.703752875328064, + "num_tokens": 85759761.0, + "step": 3424 + }, + { + "epoch": 0.37612563145179, + "grad_norm": 2.000408411026001, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6933680772781372, + "num_tokens": 85790223.0, + "step": 3425 + }, + { + "epoch": 0.3762354491544037, + "grad_norm": 2.2499783039093018, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6862032413482666, + "num_tokens": 85816918.0, + "step": 3426 + }, + { + "epoch": 0.37634526685701736, + "grad_norm": 2.297868013381958, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6947777271270752, + "num_tokens": 85842669.0, + "step": 3427 + }, + { + "epoch": 0.376455084559631, + "grad_norm": 2.1659324169158936, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.7016342878341675, + "num_tokens": 85869909.0, + "step": 3428 + }, + { + "epoch": 0.37656490226224465, + "grad_norm": 2.406681776046753, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7214151620864868, + "num_tokens": 85891472.0, + "step": 3429 + }, + { + "epoch": 0.37667471996485835, + "grad_norm": 2.084409475326538, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6832225322723389, + "num_tokens": 85921514.0, + "step": 3430 + }, + { + "epoch": 0.376784537667472, + "grad_norm": 2.0178990364074707, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7124786376953125, + "num_tokens": 85951371.0, + "step": 3431 + }, + { + "epoch": 0.37689435537008564, + "grad_norm": 2.6308116912841797, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7116819620132446, + "num_tokens": 85970358.0, + "step": 3432 + }, + { + "epoch": 0.37700417307269934, + "grad_norm": 2.165142059326172, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6860784292221069, + "num_tokens": 85997416.0, + "step": 3433 + }, + { + "epoch": 0.377113990775313, + "grad_norm": 2.30973482131958, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6987895965576172, + "num_tokens": 86020551.0, + "step": 3434 + }, + { + "epoch": 0.37722380847792664, + "grad_norm": 2.4224135875701904, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7095736265182495, + "num_tokens": 86042021.0, + "step": 3435 + }, + { + "epoch": 0.3773336261805403, + "grad_norm": 2.333688497543335, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6937949061393738, + "num_tokens": 86064727.0, + "step": 3436 + }, + { + "epoch": 0.377443443883154, + "grad_norm": 2.306260824203491, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7160417437553406, + "num_tokens": 86087899.0, + "step": 3437 + }, + { + "epoch": 0.37755326158576763, + "grad_norm": 2.0310826301574707, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7082754969596863, + "num_tokens": 86118361.0, + "step": 3438 + }, + { + "epoch": 0.3776630792883813, + "grad_norm": 2.439180612564087, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.718519926071167, + "num_tokens": 86141048.0, + "step": 3439 + }, + { + "epoch": 0.3777728969909949, + "grad_norm": 1.8725663423538208, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6877067685127258, + "num_tokens": 86173511.0, + "step": 3440 + }, + { + "epoch": 0.3778827146936086, + "grad_norm": 2.077498435974121, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6946092247962952, + "num_tokens": 86201776.0, + "step": 3441 + }, + { + "epoch": 0.37799253239622227, + "grad_norm": 2.4478137493133545, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6985965371131897, + "num_tokens": 86224623.0, + "step": 3442 + }, + { + "epoch": 0.3781023500988359, + "grad_norm": 2.291006565093994, + "learning_rate": 1e-06, + "loss": 1.0677, + "mean_token_accuracy": 0.6895729899406433, + "num_tokens": 86250506.0, + "step": 3443 + }, + { + "epoch": 0.3782121678014496, + "grad_norm": 2.3836913108825684, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6943544149398804, + "num_tokens": 86273423.0, + "step": 3444 + }, + { + "epoch": 0.37832198550406326, + "grad_norm": 2.1437485218048096, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7023847699165344, + "num_tokens": 86298777.0, + "step": 3445 + }, + { + "epoch": 0.3784318032066769, + "grad_norm": 2.1790452003479004, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6801364421844482, + "num_tokens": 86326710.0, + "step": 3446 + }, + { + "epoch": 0.37854162090929055, + "grad_norm": 2.3424415588378906, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7110670804977417, + "num_tokens": 86348782.0, + "step": 3447 + }, + { + "epoch": 0.37865143861190426, + "grad_norm": 2.268700361251831, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.6970305442810059, + "num_tokens": 86374736.0, + "step": 3448 + }, + { + "epoch": 0.3787612563145179, + "grad_norm": 2.632289409637451, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7395522594451904, + "num_tokens": 86395188.0, + "step": 3449 + }, + { + "epoch": 0.37887107401713155, + "grad_norm": 2.459212064743042, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7251771688461304, + "num_tokens": 86415205.0, + "step": 3450 + }, + { + "epoch": 0.37898089171974525, + "grad_norm": 2.257789373397827, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6863130331039429, + "num_tokens": 86438666.0, + "step": 3451 + }, + { + "epoch": 0.3790907094223589, + "grad_norm": 2.0438122749328613, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7092737555503845, + "num_tokens": 86467166.0, + "step": 3452 + }, + { + "epoch": 0.37920052712497254, + "grad_norm": 1.987926959991455, + "learning_rate": 1e-06, + "loss": 1.0862, + "mean_token_accuracy": 0.6772412061691284, + "num_tokens": 86498083.0, + "step": 3453 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 2.6945018768310547, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7102683782577515, + "num_tokens": 86518279.0, + "step": 3454 + }, + { + "epoch": 0.3794201625301999, + "grad_norm": 2.3883039951324463, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7130049467086792, + "num_tokens": 86540144.0, + "step": 3455 + }, + { + "epoch": 0.37952998023281354, + "grad_norm": 2.41518497467041, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7366747260093689, + "num_tokens": 86560974.0, + "step": 3456 + }, + { + "epoch": 0.3796397979354272, + "grad_norm": 2.5238358974456787, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7112115621566772, + "num_tokens": 86581824.0, + "step": 3457 + }, + { + "epoch": 0.3797496156380408, + "grad_norm": 2.358509063720703, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.702025294303894, + "num_tokens": 86605590.0, + "step": 3458 + }, + { + "epoch": 0.37985943334065453, + "grad_norm": 2.580157995223999, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.7061592936515808, + "num_tokens": 86627688.0, + "step": 3459 + }, + { + "epoch": 0.3799692510432682, + "grad_norm": 2.514810562133789, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7178533673286438, + "num_tokens": 86649864.0, + "step": 3460 + }, + { + "epoch": 0.3800790687458818, + "grad_norm": 2.304126024246216, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6879024505615234, + "num_tokens": 86676328.0, + "step": 3461 + }, + { + "epoch": 0.3801888864484955, + "grad_norm": 1.9979215860366821, + "learning_rate": 1e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6797142624855042, + "num_tokens": 86708501.0, + "step": 3462 + }, + { + "epoch": 0.38029870415110917, + "grad_norm": 2.3386881351470947, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7246363162994385, + "num_tokens": 86732896.0, + "step": 3463 + }, + { + "epoch": 0.3804085218537228, + "grad_norm": 2.6989762783050537, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7191345691680908, + "num_tokens": 86752305.0, + "step": 3464 + }, + { + "epoch": 0.38051833955633646, + "grad_norm": 2.2957887649536133, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7263491749763489, + "num_tokens": 86777367.0, + "step": 3465 + }, + { + "epoch": 0.38062815725895016, + "grad_norm": 2.2771763801574707, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7236793041229248, + "num_tokens": 86800414.0, + "step": 3466 + }, + { + "epoch": 0.3807379749615638, + "grad_norm": 2.1122374534606934, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6878437399864197, + "num_tokens": 86830010.0, + "step": 3467 + }, + { + "epoch": 0.38084779266417745, + "grad_norm": 2.474533796310425, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7056173086166382, + "num_tokens": 86853415.0, + "step": 3468 + }, + { + "epoch": 0.3809576103667911, + "grad_norm": 2.3255462646484375, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7257133722305298, + "num_tokens": 86875802.0, + "step": 3469 + }, + { + "epoch": 0.3810674280694048, + "grad_norm": 1.9654064178466797, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7465920448303223, + "num_tokens": 86903711.0, + "step": 3470 + }, + { + "epoch": 0.38117724577201845, + "grad_norm": 2.2386577129364014, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7137280702590942, + "num_tokens": 86927308.0, + "step": 3471 + }, + { + "epoch": 0.3812870634746321, + "grad_norm": 2.341059446334839, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6936735510826111, + "num_tokens": 86951777.0, + "step": 3472 + }, + { + "epoch": 0.3813968811772458, + "grad_norm": 2.2367196083068848, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7190853357315063, + "num_tokens": 86974766.0, + "step": 3473 + }, + { + "epoch": 0.38150669887985944, + "grad_norm": 2.4949047565460205, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.711294949054718, + "num_tokens": 86997727.0, + "step": 3474 + }, + { + "epoch": 0.3816165165824731, + "grad_norm": 2.1132376194000244, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6857038736343384, + "num_tokens": 87026798.0, + "step": 3475 + }, + { + "epoch": 0.38172633428508673, + "grad_norm": 2.1337084770202637, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.687698483467102, + "num_tokens": 87054858.0, + "step": 3476 + }, + { + "epoch": 0.38183615198770043, + "grad_norm": 2.6057322025299072, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7083183526992798, + "num_tokens": 87074123.0, + "step": 3477 + }, + { + "epoch": 0.3819459696903141, + "grad_norm": 2.230914831161499, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7230886220932007, + "num_tokens": 87099220.0, + "step": 3478 + }, + { + "epoch": 0.3820557873929277, + "grad_norm": 2.3711888790130615, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.723152756690979, + "num_tokens": 87120934.0, + "step": 3479 + }, + { + "epoch": 0.3821656050955414, + "grad_norm": 2.2161574363708496, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6894378662109375, + "num_tokens": 87146277.0, + "step": 3480 + }, + { + "epoch": 0.3822754227981551, + "grad_norm": 2.116692304611206, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7132955193519592, + "num_tokens": 87173443.0, + "step": 3481 + }, + { + "epoch": 0.3823852405007687, + "grad_norm": 2.425250291824341, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7086767554283142, + "num_tokens": 87194369.0, + "step": 3482 + }, + { + "epoch": 0.38249505820338237, + "grad_norm": 2.4123454093933105, + "learning_rate": 1e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.6796115636825562, + "num_tokens": 87219090.0, + "step": 3483 + }, + { + "epoch": 0.38260487590599607, + "grad_norm": 2.16886830329895, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7047826051712036, + "num_tokens": 87244075.0, + "step": 3484 + }, + { + "epoch": 0.3827146936086097, + "grad_norm": 2.2852330207824707, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7219371795654297, + "num_tokens": 87266954.0, + "step": 3485 + }, + { + "epoch": 0.38282451131122336, + "grad_norm": 2.049206495285034, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7004718780517578, + "num_tokens": 87295473.0, + "step": 3486 + }, + { + "epoch": 0.382934329013837, + "grad_norm": 2.071303129196167, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.715137779712677, + "num_tokens": 87323613.0, + "step": 3487 + }, + { + "epoch": 0.3830441467164507, + "grad_norm": 2.049137830734253, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6912093758583069, + "num_tokens": 87353941.0, + "step": 3488 + }, + { + "epoch": 0.38315396441906435, + "grad_norm": 2.2152795791625977, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7002813816070557, + "num_tokens": 87380495.0, + "step": 3489 + }, + { + "epoch": 0.383263782121678, + "grad_norm": 2.1761255264282227, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7040549516677856, + "num_tokens": 87405757.0, + "step": 3490 + }, + { + "epoch": 0.3833735998242917, + "grad_norm": 2.446774482727051, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7247803211212158, + "num_tokens": 87425276.0, + "step": 3491 + }, + { + "epoch": 0.38348341752690535, + "grad_norm": 2.084350347518921, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7354158759117126, + "num_tokens": 87452965.0, + "step": 3492 + }, + { + "epoch": 0.383593235229519, + "grad_norm": 2.288112163543701, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7193750739097595, + "num_tokens": 87476681.0, + "step": 3493 + }, + { + "epoch": 0.38370305293213264, + "grad_norm": 2.1764769554138184, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7091759443283081, + "num_tokens": 87503515.0, + "step": 3494 + }, + { + "epoch": 0.38381287063474634, + "grad_norm": 2.1218090057373047, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6794509887695312, + "num_tokens": 87532728.0, + "step": 3495 + }, + { + "epoch": 0.38392268833736, + "grad_norm": 2.1539602279663086, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6934172511100769, + "num_tokens": 87558253.0, + "step": 3496 + }, + { + "epoch": 0.38403250603997363, + "grad_norm": 2.354736328125, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7031154632568359, + "num_tokens": 87581050.0, + "step": 3497 + }, + { + "epoch": 0.38414232374258733, + "grad_norm": 2.3972222805023193, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7307723760604858, + "num_tokens": 87603556.0, + "step": 3498 + }, + { + "epoch": 0.384252141445201, + "grad_norm": 2.8012712001800537, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7245933413505554, + "num_tokens": 87619595.0, + "step": 3499 + }, + { + "epoch": 0.3843619591478146, + "grad_norm": 2.243084669113159, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7015296816825867, + "num_tokens": 87645510.0, + "step": 3500 + }, + { + "epoch": 0.38447177685042827, + "grad_norm": 2.0697109699249268, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.714200496673584, + "num_tokens": 87673843.0, + "step": 3501 + }, + { + "epoch": 0.38458159455304197, + "grad_norm": 2.4021198749542236, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6812270879745483, + "num_tokens": 87697151.0, + "step": 3502 + }, + { + "epoch": 0.3846914122556556, + "grad_norm": 2.142062187194824, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7185038924217224, + "num_tokens": 87722933.0, + "step": 3503 + }, + { + "epoch": 0.38480122995826926, + "grad_norm": 2.0029468536376953, + "learning_rate": 1e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6746878027915955, + "num_tokens": 87756010.0, + "step": 3504 + }, + { + "epoch": 0.3849110476608829, + "grad_norm": 2.2833735942840576, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7206050753593445, + "num_tokens": 87779192.0, + "step": 3505 + }, + { + "epoch": 0.3850208653634966, + "grad_norm": 2.2604119777679443, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7079412341117859, + "num_tokens": 87804359.0, + "step": 3506 + }, + { + "epoch": 0.38513068306611026, + "grad_norm": 2.6289267539978027, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7199063301086426, + "num_tokens": 87825164.0, + "step": 3507 + }, + { + "epoch": 0.3852405007687239, + "grad_norm": 2.090000867843628, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7119307518005371, + "num_tokens": 87853230.0, + "step": 3508 + }, + { + "epoch": 0.3853503184713376, + "grad_norm": 2.639472246170044, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7435700297355652, + "num_tokens": 87873744.0, + "step": 3509 + }, + { + "epoch": 0.38546013617395125, + "grad_norm": 2.725520610809326, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7187784910202026, + "num_tokens": 87891256.0, + "step": 3510 + }, + { + "epoch": 0.3855699538765649, + "grad_norm": 2.562366485595703, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7011152505874634, + "num_tokens": 87911049.0, + "step": 3511 + }, + { + "epoch": 0.38567977157917854, + "grad_norm": 2.472090721130371, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7071189880371094, + "num_tokens": 87931217.0, + "step": 3512 + }, + { + "epoch": 0.38578958928179224, + "grad_norm": 2.3031203746795654, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7104699611663818, + "num_tokens": 87953316.0, + "step": 3513 + }, + { + "epoch": 0.3858994069844059, + "grad_norm": 2.192852735519409, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6960769891738892, + "num_tokens": 87979018.0, + "step": 3514 + }, + { + "epoch": 0.38600922468701954, + "grad_norm": 2.2320339679718018, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7059298753738403, + "num_tokens": 88007129.0, + "step": 3515 + }, + { + "epoch": 0.3861190423896332, + "grad_norm": 2.325409412384033, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7132893800735474, + "num_tokens": 88030443.0, + "step": 3516 + }, + { + "epoch": 0.3862288600922469, + "grad_norm": 2.664334535598755, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7148346304893494, + "num_tokens": 88049217.0, + "step": 3517 + }, + { + "epoch": 0.38633867779486053, + "grad_norm": 2.0950582027435303, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6949785947799683, + "num_tokens": 88078930.0, + "step": 3518 + }, + { + "epoch": 0.3864484954974742, + "grad_norm": 2.580703020095825, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7135980129241943, + "num_tokens": 88097455.0, + "step": 3519 + }, + { + "epoch": 0.3865583132000879, + "grad_norm": 2.4283180236816406, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7335070967674255, + "num_tokens": 88118257.0, + "step": 3520 + }, + { + "epoch": 0.3866681309027015, + "grad_norm": 1.8970168828964233, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7216023802757263, + "num_tokens": 88152350.0, + "step": 3521 + }, + { + "epoch": 0.38677794860531517, + "grad_norm": 2.4248006343841553, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6971036791801453, + "num_tokens": 88175304.0, + "step": 3522 + }, + { + "epoch": 0.3868877663079288, + "grad_norm": 2.4505155086517334, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6890934705734253, + "num_tokens": 88197592.0, + "step": 3523 + }, + { + "epoch": 0.3869975840105425, + "grad_norm": 2.0545499324798584, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7077726125717163, + "num_tokens": 88228266.0, + "step": 3524 + }, + { + "epoch": 0.38710740171315616, + "grad_norm": 2.2685911655426025, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7039619088172913, + "num_tokens": 88250315.0, + "step": 3525 + }, + { + "epoch": 0.3872172194157698, + "grad_norm": 1.9586056470870972, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7009581923484802, + "num_tokens": 88281565.0, + "step": 3526 + }, + { + "epoch": 0.3873270371183835, + "grad_norm": 2.029029130935669, + "learning_rate": 1e-06, + "loss": 1.1388, + "mean_token_accuracy": 0.6622763276100159, + "num_tokens": 88312136.0, + "step": 3527 + }, + { + "epoch": 0.38743685482099716, + "grad_norm": 2.5254476070404053, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6886007785797119, + "num_tokens": 88333267.0, + "step": 3528 + }, + { + "epoch": 0.3875466725236108, + "grad_norm": 2.5549004077911377, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.69319087266922, + "num_tokens": 88355386.0, + "step": 3529 + }, + { + "epoch": 0.38765649022622445, + "grad_norm": 2.3447437286376953, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7080103158950806, + "num_tokens": 88379759.0, + "step": 3530 + }, + { + "epoch": 0.38776630792883815, + "grad_norm": 2.2181055545806885, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7219215631484985, + "num_tokens": 88405927.0, + "step": 3531 + }, + { + "epoch": 0.3878761256314518, + "grad_norm": 2.5106801986694336, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6879677176475525, + "num_tokens": 88429217.0, + "step": 3532 + }, + { + "epoch": 0.38798594333406544, + "grad_norm": 2.0235047340393066, + "learning_rate": 1e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6849937438964844, + "num_tokens": 88460283.0, + "step": 3533 + }, + { + "epoch": 0.3880957610366791, + "grad_norm": 2.3655471801757812, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7253575921058655, + "num_tokens": 88482411.0, + "step": 3534 + }, + { + "epoch": 0.3882055787392928, + "grad_norm": 2.486201047897339, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.725013017654419, + "num_tokens": 88502574.0, + "step": 3535 + }, + { + "epoch": 0.38831539644190644, + "grad_norm": 2.218278646469116, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7019760608673096, + "num_tokens": 88529384.0, + "step": 3536 + }, + { + "epoch": 0.3884252141445201, + "grad_norm": 2.2568705081939697, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7035583257675171, + "num_tokens": 88552479.0, + "step": 3537 + }, + { + "epoch": 0.3885350318471338, + "grad_norm": 2.4308831691741943, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7014613151550293, + "num_tokens": 88574727.0, + "step": 3538 + }, + { + "epoch": 0.38864484954974743, + "grad_norm": 2.322023868560791, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6938230991363525, + "num_tokens": 88598694.0, + "step": 3539 + }, + { + "epoch": 0.3887546672523611, + "grad_norm": 1.907755732536316, + "learning_rate": 1e-06, + "loss": 1.1025, + "mean_token_accuracy": 0.6740255355834961, + "num_tokens": 88635225.0, + "step": 3540 + }, + { + "epoch": 0.3888644849549747, + "grad_norm": 2.172530174255371, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7094935178756714, + "num_tokens": 88662080.0, + "step": 3541 + }, + { + "epoch": 0.3889743026575884, + "grad_norm": 1.8299813270568848, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6902689933776855, + "num_tokens": 88694917.0, + "step": 3542 + }, + { + "epoch": 0.38908412036020207, + "grad_norm": 2.47135329246521, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7209764719009399, + "num_tokens": 88714943.0, + "step": 3543 + }, + { + "epoch": 0.3891939380628157, + "grad_norm": 2.183260917663574, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6919776201248169, + "num_tokens": 88739688.0, + "step": 3544 + }, + { + "epoch": 0.38930375576542936, + "grad_norm": 2.0774435997009277, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6889858245849609, + "num_tokens": 88767891.0, + "step": 3545 + }, + { + "epoch": 0.38941357346804306, + "grad_norm": 2.238345146179199, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7129472494125366, + "num_tokens": 88792453.0, + "step": 3546 + }, + { + "epoch": 0.3895233911706567, + "grad_norm": 2.0817525386810303, + "learning_rate": 1e-06, + "loss": 1.0749, + "mean_token_accuracy": 0.6789898872375488, + "num_tokens": 88821337.0, + "step": 3547 + }, + { + "epoch": 0.38963320887327035, + "grad_norm": 2.275726556777954, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7225686311721802, + "num_tokens": 88844689.0, + "step": 3548 + }, + { + "epoch": 0.38974302657588406, + "grad_norm": 2.1066715717315674, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6868062615394592, + "num_tokens": 88873944.0, + "step": 3549 + }, + { + "epoch": 0.3898528442784977, + "grad_norm": 2.2050445079803467, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.7023323178291321, + "num_tokens": 88900575.0, + "step": 3550 + }, + { + "epoch": 0.38996266198111135, + "grad_norm": 1.990606427192688, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6949153542518616, + "num_tokens": 88930218.0, + "step": 3551 + }, + { + "epoch": 0.390072479683725, + "grad_norm": 2.115079164505005, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6988444924354553, + "num_tokens": 88957671.0, + "step": 3552 + }, + { + "epoch": 0.3901822973863387, + "grad_norm": 2.5573015213012695, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7210830450057983, + "num_tokens": 88976423.0, + "step": 3553 + }, + { + "epoch": 0.39029211508895234, + "grad_norm": 2.599712610244751, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7379215955734253, + "num_tokens": 88995057.0, + "step": 3554 + }, + { + "epoch": 0.390401932791566, + "grad_norm": 2.186032295227051, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.717146635055542, + "num_tokens": 89019816.0, + "step": 3555 + }, + { + "epoch": 0.3905117504941797, + "grad_norm": 2.2751624584198, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7199966907501221, + "num_tokens": 89042989.0, + "step": 3556 + }, + { + "epoch": 0.39062156819679333, + "grad_norm": 2.2847275733947754, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6947203874588013, + "num_tokens": 89066908.0, + "step": 3557 + }, + { + "epoch": 0.390731385899407, + "grad_norm": 2.346641778945923, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6997180581092834, + "num_tokens": 89089649.0, + "step": 3558 + }, + { + "epoch": 0.3908412036020206, + "grad_norm": 2.0246429443359375, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6852691173553467, + "num_tokens": 89119762.0, + "step": 3559 + }, + { + "epoch": 0.3909510213046343, + "grad_norm": 2.135200023651123, + "learning_rate": 1e-06, + "loss": 1.0717, + "mean_token_accuracy": 0.6774201393127441, + "num_tokens": 89148033.0, + "step": 3560 + }, + { + "epoch": 0.391060839007248, + "grad_norm": 2.956019639968872, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7316281795501709, + "num_tokens": 89163381.0, + "step": 3561 + }, + { + "epoch": 0.3911706567098616, + "grad_norm": 2.2907748222351074, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7034557461738586, + "num_tokens": 89186247.0, + "step": 3562 + }, + { + "epoch": 0.39128047441247527, + "grad_norm": 2.3146166801452637, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7111318707466125, + "num_tokens": 89210194.0, + "step": 3563 + }, + { + "epoch": 0.39139029211508897, + "grad_norm": 2.3295650482177734, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.724490761756897, + "num_tokens": 89233308.0, + "step": 3564 + }, + { + "epoch": 0.3915001098177026, + "grad_norm": 2.4472899436950684, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.707377552986145, + "num_tokens": 89254904.0, + "step": 3565 + }, + { + "epoch": 0.39160992752031626, + "grad_norm": 2.0956780910491943, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6919884085655212, + "num_tokens": 89283446.0, + "step": 3566 + }, + { + "epoch": 0.39171974522292996, + "grad_norm": 2.1437861919403076, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7105902433395386, + "num_tokens": 89310347.0, + "step": 3567 + }, + { + "epoch": 0.3918295629255436, + "grad_norm": 2.0303592681884766, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6870043873786926, + "num_tokens": 89342194.0, + "step": 3568 + }, + { + "epoch": 0.39193938062815725, + "grad_norm": 2.28708553314209, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7216976284980774, + "num_tokens": 89365447.0, + "step": 3569 + }, + { + "epoch": 0.3920491983307709, + "grad_norm": 2.3817198276519775, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.6988998651504517, + "num_tokens": 89387188.0, + "step": 3570 + }, + { + "epoch": 0.3921590160333846, + "grad_norm": 2.6427273750305176, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7125971913337708, + "num_tokens": 89406195.0, + "step": 3571 + }, + { + "epoch": 0.39226883373599825, + "grad_norm": 2.3395047187805176, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7052634954452515, + "num_tokens": 89430204.0, + "step": 3572 + }, + { + "epoch": 0.3923786514386119, + "grad_norm": 2.5021936893463135, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7051493525505066, + "num_tokens": 89452358.0, + "step": 3573 + }, + { + "epoch": 0.3924884691412256, + "grad_norm": 2.5122790336608887, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7117361426353455, + "num_tokens": 89471624.0, + "step": 3574 + }, + { + "epoch": 0.39259828684383924, + "grad_norm": 2.345690965652466, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.702502965927124, + "num_tokens": 89495062.0, + "step": 3575 + }, + { + "epoch": 0.3927081045464529, + "grad_norm": 2.5967323780059814, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7296324968338013, + "num_tokens": 89514024.0, + "step": 3576 + }, + { + "epoch": 0.39281792224906653, + "grad_norm": 2.485549211502075, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6990818381309509, + "num_tokens": 89536174.0, + "step": 3577 + }, + { + "epoch": 0.39292773995168023, + "grad_norm": 2.287684917449951, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6953425407409668, + "num_tokens": 89563044.0, + "step": 3578 + }, + { + "epoch": 0.3930375576542939, + "grad_norm": 1.9914757013320923, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.6988518238067627, + "num_tokens": 89592238.0, + "step": 3579 + }, + { + "epoch": 0.3931473753569075, + "grad_norm": 2.092689037322998, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.6965837478637695, + "num_tokens": 89621327.0, + "step": 3580 + }, + { + "epoch": 0.39325719305952117, + "grad_norm": 2.225159168243408, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.708682656288147, + "num_tokens": 89647083.0, + "step": 3581 + }, + { + "epoch": 0.39336701076213487, + "grad_norm": 2.3399085998535156, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6990151405334473, + "num_tokens": 89670892.0, + "step": 3582 + }, + { + "epoch": 0.3934768284647485, + "grad_norm": 2.245300769805908, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.701515257358551, + "num_tokens": 89694068.0, + "step": 3583 + }, + { + "epoch": 0.39358664616736216, + "grad_norm": 2.1491951942443848, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6932315826416016, + "num_tokens": 89720314.0, + "step": 3584 + }, + { + "epoch": 0.39369646386997587, + "grad_norm": 2.1706628799438477, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6819562911987305, + "num_tokens": 89747390.0, + "step": 3585 + }, + { + "epoch": 0.3938062815725895, + "grad_norm": 2.319702625274658, + "learning_rate": 1e-06, + "loss": 1.0913, + "mean_token_accuracy": 0.6723882555961609, + "num_tokens": 89772332.0, + "step": 3586 + }, + { + "epoch": 0.39391609927520316, + "grad_norm": 2.0663321018218994, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7031862735748291, + "num_tokens": 89800699.0, + "step": 3587 + }, + { + "epoch": 0.3940259169778168, + "grad_norm": 2.1600773334503174, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.6971991062164307, + "num_tokens": 89829055.0, + "step": 3588 + }, + { + "epoch": 0.3941357346804305, + "grad_norm": 2.0757155418395996, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7082184553146362, + "num_tokens": 89856878.0, + "step": 3589 + }, + { + "epoch": 0.39424555238304415, + "grad_norm": 2.03973388671875, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6910749673843384, + "num_tokens": 89883716.0, + "step": 3590 + }, + { + "epoch": 0.3943553700856578, + "grad_norm": 2.217176675796509, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7042052745819092, + "num_tokens": 89909098.0, + "step": 3591 + }, + { + "epoch": 0.39446518778827144, + "grad_norm": 2.0147204399108887, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7042882442474365, + "num_tokens": 89939790.0, + "step": 3592 + }, + { + "epoch": 0.39457500549088514, + "grad_norm": 2.272138833999634, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6924116611480713, + "num_tokens": 89962469.0, + "step": 3593 + }, + { + "epoch": 0.3946848231934988, + "grad_norm": 2.257512092590332, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6986710429191589, + "num_tokens": 89989264.0, + "step": 3594 + }, + { + "epoch": 0.39479464089611244, + "grad_norm": 2.3358821868896484, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6931182146072388, + "num_tokens": 90013931.0, + "step": 3595 + }, + { + "epoch": 0.39490445859872614, + "grad_norm": 2.031935453414917, + "learning_rate": 1e-06, + "loss": 1.1093, + "mean_token_accuracy": 0.6656721234321594, + "num_tokens": 90046881.0, + "step": 3596 + }, + { + "epoch": 0.3950142763013398, + "grad_norm": 2.060176372528076, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7003160715103149, + "num_tokens": 90074670.0, + "step": 3597 + }, + { + "epoch": 0.39512409400395343, + "grad_norm": 1.9530024528503418, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6858229637145996, + "num_tokens": 90105176.0, + "step": 3598 + }, + { + "epoch": 0.3952339117065671, + "grad_norm": 2.230128526687622, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.701020359992981, + "num_tokens": 90129861.0, + "step": 3599 + }, + { + "epoch": 0.3953437294091808, + "grad_norm": 2.19805645942688, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.6802563667297363, + "num_tokens": 90157950.0, + "step": 3600 + }, + { + "epoch": 0.3954535471117944, + "grad_norm": 2.38529372215271, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7221088409423828, + "num_tokens": 90181300.0, + "step": 3601 + }, + { + "epoch": 0.39556336481440807, + "grad_norm": 2.313498020172119, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6873897314071655, + "num_tokens": 90208160.0, + "step": 3602 + }, + { + "epoch": 0.39567318251702177, + "grad_norm": 2.224088430404663, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.697360634803772, + "num_tokens": 90233908.0, + "step": 3603 + }, + { + "epoch": 0.3957830002196354, + "grad_norm": 2.402703285217285, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6900330781936646, + "num_tokens": 90257010.0, + "step": 3604 + }, + { + "epoch": 0.39589281792224906, + "grad_norm": 2.1078381538391113, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6942426562309265, + "num_tokens": 90285894.0, + "step": 3605 + }, + { + "epoch": 0.3960026356248627, + "grad_norm": 2.2546029090881348, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6934189796447754, + "num_tokens": 90310087.0, + "step": 3606 + }, + { + "epoch": 0.3961124533274764, + "grad_norm": 2.4767558574676514, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6851688027381897, + "num_tokens": 90330866.0, + "step": 3607 + }, + { + "epoch": 0.39622227103009006, + "grad_norm": 2.4124655723571777, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.6926333904266357, + "num_tokens": 90351561.0, + "step": 3608 + }, + { + "epoch": 0.3963320887327037, + "grad_norm": 2.314866304397583, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7148938179016113, + "num_tokens": 90375485.0, + "step": 3609 + }, + { + "epoch": 0.39644190643531735, + "grad_norm": 2.21140718460083, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6837534308433533, + "num_tokens": 90401048.0, + "step": 3610 + }, + { + "epoch": 0.39655172413793105, + "grad_norm": 2.518348455429077, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7047430276870728, + "num_tokens": 90420798.0, + "step": 3611 + }, + { + "epoch": 0.3966615418405447, + "grad_norm": 2.081486463546753, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7079404592514038, + "num_tokens": 90446548.0, + "step": 3612 + }, + { + "epoch": 0.39677135954315834, + "grad_norm": 2.0813801288604736, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.69523024559021, + "num_tokens": 90475510.0, + "step": 3613 + }, + { + "epoch": 0.39688117724577204, + "grad_norm": 2.2020339965820312, + "learning_rate": 1e-06, + "loss": 1.0689, + "mean_token_accuracy": 0.6834295988082886, + "num_tokens": 90504221.0, + "step": 3614 + }, + { + "epoch": 0.3969909949483857, + "grad_norm": 1.9632505178451538, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.683046281337738, + "num_tokens": 90537030.0, + "step": 3615 + }, + { + "epoch": 0.39710081265099934, + "grad_norm": 2.2653756141662598, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6860969066619873, + "num_tokens": 90563030.0, + "step": 3616 + }, + { + "epoch": 0.397210630353613, + "grad_norm": 2.0290064811706543, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7041440010070801, + "num_tokens": 90594044.0, + "step": 3617 + }, + { + "epoch": 0.3973204480562267, + "grad_norm": 2.2409613132476807, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6822632551193237, + "num_tokens": 90620397.0, + "step": 3618 + }, + { + "epoch": 0.39743026575884033, + "grad_norm": 2.1114633083343506, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7192097306251526, + "num_tokens": 90645756.0, + "step": 3619 + }, + { + "epoch": 0.397540083461454, + "grad_norm": 2.1085832118988037, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6908564567565918, + "num_tokens": 90674156.0, + "step": 3620 + }, + { + "epoch": 0.3976499011640676, + "grad_norm": 2.2889626026153564, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7233233451843262, + "num_tokens": 90694404.0, + "step": 3621 + }, + { + "epoch": 0.3977597188666813, + "grad_norm": 2.390692949295044, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7108330726623535, + "num_tokens": 90716128.0, + "step": 3622 + }, + { + "epoch": 0.39786953656929497, + "grad_norm": 2.283496379852295, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7097069025039673, + "num_tokens": 90739600.0, + "step": 3623 + }, + { + "epoch": 0.3979793542719086, + "grad_norm": 2.41507887840271, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7033332586288452, + "num_tokens": 90762203.0, + "step": 3624 + }, + { + "epoch": 0.3980891719745223, + "grad_norm": 2.017897129058838, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6990053653717041, + "num_tokens": 90793847.0, + "step": 3625 + }, + { + "epoch": 0.39819898967713596, + "grad_norm": 1.9637067317962646, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6894716024398804, + "num_tokens": 90825988.0, + "step": 3626 + }, + { + "epoch": 0.3983088073797496, + "grad_norm": 2.080721139907837, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7168350219726562, + "num_tokens": 90852901.0, + "step": 3627 + }, + { + "epoch": 0.39841862508236325, + "grad_norm": 2.2326180934906006, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7415605783462524, + "num_tokens": 90876120.0, + "step": 3628 + }, + { + "epoch": 0.39852844278497696, + "grad_norm": 2.163161039352417, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7033727169036865, + "num_tokens": 90901168.0, + "step": 3629 + }, + { + "epoch": 0.3986382604875906, + "grad_norm": 2.1040475368499756, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7133005857467651, + "num_tokens": 90928815.0, + "step": 3630 + }, + { + "epoch": 0.39874807819020425, + "grad_norm": 2.217111110687256, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.69974684715271, + "num_tokens": 90955312.0, + "step": 3631 + }, + { + "epoch": 0.39885789589281795, + "grad_norm": 2.1528124809265137, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6978704929351807, + "num_tokens": 90981870.0, + "step": 3632 + }, + { + "epoch": 0.3989677135954316, + "grad_norm": 2.617885112762451, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.6985872983932495, + "num_tokens": 91001470.0, + "step": 3633 + }, + { + "epoch": 0.39907753129804524, + "grad_norm": 2.19445538520813, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.707462728023529, + "num_tokens": 91024565.0, + "step": 3634 + }, + { + "epoch": 0.3991873490006589, + "grad_norm": 1.9842228889465332, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6993818283081055, + "num_tokens": 91054164.0, + "step": 3635 + }, + { + "epoch": 0.3992971667032726, + "grad_norm": 2.495295524597168, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.72694993019104, + "num_tokens": 91073935.0, + "step": 3636 + }, + { + "epoch": 0.39940698440588623, + "grad_norm": 2.032057762145996, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6905230283737183, + "num_tokens": 91107406.0, + "step": 3637 + }, + { + "epoch": 0.3995168021084999, + "grad_norm": 2.1086249351501465, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6906449794769287, + "num_tokens": 91137056.0, + "step": 3638 + }, + { + "epoch": 0.3996266198111135, + "grad_norm": 2.399146318435669, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7118256092071533, + "num_tokens": 91158046.0, + "step": 3639 + }, + { + "epoch": 0.3997364375137272, + "grad_norm": 2.058009624481201, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.696689248085022, + "num_tokens": 91186881.0, + "step": 3640 + }, + { + "epoch": 0.3998462552163409, + "grad_norm": 2.0890719890594482, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7123230695724487, + "num_tokens": 91214748.0, + "step": 3641 + }, + { + "epoch": 0.3999560729189545, + "grad_norm": 2.2539687156677246, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.717808723449707, + "num_tokens": 91238811.0, + "step": 3642 + }, + { + "epoch": 0.4000658906215682, + "grad_norm": 2.193338632583618, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7307515144348145, + "num_tokens": 91262529.0, + "step": 3643 + }, + { + "epoch": 0.40017570832418187, + "grad_norm": 2.097059965133667, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6992063522338867, + "num_tokens": 91291150.0, + "step": 3644 + }, + { + "epoch": 0.4002855260267955, + "grad_norm": 1.8944435119628906, + "learning_rate": 1e-06, + "loss": 1.1297, + "mean_token_accuracy": 0.6701223254203796, + "num_tokens": 91326158.0, + "step": 3645 + }, + { + "epoch": 0.40039534372940916, + "grad_norm": 2.417634963989258, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7101256847381592, + "num_tokens": 91347590.0, + "step": 3646 + }, + { + "epoch": 0.40050516143202286, + "grad_norm": 2.255254030227661, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7110612988471985, + "num_tokens": 91371849.0, + "step": 3647 + }, + { + "epoch": 0.4006149791346365, + "grad_norm": 2.0754573345184326, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6860812902450562, + "num_tokens": 91399804.0, + "step": 3648 + }, + { + "epoch": 0.40072479683725015, + "grad_norm": 2.2053847312927246, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6842848062515259, + "num_tokens": 91425512.0, + "step": 3649 + }, + { + "epoch": 0.4008346145398638, + "grad_norm": 2.5306787490844727, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6915208101272583, + "num_tokens": 91445342.0, + "step": 3650 + }, + { + "epoch": 0.4009444322424775, + "grad_norm": 2.283648729324341, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6850943565368652, + "num_tokens": 91468811.0, + "step": 3651 + }, + { + "epoch": 0.40105424994509115, + "grad_norm": 2.0789265632629395, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7002333402633667, + "num_tokens": 91496176.0, + "step": 3652 + }, + { + "epoch": 0.4011640676477048, + "grad_norm": 2.160879611968994, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7236464023590088, + "num_tokens": 91522024.0, + "step": 3653 + }, + { + "epoch": 0.4012738853503185, + "grad_norm": 2.1799569129943848, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7028111219406128, + "num_tokens": 91551055.0, + "step": 3654 + }, + { + "epoch": 0.40138370305293214, + "grad_norm": 2.2590980529785156, + "learning_rate": 1e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.6950137615203857, + "num_tokens": 91575165.0, + "step": 3655 + }, + { + "epoch": 0.4014935207555458, + "grad_norm": 2.2383289337158203, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7078429460525513, + "num_tokens": 91599034.0, + "step": 3656 + }, + { + "epoch": 0.40160333845815943, + "grad_norm": 2.202146053314209, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.715713381767273, + "num_tokens": 91624280.0, + "step": 3657 + }, + { + "epoch": 0.40171315616077313, + "grad_norm": 2.1100351810455322, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6840723752975464, + "num_tokens": 91652644.0, + "step": 3658 + }, + { + "epoch": 0.4018229738633868, + "grad_norm": 2.306264877319336, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7141884565353394, + "num_tokens": 91677458.0, + "step": 3659 + }, + { + "epoch": 0.4019327915660004, + "grad_norm": 2.354736566543579, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7088366746902466, + "num_tokens": 91703011.0, + "step": 3660 + }, + { + "epoch": 0.4020426092686141, + "grad_norm": 1.9396157264709473, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6988064050674438, + "num_tokens": 91734321.0, + "step": 3661 + }, + { + "epoch": 0.40215242697122777, + "grad_norm": 2.774970293045044, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7247952818870544, + "num_tokens": 91751213.0, + "step": 3662 + }, + { + "epoch": 0.4022622446738414, + "grad_norm": 2.5584492683410645, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7235798239707947, + "num_tokens": 91770269.0, + "step": 3663 + }, + { + "epoch": 0.40237206237645506, + "grad_norm": 2.4763553142547607, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.705060601234436, + "num_tokens": 91790283.0, + "step": 3664 + }, + { + "epoch": 0.40248188007906877, + "grad_norm": 2.617687702178955, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7130836248397827, + "num_tokens": 91809642.0, + "step": 3665 + }, + { + "epoch": 0.4025916977816824, + "grad_norm": 2.0343761444091797, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6929374933242798, + "num_tokens": 91842315.0, + "step": 3666 + }, + { + "epoch": 0.40270151548429606, + "grad_norm": 2.7438859939575195, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.6934192776679993, + "num_tokens": 91861128.0, + "step": 3667 + }, + { + "epoch": 0.4028113331869097, + "grad_norm": 2.07293701171875, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6966837048530579, + "num_tokens": 91890810.0, + "step": 3668 + }, + { + "epoch": 0.4029211508895234, + "grad_norm": 2.1418001651763916, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6883181929588318, + "num_tokens": 91916809.0, + "step": 3669 + }, + { + "epoch": 0.40303096859213705, + "grad_norm": 1.892320156097412, + "learning_rate": 1e-06, + "loss": 1.0618, + "mean_token_accuracy": 0.6713727712631226, + "num_tokens": 91949833.0, + "step": 3670 + }, + { + "epoch": 0.4031407862947507, + "grad_norm": 2.2372372150421143, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7221790552139282, + "num_tokens": 91973226.0, + "step": 3671 + }, + { + "epoch": 0.4032506039973644, + "grad_norm": 2.174600601196289, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6977093815803528, + "num_tokens": 92000560.0, + "step": 3672 + }, + { + "epoch": 0.40336042169997804, + "grad_norm": 2.191868543624878, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6851986646652222, + "num_tokens": 92029095.0, + "step": 3673 + }, + { + "epoch": 0.4034702394025917, + "grad_norm": 2.12856125831604, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.70890212059021, + "num_tokens": 92055402.0, + "step": 3674 + }, + { + "epoch": 0.40358005710520534, + "grad_norm": 2.49383544921875, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6954747438430786, + "num_tokens": 92076789.0, + "step": 3675 + }, + { + "epoch": 0.40368987480781904, + "grad_norm": 2.4598898887634277, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7019292712211609, + "num_tokens": 92100542.0, + "step": 3676 + }, + { + "epoch": 0.4037996925104327, + "grad_norm": 2.3192455768585205, + "learning_rate": 1e-06, + "loss": 1.1138, + "mean_token_accuracy": 0.6708842515945435, + "num_tokens": 92123987.0, + "step": 3677 + }, + { + "epoch": 0.40390951021304633, + "grad_norm": 2.4460599422454834, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7229627370834351, + "num_tokens": 92145287.0, + "step": 3678 + }, + { + "epoch": 0.40401932791566003, + "grad_norm": 1.949344277381897, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6865791082382202, + "num_tokens": 92177888.0, + "step": 3679 + }, + { + "epoch": 0.4041291456182737, + "grad_norm": 2.1306099891662598, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6848803162574768, + "num_tokens": 92207000.0, + "step": 3680 + }, + { + "epoch": 0.4042389633208873, + "grad_norm": 2.2471792697906494, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7053182721138, + "num_tokens": 92230219.0, + "step": 3681 + }, + { + "epoch": 0.40434878102350097, + "grad_norm": 2.222559690475464, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7227510213851929, + "num_tokens": 92253778.0, + "step": 3682 + }, + { + "epoch": 0.40445859872611467, + "grad_norm": 2.175894260406494, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6956795454025269, + "num_tokens": 92280028.0, + "step": 3683 + }, + { + "epoch": 0.4045684164287283, + "grad_norm": 2.3167595863342285, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7073547840118408, + "num_tokens": 92302492.0, + "step": 3684 + }, + { + "epoch": 0.40467823413134196, + "grad_norm": 1.9656418561935425, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6950777769088745, + "num_tokens": 92335100.0, + "step": 3685 + }, + { + "epoch": 0.4047880518339556, + "grad_norm": 2.4937615394592285, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6902996301651001, + "num_tokens": 92357449.0, + "step": 3686 + }, + { + "epoch": 0.4048978695365693, + "grad_norm": 2.4267427921295166, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7026916742324829, + "num_tokens": 92379606.0, + "step": 3687 + }, + { + "epoch": 0.40500768723918296, + "grad_norm": 2.1438896656036377, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6936085820198059, + "num_tokens": 92405964.0, + "step": 3688 + }, + { + "epoch": 0.4051175049417966, + "grad_norm": 2.299670696258545, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7389679551124573, + "num_tokens": 92429129.0, + "step": 3689 + }, + { + "epoch": 0.4052273226444103, + "grad_norm": 2.328805685043335, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7018681764602661, + "num_tokens": 92452110.0, + "step": 3690 + }, + { + "epoch": 0.40533714034702395, + "grad_norm": 2.261033535003662, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7218024134635925, + "num_tokens": 92476268.0, + "step": 3691 + }, + { + "epoch": 0.4054469580496376, + "grad_norm": 2.8588311672210693, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7227822542190552, + "num_tokens": 92491819.0, + "step": 3692 + }, + { + "epoch": 0.40555677575225124, + "grad_norm": 2.019029378890991, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6929227113723755, + "num_tokens": 92522030.0, + "step": 3693 + }, + { + "epoch": 0.40566659345486494, + "grad_norm": 2.4054832458496094, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7251594066619873, + "num_tokens": 92543992.0, + "step": 3694 + }, + { + "epoch": 0.4057764111574786, + "grad_norm": 2.156399965286255, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.691124439239502, + "num_tokens": 92569782.0, + "step": 3695 + }, + { + "epoch": 0.40588622886009224, + "grad_norm": 2.2393546104431152, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7093316316604614, + "num_tokens": 92593193.0, + "step": 3696 + }, + { + "epoch": 0.4059960465627059, + "grad_norm": 2.281919002532959, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6980247497558594, + "num_tokens": 92617276.0, + "step": 3697 + }, + { + "epoch": 0.4061058642653196, + "grad_norm": 2.3341031074523926, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6962101459503174, + "num_tokens": 92642108.0, + "step": 3698 + }, + { + "epoch": 0.40621568196793323, + "grad_norm": 2.4242265224456787, + "learning_rate": 1e-06, + "loss": 1.1023, + "mean_token_accuracy": 0.6790748834609985, + "num_tokens": 92667168.0, + "step": 3699 + }, + { + "epoch": 0.4063254996705469, + "grad_norm": 2.2449841499328613, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7157009840011597, + "num_tokens": 92692401.0, + "step": 3700 + }, + { + "epoch": 0.4064353173731606, + "grad_norm": 2.286834955215454, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6806609630584717, + "num_tokens": 92718388.0, + "step": 3701 + }, + { + "epoch": 0.4065451350757742, + "grad_norm": 2.298308849334717, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.727490246295929, + "num_tokens": 92741189.0, + "step": 3702 + }, + { + "epoch": 0.40665495277838787, + "grad_norm": 2.431178569793701, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.7006034851074219, + "num_tokens": 92763179.0, + "step": 3703 + }, + { + "epoch": 0.4067647704810015, + "grad_norm": 2.046475887298584, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7114042639732361, + "num_tokens": 92792275.0, + "step": 3704 + }, + { + "epoch": 0.4068745881836152, + "grad_norm": 2.515117883682251, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7022086381912231, + "num_tokens": 92813684.0, + "step": 3705 + }, + { + "epoch": 0.40698440588622886, + "grad_norm": 2.588489055633545, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7294472455978394, + "num_tokens": 92833399.0, + "step": 3706 + }, + { + "epoch": 0.4070942235888425, + "grad_norm": 2.3230085372924805, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7123717665672302, + "num_tokens": 92855274.0, + "step": 3707 + }, + { + "epoch": 0.4072040412914562, + "grad_norm": 1.967727541923523, + "learning_rate": 1e-06, + "loss": 1.0794, + "mean_token_accuracy": 0.6883715987205505, + "num_tokens": 92886554.0, + "step": 3708 + }, + { + "epoch": 0.40731385899406986, + "grad_norm": 2.386866807937622, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7001955509185791, + "num_tokens": 92910103.0, + "step": 3709 + }, + { + "epoch": 0.4074236766966835, + "grad_norm": 2.3833374977111816, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7065582871437073, + "num_tokens": 92932717.0, + "step": 3710 + }, + { + "epoch": 0.40753349439929715, + "grad_norm": 2.3692989349365234, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7065575122833252, + "num_tokens": 92956205.0, + "step": 3711 + }, + { + "epoch": 0.40764331210191085, + "grad_norm": 2.402970790863037, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7155601978302002, + "num_tokens": 92978015.0, + "step": 3712 + }, + { + "epoch": 0.4077531298045245, + "grad_norm": 2.448199510574341, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.685521125793457, + "num_tokens": 93000810.0, + "step": 3713 + }, + { + "epoch": 0.40786294750713814, + "grad_norm": 2.5080368518829346, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.709625244140625, + "num_tokens": 93020414.0, + "step": 3714 + }, + { + "epoch": 0.4079727652097518, + "grad_norm": 2.159817934036255, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7106919884681702, + "num_tokens": 93046220.0, + "step": 3715 + }, + { + "epoch": 0.4080825829123655, + "grad_norm": 2.320643663406372, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.7007908225059509, + "num_tokens": 93068073.0, + "step": 3716 + }, + { + "epoch": 0.40819240061497913, + "grad_norm": 2.5005056858062744, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7230991721153259, + "num_tokens": 93087955.0, + "step": 3717 + }, + { + "epoch": 0.4083022183175928, + "grad_norm": 2.0991361141204834, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6965385675430298, + "num_tokens": 93115355.0, + "step": 3718 + }, + { + "epoch": 0.4084120360202065, + "grad_norm": 2.2380025386810303, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.69172203540802, + "num_tokens": 93140524.0, + "step": 3719 + }, + { + "epoch": 0.4085218537228201, + "grad_norm": 2.181851625442505, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6835492849349976, + "num_tokens": 93165898.0, + "step": 3720 + }, + { + "epoch": 0.4086316714254338, + "grad_norm": 2.242628574371338, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7155368328094482, + "num_tokens": 93189673.0, + "step": 3721 + }, + { + "epoch": 0.4087414891280474, + "grad_norm": 2.090924024581909, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6998549103736877, + "num_tokens": 93220096.0, + "step": 3722 + }, + { + "epoch": 0.4088513068306611, + "grad_norm": 2.2932229042053223, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6986817121505737, + "num_tokens": 93243477.0, + "step": 3723 + }, + { + "epoch": 0.40896112453327477, + "grad_norm": 2.194061279296875, + "learning_rate": 1e-06, + "loss": 1.0756, + "mean_token_accuracy": 0.6844099164009094, + "num_tokens": 93270836.0, + "step": 3724 + }, + { + "epoch": 0.4090709422358884, + "grad_norm": 2.186633825302124, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.717730700969696, + "num_tokens": 93296473.0, + "step": 3725 + }, + { + "epoch": 0.40918075993850206, + "grad_norm": 2.2634661197662354, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6929255723953247, + "num_tokens": 93321956.0, + "step": 3726 + }, + { + "epoch": 0.40929057764111576, + "grad_norm": 2.438037157058716, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.685713529586792, + "num_tokens": 93346262.0, + "step": 3727 + }, + { + "epoch": 0.4094003953437294, + "grad_norm": 2.332102060317993, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.702422022819519, + "num_tokens": 93369235.0, + "step": 3728 + }, + { + "epoch": 0.40951021304634305, + "grad_norm": 2.1116151809692383, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7013605833053589, + "num_tokens": 93394532.0, + "step": 3729 + }, + { + "epoch": 0.40962003074895675, + "grad_norm": 2.211423635482788, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6936860084533691, + "num_tokens": 93420412.0, + "step": 3730 + }, + { + "epoch": 0.4097298484515704, + "grad_norm": 2.298074245452881, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6942095160484314, + "num_tokens": 93444797.0, + "step": 3731 + }, + { + "epoch": 0.40983966615418405, + "grad_norm": 2.3507678508758545, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7065094709396362, + "num_tokens": 93469227.0, + "step": 3732 + }, + { + "epoch": 0.4099494838567977, + "grad_norm": 2.3757927417755127, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7316365242004395, + "num_tokens": 93489928.0, + "step": 3733 + }, + { + "epoch": 0.4100593015594114, + "grad_norm": 2.5344767570495605, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7353658676147461, + "num_tokens": 93509530.0, + "step": 3734 + }, + { + "epoch": 0.41016911926202504, + "grad_norm": 2.47226881980896, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.720086932182312, + "num_tokens": 93530811.0, + "step": 3735 + }, + { + "epoch": 0.4102789369646387, + "grad_norm": 2.370267152786255, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6994115114212036, + "num_tokens": 93553479.0, + "step": 3736 + }, + { + "epoch": 0.4103887546672524, + "grad_norm": 2.4020400047302246, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7154736518859863, + "num_tokens": 93574189.0, + "step": 3737 + }, + { + "epoch": 0.41049857236986603, + "grad_norm": 2.1367430686950684, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7299461960792542, + "num_tokens": 93599668.0, + "step": 3738 + }, + { + "epoch": 0.4106083900724797, + "grad_norm": 2.0960748195648193, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.684051513671875, + "num_tokens": 93628115.0, + "step": 3739 + }, + { + "epoch": 0.4107182077750933, + "grad_norm": 2.272254228591919, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.6999441385269165, + "num_tokens": 93651880.0, + "step": 3740 + }, + { + "epoch": 0.410828025477707, + "grad_norm": 2.4619033336639404, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6924982070922852, + "num_tokens": 93672652.0, + "step": 3741 + }, + { + "epoch": 0.41093784318032067, + "grad_norm": 2.1676106452941895, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7042467594146729, + "num_tokens": 93698375.0, + "step": 3742 + }, + { + "epoch": 0.4110476608829343, + "grad_norm": 2.111658811569214, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.686951220035553, + "num_tokens": 93729155.0, + "step": 3743 + }, + { + "epoch": 0.41115747858554796, + "grad_norm": 2.214352607727051, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6869890689849854, + "num_tokens": 93757350.0, + "step": 3744 + }, + { + "epoch": 0.41126729628816167, + "grad_norm": 2.383673906326294, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7003772258758545, + "num_tokens": 93779375.0, + "step": 3745 + }, + { + "epoch": 0.4113771139907753, + "grad_norm": 2.1641321182250977, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7062934637069702, + "num_tokens": 93804178.0, + "step": 3746 + }, + { + "epoch": 0.41148693169338896, + "grad_norm": 2.074019193649292, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6978040933609009, + "num_tokens": 93832833.0, + "step": 3747 + }, + { + "epoch": 0.41159674939600266, + "grad_norm": 2.2415771484375, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6943745613098145, + "num_tokens": 93856872.0, + "step": 3748 + }, + { + "epoch": 0.4117065670986163, + "grad_norm": 1.9974604845046997, + "learning_rate": 1e-06, + "loss": 1.0839, + "mean_token_accuracy": 0.67271888256073, + "num_tokens": 93888081.0, + "step": 3749 + }, + { + "epoch": 0.41181638480122995, + "grad_norm": 2.3706090450286865, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6990165710449219, + "num_tokens": 93911772.0, + "step": 3750 + }, + { + "epoch": 0.4119262025038436, + "grad_norm": 2.116832971572876, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6848965287208557, + "num_tokens": 93940890.0, + "step": 3751 + }, + { + "epoch": 0.4120360202064573, + "grad_norm": 2.0914840698242188, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7094230055809021, + "num_tokens": 93968276.0, + "step": 3752 + }, + { + "epoch": 0.41214583790907094, + "grad_norm": 2.242781400680542, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7192169427871704, + "num_tokens": 93993077.0, + "step": 3753 + }, + { + "epoch": 0.4122556556116846, + "grad_norm": 2.215834617614746, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.7042784690856934, + "num_tokens": 94018361.0, + "step": 3754 + }, + { + "epoch": 0.4123654733142983, + "grad_norm": 2.297348737716675, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7057779431343079, + "num_tokens": 94042963.0, + "step": 3755 + }, + { + "epoch": 0.41247529101691194, + "grad_norm": 2.0659546852111816, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7274718284606934, + "num_tokens": 94069034.0, + "step": 3756 + }, + { + "epoch": 0.4125851087195256, + "grad_norm": 2.4929425716400146, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7105906009674072, + "num_tokens": 94091043.0, + "step": 3757 + }, + { + "epoch": 0.41269492642213923, + "grad_norm": 2.2542226314544678, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6852992177009583, + "num_tokens": 94119677.0, + "step": 3758 + }, + { + "epoch": 0.41280474412475293, + "grad_norm": 1.9862046241760254, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6859689950942993, + "num_tokens": 94151234.0, + "step": 3759 + }, + { + "epoch": 0.4129145618273666, + "grad_norm": 2.262199640274048, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7060118913650513, + "num_tokens": 94176628.0, + "step": 3760 + }, + { + "epoch": 0.4130243795299802, + "grad_norm": 2.341165542602539, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.713445782661438, + "num_tokens": 94200923.0, + "step": 3761 + }, + { + "epoch": 0.41313419723259387, + "grad_norm": 2.5390427112579346, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7007749080657959, + "num_tokens": 94226692.0, + "step": 3762 + }, + { + "epoch": 0.41324401493520757, + "grad_norm": 2.16636061668396, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6765297651290894, + "num_tokens": 94255544.0, + "step": 3763 + }, + { + "epoch": 0.4133538326378212, + "grad_norm": 2.227306365966797, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7000287771224976, + "num_tokens": 94282896.0, + "step": 3764 + }, + { + "epoch": 0.41346365034043486, + "grad_norm": 2.3159096240997314, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7204152345657349, + "num_tokens": 94306671.0, + "step": 3765 + }, + { + "epoch": 0.41357346804304856, + "grad_norm": 2.328582286834717, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6858432292938232, + "num_tokens": 94329897.0, + "step": 3766 + }, + { + "epoch": 0.4136832857456622, + "grad_norm": 2.1742327213287354, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6913491487503052, + "num_tokens": 94356273.0, + "step": 3767 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 2.399336814880371, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7076154947280884, + "num_tokens": 94382095.0, + "step": 3768 + }, + { + "epoch": 0.4139029211508895, + "grad_norm": 2.4141721725463867, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7014434933662415, + "num_tokens": 94403009.0, + "step": 3769 + }, + { + "epoch": 0.4140127388535032, + "grad_norm": 2.363279104232788, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6903970241546631, + "num_tokens": 94428034.0, + "step": 3770 + }, + { + "epoch": 0.41412255655611685, + "grad_norm": 2.0874569416046143, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.699826180934906, + "num_tokens": 94455570.0, + "step": 3771 + }, + { + "epoch": 0.4142323742587305, + "grad_norm": 2.0932700634002686, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7206516265869141, + "num_tokens": 94483115.0, + "step": 3772 + }, + { + "epoch": 0.41434219196134414, + "grad_norm": 2.4191997051239014, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6942278146743774, + "num_tokens": 94504634.0, + "step": 3773 + }, + { + "epoch": 0.41445200966395784, + "grad_norm": 2.0886447429656982, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7056174874305725, + "num_tokens": 94534018.0, + "step": 3774 + }, + { + "epoch": 0.4145618273665715, + "grad_norm": 2.371162176132202, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7261828184127808, + "num_tokens": 94557490.0, + "step": 3775 + }, + { + "epoch": 0.41467164506918514, + "grad_norm": 2.216330051422119, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6927017569541931, + "num_tokens": 94585769.0, + "step": 3776 + }, + { + "epoch": 0.41478146277179884, + "grad_norm": 2.439758777618408, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7237242460250854, + "num_tokens": 94606083.0, + "step": 3777 + }, + { + "epoch": 0.4148912804744125, + "grad_norm": 2.0583598613739014, + "learning_rate": 1e-06, + "loss": 1.1042, + "mean_token_accuracy": 0.6698912382125854, + "num_tokens": 94636691.0, + "step": 3778 + }, + { + "epoch": 0.41500109817702613, + "grad_norm": 2.1537935733795166, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6987050175666809, + "num_tokens": 94663681.0, + "step": 3779 + }, + { + "epoch": 0.4151109158796398, + "grad_norm": 2.6461551189422607, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6990134716033936, + "num_tokens": 94683863.0, + "step": 3780 + }, + { + "epoch": 0.4152207335822535, + "grad_norm": 2.807530641555786, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7120852470397949, + "num_tokens": 94700477.0, + "step": 3781 + }, + { + "epoch": 0.4153305512848671, + "grad_norm": 2.1202006340026855, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.6990560293197632, + "num_tokens": 94732390.0, + "step": 3782 + }, + { + "epoch": 0.41544036898748077, + "grad_norm": 2.5575168132781982, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7250755429267883, + "num_tokens": 94751450.0, + "step": 3783 + }, + { + "epoch": 0.41555018669009447, + "grad_norm": 2.2397162914276123, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6887564659118652, + "num_tokens": 94776313.0, + "step": 3784 + }, + { + "epoch": 0.4156600043927081, + "grad_norm": 2.4941420555114746, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6970046758651733, + "num_tokens": 94797577.0, + "step": 3785 + }, + { + "epoch": 0.41576982209532176, + "grad_norm": 2.1284520626068115, + "learning_rate": 1e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.6903355121612549, + "num_tokens": 94823213.0, + "step": 3786 + }, + { + "epoch": 0.4158796397979354, + "grad_norm": 1.9769577980041504, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7242193818092346, + "num_tokens": 94850600.0, + "step": 3787 + }, + { + "epoch": 0.4159894575005491, + "grad_norm": 2.213136672973633, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7025586366653442, + "num_tokens": 94876310.0, + "step": 3788 + }, + { + "epoch": 0.41609927520316276, + "grad_norm": 2.39729905128479, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.686299204826355, + "num_tokens": 94900340.0, + "step": 3789 + }, + { + "epoch": 0.4162090929057764, + "grad_norm": 2.3994176387786865, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7247607707977295, + "num_tokens": 94921097.0, + "step": 3790 + }, + { + "epoch": 0.41631891060839005, + "grad_norm": 2.2146735191345215, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7223014831542969, + "num_tokens": 94942553.0, + "step": 3791 + }, + { + "epoch": 0.41642872831100375, + "grad_norm": 2.063213348388672, + "learning_rate": 1e-06, + "loss": 1.0755, + "mean_token_accuracy": 0.6750620603561401, + "num_tokens": 94972489.0, + "step": 3792 + }, + { + "epoch": 0.4165385460136174, + "grad_norm": 2.0367283821105957, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7091206312179565, + "num_tokens": 95000702.0, + "step": 3793 + }, + { + "epoch": 0.41664836371623104, + "grad_norm": 1.8910735845565796, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7110635042190552, + "num_tokens": 95033066.0, + "step": 3794 + }, + { + "epoch": 0.41675818141884474, + "grad_norm": 2.2148399353027344, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6928730010986328, + "num_tokens": 95057947.0, + "step": 3795 + }, + { + "epoch": 0.4168679991214584, + "grad_norm": 2.0639615058898926, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7065550088882446, + "num_tokens": 95085418.0, + "step": 3796 + }, + { + "epoch": 0.41697781682407203, + "grad_norm": 2.319451093673706, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6846637725830078, + "num_tokens": 95108749.0, + "step": 3797 + }, + { + "epoch": 0.4170876345266857, + "grad_norm": 2.485067129135132, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7173572182655334, + "num_tokens": 95129045.0, + "step": 3798 + }, + { + "epoch": 0.4171974522292994, + "grad_norm": 2.7510221004486084, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7215951681137085, + "num_tokens": 95146132.0, + "step": 3799 + }, + { + "epoch": 0.417307269931913, + "grad_norm": 2.235191822052002, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7030970454216003, + "num_tokens": 95172641.0, + "step": 3800 + }, + { + "epoch": 0.4174170876345267, + "grad_norm": 2.4959335327148438, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6946960687637329, + "num_tokens": 95193434.0, + "step": 3801 + }, + { + "epoch": 0.4175269053371403, + "grad_norm": 2.1233603954315186, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7046889662742615, + "num_tokens": 95221111.0, + "step": 3802 + }, + { + "epoch": 0.417636723039754, + "grad_norm": 2.3629164695739746, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7169309854507446, + "num_tokens": 95244325.0, + "step": 3803 + }, + { + "epoch": 0.41774654074236767, + "grad_norm": 2.2940845489501953, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7451192736625671, + "num_tokens": 95267510.0, + "step": 3804 + }, + { + "epoch": 0.4178563584449813, + "grad_norm": 2.1917591094970703, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.7093887329101562, + "num_tokens": 95293620.0, + "step": 3805 + }, + { + "epoch": 0.417966176147595, + "grad_norm": 2.3707268238067627, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7058482766151428, + "num_tokens": 95316236.0, + "step": 3806 + }, + { + "epoch": 0.41807599385020866, + "grad_norm": 1.9962824583053589, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6971932649612427, + "num_tokens": 95346770.0, + "step": 3807 + }, + { + "epoch": 0.4181858115528223, + "grad_norm": 2.2075114250183105, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7320120334625244, + "num_tokens": 95370861.0, + "step": 3808 + }, + { + "epoch": 0.41829562925543595, + "grad_norm": 2.466870069503784, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6961605548858643, + "num_tokens": 95392473.0, + "step": 3809 + }, + { + "epoch": 0.41840544695804965, + "grad_norm": 2.3298542499542236, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6885932087898254, + "num_tokens": 95417207.0, + "step": 3810 + }, + { + "epoch": 0.4185152646606633, + "grad_norm": 1.904468297958374, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7331650257110596, + "num_tokens": 95450126.0, + "step": 3811 + }, + { + "epoch": 0.41862508236327695, + "grad_norm": 2.275012493133545, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.700384795665741, + "num_tokens": 95474164.0, + "step": 3812 + }, + { + "epoch": 0.41873490006589065, + "grad_norm": 2.1803362369537354, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6823199987411499, + "num_tokens": 95501359.0, + "step": 3813 + }, + { + "epoch": 0.4188447177685043, + "grad_norm": 2.579432725906372, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7068515419960022, + "num_tokens": 95521110.0, + "step": 3814 + }, + { + "epoch": 0.41895453547111794, + "grad_norm": 1.8981049060821533, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6893430352210999, + "num_tokens": 95553561.0, + "step": 3815 + }, + { + "epoch": 0.4190643531737316, + "grad_norm": 2.3644583225250244, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7016236186027527, + "num_tokens": 95574848.0, + "step": 3816 + }, + { + "epoch": 0.4191741708763453, + "grad_norm": 2.3697569370269775, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7120632529258728, + "num_tokens": 95596934.0, + "step": 3817 + }, + { + "epoch": 0.41928398857895893, + "grad_norm": 2.1333839893341064, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.722732663154602, + "num_tokens": 95624145.0, + "step": 3818 + }, + { + "epoch": 0.4193938062815726, + "grad_norm": 2.545271873474121, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6925983428955078, + "num_tokens": 95645931.0, + "step": 3819 + }, + { + "epoch": 0.4195036239841862, + "grad_norm": 2.350233554840088, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.7038207650184631, + "num_tokens": 95671324.0, + "step": 3820 + }, + { + "epoch": 0.4196134416867999, + "grad_norm": 2.254098653793335, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7253270149230957, + "num_tokens": 95695206.0, + "step": 3821 + }, + { + "epoch": 0.41972325938941357, + "grad_norm": 2.45233154296875, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7265459299087524, + "num_tokens": 95715566.0, + "step": 3822 + }, + { + "epoch": 0.4198330770920272, + "grad_norm": 2.301640033721924, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7065142393112183, + "num_tokens": 95739172.0, + "step": 3823 + }, + { + "epoch": 0.4199428947946409, + "grad_norm": 2.3839738368988037, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6863119602203369, + "num_tokens": 95762368.0, + "step": 3824 + }, + { + "epoch": 0.42005271249725457, + "grad_norm": 2.4184842109680176, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7324402332305908, + "num_tokens": 95784340.0, + "step": 3825 + }, + { + "epoch": 0.4201625301998682, + "grad_norm": 2.3599324226379395, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7293345332145691, + "num_tokens": 95806073.0, + "step": 3826 + }, + { + "epoch": 0.42027234790248186, + "grad_norm": 2.3057680130004883, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.696556031703949, + "num_tokens": 95831587.0, + "step": 3827 + }, + { + "epoch": 0.42038216560509556, + "grad_norm": 2.5869503021240234, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7028414011001587, + "num_tokens": 95851647.0, + "step": 3828 + }, + { + "epoch": 0.4204919833077092, + "grad_norm": 2.199054479598999, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6972970962524414, + "num_tokens": 95878803.0, + "step": 3829 + }, + { + "epoch": 0.42060180101032285, + "grad_norm": 2.2941389083862305, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6814645528793335, + "num_tokens": 95902609.0, + "step": 3830 + }, + { + "epoch": 0.42071161871293655, + "grad_norm": 2.4981136322021484, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7183380126953125, + "num_tokens": 95923847.0, + "step": 3831 + }, + { + "epoch": 0.4208214364155502, + "grad_norm": 2.143813371658325, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6989564299583435, + "num_tokens": 95948408.0, + "step": 3832 + }, + { + "epoch": 0.42093125411816384, + "grad_norm": 2.2764430046081543, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7086935043334961, + "num_tokens": 95973215.0, + "step": 3833 + }, + { + "epoch": 0.4210410718207775, + "grad_norm": 2.359593629837036, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.689663290977478, + "num_tokens": 95997079.0, + "step": 3834 + }, + { + "epoch": 0.4211508895233912, + "grad_norm": 2.11716628074646, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7005875110626221, + "num_tokens": 96025472.0, + "step": 3835 + }, + { + "epoch": 0.42126070722600484, + "grad_norm": 2.1699137687683105, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7140035629272461, + "num_tokens": 96051806.0, + "step": 3836 + }, + { + "epoch": 0.4213705249286185, + "grad_norm": 2.2289037704467773, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6964460611343384, + "num_tokens": 96078009.0, + "step": 3837 + }, + { + "epoch": 0.42148034263123213, + "grad_norm": 2.1435658931732178, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7154507637023926, + "num_tokens": 96105860.0, + "step": 3838 + }, + { + "epoch": 0.42159016033384583, + "grad_norm": 1.955288052558899, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7105673551559448, + "num_tokens": 96135204.0, + "step": 3839 + }, + { + "epoch": 0.4216999780364595, + "grad_norm": 2.3913958072662354, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7033517956733704, + "num_tokens": 96158801.0, + "step": 3840 + }, + { + "epoch": 0.4218097957390731, + "grad_norm": 2.3108010292053223, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7068774700164795, + "num_tokens": 96181251.0, + "step": 3841 + }, + { + "epoch": 0.4219196134416868, + "grad_norm": 1.9335814714431763, + "learning_rate": 1e-06, + "loss": 1.0753, + "mean_token_accuracy": 0.6820018291473389, + "num_tokens": 96216313.0, + "step": 3842 + }, + { + "epoch": 0.42202943114430047, + "grad_norm": 2.037773370742798, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.721315860748291, + "num_tokens": 96244867.0, + "step": 3843 + }, + { + "epoch": 0.4221392488469141, + "grad_norm": 2.189728260040283, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7010083198547363, + "num_tokens": 96269601.0, + "step": 3844 + }, + { + "epoch": 0.42224906654952776, + "grad_norm": 2.03106951713562, + "learning_rate": 1e-06, + "loss": 1.0659, + "mean_token_accuracy": 0.6813324689865112, + "num_tokens": 96299106.0, + "step": 3845 + }, + { + "epoch": 0.42235888425214146, + "grad_norm": 2.242032527923584, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7019011974334717, + "num_tokens": 96325227.0, + "step": 3846 + }, + { + "epoch": 0.4224687019547551, + "grad_norm": 2.1714587211608887, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7169487476348877, + "num_tokens": 96349612.0, + "step": 3847 + }, + { + "epoch": 0.42257851965736876, + "grad_norm": 2.1465389728546143, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7086474299430847, + "num_tokens": 96376723.0, + "step": 3848 + }, + { + "epoch": 0.4226883373599824, + "grad_norm": 2.224616050720215, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6969009041786194, + "num_tokens": 96403764.0, + "step": 3849 + }, + { + "epoch": 0.4227981550625961, + "grad_norm": 2.092831611633301, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6994572877883911, + "num_tokens": 96431480.0, + "step": 3850 + }, + { + "epoch": 0.42290797276520975, + "grad_norm": 2.3772382736206055, + "learning_rate": 1e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6895765066146851, + "num_tokens": 96455206.0, + "step": 3851 + }, + { + "epoch": 0.4230177904678234, + "grad_norm": 2.2920029163360596, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7227899432182312, + "num_tokens": 96477417.0, + "step": 3852 + }, + { + "epoch": 0.4231276081704371, + "grad_norm": 2.4683475494384766, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7026671767234802, + "num_tokens": 96498214.0, + "step": 3853 + }, + { + "epoch": 0.42323742587305074, + "grad_norm": 2.2185168266296387, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7059969305992126, + "num_tokens": 96523699.0, + "step": 3854 + }, + { + "epoch": 0.4233472435756644, + "grad_norm": 2.2785251140594482, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7084185481071472, + "num_tokens": 96547120.0, + "step": 3855 + }, + { + "epoch": 0.42345706127827804, + "grad_norm": 2.3805806636810303, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7051745653152466, + "num_tokens": 96570307.0, + "step": 3856 + }, + { + "epoch": 0.42356687898089174, + "grad_norm": 2.4265315532684326, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.724062442779541, + "num_tokens": 96591061.0, + "step": 3857 + }, + { + "epoch": 0.4236766966835054, + "grad_norm": 2.313096761703491, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6975085735321045, + "num_tokens": 96614856.0, + "step": 3858 + }, + { + "epoch": 0.42378651438611903, + "grad_norm": 2.2781295776367188, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7186737060546875, + "num_tokens": 96639614.0, + "step": 3859 + }, + { + "epoch": 0.42389633208873273, + "grad_norm": 2.0201432704925537, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6993017196655273, + "num_tokens": 96669449.0, + "step": 3860 + }, + { + "epoch": 0.4240061497913464, + "grad_norm": 2.1170694828033447, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7035205960273743, + "num_tokens": 96695419.0, + "step": 3861 + }, + { + "epoch": 0.42411596749396, + "grad_norm": 2.089561700820923, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7106238603591919, + "num_tokens": 96725363.0, + "step": 3862 + }, + { + "epoch": 0.42422578519657367, + "grad_norm": 2.7140181064605713, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7133504152297974, + "num_tokens": 96744335.0, + "step": 3863 + }, + { + "epoch": 0.42433560289918737, + "grad_norm": 2.10010027885437, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6920717358589172, + "num_tokens": 96772566.0, + "step": 3864 + }, + { + "epoch": 0.424445420601801, + "grad_norm": 2.4024062156677246, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7038243412971497, + "num_tokens": 96794447.0, + "step": 3865 + }, + { + "epoch": 0.42455523830441466, + "grad_norm": 2.092634677886963, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6919465065002441, + "num_tokens": 96824365.0, + "step": 3866 + }, + { + "epoch": 0.4246650560070283, + "grad_norm": 2.078502655029297, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6927473545074463, + "num_tokens": 96853841.0, + "step": 3867 + }, + { + "epoch": 0.424774873709642, + "grad_norm": 2.064570665359497, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7248777747154236, + "num_tokens": 96881687.0, + "step": 3868 + }, + { + "epoch": 0.42488469141225566, + "grad_norm": 2.4866256713867188, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7370439171791077, + "num_tokens": 96901793.0, + "step": 3869 + }, + { + "epoch": 0.4249945091148693, + "grad_norm": 2.2274060249328613, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.6950072050094604, + "num_tokens": 96927041.0, + "step": 3870 + }, + { + "epoch": 0.425104326817483, + "grad_norm": 2.4721086025238037, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7176433801651001, + "num_tokens": 96949528.0, + "step": 3871 + }, + { + "epoch": 0.42521414452009665, + "grad_norm": 2.0515975952148438, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.703312337398529, + "num_tokens": 96977697.0, + "step": 3872 + }, + { + "epoch": 0.4253239622227103, + "grad_norm": 2.202761650085449, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7039517164230347, + "num_tokens": 97003602.0, + "step": 3873 + }, + { + "epoch": 0.42543377992532394, + "grad_norm": 2.455524206161499, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.703938901424408, + "num_tokens": 97024830.0, + "step": 3874 + }, + { + "epoch": 0.42554359762793764, + "grad_norm": 2.1009650230407715, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6923033595085144, + "num_tokens": 97054680.0, + "step": 3875 + }, + { + "epoch": 0.4256534153305513, + "grad_norm": 1.9352531433105469, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6931316256523132, + "num_tokens": 97089699.0, + "step": 3876 + }, + { + "epoch": 0.42576323303316493, + "grad_norm": 2.139390468597412, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.70382159948349, + "num_tokens": 97114911.0, + "step": 3877 + }, + { + "epoch": 0.4258730507357786, + "grad_norm": 2.537414073944092, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.717075526714325, + "num_tokens": 97134124.0, + "step": 3878 + }, + { + "epoch": 0.4259828684383923, + "grad_norm": 2.3240854740142822, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6867645978927612, + "num_tokens": 97158710.0, + "step": 3879 + }, + { + "epoch": 0.4260926861410059, + "grad_norm": 2.5845537185668945, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6910914182662964, + "num_tokens": 97179233.0, + "step": 3880 + }, + { + "epoch": 0.4262025038436196, + "grad_norm": 2.119049549102783, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7147142887115479, + "num_tokens": 97205753.0, + "step": 3881 + }, + { + "epoch": 0.4263123215462333, + "grad_norm": 2.1041717529296875, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6958190202713013, + "num_tokens": 97232133.0, + "step": 3882 + }, + { + "epoch": 0.4264221392488469, + "grad_norm": 2.189818859100342, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6913946866989136, + "num_tokens": 97260122.0, + "step": 3883 + }, + { + "epoch": 0.42653195695146057, + "grad_norm": 2.4676594734191895, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.732020378112793, + "num_tokens": 97281245.0, + "step": 3884 + }, + { + "epoch": 0.4266417746540742, + "grad_norm": 2.4734511375427246, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6999056339263916, + "num_tokens": 97305384.0, + "step": 3885 + }, + { + "epoch": 0.4267515923566879, + "grad_norm": 2.038208246231079, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7149642109870911, + "num_tokens": 97331860.0, + "step": 3886 + }, + { + "epoch": 0.42686141005930156, + "grad_norm": 2.2195017337799072, + "learning_rate": 1e-06, + "loss": 1.1255, + "mean_token_accuracy": 0.6690105199813843, + "num_tokens": 97357886.0, + "step": 3887 + }, + { + "epoch": 0.4269712277619152, + "grad_norm": 2.351893186569214, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.705778181552887, + "num_tokens": 97380065.0, + "step": 3888 + }, + { + "epoch": 0.4270810454645289, + "grad_norm": 2.329969644546509, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6985087394714355, + "num_tokens": 97403744.0, + "step": 3889 + }, + { + "epoch": 0.42719086316714255, + "grad_norm": 2.246490478515625, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6920127272605896, + "num_tokens": 97429615.0, + "step": 3890 + }, + { + "epoch": 0.4273006808697562, + "grad_norm": 2.424114942550659, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7092134952545166, + "num_tokens": 97450982.0, + "step": 3891 + }, + { + "epoch": 0.42741049857236985, + "grad_norm": 2.4208271503448486, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7291240692138672, + "num_tokens": 97471913.0, + "step": 3892 + }, + { + "epoch": 0.42752031627498355, + "grad_norm": 2.4487385749816895, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7164164781570435, + "num_tokens": 97494600.0, + "step": 3893 + }, + { + "epoch": 0.4276301339775972, + "grad_norm": 1.9421330690383911, + "learning_rate": 1e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.6765837669372559, + "num_tokens": 97528416.0, + "step": 3894 + }, + { + "epoch": 0.42773995168021084, + "grad_norm": 1.9884388446807861, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6975222229957581, + "num_tokens": 97560624.0, + "step": 3895 + }, + { + "epoch": 0.4278497693828245, + "grad_norm": 2.5162086486816406, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7160419821739197, + "num_tokens": 97580295.0, + "step": 3896 + }, + { + "epoch": 0.4279595870854382, + "grad_norm": 2.672009229660034, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7103426456451416, + "num_tokens": 97599770.0, + "step": 3897 + }, + { + "epoch": 0.42806940478805183, + "grad_norm": 1.9640135765075684, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6853840947151184, + "num_tokens": 97632802.0, + "step": 3898 + }, + { + "epoch": 0.4281792224906655, + "grad_norm": 2.4111874103546143, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7128486633300781, + "num_tokens": 97655020.0, + "step": 3899 + }, + { + "epoch": 0.4282890401932792, + "grad_norm": 2.061398506164551, + "learning_rate": 1e-06, + "loss": 1.116, + "mean_token_accuracy": 0.6705924272537231, + "num_tokens": 97687692.0, + "step": 3900 + }, + { + "epoch": 0.4283988578958928, + "grad_norm": 2.651224136352539, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7265781164169312, + "num_tokens": 97705221.0, + "step": 3901 + }, + { + "epoch": 0.42850867559850647, + "grad_norm": 1.9491509199142456, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7177079319953918, + "num_tokens": 97736208.0, + "step": 3902 + }, + { + "epoch": 0.4286184933011201, + "grad_norm": 2.4022889137268066, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6981133222579956, + "num_tokens": 97759391.0, + "step": 3903 + }, + { + "epoch": 0.4287283110037338, + "grad_norm": 2.1966233253479004, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7299962639808655, + "num_tokens": 97784132.0, + "step": 3904 + }, + { + "epoch": 0.42883812870634747, + "grad_norm": 2.187843084335327, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6827268600463867, + "num_tokens": 97811386.0, + "step": 3905 + }, + { + "epoch": 0.4289479464089611, + "grad_norm": 2.0652413368225098, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6835666298866272, + "num_tokens": 97840647.0, + "step": 3906 + }, + { + "epoch": 0.4290577641115748, + "grad_norm": 1.9837114810943604, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7014944553375244, + "num_tokens": 97872506.0, + "step": 3907 + }, + { + "epoch": 0.42916758181418846, + "grad_norm": 2.668060302734375, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7219827771186829, + "num_tokens": 97891898.0, + "step": 3908 + }, + { + "epoch": 0.4292773995168021, + "grad_norm": 2.167524814605713, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6817704439163208, + "num_tokens": 97918981.0, + "step": 3909 + }, + { + "epoch": 0.42938721721941575, + "grad_norm": 2.2404706478118896, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7299867272377014, + "num_tokens": 97941996.0, + "step": 3910 + }, + { + "epoch": 0.42949703492202945, + "grad_norm": 2.4622349739074707, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7209364175796509, + "num_tokens": 97964058.0, + "step": 3911 + }, + { + "epoch": 0.4296068526246431, + "grad_norm": 2.380019187927246, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7141605019569397, + "num_tokens": 97987153.0, + "step": 3912 + }, + { + "epoch": 0.42971667032725674, + "grad_norm": 2.067814588546753, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.695094108581543, + "num_tokens": 98013248.0, + "step": 3913 + }, + { + "epoch": 0.4298264880298704, + "grad_norm": 2.6684348583221436, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.708802342414856, + "num_tokens": 98031501.0, + "step": 3914 + }, + { + "epoch": 0.4299363057324841, + "grad_norm": 1.9460666179656982, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6926804780960083, + "num_tokens": 98064491.0, + "step": 3915 + }, + { + "epoch": 0.43004612343509774, + "grad_norm": 2.353917360305786, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7147570252418518, + "num_tokens": 98087025.0, + "step": 3916 + }, + { + "epoch": 0.4301559411377114, + "grad_norm": 2.0504021644592285, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7145530581474304, + "num_tokens": 98114188.0, + "step": 3917 + }, + { + "epoch": 0.4302657588403251, + "grad_norm": 2.0253803730010986, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.694855809211731, + "num_tokens": 98144990.0, + "step": 3918 + }, + { + "epoch": 0.43037557654293873, + "grad_norm": 2.679750680923462, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7024405598640442, + "num_tokens": 98163001.0, + "step": 3919 + }, + { + "epoch": 0.4304853942455524, + "grad_norm": 2.262903928756714, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6997236013412476, + "num_tokens": 98188104.0, + "step": 3920 + }, + { + "epoch": 0.430595211948166, + "grad_norm": 2.7554140090942383, + "learning_rate": 1e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.6768679022789001, + "num_tokens": 98212853.0, + "step": 3921 + }, + { + "epoch": 0.4307050296507797, + "grad_norm": 2.3525092601776123, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7080258131027222, + "num_tokens": 98238194.0, + "step": 3922 + }, + { + "epoch": 0.43081484735339337, + "grad_norm": 2.10976505279541, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6993557214736938, + "num_tokens": 98265523.0, + "step": 3923 + }, + { + "epoch": 0.430924665056007, + "grad_norm": 2.097339630126953, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6983335018157959, + "num_tokens": 98292834.0, + "step": 3924 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 2.249206066131592, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7232635021209717, + "num_tokens": 98316462.0, + "step": 3925 + }, + { + "epoch": 0.43114430046123436, + "grad_norm": 2.004007577896118, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7062592506408691, + "num_tokens": 98347550.0, + "step": 3926 + }, + { + "epoch": 0.431254118163848, + "grad_norm": 2.2892005443573, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6944749355316162, + "num_tokens": 98372509.0, + "step": 3927 + }, + { + "epoch": 0.43136393586646166, + "grad_norm": 2.603360176086426, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6916981935501099, + "num_tokens": 98393772.0, + "step": 3928 + }, + { + "epoch": 0.43147375356907536, + "grad_norm": 2.3354854583740234, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6879475116729736, + "num_tokens": 98417128.0, + "step": 3929 + }, + { + "epoch": 0.431583571271689, + "grad_norm": 2.1308600902557373, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.696469783782959, + "num_tokens": 98444865.0, + "step": 3930 + }, + { + "epoch": 0.43169338897430265, + "grad_norm": 2.7874369621276855, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7233059406280518, + "num_tokens": 98461852.0, + "step": 3931 + }, + { + "epoch": 0.4318032066769163, + "grad_norm": 2.0297062397003174, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6815973520278931, + "num_tokens": 98492867.0, + "step": 3932 + }, + { + "epoch": 0.43191302437953, + "grad_norm": 2.5009005069732666, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7033908367156982, + "num_tokens": 98512531.0, + "step": 3933 + }, + { + "epoch": 0.43202284208214364, + "grad_norm": 2.321448802947998, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.7006096839904785, + "num_tokens": 98538279.0, + "step": 3934 + }, + { + "epoch": 0.4321326597847573, + "grad_norm": 2.0945029258728027, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7241345047950745, + "num_tokens": 98564946.0, + "step": 3935 + }, + { + "epoch": 0.432242477487371, + "grad_norm": 2.630476236343384, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7136790752410889, + "num_tokens": 98582757.0, + "step": 3936 + }, + { + "epoch": 0.43235229518998464, + "grad_norm": 2.4634110927581787, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6957147121429443, + "num_tokens": 98604409.0, + "step": 3937 + }, + { + "epoch": 0.4324621128925983, + "grad_norm": 2.297264575958252, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7135888338088989, + "num_tokens": 98628128.0, + "step": 3938 + }, + { + "epoch": 0.43257193059521193, + "grad_norm": 2.382045269012451, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6969588994979858, + "num_tokens": 98649971.0, + "step": 3939 + }, + { + "epoch": 0.43268174829782563, + "grad_norm": 2.1223976612091064, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7239519953727722, + "num_tokens": 98676109.0, + "step": 3940 + }, + { + "epoch": 0.4327915660004393, + "grad_norm": 2.47163462638855, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.69707190990448, + "num_tokens": 98697230.0, + "step": 3941 + }, + { + "epoch": 0.4329013837030529, + "grad_norm": 2.445434093475342, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.707977294921875, + "num_tokens": 98716317.0, + "step": 3942 + }, + { + "epoch": 0.43301120140566657, + "grad_norm": 2.3203465938568115, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.696517288684845, + "num_tokens": 98739798.0, + "step": 3943 + }, + { + "epoch": 0.43312101910828027, + "grad_norm": 2.3021299839019775, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7067755460739136, + "num_tokens": 98762813.0, + "step": 3944 + }, + { + "epoch": 0.4332308368108939, + "grad_norm": 2.4972333908081055, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7054111361503601, + "num_tokens": 98782491.0, + "step": 3945 + }, + { + "epoch": 0.43334065451350756, + "grad_norm": 2.6939990520477295, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7186036109924316, + "num_tokens": 98800033.0, + "step": 3946 + }, + { + "epoch": 0.43345047221612126, + "grad_norm": 2.2469499111175537, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.6981694102287292, + "num_tokens": 98824185.0, + "step": 3947 + }, + { + "epoch": 0.4335602899187349, + "grad_norm": 2.2457668781280518, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7140482664108276, + "num_tokens": 98850032.0, + "step": 3948 + }, + { + "epoch": 0.43367010762134856, + "grad_norm": 2.417478561401367, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7144379019737244, + "num_tokens": 98871496.0, + "step": 3949 + }, + { + "epoch": 0.4337799253239622, + "grad_norm": 2.2777180671691895, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6909684538841248, + "num_tokens": 98895488.0, + "step": 3950 + }, + { + "epoch": 0.4338897430265759, + "grad_norm": 2.3522567749023438, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7214469313621521, + "num_tokens": 98916962.0, + "step": 3951 + }, + { + "epoch": 0.43399956072918955, + "grad_norm": 2.129356861114502, + "learning_rate": 1e-06, + "loss": 1.131, + "mean_token_accuracy": 0.66242915391922, + "num_tokens": 98948931.0, + "step": 3952 + }, + { + "epoch": 0.4341093784318032, + "grad_norm": 2.7341156005859375, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.716167688369751, + "num_tokens": 98966877.0, + "step": 3953 + }, + { + "epoch": 0.43421919613441684, + "grad_norm": 2.2414112091064453, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6931456923484802, + "num_tokens": 98993147.0, + "step": 3954 + }, + { + "epoch": 0.43432901383703054, + "grad_norm": 2.1702516078948975, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7231693267822266, + "num_tokens": 99017520.0, + "step": 3955 + }, + { + "epoch": 0.4344388315396442, + "grad_norm": 2.583380699157715, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7136089205741882, + "num_tokens": 99036584.0, + "step": 3956 + }, + { + "epoch": 0.43454864924225783, + "grad_norm": 2.2019588947296143, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6830259561538696, + "num_tokens": 99064810.0, + "step": 3957 + }, + { + "epoch": 0.43465846694487154, + "grad_norm": 2.341606616973877, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6846626996994019, + "num_tokens": 99087970.0, + "step": 3958 + }, + { + "epoch": 0.4347682846474852, + "grad_norm": 2.0422894954681396, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7079764008522034, + "num_tokens": 99116393.0, + "step": 3959 + }, + { + "epoch": 0.4348781023500988, + "grad_norm": 2.321607828140259, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7030816078186035, + "num_tokens": 99141044.0, + "step": 3960 + }, + { + "epoch": 0.4349879200527125, + "grad_norm": 2.2436392307281494, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6852637529373169, + "num_tokens": 99165389.0, + "step": 3961 + }, + { + "epoch": 0.4350977377553262, + "grad_norm": 2.5749056339263916, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7038714289665222, + "num_tokens": 99185289.0, + "step": 3962 + }, + { + "epoch": 0.4352075554579398, + "grad_norm": 2.6352200508117676, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7409119606018066, + "num_tokens": 99203024.0, + "step": 3963 + }, + { + "epoch": 0.43531737316055347, + "grad_norm": 2.191789388656616, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6957522630691528, + "num_tokens": 99228421.0, + "step": 3964 + }, + { + "epoch": 0.43542719086316717, + "grad_norm": 2.2478015422821045, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7033069133758545, + "num_tokens": 99254238.0, + "step": 3965 + }, + { + "epoch": 0.4355370085657808, + "grad_norm": 2.2167632579803467, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6871383190155029, + "num_tokens": 99281339.0, + "step": 3966 + }, + { + "epoch": 0.43564682626839446, + "grad_norm": 2.0874722003936768, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6775619983673096, + "num_tokens": 99310915.0, + "step": 3967 + }, + { + "epoch": 0.4357566439710081, + "grad_norm": 2.174008369445801, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7154182195663452, + "num_tokens": 99336894.0, + "step": 3968 + }, + { + "epoch": 0.4358664616736218, + "grad_norm": 2.2805051803588867, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6912038326263428, + "num_tokens": 99364722.0, + "step": 3969 + }, + { + "epoch": 0.43597627937623545, + "grad_norm": 2.3236074447631836, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.688690185546875, + "num_tokens": 99390370.0, + "step": 3970 + }, + { + "epoch": 0.4360860970788491, + "grad_norm": 2.738570213317871, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7240310907363892, + "num_tokens": 99407551.0, + "step": 3971 + }, + { + "epoch": 0.43619591478146275, + "grad_norm": 2.1438863277435303, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7114978432655334, + "num_tokens": 99432669.0, + "step": 3972 + }, + { + "epoch": 0.43630573248407645, + "grad_norm": 1.9254952669143677, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7010456323623657, + "num_tokens": 99464012.0, + "step": 3973 + }, + { + "epoch": 0.4364155501866901, + "grad_norm": 2.1629321575164795, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7050410509109497, + "num_tokens": 99489140.0, + "step": 3974 + }, + { + "epoch": 0.43652536788930374, + "grad_norm": 2.189326047897339, + "learning_rate": 1e-06, + "loss": 1.1348, + "mean_token_accuracy": 0.6694347858428955, + "num_tokens": 99514326.0, + "step": 3975 + }, + { + "epoch": 0.43663518559191744, + "grad_norm": 2.5188777446746826, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7179131507873535, + "num_tokens": 99534190.0, + "step": 3976 + }, + { + "epoch": 0.4367450032945311, + "grad_norm": 2.4666495323181152, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7213332653045654, + "num_tokens": 99556187.0, + "step": 3977 + }, + { + "epoch": 0.43685482099714473, + "grad_norm": 2.1108438968658447, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6905980706214905, + "num_tokens": 99585430.0, + "step": 3978 + }, + { + "epoch": 0.4369646386997584, + "grad_norm": 2.48614501953125, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7168830633163452, + "num_tokens": 99605088.0, + "step": 3979 + }, + { + "epoch": 0.4370744564023721, + "grad_norm": 2.289088487625122, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7285027503967285, + "num_tokens": 99626500.0, + "step": 3980 + }, + { + "epoch": 0.4371842741049857, + "grad_norm": 2.175626516342163, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7064229249954224, + "num_tokens": 99651480.0, + "step": 3981 + }, + { + "epoch": 0.43729409180759937, + "grad_norm": 2.353811502456665, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7205847501754761, + "num_tokens": 99674420.0, + "step": 3982 + }, + { + "epoch": 0.4374039095102131, + "grad_norm": 2.4267873764038086, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.712546169757843, + "num_tokens": 99696365.0, + "step": 3983 + }, + { + "epoch": 0.4375137272128267, + "grad_norm": 2.1172103881835938, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7056983113288879, + "num_tokens": 99721957.0, + "step": 3984 + }, + { + "epoch": 0.43762354491544037, + "grad_norm": 2.4396655559539795, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7050211429595947, + "num_tokens": 99744225.0, + "step": 3985 + }, + { + "epoch": 0.437733362618054, + "grad_norm": 2.246631622314453, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6976284980773926, + "num_tokens": 99768151.0, + "step": 3986 + }, + { + "epoch": 0.4378431803206677, + "grad_norm": 2.293074369430542, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6871367692947388, + "num_tokens": 99792078.0, + "step": 3987 + }, + { + "epoch": 0.43795299802328136, + "grad_norm": 2.437544822692871, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7015905380249023, + "num_tokens": 99814501.0, + "step": 3988 + }, + { + "epoch": 0.438062815725895, + "grad_norm": 2.258518695831299, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7257927060127258, + "num_tokens": 99839096.0, + "step": 3989 + }, + { + "epoch": 0.43817263342850865, + "grad_norm": 2.218456745147705, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6879634857177734, + "num_tokens": 99863911.0, + "step": 3990 + }, + { + "epoch": 0.43828245113112235, + "grad_norm": 2.3203539848327637, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7162269949913025, + "num_tokens": 99887641.0, + "step": 3991 + }, + { + "epoch": 0.438392268833736, + "grad_norm": 2.157688856124878, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7084213495254517, + "num_tokens": 99914550.0, + "step": 3992 + }, + { + "epoch": 0.43850208653634964, + "grad_norm": 2.228827476501465, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6976081132888794, + "num_tokens": 99940215.0, + "step": 3993 + }, + { + "epoch": 0.43861190423896335, + "grad_norm": 1.7897402048110962, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.707036018371582, + "num_tokens": 99975682.0, + "step": 3994 + }, + { + "epoch": 0.438721721941577, + "grad_norm": 2.5558483600616455, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.7039074897766113, + "num_tokens": 99996497.0, + "step": 3995 + }, + { + "epoch": 0.43883153964419064, + "grad_norm": 2.296123743057251, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7141834497451782, + "num_tokens": 100017828.0, + "step": 3996 + }, + { + "epoch": 0.4389413573468043, + "grad_norm": 2.483583688735962, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7090334892272949, + "num_tokens": 100037670.0, + "step": 3997 + }, + { + "epoch": 0.439051175049418, + "grad_norm": 2.387192726135254, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6816287636756897, + "num_tokens": 100061660.0, + "step": 3998 + }, + { + "epoch": 0.43916099275203163, + "grad_norm": 2.442185878753662, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7041144371032715, + "num_tokens": 100082983.0, + "step": 3999 + }, + { + "epoch": 0.4392708104546453, + "grad_norm": 2.238381862640381, + "learning_rate": 1e-06, + "loss": 1.1354, + "mean_token_accuracy": 0.6719470620155334, + "num_tokens": 100111765.0, + "step": 4000 + }, + { + "epoch": 0.4393806281572589, + "grad_norm": 2.2442712783813477, + "learning_rate": 1e-06, + "loss": 1.0869, + "mean_token_accuracy": 0.6744258403778076, + "num_tokens": 100138593.0, + "step": 4001 + }, + { + "epoch": 0.4394904458598726, + "grad_norm": 2.3859753608703613, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6890732645988464, + "num_tokens": 100160496.0, + "step": 4002 + }, + { + "epoch": 0.43960026356248627, + "grad_norm": 2.1947574615478516, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6887117028236389, + "num_tokens": 100189219.0, + "step": 4003 + }, + { + "epoch": 0.4397100812650999, + "grad_norm": 2.5226008892059326, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.702717125415802, + "num_tokens": 100208933.0, + "step": 4004 + }, + { + "epoch": 0.4398198989677136, + "grad_norm": 2.2114899158477783, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6909352540969849, + "num_tokens": 100234387.0, + "step": 4005 + }, + { + "epoch": 0.43992971667032726, + "grad_norm": 2.2549240589141846, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.707714855670929, + "num_tokens": 100259712.0, + "step": 4006 + }, + { + "epoch": 0.4400395343729409, + "grad_norm": 2.018106460571289, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.711892306804657, + "num_tokens": 100288766.0, + "step": 4007 + }, + { + "epoch": 0.44014935207555456, + "grad_norm": 2.7236416339874268, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7082310318946838, + "num_tokens": 100307165.0, + "step": 4008 + }, + { + "epoch": 0.44025916977816826, + "grad_norm": 2.2012643814086914, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7011716365814209, + "num_tokens": 100333375.0, + "step": 4009 + }, + { + "epoch": 0.4403689874807819, + "grad_norm": 1.9953047037124634, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6948057413101196, + "num_tokens": 100364448.0, + "step": 4010 + }, + { + "epoch": 0.44047880518339555, + "grad_norm": 2.1645567417144775, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6972002387046814, + "num_tokens": 100393093.0, + "step": 4011 + }, + { + "epoch": 0.44058862288600925, + "grad_norm": 2.165364980697632, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.698824405670166, + "num_tokens": 100417603.0, + "step": 4012 + }, + { + "epoch": 0.4406984405886229, + "grad_norm": 2.6690008640289307, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7124608755111694, + "num_tokens": 100435978.0, + "step": 4013 + }, + { + "epoch": 0.44080825829123654, + "grad_norm": 2.171454668045044, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6956872940063477, + "num_tokens": 100460749.0, + "step": 4014 + }, + { + "epoch": 0.4409180759938502, + "grad_norm": 2.0336029529571533, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6946322917938232, + "num_tokens": 100490840.0, + "step": 4015 + }, + { + "epoch": 0.4410278936964639, + "grad_norm": 2.291940450668335, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7227221131324768, + "num_tokens": 100514213.0, + "step": 4016 + }, + { + "epoch": 0.44113771139907754, + "grad_norm": 2.3399369716644287, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7475252151489258, + "num_tokens": 100535683.0, + "step": 4017 + }, + { + "epoch": 0.4412475291016912, + "grad_norm": 2.0404610633850098, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6986420154571533, + "num_tokens": 100564607.0, + "step": 4018 + }, + { + "epoch": 0.44135734680430483, + "grad_norm": 2.250808000564575, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6968648433685303, + "num_tokens": 100590569.0, + "step": 4019 + }, + { + "epoch": 0.44146716450691853, + "grad_norm": 2.1678948402404785, + "learning_rate": 1e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7458011507987976, + "num_tokens": 100616586.0, + "step": 4020 + }, + { + "epoch": 0.4415769822095322, + "grad_norm": 2.70449161529541, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7207679748535156, + "num_tokens": 100633329.0, + "step": 4021 + }, + { + "epoch": 0.4416867999121458, + "grad_norm": 2.1543803215026855, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7005500793457031, + "num_tokens": 100658881.0, + "step": 4022 + }, + { + "epoch": 0.4417966176147595, + "grad_norm": 2.102821111679077, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6790193319320679, + "num_tokens": 100687997.0, + "step": 4023 + }, + { + "epoch": 0.44190643531737317, + "grad_norm": 2.653186321258545, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7075135707855225, + "num_tokens": 100704784.0, + "step": 4024 + }, + { + "epoch": 0.4420162530199868, + "grad_norm": 2.25938081741333, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6945498585700989, + "num_tokens": 100731265.0, + "step": 4025 + }, + { + "epoch": 0.44212607072260046, + "grad_norm": 2.35821795463562, + "learning_rate": 1e-06, + "loss": 1.055, + "mean_token_accuracy": 0.6819074749946594, + "num_tokens": 100755374.0, + "step": 4026 + }, + { + "epoch": 0.44223588842521416, + "grad_norm": 2.6237025260925293, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7077189683914185, + "num_tokens": 100775121.0, + "step": 4027 + }, + { + "epoch": 0.4423457061278278, + "grad_norm": 2.32072377204895, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7151541113853455, + "num_tokens": 100796859.0, + "step": 4028 + }, + { + "epoch": 0.44245552383044146, + "grad_norm": 2.3120510578155518, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.714407205581665, + "num_tokens": 100819927.0, + "step": 4029 + }, + { + "epoch": 0.4425653415330551, + "grad_norm": 2.0505337715148926, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7127729654312134, + "num_tokens": 100849933.0, + "step": 4030 + }, + { + "epoch": 0.4426751592356688, + "grad_norm": 2.1104531288146973, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6927443742752075, + "num_tokens": 100878799.0, + "step": 4031 + }, + { + "epoch": 0.44278497693828245, + "grad_norm": 2.0106611251831055, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6978640556335449, + "num_tokens": 100908134.0, + "step": 4032 + }, + { + "epoch": 0.4428947946408961, + "grad_norm": 3.0187439918518066, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7000418305397034, + "num_tokens": 100923177.0, + "step": 4033 + }, + { + "epoch": 0.4430046123435098, + "grad_norm": 2.296809196472168, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7058873176574707, + "num_tokens": 100946994.0, + "step": 4034 + }, + { + "epoch": 0.44311443004612344, + "grad_norm": 2.293765068054199, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7036636471748352, + "num_tokens": 100971259.0, + "step": 4035 + }, + { + "epoch": 0.4432242477487371, + "grad_norm": 2.3631646633148193, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.708831787109375, + "num_tokens": 100994092.0, + "step": 4036 + }, + { + "epoch": 0.44333406545135073, + "grad_norm": 2.247413396835327, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7045542597770691, + "num_tokens": 101018068.0, + "step": 4037 + }, + { + "epoch": 0.44344388315396444, + "grad_norm": 2.3199422359466553, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6952074766159058, + "num_tokens": 101042763.0, + "step": 4038 + }, + { + "epoch": 0.4435537008565781, + "grad_norm": 2.3962442874908447, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6903815865516663, + "num_tokens": 101065143.0, + "step": 4039 + }, + { + "epoch": 0.4436635185591917, + "grad_norm": 1.759886622428894, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7029867172241211, + "num_tokens": 101100764.0, + "step": 4040 + }, + { + "epoch": 0.44377333626180543, + "grad_norm": 2.3439531326293945, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7185801267623901, + "num_tokens": 101124293.0, + "step": 4041 + }, + { + "epoch": 0.4438831539644191, + "grad_norm": 2.3416874408721924, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.7003265619277954, + "num_tokens": 101147925.0, + "step": 4042 + }, + { + "epoch": 0.4439929716670327, + "grad_norm": 2.266976833343506, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6892409920692444, + "num_tokens": 101172356.0, + "step": 4043 + }, + { + "epoch": 0.44410278936964637, + "grad_norm": 2.330134391784668, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7286981344223022, + "num_tokens": 101192890.0, + "step": 4044 + }, + { + "epoch": 0.44421260707226007, + "grad_norm": 2.46478271484375, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7085646986961365, + "num_tokens": 101214943.0, + "step": 4045 + }, + { + "epoch": 0.4443224247748737, + "grad_norm": 2.441171646118164, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.693808913230896, + "num_tokens": 101237602.0, + "step": 4046 + }, + { + "epoch": 0.44443224247748736, + "grad_norm": 2.256335735321045, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6986680030822754, + "num_tokens": 101264505.0, + "step": 4047 + }, + { + "epoch": 0.444542060180101, + "grad_norm": 2.318187713623047, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7044280767440796, + "num_tokens": 101286329.0, + "step": 4048 + }, + { + "epoch": 0.4446518778827147, + "grad_norm": 2.1304428577423096, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.7079170942306519, + "num_tokens": 101314369.0, + "step": 4049 + }, + { + "epoch": 0.44476169558532835, + "grad_norm": 2.275270700454712, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.6992446184158325, + "num_tokens": 101338940.0, + "step": 4050 + }, + { + "epoch": 0.444871513287942, + "grad_norm": 2.4082653522491455, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.686202883720398, + "num_tokens": 101361399.0, + "step": 4051 + }, + { + "epoch": 0.4449813309905557, + "grad_norm": 2.279590606689453, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7039130926132202, + "num_tokens": 101385725.0, + "step": 4052 + }, + { + "epoch": 0.44509114869316935, + "grad_norm": 2.082568883895874, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6963415145874023, + "num_tokens": 101414592.0, + "step": 4053 + }, + { + "epoch": 0.445200966395783, + "grad_norm": 2.174365520477295, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.722159206867218, + "num_tokens": 101441411.0, + "step": 4054 + }, + { + "epoch": 0.44531078409839664, + "grad_norm": 2.342820644378662, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7065577507019043, + "num_tokens": 101465979.0, + "step": 4055 + }, + { + "epoch": 0.44542060180101034, + "grad_norm": 2.11006236076355, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.714489221572876, + "num_tokens": 101493532.0, + "step": 4056 + }, + { + "epoch": 0.445530419503624, + "grad_norm": 2.250823497772217, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6878401041030884, + "num_tokens": 101517940.0, + "step": 4057 + }, + { + "epoch": 0.44564023720623763, + "grad_norm": 2.499225378036499, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7212990522384644, + "num_tokens": 101538170.0, + "step": 4058 + }, + { + "epoch": 0.44575005490885133, + "grad_norm": 2.5265090465545654, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6835970878601074, + "num_tokens": 101560712.0, + "step": 4059 + }, + { + "epoch": 0.445859872611465, + "grad_norm": 2.260423183441162, + "learning_rate": 1e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6890071630477905, + "num_tokens": 101587495.0, + "step": 4060 + }, + { + "epoch": 0.4459696903140786, + "grad_norm": 2.2881357669830322, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7024962902069092, + "num_tokens": 101611903.0, + "step": 4061 + }, + { + "epoch": 0.44607950801669227, + "grad_norm": 2.209627389907837, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7082401514053345, + "num_tokens": 101637889.0, + "step": 4062 + }, + { + "epoch": 0.446189325719306, + "grad_norm": 2.4256491661071777, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7131609320640564, + "num_tokens": 101659469.0, + "step": 4063 + }, + { + "epoch": 0.4462991434219196, + "grad_norm": 2.4980628490448, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7032583951950073, + "num_tokens": 101682566.0, + "step": 4064 + }, + { + "epoch": 0.44640896112453327, + "grad_norm": 2.0968754291534424, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7060398459434509, + "num_tokens": 101710128.0, + "step": 4065 + }, + { + "epoch": 0.4465187788271469, + "grad_norm": 2.251697540283203, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6921659708023071, + "num_tokens": 101734767.0, + "step": 4066 + }, + { + "epoch": 0.4466285965297606, + "grad_norm": 2.089777708053589, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6833744049072266, + "num_tokens": 101763490.0, + "step": 4067 + }, + { + "epoch": 0.44673841423237426, + "grad_norm": 2.4624061584472656, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7060810923576355, + "num_tokens": 101786066.0, + "step": 4068 + }, + { + "epoch": 0.4468482319349879, + "grad_norm": 2.8479535579681396, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7390632629394531, + "num_tokens": 101803553.0, + "step": 4069 + }, + { + "epoch": 0.4469580496376016, + "grad_norm": 2.184272527694702, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7024768590927124, + "num_tokens": 101832448.0, + "step": 4070 + }, + { + "epoch": 0.44706786734021525, + "grad_norm": 2.368542194366455, + "learning_rate": 1e-06, + "loss": 1.073, + "mean_token_accuracy": 0.6802444458007812, + "num_tokens": 101857311.0, + "step": 4071 + }, + { + "epoch": 0.4471776850428289, + "grad_norm": 2.3116953372955322, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7057065367698669, + "num_tokens": 101879908.0, + "step": 4072 + }, + { + "epoch": 0.44728750274544254, + "grad_norm": 2.3170595169067383, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7033897042274475, + "num_tokens": 101903419.0, + "step": 4073 + }, + { + "epoch": 0.44739732044805625, + "grad_norm": 2.458385705947876, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7131751179695129, + "num_tokens": 101926469.0, + "step": 4074 + }, + { + "epoch": 0.4475071381506699, + "grad_norm": 2.4105007648468018, + "learning_rate": 1e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6742451786994934, + "num_tokens": 101951000.0, + "step": 4075 + }, + { + "epoch": 0.44761695585328354, + "grad_norm": 2.372633218765259, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7168662548065186, + "num_tokens": 101974300.0, + "step": 4076 + }, + { + "epoch": 0.4477267735558972, + "grad_norm": 2.545743942260742, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7172716856002808, + "num_tokens": 101994961.0, + "step": 4077 + }, + { + "epoch": 0.4478365912585109, + "grad_norm": 2.2940752506256104, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7055558562278748, + "num_tokens": 102020163.0, + "step": 4078 + }, + { + "epoch": 0.44794640896112453, + "grad_norm": 2.111581563949585, + "learning_rate": 1e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.685390830039978, + "num_tokens": 102050736.0, + "step": 4079 + }, + { + "epoch": 0.4480562266637382, + "grad_norm": 2.351212978363037, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7008464336395264, + "num_tokens": 102075293.0, + "step": 4080 + }, + { + "epoch": 0.4481660443663519, + "grad_norm": 2.4733173847198486, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7109276056289673, + "num_tokens": 102095923.0, + "step": 4081 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 2.2093825340270996, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6904677748680115, + "num_tokens": 102120527.0, + "step": 4082 + }, + { + "epoch": 0.44838567977157917, + "grad_norm": 2.4734888076782227, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7300369739532471, + "num_tokens": 102141355.0, + "step": 4083 + }, + { + "epoch": 0.4484954974741928, + "grad_norm": 2.1851119995117188, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7092932462692261, + "num_tokens": 102166625.0, + "step": 4084 + }, + { + "epoch": 0.4486053151768065, + "grad_norm": 2.238363742828369, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6929324865341187, + "num_tokens": 102191863.0, + "step": 4085 + }, + { + "epoch": 0.44871513287942016, + "grad_norm": 2.1711771488189697, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.6981592178344727, + "num_tokens": 102218062.0, + "step": 4086 + }, + { + "epoch": 0.4488249505820338, + "grad_norm": 2.2795701026916504, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.715427577495575, + "num_tokens": 102242965.0, + "step": 4087 + }, + { + "epoch": 0.4489347682846475, + "grad_norm": 2.2467539310455322, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7038904428482056, + "num_tokens": 102265841.0, + "step": 4088 + }, + { + "epoch": 0.44904458598726116, + "grad_norm": 2.704972505569458, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7315441370010376, + "num_tokens": 102282828.0, + "step": 4089 + }, + { + "epoch": 0.4491544036898748, + "grad_norm": 2.4862711429595947, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6995108723640442, + "num_tokens": 102304456.0, + "step": 4090 + }, + { + "epoch": 0.44926422139248845, + "grad_norm": 2.1638295650482178, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6834621429443359, + "num_tokens": 102333067.0, + "step": 4091 + }, + { + "epoch": 0.44937403909510215, + "grad_norm": 1.8106170892715454, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6996224522590637, + "num_tokens": 102370582.0, + "step": 4092 + }, + { + "epoch": 0.4494838567977158, + "grad_norm": 2.176917314529419, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7178454399108887, + "num_tokens": 102398281.0, + "step": 4093 + }, + { + "epoch": 0.44959367450032944, + "grad_norm": 2.2987003326416016, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7077939510345459, + "num_tokens": 102422385.0, + "step": 4094 + }, + { + "epoch": 0.4497034922029431, + "grad_norm": 2.189927101135254, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7145673036575317, + "num_tokens": 102449382.0, + "step": 4095 + }, + { + "epoch": 0.4498133099055568, + "grad_norm": 2.188797950744629, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7325891256332397, + "num_tokens": 102473874.0, + "step": 4096 + }, + { + "epoch": 0.44992312760817044, + "grad_norm": 2.2507667541503906, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7072624564170837, + "num_tokens": 102499922.0, + "step": 4097 + }, + { + "epoch": 0.4500329453107841, + "grad_norm": 2.14312744140625, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.721859335899353, + "num_tokens": 102523637.0, + "step": 4098 + }, + { + "epoch": 0.4501427630133978, + "grad_norm": 2.000164747238159, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6963036060333252, + "num_tokens": 102553370.0, + "step": 4099 + }, + { + "epoch": 0.45025258071601143, + "grad_norm": 2.2967631816864014, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7113827466964722, + "num_tokens": 102578768.0, + "step": 4100 + }, + { + "epoch": 0.4503623984186251, + "grad_norm": 2.101577043533325, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7112962603569031, + "num_tokens": 102607112.0, + "step": 4101 + }, + { + "epoch": 0.4504722161212387, + "grad_norm": 2.0758767127990723, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7148982286453247, + "num_tokens": 102631606.0, + "step": 4102 + }, + { + "epoch": 0.4505820338238524, + "grad_norm": 2.0920886993408203, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6935678720474243, + "num_tokens": 102659921.0, + "step": 4103 + }, + { + "epoch": 0.45069185152646607, + "grad_norm": 2.0988359451293945, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.68877112865448, + "num_tokens": 102687509.0, + "step": 4104 + }, + { + "epoch": 0.4508016692290797, + "grad_norm": 2.457460880279541, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6940294504165649, + "num_tokens": 102708866.0, + "step": 4105 + }, + { + "epoch": 0.45091148693169336, + "grad_norm": 2.3423397541046143, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7119516134262085, + "num_tokens": 102729874.0, + "step": 4106 + }, + { + "epoch": 0.45102130463430706, + "grad_norm": 2.204308271408081, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7048271298408508, + "num_tokens": 102754346.0, + "step": 4107 + }, + { + "epoch": 0.4511311223369207, + "grad_norm": 2.382211208343506, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7216477990150452, + "num_tokens": 102777480.0, + "step": 4108 + }, + { + "epoch": 0.45124094003953436, + "grad_norm": 2.227440595626831, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7323030233383179, + "num_tokens": 102802122.0, + "step": 4109 + }, + { + "epoch": 0.45135075774214806, + "grad_norm": 2.369246482849121, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7066227197647095, + "num_tokens": 102824132.0, + "step": 4110 + }, + { + "epoch": 0.4514605754447617, + "grad_norm": 2.230922222137451, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6950087547302246, + "num_tokens": 102850975.0, + "step": 4111 + }, + { + "epoch": 0.45157039314737535, + "grad_norm": 2.1086835861206055, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7010153532028198, + "num_tokens": 102878943.0, + "step": 4112 + }, + { + "epoch": 0.451680210849989, + "grad_norm": 2.5565896034240723, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7065204977989197, + "num_tokens": 102899569.0, + "step": 4113 + }, + { + "epoch": 0.4517900285526027, + "grad_norm": 2.0009593963623047, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6870720386505127, + "num_tokens": 102932437.0, + "step": 4114 + }, + { + "epoch": 0.45189984625521634, + "grad_norm": 1.9830527305603027, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6804159879684448, + "num_tokens": 102967348.0, + "step": 4115 + }, + { + "epoch": 0.45200966395783, + "grad_norm": 2.314051628112793, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6982573866844177, + "num_tokens": 102990617.0, + "step": 4116 + }, + { + "epoch": 0.4521194816604437, + "grad_norm": 2.1105523109436035, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6971225142478943, + "num_tokens": 103018289.0, + "step": 4117 + }, + { + "epoch": 0.45222929936305734, + "grad_norm": 2.2531986236572266, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7070067524909973, + "num_tokens": 103042172.0, + "step": 4118 + }, + { + "epoch": 0.452339117065671, + "grad_norm": 2.1103060245513916, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6846551299095154, + "num_tokens": 103072201.0, + "step": 4119 + }, + { + "epoch": 0.4524489347682846, + "grad_norm": 2.201052665710449, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6932558417320251, + "num_tokens": 103098632.0, + "step": 4120 + }, + { + "epoch": 0.45255875247089833, + "grad_norm": 2.1314122676849365, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7283574342727661, + "num_tokens": 103122414.0, + "step": 4121 + }, + { + "epoch": 0.452668570173512, + "grad_norm": 2.060401201248169, + "learning_rate": 1e-06, + "loss": 1.1596, + "mean_token_accuracy": 0.650100827217102, + "num_tokens": 103155475.0, + "step": 4122 + }, + { + "epoch": 0.4527783878761256, + "grad_norm": 2.262892484664917, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7079534530639648, + "num_tokens": 103183237.0, + "step": 4123 + }, + { + "epoch": 0.45288820557873927, + "grad_norm": 2.649205207824707, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7166421413421631, + "num_tokens": 103200860.0, + "step": 4124 + }, + { + "epoch": 0.45299802328135297, + "grad_norm": 2.059213638305664, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7045546770095825, + "num_tokens": 103228782.0, + "step": 4125 + }, + { + "epoch": 0.4531078409839666, + "grad_norm": 2.3376708030700684, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7089527249336243, + "num_tokens": 103252314.0, + "step": 4126 + }, + { + "epoch": 0.45321765868658026, + "grad_norm": 2.432846784591675, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7066327929496765, + "num_tokens": 103274225.0, + "step": 4127 + }, + { + "epoch": 0.45332747638919396, + "grad_norm": 2.2386810779571533, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7049490213394165, + "num_tokens": 103299086.0, + "step": 4128 + }, + { + "epoch": 0.4534372940918076, + "grad_norm": 2.2611963748931885, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6841819882392883, + "num_tokens": 103323598.0, + "step": 4129 + }, + { + "epoch": 0.45354711179442125, + "grad_norm": 2.3693270683288574, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7179181575775146, + "num_tokens": 103346032.0, + "step": 4130 + }, + { + "epoch": 0.4536569294970349, + "grad_norm": 2.2550625801086426, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6963610649108887, + "num_tokens": 103369587.0, + "step": 4131 + }, + { + "epoch": 0.4537667471996486, + "grad_norm": 2.207460403442383, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7036285400390625, + "num_tokens": 103396052.0, + "step": 4132 + }, + { + "epoch": 0.45387656490226225, + "grad_norm": 2.65010929107666, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7009311318397522, + "num_tokens": 103414848.0, + "step": 4133 + }, + { + "epoch": 0.4539863826048759, + "grad_norm": 2.0648632049560547, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6872720718383789, + "num_tokens": 103442884.0, + "step": 4134 + }, + { + "epoch": 0.4540962003074896, + "grad_norm": 2.21406626701355, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7077361941337585, + "num_tokens": 103468873.0, + "step": 4135 + }, + { + "epoch": 0.45420601801010324, + "grad_norm": 2.319653034210205, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7100728154182434, + "num_tokens": 103491665.0, + "step": 4136 + }, + { + "epoch": 0.4543158357127169, + "grad_norm": 2.4820261001586914, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6923162937164307, + "num_tokens": 103513309.0, + "step": 4137 + }, + { + "epoch": 0.45442565341533053, + "grad_norm": 2.4165217876434326, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7246151566505432, + "num_tokens": 103532474.0, + "step": 4138 + }, + { + "epoch": 0.45453547111794423, + "grad_norm": 2.4690029621124268, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7120038270950317, + "num_tokens": 103553693.0, + "step": 4139 + }, + { + "epoch": 0.4546452888205579, + "grad_norm": 2.2868170738220215, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6921713948249817, + "num_tokens": 103580401.0, + "step": 4140 + }, + { + "epoch": 0.4547551065231715, + "grad_norm": 2.012352228164673, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6906620264053345, + "num_tokens": 103609270.0, + "step": 4141 + }, + { + "epoch": 0.45486492422578517, + "grad_norm": 2.2302799224853516, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6962939500808716, + "num_tokens": 103632697.0, + "step": 4142 + }, + { + "epoch": 0.4549747419283989, + "grad_norm": 2.254483699798584, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7266557216644287, + "num_tokens": 103655491.0, + "step": 4143 + }, + { + "epoch": 0.4550845596310125, + "grad_norm": 2.580673933029175, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6941289901733398, + "num_tokens": 103675273.0, + "step": 4144 + }, + { + "epoch": 0.45519437733362617, + "grad_norm": 2.1385791301727295, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7076283693313599, + "num_tokens": 103701300.0, + "step": 4145 + }, + { + "epoch": 0.45530419503623987, + "grad_norm": 2.492213249206543, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7129011750221252, + "num_tokens": 103721218.0, + "step": 4146 + }, + { + "epoch": 0.4554140127388535, + "grad_norm": 2.0184624195098877, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6932480335235596, + "num_tokens": 103752175.0, + "step": 4147 + }, + { + "epoch": 0.45552383044146716, + "grad_norm": 2.1865391731262207, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7181878685951233, + "num_tokens": 103778198.0, + "step": 4148 + }, + { + "epoch": 0.4556336481440808, + "grad_norm": 2.517338752746582, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7130516767501831, + "num_tokens": 103797340.0, + "step": 4149 + }, + { + "epoch": 0.4557434658466945, + "grad_norm": 2.1058554649353027, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.7057209014892578, + "num_tokens": 103824675.0, + "step": 4150 + }, + { + "epoch": 0.45585328354930815, + "grad_norm": 2.4156455993652344, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.715537428855896, + "num_tokens": 103845941.0, + "step": 4151 + }, + { + "epoch": 0.4559631012519218, + "grad_norm": 2.2380216121673584, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.702019453048706, + "num_tokens": 103870936.0, + "step": 4152 + }, + { + "epoch": 0.45607291895453544, + "grad_norm": 2.2928714752197266, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.722434401512146, + "num_tokens": 103892688.0, + "step": 4153 + }, + { + "epoch": 0.45618273665714915, + "grad_norm": 2.267444133758545, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7101320028305054, + "num_tokens": 103916691.0, + "step": 4154 + }, + { + "epoch": 0.4562925543597628, + "grad_norm": 2.058138608932495, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7157500386238098, + "num_tokens": 103944782.0, + "step": 4155 + }, + { + "epoch": 0.45640237206237644, + "grad_norm": 2.0332682132720947, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7181467413902283, + "num_tokens": 103974285.0, + "step": 4156 + }, + { + "epoch": 0.45651218976499014, + "grad_norm": 2.3474059104919434, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7350219488143921, + "num_tokens": 103995065.0, + "step": 4157 + }, + { + "epoch": 0.4566220074676038, + "grad_norm": 2.1687419414520264, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7188907861709595, + "num_tokens": 104020040.0, + "step": 4158 + }, + { + "epoch": 0.45673182517021743, + "grad_norm": 2.4479875564575195, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.71108078956604, + "num_tokens": 104039249.0, + "step": 4159 + }, + { + "epoch": 0.4568416428728311, + "grad_norm": 2.1687369346618652, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7072385549545288, + "num_tokens": 104065433.0, + "step": 4160 + }, + { + "epoch": 0.4569514605754448, + "grad_norm": 1.765107274055481, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7243125438690186, + "num_tokens": 104100369.0, + "step": 4161 + }, + { + "epoch": 0.4570612782780584, + "grad_norm": 2.0988922119140625, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6986823081970215, + "num_tokens": 104126554.0, + "step": 4162 + }, + { + "epoch": 0.45717109598067207, + "grad_norm": 2.010253667831421, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7144047021865845, + "num_tokens": 104153588.0, + "step": 4163 + }, + { + "epoch": 0.4572809136832858, + "grad_norm": 2.370849132537842, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.726902186870575, + "num_tokens": 104175733.0, + "step": 4164 + }, + { + "epoch": 0.4573907313858994, + "grad_norm": 2.44429612159729, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7064497470855713, + "num_tokens": 104196517.0, + "step": 4165 + }, + { + "epoch": 0.45750054908851306, + "grad_norm": 2.015892267227173, + "learning_rate": 1e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6800726652145386, + "num_tokens": 104227042.0, + "step": 4166 + }, + { + "epoch": 0.4576103667911267, + "grad_norm": 2.140481948852539, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.7061319947242737, + "num_tokens": 104252634.0, + "step": 4167 + }, + { + "epoch": 0.4577201844937404, + "grad_norm": 2.0169811248779297, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7022898197174072, + "num_tokens": 104282376.0, + "step": 4168 + }, + { + "epoch": 0.45783000219635406, + "grad_norm": 2.293795108795166, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6988193988800049, + "num_tokens": 104306320.0, + "step": 4169 + }, + { + "epoch": 0.4579398198989677, + "grad_norm": 2.4991044998168945, + "learning_rate": 1e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7391438484191895, + "num_tokens": 104325747.0, + "step": 4170 + }, + { + "epoch": 0.45804963760158135, + "grad_norm": 2.465127944946289, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7215794324874878, + "num_tokens": 104345224.0, + "step": 4171 + }, + { + "epoch": 0.45815945530419505, + "grad_norm": 2.3622994422912598, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.718477189540863, + "num_tokens": 104366243.0, + "step": 4172 + }, + { + "epoch": 0.4582692730068087, + "grad_norm": 2.279252052307129, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6833788156509399, + "num_tokens": 104390523.0, + "step": 4173 + }, + { + "epoch": 0.45837909070942234, + "grad_norm": 2.0376646518707275, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7041429281234741, + "num_tokens": 104419464.0, + "step": 4174 + }, + { + "epoch": 0.45848890841203604, + "grad_norm": 2.1502797603607178, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7011205554008484, + "num_tokens": 104446505.0, + "step": 4175 + }, + { + "epoch": 0.4585987261146497, + "grad_norm": 2.2696337699890137, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6982696056365967, + "num_tokens": 104473197.0, + "step": 4176 + }, + { + "epoch": 0.45870854381726334, + "grad_norm": 2.359830856323242, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6931630969047546, + "num_tokens": 104495906.0, + "step": 4177 + }, + { + "epoch": 0.458818361519877, + "grad_norm": 2.2288103103637695, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7109513878822327, + "num_tokens": 104519556.0, + "step": 4178 + }, + { + "epoch": 0.4589281792224907, + "grad_norm": 2.175354242324829, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6980751752853394, + "num_tokens": 104545930.0, + "step": 4179 + }, + { + "epoch": 0.45903799692510433, + "grad_norm": 2.294142246246338, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7258601188659668, + "num_tokens": 104567660.0, + "step": 4180 + }, + { + "epoch": 0.459147814627718, + "grad_norm": 2.4014668464660645, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7087811827659607, + "num_tokens": 104589859.0, + "step": 4181 + }, + { + "epoch": 0.4592576323303316, + "grad_norm": 2.0746772289276123, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6939492225646973, + "num_tokens": 104618135.0, + "step": 4182 + }, + { + "epoch": 0.4593674500329453, + "grad_norm": 2.3226735591888428, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7014899253845215, + "num_tokens": 104642020.0, + "step": 4183 + }, + { + "epoch": 0.45947726773555897, + "grad_norm": 2.299792766571045, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.704198956489563, + "num_tokens": 104666725.0, + "step": 4184 + }, + { + "epoch": 0.4595870854381726, + "grad_norm": 2.2780797481536865, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7275519967079163, + "num_tokens": 104691496.0, + "step": 4185 + }, + { + "epoch": 0.4596969031407863, + "grad_norm": 2.3868565559387207, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7046341896057129, + "num_tokens": 104714562.0, + "step": 4186 + }, + { + "epoch": 0.45980672084339996, + "grad_norm": 2.206404447555542, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6869698762893677, + "num_tokens": 104740566.0, + "step": 4187 + }, + { + "epoch": 0.4599165385460136, + "grad_norm": 2.13161563873291, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7058955430984497, + "num_tokens": 104771068.0, + "step": 4188 + }, + { + "epoch": 0.46002635624862726, + "grad_norm": 2.4560418128967285, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7074099779129028, + "num_tokens": 104793684.0, + "step": 4189 + }, + { + "epoch": 0.46013617395124096, + "grad_norm": 2.315978765487671, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.706419825553894, + "num_tokens": 104817606.0, + "step": 4190 + }, + { + "epoch": 0.4602459916538546, + "grad_norm": 2.1883156299591064, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7025355100631714, + "num_tokens": 104843864.0, + "step": 4191 + }, + { + "epoch": 0.46035580935646825, + "grad_norm": 2.209559440612793, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6920698285102844, + "num_tokens": 104871632.0, + "step": 4192 + }, + { + "epoch": 0.46046562705908195, + "grad_norm": 2.326584815979004, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7140520811080933, + "num_tokens": 104895331.0, + "step": 4193 + }, + { + "epoch": 0.4605754447616956, + "grad_norm": 2.2744317054748535, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7331862449645996, + "num_tokens": 104918290.0, + "step": 4194 + }, + { + "epoch": 0.46068526246430924, + "grad_norm": 2.0123441219329834, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6870684623718262, + "num_tokens": 104947042.0, + "step": 4195 + }, + { + "epoch": 0.4607950801669229, + "grad_norm": 2.2422914505004883, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6900414824485779, + "num_tokens": 104971561.0, + "step": 4196 + }, + { + "epoch": 0.4609048978695366, + "grad_norm": 2.4171528816223145, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7118877172470093, + "num_tokens": 104993874.0, + "step": 4197 + }, + { + "epoch": 0.46101471557215024, + "grad_norm": 2.2156543731689453, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7076749801635742, + "num_tokens": 105018986.0, + "step": 4198 + }, + { + "epoch": 0.4611245332747639, + "grad_norm": 2.333423614501953, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7078272700309753, + "num_tokens": 105043534.0, + "step": 4199 + }, + { + "epoch": 0.4612343509773775, + "grad_norm": 2.3227508068084717, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7151424884796143, + "num_tokens": 105066537.0, + "step": 4200 + }, + { + "epoch": 0.46134416867999123, + "grad_norm": 1.9253820180892944, + "learning_rate": 1e-06, + "loss": 1.0413, + "mean_token_accuracy": 0.6838203072547913, + "num_tokens": 105100273.0, + "step": 4201 + }, + { + "epoch": 0.4614539863826049, + "grad_norm": 1.819819688796997, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6866181492805481, + "num_tokens": 105136300.0, + "step": 4202 + }, + { + "epoch": 0.4615638040852185, + "grad_norm": 2.4979515075683594, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6839017271995544, + "num_tokens": 105157717.0, + "step": 4203 + }, + { + "epoch": 0.4616736217878322, + "grad_norm": 1.9193233251571655, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7177606821060181, + "num_tokens": 105188823.0, + "step": 4204 + }, + { + "epoch": 0.46178343949044587, + "grad_norm": 2.0666568279266357, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.684940755367279, + "num_tokens": 105219986.0, + "step": 4205 + }, + { + "epoch": 0.4618932571930595, + "grad_norm": 2.459329843521118, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7007283568382263, + "num_tokens": 105243179.0, + "step": 4206 + }, + { + "epoch": 0.46200307489567316, + "grad_norm": 2.315171718597412, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7015736103057861, + "num_tokens": 105266752.0, + "step": 4207 + }, + { + "epoch": 0.46211289259828686, + "grad_norm": 2.0774405002593994, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7119448184967041, + "num_tokens": 105294815.0, + "step": 4208 + }, + { + "epoch": 0.4622227103009005, + "grad_norm": 2.0546751022338867, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7015262246131897, + "num_tokens": 105323481.0, + "step": 4209 + }, + { + "epoch": 0.46233252800351415, + "grad_norm": 2.1358871459960938, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7023352384567261, + "num_tokens": 105348645.0, + "step": 4210 + }, + { + "epoch": 0.46244234570612786, + "grad_norm": 2.380253553390503, + "learning_rate": 1e-06, + "loss": 1.1082, + "mean_token_accuracy": 0.6734514832496643, + "num_tokens": 105371365.0, + "step": 4211 + }, + { + "epoch": 0.4625521634087415, + "grad_norm": 2.1014280319213867, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6957554817199707, + "num_tokens": 105397724.0, + "step": 4212 + }, + { + "epoch": 0.46266198111135515, + "grad_norm": 2.322449207305908, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7162162065505981, + "num_tokens": 105421427.0, + "step": 4213 + }, + { + "epoch": 0.4627717988139688, + "grad_norm": 2.15643310546875, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7262980937957764, + "num_tokens": 105445920.0, + "step": 4214 + }, + { + "epoch": 0.4628816165165825, + "grad_norm": 2.3409714698791504, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7177911400794983, + "num_tokens": 105470094.0, + "step": 4215 + }, + { + "epoch": 0.46299143421919614, + "grad_norm": 2.000126361846924, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7290880680084229, + "num_tokens": 105497401.0, + "step": 4216 + }, + { + "epoch": 0.4631012519218098, + "grad_norm": 2.1932735443115234, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7159240245819092, + "num_tokens": 105521431.0, + "step": 4217 + }, + { + "epoch": 0.46321106962442343, + "grad_norm": 2.4065563678741455, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7250087261199951, + "num_tokens": 105545666.0, + "step": 4218 + }, + { + "epoch": 0.46332088732703713, + "grad_norm": 2.0918052196502686, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.701509416103363, + "num_tokens": 105575101.0, + "step": 4219 + }, + { + "epoch": 0.4634307050296508, + "grad_norm": 1.9199353456497192, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.702819287776947, + "num_tokens": 105608272.0, + "step": 4220 + }, + { + "epoch": 0.4635405227322644, + "grad_norm": 2.3409345149993896, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7053250074386597, + "num_tokens": 105631498.0, + "step": 4221 + }, + { + "epoch": 0.4636503404348781, + "grad_norm": 2.2169477939605713, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7162411212921143, + "num_tokens": 105656279.0, + "step": 4222 + }, + { + "epoch": 0.4637601581374918, + "grad_norm": 2.0725512504577637, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6951845288276672, + "num_tokens": 105685620.0, + "step": 4223 + }, + { + "epoch": 0.4638699758401054, + "grad_norm": 2.237208127975464, + "learning_rate": 1e-06, + "loss": 1.1288, + "mean_token_accuracy": 0.6688903570175171, + "num_tokens": 105711886.0, + "step": 4224 + }, + { + "epoch": 0.46397979354271907, + "grad_norm": 2.2338290214538574, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7010392546653748, + "num_tokens": 105738096.0, + "step": 4225 + }, + { + "epoch": 0.46408961124533277, + "grad_norm": 2.1478323936462402, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7000507116317749, + "num_tokens": 105767438.0, + "step": 4226 + }, + { + "epoch": 0.4641994289479464, + "grad_norm": 2.1517090797424316, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7306386232376099, + "num_tokens": 105792667.0, + "step": 4227 + }, + { + "epoch": 0.46430924665056006, + "grad_norm": 2.6139426231384277, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.7059644460678101, + "num_tokens": 105811348.0, + "step": 4228 + }, + { + "epoch": 0.4644190643531737, + "grad_norm": 2.262934446334839, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6994790434837341, + "num_tokens": 105835848.0, + "step": 4229 + }, + { + "epoch": 0.4645288820557874, + "grad_norm": 2.0091187953948975, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.698143482208252, + "num_tokens": 105865181.0, + "step": 4230 + }, + { + "epoch": 0.46463869975840105, + "grad_norm": 2.3846795558929443, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7273178100585938, + "num_tokens": 105885909.0, + "step": 4231 + }, + { + "epoch": 0.4647485174610147, + "grad_norm": 2.347656011581421, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7097924947738647, + "num_tokens": 105909625.0, + "step": 4232 + }, + { + "epoch": 0.4648583351636284, + "grad_norm": 2.07621693611145, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7232325077056885, + "num_tokens": 105937060.0, + "step": 4233 + }, + { + "epoch": 0.46496815286624205, + "grad_norm": 2.1426570415496826, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7193195819854736, + "num_tokens": 105961007.0, + "step": 4234 + }, + { + "epoch": 0.4650779705688557, + "grad_norm": 2.106743335723877, + "learning_rate": 1e-06, + "loss": 1.1182, + "mean_token_accuracy": 0.6763337254524231, + "num_tokens": 105991273.0, + "step": 4235 + }, + { + "epoch": 0.46518778827146934, + "grad_norm": 2.0341317653656006, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.720810055732727, + "num_tokens": 106019133.0, + "step": 4236 + }, + { + "epoch": 0.46529760597408304, + "grad_norm": 2.337733268737793, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6943211555480957, + "num_tokens": 106042652.0, + "step": 4237 + }, + { + "epoch": 0.4654074236766967, + "grad_norm": 2.10638689994812, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.6791282892227173, + "num_tokens": 106070458.0, + "step": 4238 + }, + { + "epoch": 0.46551724137931033, + "grad_norm": 2.144404411315918, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.706233024597168, + "num_tokens": 106098136.0, + "step": 4239 + }, + { + "epoch": 0.46562705908192403, + "grad_norm": 2.265913963317871, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7022217512130737, + "num_tokens": 106123391.0, + "step": 4240 + }, + { + "epoch": 0.4657368767845377, + "grad_norm": 2.1800825595855713, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6971553564071655, + "num_tokens": 106148138.0, + "step": 4241 + }, + { + "epoch": 0.4658466944871513, + "grad_norm": 2.3199450969696045, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.717581033706665, + "num_tokens": 106169681.0, + "step": 4242 + }, + { + "epoch": 0.46595651218976497, + "grad_norm": 2.3578152656555176, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7019026875495911, + "num_tokens": 106193064.0, + "step": 4243 + }, + { + "epoch": 0.4660663298923787, + "grad_norm": 2.193671703338623, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6970235109329224, + "num_tokens": 106219736.0, + "step": 4244 + }, + { + "epoch": 0.4661761475949923, + "grad_norm": 1.9770723581314087, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7089021801948547, + "num_tokens": 106249106.0, + "step": 4245 + }, + { + "epoch": 0.46628596529760596, + "grad_norm": 2.754889965057373, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7061440348625183, + "num_tokens": 106267495.0, + "step": 4246 + }, + { + "epoch": 0.4663957830002196, + "grad_norm": 2.089181661605835, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7117955684661865, + "num_tokens": 106294113.0, + "step": 4247 + }, + { + "epoch": 0.4665056007028333, + "grad_norm": 2.523533821105957, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7047158479690552, + "num_tokens": 106315847.0, + "step": 4248 + }, + { + "epoch": 0.46661541840544696, + "grad_norm": 2.185741424560547, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6901046633720398, + "num_tokens": 106344332.0, + "step": 4249 + }, + { + "epoch": 0.4667252361080606, + "grad_norm": 2.341055393218994, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7047328948974609, + "num_tokens": 106366666.0, + "step": 4250 + }, + { + "epoch": 0.4668350538106743, + "grad_norm": 2.434938669204712, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6942419409751892, + "num_tokens": 106389412.0, + "step": 4251 + }, + { + "epoch": 0.46694487151328795, + "grad_norm": 2.422895908355713, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7216482758522034, + "num_tokens": 106409273.0, + "step": 4252 + }, + { + "epoch": 0.4670546892159016, + "grad_norm": 2.0947391986846924, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7082685232162476, + "num_tokens": 106435418.0, + "step": 4253 + }, + { + "epoch": 0.46716450691851524, + "grad_norm": 2.2565081119537354, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7149817943572998, + "num_tokens": 106457880.0, + "step": 4254 + }, + { + "epoch": 0.46727432462112894, + "grad_norm": 2.6378109455108643, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7132046818733215, + "num_tokens": 106475901.0, + "step": 4255 + }, + { + "epoch": 0.4673841423237426, + "grad_norm": 2.2994344234466553, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7212274670600891, + "num_tokens": 106499967.0, + "step": 4256 + }, + { + "epoch": 0.46749396002635624, + "grad_norm": 2.222181797027588, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6998370289802551, + "num_tokens": 106524816.0, + "step": 4257 + }, + { + "epoch": 0.4676037777289699, + "grad_norm": 2.381376266479492, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7120202779769897, + "num_tokens": 106547106.0, + "step": 4258 + }, + { + "epoch": 0.4677135954315836, + "grad_norm": 2.2694497108459473, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6984233856201172, + "num_tokens": 106571924.0, + "step": 4259 + }, + { + "epoch": 0.46782341313419723, + "grad_norm": 2.313209295272827, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7076989412307739, + "num_tokens": 106593884.0, + "step": 4260 + }, + { + "epoch": 0.4679332308368109, + "grad_norm": 2.1326370239257812, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6910728216171265, + "num_tokens": 106620923.0, + "step": 4261 + }, + { + "epoch": 0.4680430485394246, + "grad_norm": 2.4373371601104736, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7033430337905884, + "num_tokens": 106641513.0, + "step": 4262 + }, + { + "epoch": 0.4681528662420382, + "grad_norm": 2.518813371658325, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7054451704025269, + "num_tokens": 106661302.0, + "step": 4263 + }, + { + "epoch": 0.46826268394465187, + "grad_norm": 2.289506435394287, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.703108549118042, + "num_tokens": 106685562.0, + "step": 4264 + }, + { + "epoch": 0.4683725016472655, + "grad_norm": 2.2482399940490723, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7039312124252319, + "num_tokens": 106711389.0, + "step": 4265 + }, + { + "epoch": 0.4684823193498792, + "grad_norm": 2.2250359058380127, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7177391052246094, + "num_tokens": 106736168.0, + "step": 4266 + }, + { + "epoch": 0.46859213705249286, + "grad_norm": 2.5441102981567383, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7265543937683105, + "num_tokens": 106755833.0, + "step": 4267 + }, + { + "epoch": 0.4687019547551065, + "grad_norm": 2.4810314178466797, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7142807841300964, + "num_tokens": 106776020.0, + "step": 4268 + }, + { + "epoch": 0.4688117724577202, + "grad_norm": 2.3064146041870117, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7060677409172058, + "num_tokens": 106799715.0, + "step": 4269 + }, + { + "epoch": 0.46892159016033386, + "grad_norm": 2.2906672954559326, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.709062933921814, + "num_tokens": 106822776.0, + "step": 4270 + }, + { + "epoch": 0.4690314078629475, + "grad_norm": 2.2543928623199463, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7079620361328125, + "num_tokens": 106846999.0, + "step": 4271 + }, + { + "epoch": 0.46914122556556115, + "grad_norm": 2.201824426651001, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6898320913314819, + "num_tokens": 106872105.0, + "step": 4272 + }, + { + "epoch": 0.46925104326817485, + "grad_norm": 2.0454211235046387, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7140737175941467, + "num_tokens": 106899509.0, + "step": 4273 + }, + { + "epoch": 0.4693608609707885, + "grad_norm": 2.344149589538574, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7054668664932251, + "num_tokens": 106923250.0, + "step": 4274 + }, + { + "epoch": 0.46947067867340214, + "grad_norm": 2.006631374359131, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7100881338119507, + "num_tokens": 106950082.0, + "step": 4275 + }, + { + "epoch": 0.4695804963760158, + "grad_norm": 2.2431628704071045, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.7015836238861084, + "num_tokens": 106975327.0, + "step": 4276 + }, + { + "epoch": 0.4696903140786295, + "grad_norm": 2.4818029403686523, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7006744146347046, + "num_tokens": 106996063.0, + "step": 4277 + }, + { + "epoch": 0.46980013178124314, + "grad_norm": 2.0239572525024414, + "learning_rate": 1e-06, + "loss": 1.0756, + "mean_token_accuracy": 0.6774657964706421, + "num_tokens": 107025517.0, + "step": 4278 + }, + { + "epoch": 0.4699099494838568, + "grad_norm": 2.2607476711273193, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7083665132522583, + "num_tokens": 107050010.0, + "step": 4279 + }, + { + "epoch": 0.4700197671864705, + "grad_norm": 2.0216145515441895, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6823327541351318, + "num_tokens": 107077874.0, + "step": 4280 + }, + { + "epoch": 0.47012958488908413, + "grad_norm": 2.423762083053589, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7012709975242615, + "num_tokens": 107099526.0, + "step": 4281 + }, + { + "epoch": 0.4702394025916978, + "grad_norm": 2.0179412364959717, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6921554207801819, + "num_tokens": 107128864.0, + "step": 4282 + }, + { + "epoch": 0.4703492202943114, + "grad_norm": 2.5197842121124268, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7374959588050842, + "num_tokens": 107150219.0, + "step": 4283 + }, + { + "epoch": 0.4704590379969251, + "grad_norm": 2.1904826164245605, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7090603709220886, + "num_tokens": 107176068.0, + "step": 4284 + }, + { + "epoch": 0.47056885569953877, + "grad_norm": 2.4579551219940186, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7017837762832642, + "num_tokens": 107196991.0, + "step": 4285 + }, + { + "epoch": 0.4706786734021524, + "grad_norm": 2.2851874828338623, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.690730631351471, + "num_tokens": 107223523.0, + "step": 4286 + }, + { + "epoch": 0.4707884911047661, + "grad_norm": 2.3434343338012695, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6911647915840149, + "num_tokens": 107247887.0, + "step": 4287 + }, + { + "epoch": 0.47089830880737976, + "grad_norm": 2.16001033782959, + "learning_rate": 1e-06, + "loss": 1.0615, + "mean_token_accuracy": 0.6827804446220398, + "num_tokens": 107276643.0, + "step": 4288 + }, + { + "epoch": 0.4710081265099934, + "grad_norm": 1.9449353218078613, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.708629846572876, + "num_tokens": 107307206.0, + "step": 4289 + }, + { + "epoch": 0.47111794421260705, + "grad_norm": 2.3597195148468018, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6955660581588745, + "num_tokens": 107333456.0, + "step": 4290 + }, + { + "epoch": 0.47122776191522076, + "grad_norm": 2.392901659011841, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7129908800125122, + "num_tokens": 107354413.0, + "step": 4291 + }, + { + "epoch": 0.4713375796178344, + "grad_norm": 1.9977794885635376, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.6958888173103333, + "num_tokens": 107385753.0, + "step": 4292 + }, + { + "epoch": 0.47144739732044805, + "grad_norm": 2.0579335689544678, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7042049765586853, + "num_tokens": 107416073.0, + "step": 4293 + }, + { + "epoch": 0.4715572150230617, + "grad_norm": 2.1557059288024902, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6973716616630554, + "num_tokens": 107442054.0, + "step": 4294 + }, + { + "epoch": 0.4716670327256754, + "grad_norm": 2.131058931350708, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7210768461227417, + "num_tokens": 107465752.0, + "step": 4295 + }, + { + "epoch": 0.47177685042828904, + "grad_norm": 2.4254024028778076, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7063934803009033, + "num_tokens": 107490350.0, + "step": 4296 + }, + { + "epoch": 0.4718866681309027, + "grad_norm": 2.149834394454956, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7299677133560181, + "num_tokens": 107517150.0, + "step": 4297 + }, + { + "epoch": 0.4719964858335164, + "grad_norm": 2.1924688816070557, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7028454542160034, + "num_tokens": 107542306.0, + "step": 4298 + }, + { + "epoch": 0.47210630353613003, + "grad_norm": 2.4364781379699707, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.6963539719581604, + "num_tokens": 107565259.0, + "step": 4299 + }, + { + "epoch": 0.4722161212387437, + "grad_norm": 2.1696228981018066, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7091020941734314, + "num_tokens": 107591791.0, + "step": 4300 + }, + { + "epoch": 0.4723259389413573, + "grad_norm": 2.0464465618133545, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7091478109359741, + "num_tokens": 107619729.0, + "step": 4301 + }, + { + "epoch": 0.472435756643971, + "grad_norm": 2.4119510650634766, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.723243236541748, + "num_tokens": 107641263.0, + "step": 4302 + }, + { + "epoch": 0.4725455743465847, + "grad_norm": 2.2969720363616943, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7112817764282227, + "num_tokens": 107665236.0, + "step": 4303 + }, + { + "epoch": 0.4726553920491983, + "grad_norm": 2.1211273670196533, + "learning_rate": 1e-06, + "loss": 1.1032, + "mean_token_accuracy": 0.6752049922943115, + "num_tokens": 107695179.0, + "step": 4304 + }, + { + "epoch": 0.47276520975181197, + "grad_norm": 2.208986759185791, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6818068027496338, + "num_tokens": 107723058.0, + "step": 4305 + }, + { + "epoch": 0.47287502745442567, + "grad_norm": 2.2275032997131348, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7128158807754517, + "num_tokens": 107748664.0, + "step": 4306 + }, + { + "epoch": 0.4729848451570393, + "grad_norm": 2.253462791442871, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6968204975128174, + "num_tokens": 107775061.0, + "step": 4307 + }, + { + "epoch": 0.47309466285965296, + "grad_norm": 2.493746757507324, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7198818325996399, + "num_tokens": 107795254.0, + "step": 4308 + }, + { + "epoch": 0.47320448056226666, + "grad_norm": 2.689939260482788, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7237669825553894, + "num_tokens": 107814006.0, + "step": 4309 + }, + { + "epoch": 0.4733142982648803, + "grad_norm": 2.1524407863616943, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7179033160209656, + "num_tokens": 107841353.0, + "step": 4310 + }, + { + "epoch": 0.47342411596749395, + "grad_norm": 2.127063274383545, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.6970617175102234, + "num_tokens": 107867450.0, + "step": 4311 + }, + { + "epoch": 0.4735339336701076, + "grad_norm": 2.10019850730896, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7070082426071167, + "num_tokens": 107893358.0, + "step": 4312 + }, + { + "epoch": 0.4736437513727213, + "grad_norm": 2.2068417072296143, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7272577881813049, + "num_tokens": 107918157.0, + "step": 4313 + }, + { + "epoch": 0.47375356907533495, + "grad_norm": 1.8653010129928589, + "learning_rate": 1e-06, + "loss": 1.0711, + "mean_token_accuracy": 0.688633143901825, + "num_tokens": 107950876.0, + "step": 4314 + }, + { + "epoch": 0.4738633867779486, + "grad_norm": 2.4041824340820312, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7128980159759521, + "num_tokens": 107973332.0, + "step": 4315 + }, + { + "epoch": 0.4739732044805623, + "grad_norm": 2.4355623722076416, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7161506414413452, + "num_tokens": 107995399.0, + "step": 4316 + }, + { + "epoch": 0.47408302218317594, + "grad_norm": 2.055136203765869, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6889721751213074, + "num_tokens": 108025405.0, + "step": 4317 + }, + { + "epoch": 0.4741928398857896, + "grad_norm": 2.0474109649658203, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6978980302810669, + "num_tokens": 108052167.0, + "step": 4318 + }, + { + "epoch": 0.47430265758840323, + "grad_norm": 2.0230491161346436, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6840066909790039, + "num_tokens": 108083114.0, + "step": 4319 + }, + { + "epoch": 0.47441247529101693, + "grad_norm": 2.066887378692627, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7129173278808594, + "num_tokens": 108110806.0, + "step": 4320 + }, + { + "epoch": 0.4745222929936306, + "grad_norm": 2.1122841835021973, + "learning_rate": 1e-06, + "loss": 1.0916, + "mean_token_accuracy": 0.6755957007408142, + "num_tokens": 108140505.0, + "step": 4321 + }, + { + "epoch": 0.4746321106962442, + "grad_norm": 2.1283483505249023, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7202137112617493, + "num_tokens": 108166108.0, + "step": 4322 + }, + { + "epoch": 0.47474192839885787, + "grad_norm": 2.2170987129211426, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7097184658050537, + "num_tokens": 108190327.0, + "step": 4323 + }, + { + "epoch": 0.4748517461014716, + "grad_norm": 2.2295751571655273, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6918068528175354, + "num_tokens": 108215372.0, + "step": 4324 + }, + { + "epoch": 0.4749615638040852, + "grad_norm": 2.2673356533050537, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.70326828956604, + "num_tokens": 108240608.0, + "step": 4325 + }, + { + "epoch": 0.47507138150669886, + "grad_norm": 2.571774482727051, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7138004302978516, + "num_tokens": 108259691.0, + "step": 4326 + }, + { + "epoch": 0.47518119920931257, + "grad_norm": 2.058372735977173, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6823621392250061, + "num_tokens": 108288561.0, + "step": 4327 + }, + { + "epoch": 0.4752910169119262, + "grad_norm": 2.0140228271484375, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6943609714508057, + "num_tokens": 108321935.0, + "step": 4328 + }, + { + "epoch": 0.47540083461453986, + "grad_norm": 2.498257875442505, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7003977298736572, + "num_tokens": 108344609.0, + "step": 4329 + }, + { + "epoch": 0.4755106523171535, + "grad_norm": 2.390564203262329, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.7009913921356201, + "num_tokens": 108366932.0, + "step": 4330 + }, + { + "epoch": 0.4756204700197672, + "grad_norm": 2.1887905597686768, + "learning_rate": 1e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.676735520362854, + "num_tokens": 108393068.0, + "step": 4331 + }, + { + "epoch": 0.47573028772238085, + "grad_norm": 2.095780849456787, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7138179540634155, + "num_tokens": 108420017.0, + "step": 4332 + }, + { + "epoch": 0.4758401054249945, + "grad_norm": 2.421790599822998, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.708163321018219, + "num_tokens": 108443294.0, + "step": 4333 + }, + { + "epoch": 0.47594992312760814, + "grad_norm": 2.1579439640045166, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7172656059265137, + "num_tokens": 108467870.0, + "step": 4334 + }, + { + "epoch": 0.47605974083022184, + "grad_norm": 2.3162379264831543, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7092787027359009, + "num_tokens": 108489313.0, + "step": 4335 + }, + { + "epoch": 0.4761695585328355, + "grad_norm": 2.3039987087249756, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.687735378742218, + "num_tokens": 108514779.0, + "step": 4336 + }, + { + "epoch": 0.47627937623544914, + "grad_norm": 2.210937023162842, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7187204360961914, + "num_tokens": 108538492.0, + "step": 4337 + }, + { + "epoch": 0.47638919393806284, + "grad_norm": 2.2496044635772705, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7106649875640869, + "num_tokens": 108562478.0, + "step": 4338 + }, + { + "epoch": 0.4764990116406765, + "grad_norm": 2.6816036701202393, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7161558270454407, + "num_tokens": 108582533.0, + "step": 4339 + }, + { + "epoch": 0.47660882934329013, + "grad_norm": 2.5229251384735107, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7057114243507385, + "num_tokens": 108604096.0, + "step": 4340 + }, + { + "epoch": 0.4767186470459038, + "grad_norm": 2.8310556411743164, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7191574573516846, + "num_tokens": 108621186.0, + "step": 4341 + }, + { + "epoch": 0.4768284647485175, + "grad_norm": 2.1166367530822754, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7125609517097473, + "num_tokens": 108648002.0, + "step": 4342 + }, + { + "epoch": 0.4769382824511311, + "grad_norm": 2.5109992027282715, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7248725891113281, + "num_tokens": 108669225.0, + "step": 4343 + }, + { + "epoch": 0.47704810015374477, + "grad_norm": 2.4993932247161865, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6896081566810608, + "num_tokens": 108691060.0, + "step": 4344 + }, + { + "epoch": 0.47715791785635847, + "grad_norm": 1.97648024559021, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6856272220611572, + "num_tokens": 108723163.0, + "step": 4345 + }, + { + "epoch": 0.4772677355589721, + "grad_norm": 2.1229360103607178, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7216202020645142, + "num_tokens": 108748661.0, + "step": 4346 + }, + { + "epoch": 0.47737755326158576, + "grad_norm": 2.2836029529571533, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7065231204032898, + "num_tokens": 108773626.0, + "step": 4347 + }, + { + "epoch": 0.4774873709641994, + "grad_norm": 2.081413984298706, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7051984071731567, + "num_tokens": 108800397.0, + "step": 4348 + }, + { + "epoch": 0.4775971886668131, + "grad_norm": 2.4609122276306152, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7146100997924805, + "num_tokens": 108819785.0, + "step": 4349 + }, + { + "epoch": 0.47770700636942676, + "grad_norm": 1.8846853971481323, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6954929828643799, + "num_tokens": 108852723.0, + "step": 4350 + }, + { + "epoch": 0.4778168240720404, + "grad_norm": 2.3599681854248047, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7037465572357178, + "num_tokens": 108877603.0, + "step": 4351 + }, + { + "epoch": 0.47792664177465405, + "grad_norm": 2.1386306285858154, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6907669901847839, + "num_tokens": 108903572.0, + "step": 4352 + }, + { + "epoch": 0.47803645947726775, + "grad_norm": 2.0019452571868896, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.6961898803710938, + "num_tokens": 108936827.0, + "step": 4353 + }, + { + "epoch": 0.4781462771798814, + "grad_norm": 2.682342767715454, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7208155393600464, + "num_tokens": 108955489.0, + "step": 4354 + }, + { + "epoch": 0.47825609488249504, + "grad_norm": 2.4791226387023926, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7147818207740784, + "num_tokens": 108975270.0, + "step": 4355 + }, + { + "epoch": 0.47836591258510874, + "grad_norm": 2.2055063247680664, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.722349226474762, + "num_tokens": 108999071.0, + "step": 4356 + }, + { + "epoch": 0.4784757302877224, + "grad_norm": 2.130845785140991, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7012662887573242, + "num_tokens": 109026052.0, + "step": 4357 + }, + { + "epoch": 0.47858554799033604, + "grad_norm": 2.065114736557007, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6961625814437866, + "num_tokens": 109056853.0, + "step": 4358 + }, + { + "epoch": 0.4786953656929497, + "grad_norm": 2.299320936203003, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7028977870941162, + "num_tokens": 109081213.0, + "step": 4359 + }, + { + "epoch": 0.4788051833955634, + "grad_norm": 2.1388895511627197, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6995131373405457, + "num_tokens": 109109596.0, + "step": 4360 + }, + { + "epoch": 0.47891500109817703, + "grad_norm": 2.6239688396453857, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7195292115211487, + "num_tokens": 109127675.0, + "step": 4361 + }, + { + "epoch": 0.4790248188007907, + "grad_norm": 2.434412956237793, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7108070850372314, + "num_tokens": 109147784.0, + "step": 4362 + }, + { + "epoch": 0.4791346365034044, + "grad_norm": 2.564471960067749, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.70094895362854, + "num_tokens": 109168197.0, + "step": 4363 + }, + { + "epoch": 0.479244454206018, + "grad_norm": 2.2931370735168457, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7187451124191284, + "num_tokens": 109191367.0, + "step": 4364 + }, + { + "epoch": 0.47935427190863167, + "grad_norm": 2.10536527633667, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7102829217910767, + "num_tokens": 109218229.0, + "step": 4365 + }, + { + "epoch": 0.4794640896112453, + "grad_norm": 2.166038990020752, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6953567862510681, + "num_tokens": 109245694.0, + "step": 4366 + }, + { + "epoch": 0.479573907313859, + "grad_norm": 2.0534348487854004, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6852807402610779, + "num_tokens": 109274761.0, + "step": 4367 + }, + { + "epoch": 0.47968372501647266, + "grad_norm": 2.3923418521881104, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6929510831832886, + "num_tokens": 109295803.0, + "step": 4368 + }, + { + "epoch": 0.4797935427190863, + "grad_norm": 2.180126667022705, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6872805953025818, + "num_tokens": 109322868.0, + "step": 4369 + }, + { + "epoch": 0.47990336042169995, + "grad_norm": 2.1411712169647217, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.688955545425415, + "num_tokens": 109350401.0, + "step": 4370 + }, + { + "epoch": 0.48001317812431366, + "grad_norm": 2.4670603275299072, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7304126620292664, + "num_tokens": 109370936.0, + "step": 4371 + }, + { + "epoch": 0.4801229958269273, + "grad_norm": 2.281560182571411, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7015988826751709, + "num_tokens": 109396755.0, + "step": 4372 + }, + { + "epoch": 0.48023281352954095, + "grad_norm": 2.6098153591156006, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6989027857780457, + "num_tokens": 109417107.0, + "step": 4373 + }, + { + "epoch": 0.48034263123215465, + "grad_norm": 2.209278106689453, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6830551624298096, + "num_tokens": 109444594.0, + "step": 4374 + }, + { + "epoch": 0.4804524489347683, + "grad_norm": 2.090271234512329, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.705601692199707, + "num_tokens": 109471405.0, + "step": 4375 + }, + { + "epoch": 0.48056226663738194, + "grad_norm": 2.151259422302246, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7118465900421143, + "num_tokens": 109496651.0, + "step": 4376 + }, + { + "epoch": 0.4806720843399956, + "grad_norm": 2.3893589973449707, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7143637537956238, + "num_tokens": 109518340.0, + "step": 4377 + }, + { + "epoch": 0.4807819020426093, + "grad_norm": 2.1617252826690674, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7239217758178711, + "num_tokens": 109543839.0, + "step": 4378 + }, + { + "epoch": 0.48089171974522293, + "grad_norm": 2.4799673557281494, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7111384868621826, + "num_tokens": 109566362.0, + "step": 4379 + }, + { + "epoch": 0.4810015374478366, + "grad_norm": 2.5066680908203125, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6913145780563354, + "num_tokens": 109587230.0, + "step": 4380 + }, + { + "epoch": 0.4811113551504502, + "grad_norm": 2.1845896244049072, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.705370306968689, + "num_tokens": 109612772.0, + "step": 4381 + }, + { + "epoch": 0.4812211728530639, + "grad_norm": 2.0591864585876465, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.700887143611908, + "num_tokens": 109639239.0, + "step": 4382 + }, + { + "epoch": 0.4813309905556776, + "grad_norm": 2.310997486114502, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7211425304412842, + "num_tokens": 109660803.0, + "step": 4383 + }, + { + "epoch": 0.4814408082582912, + "grad_norm": 2.2826313972473145, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6963292360305786, + "num_tokens": 109686480.0, + "step": 4384 + }, + { + "epoch": 0.4815506259609049, + "grad_norm": 2.5610897541046143, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7044078707695007, + "num_tokens": 109706292.0, + "step": 4385 + }, + { + "epoch": 0.48166044366351857, + "grad_norm": 2.136852979660034, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7338560223579407, + "num_tokens": 109731501.0, + "step": 4386 + }, + { + "epoch": 0.4817702613661322, + "grad_norm": 2.6556711196899414, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.709232747554779, + "num_tokens": 109750521.0, + "step": 4387 + }, + { + "epoch": 0.48188007906874586, + "grad_norm": 2.244197130203247, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7154592275619507, + "num_tokens": 109773131.0, + "step": 4388 + }, + { + "epoch": 0.48198989677135956, + "grad_norm": 2.3513684272766113, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7186787128448486, + "num_tokens": 109794773.0, + "step": 4389 + }, + { + "epoch": 0.4820997144739732, + "grad_norm": 2.478891611099243, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7004439830780029, + "num_tokens": 109817005.0, + "step": 4390 + }, + { + "epoch": 0.48220953217658685, + "grad_norm": 2.1848485469818115, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7120305299758911, + "num_tokens": 109843169.0, + "step": 4391 + }, + { + "epoch": 0.48231934987920055, + "grad_norm": 2.062835931777954, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6946702003479004, + "num_tokens": 109871475.0, + "step": 4392 + }, + { + "epoch": 0.4824291675818142, + "grad_norm": 2.4684927463531494, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7115985751152039, + "num_tokens": 109891631.0, + "step": 4393 + }, + { + "epoch": 0.48253898528442785, + "grad_norm": 2.808820962905884, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7274569272994995, + "num_tokens": 109908107.0, + "step": 4394 + }, + { + "epoch": 0.4826488029870415, + "grad_norm": 2.047823667526245, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7059676647186279, + "num_tokens": 109936277.0, + "step": 4395 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 2.0496978759765625, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7254478931427002, + "num_tokens": 109961499.0, + "step": 4396 + }, + { + "epoch": 0.48286843839226884, + "grad_norm": 2.6216869354248047, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7136899828910828, + "num_tokens": 109982169.0, + "step": 4397 + }, + { + "epoch": 0.4829782560948825, + "grad_norm": 2.5758132934570312, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6988832950592041, + "num_tokens": 110001346.0, + "step": 4398 + }, + { + "epoch": 0.48308807379749613, + "grad_norm": 2.5136654376983643, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7043180465698242, + "num_tokens": 110022023.0, + "step": 4399 + }, + { + "epoch": 0.48319789150010983, + "grad_norm": 2.3416624069213867, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7299425601959229, + "num_tokens": 110043386.0, + "step": 4400 + }, + { + "epoch": 0.4833077092027235, + "grad_norm": 2.1314516067504883, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7092294096946716, + "num_tokens": 110068956.0, + "step": 4401 + }, + { + "epoch": 0.4834175269053371, + "grad_norm": 2.3180034160614014, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6902883648872375, + "num_tokens": 110092720.0, + "step": 4402 + }, + { + "epoch": 0.4835273446079508, + "grad_norm": 2.2301816940307617, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.679643452167511, + "num_tokens": 110117305.0, + "step": 4403 + }, + { + "epoch": 0.4836371623105645, + "grad_norm": 2.0395631790161133, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7029523253440857, + "num_tokens": 110145038.0, + "step": 4404 + }, + { + "epoch": 0.4837469800131781, + "grad_norm": 2.1081366539001465, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7209484577178955, + "num_tokens": 110171021.0, + "step": 4405 + }, + { + "epoch": 0.48385679771579176, + "grad_norm": 2.1021697521209717, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.712221622467041, + "num_tokens": 110199474.0, + "step": 4406 + }, + { + "epoch": 0.48396661541840547, + "grad_norm": 2.222994327545166, + "learning_rate": 1e-06, + "loss": 1.07, + "mean_token_accuracy": 0.6888059377670288, + "num_tokens": 110227319.0, + "step": 4407 + }, + { + "epoch": 0.4840764331210191, + "grad_norm": 2.2193830013275146, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7006937861442566, + "num_tokens": 110252161.0, + "step": 4408 + }, + { + "epoch": 0.48418625082363276, + "grad_norm": 2.4674980640411377, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7063122987747192, + "num_tokens": 110273273.0, + "step": 4409 + }, + { + "epoch": 0.4842960685262464, + "grad_norm": 2.1394145488739014, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7003146409988403, + "num_tokens": 110302313.0, + "step": 4410 + }, + { + "epoch": 0.4844058862288601, + "grad_norm": 2.557645082473755, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7283486127853394, + "num_tokens": 110322431.0, + "step": 4411 + }, + { + "epoch": 0.48451570393147375, + "grad_norm": 2.3284029960632324, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7162699103355408, + "num_tokens": 110346177.0, + "step": 4412 + }, + { + "epoch": 0.4846255216340874, + "grad_norm": 2.35756516456604, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7082672119140625, + "num_tokens": 110367582.0, + "step": 4413 + }, + { + "epoch": 0.4847353393367011, + "grad_norm": 2.359703302383423, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7122778296470642, + "num_tokens": 110390963.0, + "step": 4414 + }, + { + "epoch": 0.48484515703931474, + "grad_norm": 2.2833995819091797, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.697622537612915, + "num_tokens": 110416761.0, + "step": 4415 + }, + { + "epoch": 0.4849549747419284, + "grad_norm": 2.7661962509155273, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7196974754333496, + "num_tokens": 110434274.0, + "step": 4416 + }, + { + "epoch": 0.48506479244454204, + "grad_norm": 2.385268449783325, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7103340029716492, + "num_tokens": 110456166.0, + "step": 4417 + }, + { + "epoch": 0.48517461014715574, + "grad_norm": 2.261495351791382, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.706173837184906, + "num_tokens": 110479660.0, + "step": 4418 + }, + { + "epoch": 0.4852844278497694, + "grad_norm": 2.179877281188965, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.724075973033905, + "num_tokens": 110504218.0, + "step": 4419 + }, + { + "epoch": 0.48539424555238303, + "grad_norm": 2.121000051498413, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6930316090583801, + "num_tokens": 110530452.0, + "step": 4420 + }, + { + "epoch": 0.48550406325499673, + "grad_norm": 2.424692392349243, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7078533172607422, + "num_tokens": 110552398.0, + "step": 4421 + }, + { + "epoch": 0.4856138809576104, + "grad_norm": 2.5410828590393066, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7015547156333923, + "num_tokens": 110574092.0, + "step": 4422 + }, + { + "epoch": 0.485723698660224, + "grad_norm": 2.0342252254486084, + "learning_rate": 1e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6850705146789551, + "num_tokens": 110602749.0, + "step": 4423 + }, + { + "epoch": 0.48583351636283767, + "grad_norm": 2.4662771224975586, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7385439276695251, + "num_tokens": 110623890.0, + "step": 4424 + }, + { + "epoch": 0.48594333406545137, + "grad_norm": 2.4581758975982666, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7087245583534241, + "num_tokens": 110643573.0, + "step": 4425 + }, + { + "epoch": 0.486053151768065, + "grad_norm": 2.55834698677063, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7107877135276794, + "num_tokens": 110662834.0, + "step": 4426 + }, + { + "epoch": 0.48616296947067866, + "grad_norm": 1.9321125745773315, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6883715391159058, + "num_tokens": 110692616.0, + "step": 4427 + }, + { + "epoch": 0.4862727871732923, + "grad_norm": 2.162980556488037, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7004627585411072, + "num_tokens": 110720571.0, + "step": 4428 + }, + { + "epoch": 0.486382604875906, + "grad_norm": 2.2302050590515137, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7159977555274963, + "num_tokens": 110744081.0, + "step": 4429 + }, + { + "epoch": 0.48649242257851966, + "grad_norm": 2.2532248497009277, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.6967979669570923, + "num_tokens": 110770220.0, + "step": 4430 + }, + { + "epoch": 0.4866022402811333, + "grad_norm": 2.1562671661376953, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.706443190574646, + "num_tokens": 110795120.0, + "step": 4431 + }, + { + "epoch": 0.486712057983747, + "grad_norm": 1.9576915502548218, + "learning_rate": 1e-06, + "loss": 1.0992, + "mean_token_accuracy": 0.6758389472961426, + "num_tokens": 110829022.0, + "step": 4432 + }, + { + "epoch": 0.48682187568636065, + "grad_norm": 2.315537214279175, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6799097061157227, + "num_tokens": 110852624.0, + "step": 4433 + }, + { + "epoch": 0.4869316933889743, + "grad_norm": 2.238069772720337, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.717736542224884, + "num_tokens": 110877304.0, + "step": 4434 + }, + { + "epoch": 0.48704151109158794, + "grad_norm": 2.381558895111084, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7110422849655151, + "num_tokens": 110902454.0, + "step": 4435 + }, + { + "epoch": 0.48715132879420164, + "grad_norm": 1.9074968099594116, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7001799941062927, + "num_tokens": 110933542.0, + "step": 4436 + }, + { + "epoch": 0.4872611464968153, + "grad_norm": 2.37131667137146, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.7003562450408936, + "num_tokens": 110954353.0, + "step": 4437 + }, + { + "epoch": 0.48737096419942894, + "grad_norm": 2.02327036857605, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7027793526649475, + "num_tokens": 110986168.0, + "step": 4438 + }, + { + "epoch": 0.48748078190204264, + "grad_norm": 2.2145795822143555, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6899172067642212, + "num_tokens": 111013056.0, + "step": 4439 + }, + { + "epoch": 0.4875905996046563, + "grad_norm": 2.3337624073028564, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7082855105400085, + "num_tokens": 111036338.0, + "step": 4440 + }, + { + "epoch": 0.48770041730726993, + "grad_norm": 2.1483516693115234, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7337979674339294, + "num_tokens": 111060517.0, + "step": 4441 + }, + { + "epoch": 0.4878102350098836, + "grad_norm": 2.237367630004883, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.716754674911499, + "num_tokens": 111084665.0, + "step": 4442 + }, + { + "epoch": 0.4879200527124973, + "grad_norm": 2.246824026107788, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7058451771736145, + "num_tokens": 111108753.0, + "step": 4443 + }, + { + "epoch": 0.4880298704151109, + "grad_norm": 2.4304139614105225, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7345095276832581, + "num_tokens": 111127233.0, + "step": 4444 + }, + { + "epoch": 0.48813968811772457, + "grad_norm": 2.2337002754211426, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6894888877868652, + "num_tokens": 111151736.0, + "step": 4445 + }, + { + "epoch": 0.4882495058203382, + "grad_norm": 2.344787836074829, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7009575366973877, + "num_tokens": 111174018.0, + "step": 4446 + }, + { + "epoch": 0.4883593235229519, + "grad_norm": 2.3908767700195312, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7065637707710266, + "num_tokens": 111196967.0, + "step": 4447 + }, + { + "epoch": 0.48846914122556556, + "grad_norm": 2.44407320022583, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7245798110961914, + "num_tokens": 111216803.0, + "step": 4448 + }, + { + "epoch": 0.4885789589281792, + "grad_norm": 2.337451219558716, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7056180238723755, + "num_tokens": 111240074.0, + "step": 4449 + }, + { + "epoch": 0.4886887766307929, + "grad_norm": 2.415811538696289, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7237464785575867, + "num_tokens": 111260765.0, + "step": 4450 + }, + { + "epoch": 0.48879859433340656, + "grad_norm": 2.1635499000549316, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6918468475341797, + "num_tokens": 111288516.0, + "step": 4451 + }, + { + "epoch": 0.4889084120360202, + "grad_norm": 2.1064512729644775, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6812697649002075, + "num_tokens": 111316630.0, + "step": 4452 + }, + { + "epoch": 0.48901822973863385, + "grad_norm": 2.0179178714752197, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.713707685470581, + "num_tokens": 111345309.0, + "step": 4453 + }, + { + "epoch": 0.48912804744124755, + "grad_norm": 2.17257022857666, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.722333550453186, + "num_tokens": 111369081.0, + "step": 4454 + }, + { + "epoch": 0.4892378651438612, + "grad_norm": 2.144552230834961, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6879673004150391, + "num_tokens": 111397987.0, + "step": 4455 + }, + { + "epoch": 0.48934768284647484, + "grad_norm": 2.4799704551696777, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6873206496238708, + "num_tokens": 111421806.0, + "step": 4456 + }, + { + "epoch": 0.4894575005490885, + "grad_norm": 2.188657522201538, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6862820386886597, + "num_tokens": 111447293.0, + "step": 4457 + }, + { + "epoch": 0.4895673182517022, + "grad_norm": 2.2651257514953613, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7318229079246521, + "num_tokens": 111472279.0, + "step": 4458 + }, + { + "epoch": 0.48967713595431583, + "grad_norm": 2.4161417484283447, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6905909180641174, + "num_tokens": 111494046.0, + "step": 4459 + }, + { + "epoch": 0.4897869536569295, + "grad_norm": 2.4837474822998047, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7167606353759766, + "num_tokens": 111514398.0, + "step": 4460 + }, + { + "epoch": 0.4898967713595432, + "grad_norm": 2.2908713817596436, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6860088109970093, + "num_tokens": 111539503.0, + "step": 4461 + }, + { + "epoch": 0.4900065890621568, + "grad_norm": 2.1347854137420654, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.698938250541687, + "num_tokens": 111567348.0, + "step": 4462 + }, + { + "epoch": 0.4901164067647705, + "grad_norm": 2.0773873329162598, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7104294896125793, + "num_tokens": 111595171.0, + "step": 4463 + }, + { + "epoch": 0.4902262244673841, + "grad_norm": 2.399113178253174, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7142545580863953, + "num_tokens": 111616938.0, + "step": 4464 + }, + { + "epoch": 0.4903360421699978, + "grad_norm": 2.883801221847534, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7257041335105896, + "num_tokens": 111632753.0, + "step": 4465 + }, + { + "epoch": 0.49044585987261147, + "grad_norm": 2.139155149459839, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7187714576721191, + "num_tokens": 111657233.0, + "step": 4466 + }, + { + "epoch": 0.4905556775752251, + "grad_norm": 2.0951497554779053, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.7015708684921265, + "num_tokens": 111685317.0, + "step": 4467 + }, + { + "epoch": 0.4906654952778388, + "grad_norm": 1.858011245727539, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6971174478530884, + "num_tokens": 111718117.0, + "step": 4468 + }, + { + "epoch": 0.49077531298045246, + "grad_norm": 2.115450859069824, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6962243318557739, + "num_tokens": 111745235.0, + "step": 4469 + }, + { + "epoch": 0.4908851306830661, + "grad_norm": 1.7932915687561035, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6754595041275024, + "num_tokens": 111785156.0, + "step": 4470 + }, + { + "epoch": 0.49099494838567975, + "grad_norm": 2.4961740970611572, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7019612789154053, + "num_tokens": 111806180.0, + "step": 4471 + }, + { + "epoch": 0.49110476608829345, + "grad_norm": 2.198817253112793, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6880102753639221, + "num_tokens": 111835534.0, + "step": 4472 + }, + { + "epoch": 0.4912145837909071, + "grad_norm": 2.0050268173217773, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.695874035358429, + "num_tokens": 111864007.0, + "step": 4473 + }, + { + "epoch": 0.49132440149352075, + "grad_norm": 2.810605764389038, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7201367616653442, + "num_tokens": 111880849.0, + "step": 4474 + }, + { + "epoch": 0.4914342191961344, + "grad_norm": 2.028062582015991, + "learning_rate": 1e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6696146130561829, + "num_tokens": 111911193.0, + "step": 4475 + }, + { + "epoch": 0.4915440368987481, + "grad_norm": 2.6804330348968506, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7103460431098938, + "num_tokens": 111928861.0, + "step": 4476 + }, + { + "epoch": 0.49165385460136174, + "grad_norm": 2.119262456893921, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7105522155761719, + "num_tokens": 111955696.0, + "step": 4477 + }, + { + "epoch": 0.4917636723039754, + "grad_norm": 2.0410733222961426, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6948927044868469, + "num_tokens": 111987432.0, + "step": 4478 + }, + { + "epoch": 0.4918734900065891, + "grad_norm": 1.9914944171905518, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.699657678604126, + "num_tokens": 112017143.0, + "step": 4479 + }, + { + "epoch": 0.49198330770920273, + "grad_norm": 2.3110225200653076, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7098325490951538, + "num_tokens": 112041552.0, + "step": 4480 + }, + { + "epoch": 0.4920931254118164, + "grad_norm": 2.185311794281006, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.713269054889679, + "num_tokens": 112065605.0, + "step": 4481 + }, + { + "epoch": 0.49220294311443, + "grad_norm": 2.185044527053833, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7247272729873657, + "num_tokens": 112092045.0, + "step": 4482 + }, + { + "epoch": 0.4923127608170437, + "grad_norm": 2.09161376953125, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6989858746528625, + "num_tokens": 112120671.0, + "step": 4483 + }, + { + "epoch": 0.4924225785196574, + "grad_norm": 2.197925329208374, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6898747682571411, + "num_tokens": 112145484.0, + "step": 4484 + }, + { + "epoch": 0.492532396222271, + "grad_norm": 2.5324981212615967, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7189499139785767, + "num_tokens": 112166369.0, + "step": 4485 + }, + { + "epoch": 0.49264221392488466, + "grad_norm": 2.2711682319641113, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6961784362792969, + "num_tokens": 112192927.0, + "step": 4486 + }, + { + "epoch": 0.49275203162749837, + "grad_norm": 2.033891439437866, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7066878080368042, + "num_tokens": 112222373.0, + "step": 4487 + }, + { + "epoch": 0.492861849330112, + "grad_norm": 2.343996047973633, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7085971236228943, + "num_tokens": 112247060.0, + "step": 4488 + }, + { + "epoch": 0.49297166703272566, + "grad_norm": 2.2049200534820557, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7102317214012146, + "num_tokens": 112271688.0, + "step": 4489 + }, + { + "epoch": 0.49308148473533936, + "grad_norm": 2.337684154510498, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7118273377418518, + "num_tokens": 112294400.0, + "step": 4490 + }, + { + "epoch": 0.493191302437953, + "grad_norm": 2.198582649230957, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7000613212585449, + "num_tokens": 112319822.0, + "step": 4491 + }, + { + "epoch": 0.49330112014056665, + "grad_norm": 2.10384464263916, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7012611627578735, + "num_tokens": 112347738.0, + "step": 4492 + }, + { + "epoch": 0.4934109378431803, + "grad_norm": 2.199767827987671, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6960954666137695, + "num_tokens": 112373694.0, + "step": 4493 + }, + { + "epoch": 0.493520755545794, + "grad_norm": 2.577141284942627, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7276896238327026, + "num_tokens": 112392785.0, + "step": 4494 + }, + { + "epoch": 0.49363057324840764, + "grad_norm": 2.312056303024292, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6829781532287598, + "num_tokens": 112416217.0, + "step": 4495 + }, + { + "epoch": 0.4937403909510213, + "grad_norm": 1.9497363567352295, + "learning_rate": 1e-06, + "loss": 1.0603, + "mean_token_accuracy": 0.6802642345428467, + "num_tokens": 112451379.0, + "step": 4496 + }, + { + "epoch": 0.493850208653635, + "grad_norm": 2.3885786533355713, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.7006300687789917, + "num_tokens": 112474519.0, + "step": 4497 + }, + { + "epoch": 0.49396002635624864, + "grad_norm": 2.476818799972534, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7233788967132568, + "num_tokens": 112495981.0, + "step": 4498 + }, + { + "epoch": 0.4940698440588623, + "grad_norm": 2.256446599960327, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7136144638061523, + "num_tokens": 112519136.0, + "step": 4499 + }, + { + "epoch": 0.49417966176147593, + "grad_norm": 2.177734851837158, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6906816959381104, + "num_tokens": 112546494.0, + "step": 4500 + }, + { + "epoch": 0.49428947946408963, + "grad_norm": 2.217092275619507, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.701428234577179, + "num_tokens": 112570269.0, + "step": 4501 + }, + { + "epoch": 0.4943992971667033, + "grad_norm": 2.0633537769317627, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.700047492980957, + "num_tokens": 112599474.0, + "step": 4502 + }, + { + "epoch": 0.4945091148693169, + "grad_norm": 2.2217140197753906, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7077343463897705, + "num_tokens": 112624866.0, + "step": 4503 + }, + { + "epoch": 0.49461893257193057, + "grad_norm": 2.27691388130188, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7166191339492798, + "num_tokens": 112647547.0, + "step": 4504 + }, + { + "epoch": 0.49472875027454427, + "grad_norm": 2.284290313720703, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7342530488967896, + "num_tokens": 112670150.0, + "step": 4505 + }, + { + "epoch": 0.4948385679771579, + "grad_norm": 2.203270196914673, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7115070819854736, + "num_tokens": 112692140.0, + "step": 4506 + }, + { + "epoch": 0.49494838567977156, + "grad_norm": 2.231271266937256, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6881982088088989, + "num_tokens": 112717157.0, + "step": 4507 + }, + { + "epoch": 0.49505820338238526, + "grad_norm": 2.376875400543213, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7132437229156494, + "num_tokens": 112738663.0, + "step": 4508 + }, + { + "epoch": 0.4951680210849989, + "grad_norm": 2.019458055496216, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7215224504470825, + "num_tokens": 112766800.0, + "step": 4509 + }, + { + "epoch": 0.49527783878761256, + "grad_norm": 2.325385093688965, + "learning_rate": 1e-06, + "loss": 1.0885, + "mean_token_accuracy": 0.6816962361335754, + "num_tokens": 112793259.0, + "step": 4510 + }, + { + "epoch": 0.4953876564902262, + "grad_norm": 2.073878288269043, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7092614769935608, + "num_tokens": 112821944.0, + "step": 4511 + }, + { + "epoch": 0.4954974741928399, + "grad_norm": 2.410132884979248, + "learning_rate": 1e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.694564938545227, + "num_tokens": 112845491.0, + "step": 4512 + }, + { + "epoch": 0.49560729189545355, + "grad_norm": 2.342639923095703, + "learning_rate": 1e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6815757751464844, + "num_tokens": 112868742.0, + "step": 4513 + }, + { + "epoch": 0.4957171095980672, + "grad_norm": 2.2229907512664795, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7457956075668335, + "num_tokens": 112890468.0, + "step": 4514 + }, + { + "epoch": 0.4958269273006809, + "grad_norm": 2.455698013305664, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7126003503799438, + "num_tokens": 112910570.0, + "step": 4515 + }, + { + "epoch": 0.49593674500329454, + "grad_norm": 2.1347668170928955, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7099449634552002, + "num_tokens": 112938536.0, + "step": 4516 + }, + { + "epoch": 0.4960465627059082, + "grad_norm": 2.359311103820801, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7031038999557495, + "num_tokens": 112959498.0, + "step": 4517 + }, + { + "epoch": 0.49615638040852184, + "grad_norm": 2.2561957836151123, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7084460258483887, + "num_tokens": 112984487.0, + "step": 4518 + }, + { + "epoch": 0.49626619811113554, + "grad_norm": 2.3417246341705322, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7051105499267578, + "num_tokens": 113006601.0, + "step": 4519 + }, + { + "epoch": 0.4963760158137492, + "grad_norm": 2.491849184036255, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.724855899810791, + "num_tokens": 113027360.0, + "step": 4520 + }, + { + "epoch": 0.49648583351636283, + "grad_norm": 2.1012582778930664, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7060811519622803, + "num_tokens": 113055047.0, + "step": 4521 + }, + { + "epoch": 0.4965956512189765, + "grad_norm": 2.14663028717041, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6921021938323975, + "num_tokens": 113085300.0, + "step": 4522 + }, + { + "epoch": 0.4967054689215902, + "grad_norm": 2.33412504196167, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.699425458908081, + "num_tokens": 113109243.0, + "step": 4523 + }, + { + "epoch": 0.4968152866242038, + "grad_norm": 2.0373566150665283, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7173531651496887, + "num_tokens": 113138113.0, + "step": 4524 + }, + { + "epoch": 0.49692510432681747, + "grad_norm": 2.373058319091797, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7285991907119751, + "num_tokens": 113159181.0, + "step": 4525 + }, + { + "epoch": 0.49703492202943117, + "grad_norm": 2.394381284713745, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7167668342590332, + "num_tokens": 113181464.0, + "step": 4526 + }, + { + "epoch": 0.4971447397320448, + "grad_norm": 2.20095157623291, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7061747312545776, + "num_tokens": 113205683.0, + "step": 4527 + }, + { + "epoch": 0.49725455743465846, + "grad_norm": 2.4688267707824707, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7004574537277222, + "num_tokens": 113225886.0, + "step": 4528 + }, + { + "epoch": 0.4973643751372721, + "grad_norm": 2.2653462886810303, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7179480791091919, + "num_tokens": 113248342.0, + "step": 4529 + }, + { + "epoch": 0.4974741928398858, + "grad_norm": 2.4318106174468994, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7192859053611755, + "num_tokens": 113269535.0, + "step": 4530 + }, + { + "epoch": 0.49758401054249946, + "grad_norm": 2.3067498207092285, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7106616497039795, + "num_tokens": 113292380.0, + "step": 4531 + }, + { + "epoch": 0.4976938282451131, + "grad_norm": 2.2653968334198, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7046157717704773, + "num_tokens": 113317397.0, + "step": 4532 + }, + { + "epoch": 0.49780364594772675, + "grad_norm": 2.299535036087036, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7019994258880615, + "num_tokens": 113340878.0, + "step": 4533 + }, + { + "epoch": 0.49791346365034045, + "grad_norm": 2.519420623779297, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7382127046585083, + "num_tokens": 113359933.0, + "step": 4534 + }, + { + "epoch": 0.4980232813529541, + "grad_norm": 2.3662161827087402, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6996197700500488, + "num_tokens": 113381343.0, + "step": 4535 + }, + { + "epoch": 0.49813309905556774, + "grad_norm": 2.224501848220825, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7228780388832092, + "num_tokens": 113406365.0, + "step": 4536 + }, + { + "epoch": 0.49824291675818144, + "grad_norm": 2.3274238109588623, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.708329439163208, + "num_tokens": 113430361.0, + "step": 4537 + }, + { + "epoch": 0.4983527344607951, + "grad_norm": 2.3600330352783203, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7108260989189148, + "num_tokens": 113451882.0, + "step": 4538 + }, + { + "epoch": 0.49846255216340873, + "grad_norm": 2.0818800926208496, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7169255018234253, + "num_tokens": 113480132.0, + "step": 4539 + }, + { + "epoch": 0.4985723698660224, + "grad_norm": 2.2348668575286865, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6878586411476135, + "num_tokens": 113505734.0, + "step": 4540 + }, + { + "epoch": 0.4986821875686361, + "grad_norm": 2.2009940147399902, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7055807113647461, + "num_tokens": 113530624.0, + "step": 4541 + }, + { + "epoch": 0.4987920052712497, + "grad_norm": 2.6993494033813477, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7090229988098145, + "num_tokens": 113548335.0, + "step": 4542 + }, + { + "epoch": 0.4989018229738634, + "grad_norm": 2.415433645248413, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.6998676657676697, + "num_tokens": 113570033.0, + "step": 4543 + }, + { + "epoch": 0.4990116406764771, + "grad_norm": 2.083272933959961, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.713023841381073, + "num_tokens": 113596619.0, + "step": 4544 + }, + { + "epoch": 0.4991214583790907, + "grad_norm": 2.594754695892334, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7069056630134583, + "num_tokens": 113616835.0, + "step": 4545 + }, + { + "epoch": 0.49923127608170437, + "grad_norm": 2.180718421936035, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7004081010818481, + "num_tokens": 113643555.0, + "step": 4546 + }, + { + "epoch": 0.499341093784318, + "grad_norm": 2.2499890327453613, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6868503093719482, + "num_tokens": 113670556.0, + "step": 4547 + }, + { + "epoch": 0.4994509114869317, + "grad_norm": 2.1360526084899902, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7033147215843201, + "num_tokens": 113697876.0, + "step": 4548 + }, + { + "epoch": 0.49956072918954536, + "grad_norm": 2.1850106716156006, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6947528123855591, + "num_tokens": 113722328.0, + "step": 4549 + }, + { + "epoch": 0.499670546892159, + "grad_norm": 1.9914507865905762, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7044357061386108, + "num_tokens": 113750311.0, + "step": 4550 + }, + { + "epoch": 0.49978036459477265, + "grad_norm": 2.0587525367736816, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.679477334022522, + "num_tokens": 113778770.0, + "step": 4551 + }, + { + "epoch": 0.49989018229738635, + "grad_norm": 2.30659556388855, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7057060599327087, + "num_tokens": 113801430.0, + "step": 4552 + }, + { + "epoch": 0.5, + "grad_norm": 2.0586607456207275, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7058526873588562, + "num_tokens": 113828571.0, + "step": 4553 + }, + { + "epoch": 0.5001098177026136, + "grad_norm": 2.179313898086548, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7107158899307251, + "num_tokens": 113853404.0, + "step": 4554 + }, + { + "epoch": 0.5002196354052273, + "grad_norm": 2.1635560989379883, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7101274728775024, + "num_tokens": 113878267.0, + "step": 4555 + }, + { + "epoch": 0.5003294531078409, + "grad_norm": 2.2705743312835693, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6781500577926636, + "num_tokens": 113903218.0, + "step": 4556 + }, + { + "epoch": 0.5004392708104547, + "grad_norm": 2.4905567169189453, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.688654363155365, + "num_tokens": 113927670.0, + "step": 4557 + }, + { + "epoch": 0.5005490885130683, + "grad_norm": 2.0255534648895264, + "learning_rate": 1e-06, + "loss": 1.0984, + "mean_token_accuracy": 0.6718971729278564, + "num_tokens": 113957205.0, + "step": 4558 + }, + { + "epoch": 0.500658906215682, + "grad_norm": 2.18290638923645, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7029150724411011, + "num_tokens": 113983214.0, + "step": 4559 + }, + { + "epoch": 0.5007687239182956, + "grad_norm": 2.773516893386841, + "learning_rate": 1e-06, + "loss": 0.7781, + "mean_token_accuracy": 0.7542842626571655, + "num_tokens": 113998538.0, + "step": 4560 + }, + { + "epoch": 0.5008785416209093, + "grad_norm": 2.3863327503204346, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7049458026885986, + "num_tokens": 114020128.0, + "step": 4561 + }, + { + "epoch": 0.5009883593235229, + "grad_norm": 2.438737154006958, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7068766951560974, + "num_tokens": 114039931.0, + "step": 4562 + }, + { + "epoch": 0.5010981770261366, + "grad_norm": 2.166778564453125, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.71629399061203, + "num_tokens": 114064019.0, + "step": 4563 + }, + { + "epoch": 0.5012079947287503, + "grad_norm": 2.4709970951080322, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.7127668261528015, + "num_tokens": 114083817.0, + "step": 4564 + }, + { + "epoch": 0.501317812431364, + "grad_norm": 2.208946943283081, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6966605186462402, + "num_tokens": 114108939.0, + "step": 4565 + }, + { + "epoch": 0.5014276301339776, + "grad_norm": 2.5397274494171143, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6940666437149048, + "num_tokens": 114129511.0, + "step": 4566 + }, + { + "epoch": 0.5015374478365913, + "grad_norm": 2.5811171531677246, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7270759344100952, + "num_tokens": 114150066.0, + "step": 4567 + }, + { + "epoch": 0.5016472655392049, + "grad_norm": 2.356640577316284, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7012178301811218, + "num_tokens": 114173853.0, + "step": 4568 + }, + { + "epoch": 0.5017570832418186, + "grad_norm": 2.2681047916412354, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7181810140609741, + "num_tokens": 114197645.0, + "step": 4569 + }, + { + "epoch": 0.5018669009444322, + "grad_norm": 2.5199451446533203, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7054011821746826, + "num_tokens": 114218285.0, + "step": 4570 + }, + { + "epoch": 0.5019767186470458, + "grad_norm": 2.404482841491699, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.717162013053894, + "num_tokens": 114240292.0, + "step": 4571 + }, + { + "epoch": 0.5020865363496596, + "grad_norm": 2.176028251647949, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6845847964286804, + "num_tokens": 114267316.0, + "step": 4572 + }, + { + "epoch": 0.5021963540522733, + "grad_norm": 2.1988654136657715, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6905712485313416, + "num_tokens": 114292724.0, + "step": 4573 + }, + { + "epoch": 0.5023061717548869, + "grad_norm": 2.4058127403259277, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6939576268196106, + "num_tokens": 114315115.0, + "step": 4574 + }, + { + "epoch": 0.5024159894575005, + "grad_norm": 2.168483257293701, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6976274251937866, + "num_tokens": 114342300.0, + "step": 4575 + }, + { + "epoch": 0.5025258071601142, + "grad_norm": 2.615396022796631, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7057831883430481, + "num_tokens": 114362859.0, + "step": 4576 + }, + { + "epoch": 0.5026356248627278, + "grad_norm": 2.2716879844665527, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7017782926559448, + "num_tokens": 114386119.0, + "step": 4577 + }, + { + "epoch": 0.5027454425653415, + "grad_norm": 2.517066717147827, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7276511788368225, + "num_tokens": 114406274.0, + "step": 4578 + }, + { + "epoch": 0.5028552602679552, + "grad_norm": 2.5559425354003906, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7177644371986389, + "num_tokens": 114426986.0, + "step": 4579 + }, + { + "epoch": 0.5029650779705689, + "grad_norm": 2.3303685188293457, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6951080560684204, + "num_tokens": 114449568.0, + "step": 4580 + }, + { + "epoch": 0.5030748956731825, + "grad_norm": 2.0928843021392822, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7267221212387085, + "num_tokens": 114474521.0, + "step": 4581 + }, + { + "epoch": 0.5031847133757962, + "grad_norm": 2.254500389099121, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7108529806137085, + "num_tokens": 114498269.0, + "step": 4582 + }, + { + "epoch": 0.5032945310784098, + "grad_norm": 2.3941256999969482, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7005730867385864, + "num_tokens": 114521763.0, + "step": 4583 + }, + { + "epoch": 0.5034043487810235, + "grad_norm": 2.3619234561920166, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7085753679275513, + "num_tokens": 114544230.0, + "step": 4584 + }, + { + "epoch": 0.5035141664836371, + "grad_norm": 1.8918782472610474, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7130997180938721, + "num_tokens": 114575924.0, + "step": 4585 + }, + { + "epoch": 0.5036239841862509, + "grad_norm": 1.9961310625076294, + "learning_rate": 1e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6787518858909607, + "num_tokens": 114607499.0, + "step": 4586 + }, + { + "epoch": 0.5037338018888645, + "grad_norm": 1.9953505992889404, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.69459068775177, + "num_tokens": 114635907.0, + "step": 4587 + }, + { + "epoch": 0.5038436195914782, + "grad_norm": 2.2040021419525146, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7063027024269104, + "num_tokens": 114661091.0, + "step": 4588 + }, + { + "epoch": 0.5039534372940918, + "grad_norm": 2.1761741638183594, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6944829225540161, + "num_tokens": 114687468.0, + "step": 4589 + }, + { + "epoch": 0.5040632549967055, + "grad_norm": 1.8332843780517578, + "learning_rate": 1e-06, + "loss": 1.142, + "mean_token_accuracy": 0.6718308925628662, + "num_tokens": 114725287.0, + "step": 4590 + }, + { + "epoch": 0.5041730726993191, + "grad_norm": 2.009006977081299, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7017351984977722, + "num_tokens": 114755827.0, + "step": 4591 + }, + { + "epoch": 0.5042828904019327, + "grad_norm": 2.320387125015259, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7171790599822998, + "num_tokens": 114779398.0, + "step": 4592 + }, + { + "epoch": 0.5043927081045465, + "grad_norm": 2.196812629699707, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7086442112922668, + "num_tokens": 114805074.0, + "step": 4593 + }, + { + "epoch": 0.5045025258071602, + "grad_norm": 2.494338035583496, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7117182016372681, + "num_tokens": 114827779.0, + "step": 4594 + }, + { + "epoch": 0.5046123435097738, + "grad_norm": 2.231765031814575, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6927881836891174, + "num_tokens": 114854778.0, + "step": 4595 + }, + { + "epoch": 0.5047221612123874, + "grad_norm": 2.1283321380615234, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7283127903938293, + "num_tokens": 114881104.0, + "step": 4596 + }, + { + "epoch": 0.5048319789150011, + "grad_norm": 2.202324151992798, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7032438516616821, + "num_tokens": 114906703.0, + "step": 4597 + }, + { + "epoch": 0.5049417966176147, + "grad_norm": 2.320047616958618, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7016229629516602, + "num_tokens": 114929553.0, + "step": 4598 + }, + { + "epoch": 0.5050516143202284, + "grad_norm": 2.1737239360809326, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6976953148841858, + "num_tokens": 114955518.0, + "step": 4599 + }, + { + "epoch": 0.505161432022842, + "grad_norm": 2.337089776992798, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6913261413574219, + "num_tokens": 114980002.0, + "step": 4600 + }, + { + "epoch": 0.5052712497254558, + "grad_norm": 2.4165682792663574, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6939057111740112, + "num_tokens": 115002106.0, + "step": 4601 + }, + { + "epoch": 0.5053810674280694, + "grad_norm": 2.20403790473938, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7082041501998901, + "num_tokens": 115026511.0, + "step": 4602 + }, + { + "epoch": 0.5054908851306831, + "grad_norm": 2.187376022338867, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6907702684402466, + "num_tokens": 115052917.0, + "step": 4603 + }, + { + "epoch": 0.5056007028332967, + "grad_norm": 2.3219075202941895, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7166929244995117, + "num_tokens": 115076087.0, + "step": 4604 + }, + { + "epoch": 0.5057105205359104, + "grad_norm": 2.3073952198028564, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7253148555755615, + "num_tokens": 115099950.0, + "step": 4605 + }, + { + "epoch": 0.505820338238524, + "grad_norm": 2.1946518421173096, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6880689263343811, + "num_tokens": 115127251.0, + "step": 4606 + }, + { + "epoch": 0.5059301559411377, + "grad_norm": 2.141798734664917, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.709307074546814, + "num_tokens": 115153519.0, + "step": 4607 + }, + { + "epoch": 0.5060399736437514, + "grad_norm": 2.111696481704712, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.693931519985199, + "num_tokens": 115179994.0, + "step": 4608 + }, + { + "epoch": 0.5061497913463651, + "grad_norm": 2.2739555835723877, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6989098787307739, + "num_tokens": 115205390.0, + "step": 4609 + }, + { + "epoch": 0.5062596090489787, + "grad_norm": 2.0267767906188965, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6985734701156616, + "num_tokens": 115236103.0, + "step": 4610 + }, + { + "epoch": 0.5063694267515924, + "grad_norm": 2.196312189102173, + "learning_rate": 1e-06, + "loss": 1.1127, + "mean_token_accuracy": 0.6691403985023499, + "num_tokens": 115264604.0, + "step": 4611 + }, + { + "epoch": 0.506479244454206, + "grad_norm": 1.9377000331878662, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6923398375511169, + "num_tokens": 115297116.0, + "step": 4612 + }, + { + "epoch": 0.5065890621568196, + "grad_norm": 2.0514976978302, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7240830659866333, + "num_tokens": 115324682.0, + "step": 4613 + }, + { + "epoch": 0.5066988798594333, + "grad_norm": 2.2304182052612305, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7073869109153748, + "num_tokens": 115349795.0, + "step": 4614 + }, + { + "epoch": 0.506808697562047, + "grad_norm": 2.2137508392333984, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6797871589660645, + "num_tokens": 115376526.0, + "step": 4615 + }, + { + "epoch": 0.5069185152646607, + "grad_norm": 2.2989795207977295, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.687645673751831, + "num_tokens": 115401043.0, + "step": 4616 + }, + { + "epoch": 0.5070283329672743, + "grad_norm": 2.098820447921753, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6897198557853699, + "num_tokens": 115427449.0, + "step": 4617 + }, + { + "epoch": 0.507138150669888, + "grad_norm": 2.112589120864868, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6884071826934814, + "num_tokens": 115454951.0, + "step": 4618 + }, + { + "epoch": 0.5072479683725016, + "grad_norm": 2.2349915504455566, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7131597995758057, + "num_tokens": 115480618.0, + "step": 4619 + }, + { + "epoch": 0.5073577860751153, + "grad_norm": 2.339156150817871, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7263585329055786, + "num_tokens": 115501564.0, + "step": 4620 + }, + { + "epoch": 0.5074676037777289, + "grad_norm": 2.3917746543884277, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7031986117362976, + "num_tokens": 115523384.0, + "step": 4621 + }, + { + "epoch": 0.5075774214803427, + "grad_norm": 2.4350216388702393, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6902459263801575, + "num_tokens": 115546377.0, + "step": 4622 + }, + { + "epoch": 0.5076872391829563, + "grad_norm": 2.073176622390747, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7092729806900024, + "num_tokens": 115573213.0, + "step": 4623 + }, + { + "epoch": 0.50779705688557, + "grad_norm": 2.2037317752838135, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7066432237625122, + "num_tokens": 115597189.0, + "step": 4624 + }, + { + "epoch": 0.5079068745881836, + "grad_norm": 2.0707101821899414, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7030118703842163, + "num_tokens": 115626324.0, + "step": 4625 + }, + { + "epoch": 0.5080166922907973, + "grad_norm": 2.3789987564086914, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6955758333206177, + "num_tokens": 115649121.0, + "step": 4626 + }, + { + "epoch": 0.5081265099934109, + "grad_norm": 2.2449545860290527, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6874991655349731, + "num_tokens": 115674446.0, + "step": 4627 + }, + { + "epoch": 0.5082363276960246, + "grad_norm": 2.0226829051971436, + "learning_rate": 1e-06, + "loss": 1.0732, + "mean_token_accuracy": 0.6802746057510376, + "num_tokens": 115703200.0, + "step": 4628 + }, + { + "epoch": 0.5083461453986382, + "grad_norm": 1.9610710144042969, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7093526721000671, + "num_tokens": 115733502.0, + "step": 4629 + }, + { + "epoch": 0.508455963101252, + "grad_norm": 2.1671135425567627, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7049548625946045, + "num_tokens": 115760180.0, + "step": 4630 + }, + { + "epoch": 0.5085657808038656, + "grad_norm": 2.163048505783081, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.688300371170044, + "num_tokens": 115788417.0, + "step": 4631 + }, + { + "epoch": 0.5086755985064793, + "grad_norm": 2.3166370391845703, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.701556921005249, + "num_tokens": 115813396.0, + "step": 4632 + }, + { + "epoch": 0.5087854162090929, + "grad_norm": 2.3934707641601562, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7191697955131531, + "num_tokens": 115835793.0, + "step": 4633 + }, + { + "epoch": 0.5088952339117065, + "grad_norm": 2.4107632637023926, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7043600082397461, + "num_tokens": 115859083.0, + "step": 4634 + }, + { + "epoch": 0.5090050516143202, + "grad_norm": 2.523132085800171, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.726656436920166, + "num_tokens": 115879159.0, + "step": 4635 + }, + { + "epoch": 0.5091148693169338, + "grad_norm": 2.4032626152038574, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7050869464874268, + "num_tokens": 115901561.0, + "step": 4636 + }, + { + "epoch": 0.5092246870195476, + "grad_norm": 2.2204573154449463, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6861991882324219, + "num_tokens": 115927688.0, + "step": 4637 + }, + { + "epoch": 0.5093345047221612, + "grad_norm": 2.07547926902771, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7163811326026917, + "num_tokens": 115953740.0, + "step": 4638 + }, + { + "epoch": 0.5094443224247749, + "grad_norm": 1.9111348390579224, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6963803768157959, + "num_tokens": 115986730.0, + "step": 4639 + }, + { + "epoch": 0.5095541401273885, + "grad_norm": 2.2784197330474854, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6895660161972046, + "num_tokens": 116011828.0, + "step": 4640 + }, + { + "epoch": 0.5096639578300022, + "grad_norm": 2.261441469192505, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7044613361358643, + "num_tokens": 116036371.0, + "step": 4641 + }, + { + "epoch": 0.5097737755326158, + "grad_norm": 2.1432125568389893, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7003668546676636, + "num_tokens": 116063116.0, + "step": 4642 + }, + { + "epoch": 0.5098835932352295, + "grad_norm": 1.9451295137405396, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.715947151184082, + "num_tokens": 116094225.0, + "step": 4643 + }, + { + "epoch": 0.5099934109378432, + "grad_norm": 2.458127498626709, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7125547528266907, + "num_tokens": 116115321.0, + "step": 4644 + }, + { + "epoch": 0.5101032286404569, + "grad_norm": 1.9691228866577148, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7033241987228394, + "num_tokens": 116146752.0, + "step": 4645 + }, + { + "epoch": 0.5102130463430705, + "grad_norm": 2.5187745094299316, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6859219074249268, + "num_tokens": 116168496.0, + "step": 4646 + }, + { + "epoch": 0.5103228640456842, + "grad_norm": 2.594008207321167, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6992116570472717, + "num_tokens": 116189297.0, + "step": 4647 + }, + { + "epoch": 0.5104326817482978, + "grad_norm": 2.001960515975952, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7004700303077698, + "num_tokens": 116217805.0, + "step": 4648 + }, + { + "epoch": 0.5105424994509115, + "grad_norm": 2.3115670680999756, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6895838975906372, + "num_tokens": 116241089.0, + "step": 4649 + }, + { + "epoch": 0.5106523171535251, + "grad_norm": 2.0796287059783936, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7031400799751282, + "num_tokens": 116268036.0, + "step": 4650 + }, + { + "epoch": 0.5107621348561389, + "grad_norm": 2.2777726650238037, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7331048250198364, + "num_tokens": 116290197.0, + "step": 4651 + }, + { + "epoch": 0.5108719525587525, + "grad_norm": 2.4444165229797363, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6945376396179199, + "num_tokens": 116311354.0, + "step": 4652 + }, + { + "epoch": 0.5109817702613662, + "grad_norm": 2.2077810764312744, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7093291878700256, + "num_tokens": 116336952.0, + "step": 4653 + }, + { + "epoch": 0.5110915879639798, + "grad_norm": 2.100371837615967, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7148017883300781, + "num_tokens": 116362803.0, + "step": 4654 + }, + { + "epoch": 0.5112014056665934, + "grad_norm": 2.0758461952209473, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7225393652915955, + "num_tokens": 116388284.0, + "step": 4655 + }, + { + "epoch": 0.5113112233692071, + "grad_norm": 1.8688019514083862, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7275424003601074, + "num_tokens": 116417945.0, + "step": 4656 + }, + { + "epoch": 0.5114210410718207, + "grad_norm": 1.8845880031585693, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6980222463607788, + "num_tokens": 116450547.0, + "step": 4657 + }, + { + "epoch": 0.5115308587744345, + "grad_norm": 2.192615032196045, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6832940578460693, + "num_tokens": 116477071.0, + "step": 4658 + }, + { + "epoch": 0.5116406764770481, + "grad_norm": 2.7404181957244873, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.720525324344635, + "num_tokens": 116495120.0, + "step": 4659 + }, + { + "epoch": 0.5117504941796618, + "grad_norm": 2.0761187076568604, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6928472518920898, + "num_tokens": 116525559.0, + "step": 4660 + }, + { + "epoch": 0.5118603118822754, + "grad_norm": 1.9617674350738525, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.709807813167572, + "num_tokens": 116554698.0, + "step": 4661 + }, + { + "epoch": 0.5119701295848891, + "grad_norm": 2.199639320373535, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6941617727279663, + "num_tokens": 116582906.0, + "step": 4662 + }, + { + "epoch": 0.5120799472875027, + "grad_norm": 2.049940824508667, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7036463022232056, + "num_tokens": 116612623.0, + "step": 4663 + }, + { + "epoch": 0.5121897649901164, + "grad_norm": 2.308095693588257, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7166224122047424, + "num_tokens": 116634913.0, + "step": 4664 + }, + { + "epoch": 0.51229958269273, + "grad_norm": 2.4055099487304688, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7070916295051575, + "num_tokens": 116657041.0, + "step": 4665 + }, + { + "epoch": 0.5124094003953438, + "grad_norm": 2.000110149383545, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7002904415130615, + "num_tokens": 116686989.0, + "step": 4666 + }, + { + "epoch": 0.5125192180979574, + "grad_norm": 2.3981125354766846, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.708794355392456, + "num_tokens": 116710997.0, + "step": 4667 + }, + { + "epoch": 0.5126290358005711, + "grad_norm": 2.073993444442749, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7148116827011108, + "num_tokens": 116737267.0, + "step": 4668 + }, + { + "epoch": 0.5127388535031847, + "grad_norm": 2.5258090496063232, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7352544069290161, + "num_tokens": 116755112.0, + "step": 4669 + }, + { + "epoch": 0.5128486712057984, + "grad_norm": 2.150484561920166, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.708770751953125, + "num_tokens": 116781564.0, + "step": 4670 + }, + { + "epoch": 0.512958488908412, + "grad_norm": 2.3159215450286865, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7025600671768188, + "num_tokens": 116804586.0, + "step": 4671 + }, + { + "epoch": 0.5130683066110256, + "grad_norm": 2.6548688411712646, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6937786340713501, + "num_tokens": 116825160.0, + "step": 4672 + }, + { + "epoch": 0.5131781243136394, + "grad_norm": 2.4474148750305176, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7143637537956238, + "num_tokens": 116846340.0, + "step": 4673 + }, + { + "epoch": 0.513287942016253, + "grad_norm": 2.1333119869232178, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6949877738952637, + "num_tokens": 116874005.0, + "step": 4674 + }, + { + "epoch": 0.5133977597188667, + "grad_norm": 2.4718708992004395, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7165913581848145, + "num_tokens": 116894697.0, + "step": 4675 + }, + { + "epoch": 0.5135075774214803, + "grad_norm": 2.4267401695251465, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7108098864555359, + "num_tokens": 116916403.0, + "step": 4676 + }, + { + "epoch": 0.513617395124094, + "grad_norm": 1.9666475057601929, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7154092788696289, + "num_tokens": 116945722.0, + "step": 4677 + }, + { + "epoch": 0.5137272128267076, + "grad_norm": 2.303490161895752, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7298212647438049, + "num_tokens": 116969154.0, + "step": 4678 + }, + { + "epoch": 0.5138370305293213, + "grad_norm": 2.8195934295654297, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7190210819244385, + "num_tokens": 116985745.0, + "step": 4679 + }, + { + "epoch": 0.513946848231935, + "grad_norm": 2.650871515274048, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7134780287742615, + "num_tokens": 117005068.0, + "step": 4680 + }, + { + "epoch": 0.5140566659345487, + "grad_norm": 2.5427069664001465, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7291682958602905, + "num_tokens": 117023798.0, + "step": 4681 + }, + { + "epoch": 0.5141664836371623, + "grad_norm": 2.427283763885498, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7101963758468628, + "num_tokens": 117044763.0, + "step": 4682 + }, + { + "epoch": 0.514276301339776, + "grad_norm": 2.2123680114746094, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7070443034172058, + "num_tokens": 117072077.0, + "step": 4683 + }, + { + "epoch": 0.5143861190423896, + "grad_norm": 2.1752660274505615, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7197524309158325, + "num_tokens": 117097812.0, + "step": 4684 + }, + { + "epoch": 0.5144959367450033, + "grad_norm": 2.3787269592285156, + "learning_rate": 1e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7358565330505371, + "num_tokens": 117120498.0, + "step": 4685 + }, + { + "epoch": 0.5146057544476169, + "grad_norm": 2.0357747077941895, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7074952125549316, + "num_tokens": 117149122.0, + "step": 4686 + }, + { + "epoch": 0.5147155721502307, + "grad_norm": 2.428313732147217, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.7005797028541565, + "num_tokens": 117171653.0, + "step": 4687 + }, + { + "epoch": 0.5148253898528443, + "grad_norm": 2.6540703773498535, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6920051574707031, + "num_tokens": 117191705.0, + "step": 4688 + }, + { + "epoch": 0.514935207555458, + "grad_norm": 2.1886675357818604, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6926331520080566, + "num_tokens": 117216988.0, + "step": 4689 + }, + { + "epoch": 0.5150450252580716, + "grad_norm": 2.1890347003936768, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7201054692268372, + "num_tokens": 117242063.0, + "step": 4690 + }, + { + "epoch": 0.5151548429606853, + "grad_norm": 2.1064867973327637, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7123509645462036, + "num_tokens": 117269443.0, + "step": 4691 + }, + { + "epoch": 0.5152646606632989, + "grad_norm": 1.9886903762817383, + "learning_rate": 1e-06, + "loss": 1.062, + "mean_token_accuracy": 0.6753589510917664, + "num_tokens": 117299535.0, + "step": 4692 + }, + { + "epoch": 0.5153744783659125, + "grad_norm": 2.447530508041382, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6792556047439575, + "num_tokens": 117322625.0, + "step": 4693 + }, + { + "epoch": 0.5154842960685262, + "grad_norm": 2.6768412590026855, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7262574434280396, + "num_tokens": 117340897.0, + "step": 4694 + }, + { + "epoch": 0.51559411377114, + "grad_norm": 2.4329874515533447, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.6975827217102051, + "num_tokens": 117363908.0, + "step": 4695 + }, + { + "epoch": 0.5157039314737536, + "grad_norm": 2.4503560066223145, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.726398229598999, + "num_tokens": 117383283.0, + "step": 4696 + }, + { + "epoch": 0.5158137491763672, + "grad_norm": 2.1983487606048584, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7229731678962708, + "num_tokens": 117408463.0, + "step": 4697 + }, + { + "epoch": 0.5159235668789809, + "grad_norm": 2.21832013130188, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.734797477722168, + "num_tokens": 117432455.0, + "step": 4698 + }, + { + "epoch": 0.5160333845815945, + "grad_norm": 2.0431296825408936, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6901869773864746, + "num_tokens": 117465196.0, + "step": 4699 + }, + { + "epoch": 0.5161432022842082, + "grad_norm": 2.119799852371216, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7212820053100586, + "num_tokens": 117491508.0, + "step": 4700 + }, + { + "epoch": 0.5162530199868218, + "grad_norm": 2.2329115867614746, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.698401689529419, + "num_tokens": 117516652.0, + "step": 4701 + }, + { + "epoch": 0.5163628376894356, + "grad_norm": 2.280722141265869, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7070354223251343, + "num_tokens": 117541520.0, + "step": 4702 + }, + { + "epoch": 0.5164726553920492, + "grad_norm": 2.1887941360473633, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7208670377731323, + "num_tokens": 117567280.0, + "step": 4703 + }, + { + "epoch": 0.5165824730946629, + "grad_norm": 1.9106329679489136, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6962552070617676, + "num_tokens": 117599210.0, + "step": 4704 + }, + { + "epoch": 0.5166922907972765, + "grad_norm": 2.246699571609497, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7163637280464172, + "num_tokens": 117622558.0, + "step": 4705 + }, + { + "epoch": 0.5168021084998902, + "grad_norm": 2.1060407161712646, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7131166458129883, + "num_tokens": 117648431.0, + "step": 4706 + }, + { + "epoch": 0.5169119262025038, + "grad_norm": 2.317582607269287, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7138000726699829, + "num_tokens": 117670668.0, + "step": 4707 + }, + { + "epoch": 0.5170217439051175, + "grad_norm": 2.6471385955810547, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7190748453140259, + "num_tokens": 117691506.0, + "step": 4708 + }, + { + "epoch": 0.5171315616077312, + "grad_norm": 2.379570484161377, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7230150699615479, + "num_tokens": 117712608.0, + "step": 4709 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 2.491887331008911, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.727791965007782, + "num_tokens": 117731611.0, + "step": 4710 + }, + { + "epoch": 0.5173511970129585, + "grad_norm": 2.098541021347046, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7224510908126831, + "num_tokens": 117758644.0, + "step": 4711 + }, + { + "epoch": 0.5174610147155722, + "grad_norm": 2.849294900894165, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7053810358047485, + "num_tokens": 117779770.0, + "step": 4712 + }, + { + "epoch": 0.5175708324181858, + "grad_norm": 2.570314407348633, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.72845858335495, + "num_tokens": 117798741.0, + "step": 4713 + }, + { + "epoch": 0.5176806501207994, + "grad_norm": 2.1677064895629883, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.6939705610275269, + "num_tokens": 117823849.0, + "step": 4714 + }, + { + "epoch": 0.5177904678234131, + "grad_norm": 2.306519031524658, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7080371379852295, + "num_tokens": 117848175.0, + "step": 4715 + }, + { + "epoch": 0.5179002855260268, + "grad_norm": 2.4463374614715576, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7073580622673035, + "num_tokens": 117869543.0, + "step": 4716 + }, + { + "epoch": 0.5180101032286405, + "grad_norm": 2.1052656173706055, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6897612810134888, + "num_tokens": 117898160.0, + "step": 4717 + }, + { + "epoch": 0.5181199209312541, + "grad_norm": 2.532313823699951, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7077214121818542, + "num_tokens": 117919465.0, + "step": 4718 + }, + { + "epoch": 0.5182297386338678, + "grad_norm": 2.0073623657226562, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7024766206741333, + "num_tokens": 117950125.0, + "step": 4719 + }, + { + "epoch": 0.5183395563364814, + "grad_norm": 2.092503309249878, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.705028235912323, + "num_tokens": 117976866.0, + "step": 4720 + }, + { + "epoch": 0.5184493740390951, + "grad_norm": 1.8322532176971436, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7057552933692932, + "num_tokens": 118008379.0, + "step": 4721 + }, + { + "epoch": 0.5185591917417087, + "grad_norm": 2.1017019748687744, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7209072113037109, + "num_tokens": 118037017.0, + "step": 4722 + }, + { + "epoch": 0.5186690094443224, + "grad_norm": 2.1327919960021973, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7005801200866699, + "num_tokens": 118063756.0, + "step": 4723 + }, + { + "epoch": 0.5187788271469361, + "grad_norm": 2.291780471801758, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.6969616413116455, + "num_tokens": 118087964.0, + "step": 4724 + }, + { + "epoch": 0.5188886448495498, + "grad_norm": 2.1008970737457275, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6930069923400879, + "num_tokens": 118117675.0, + "step": 4725 + }, + { + "epoch": 0.5189984625521634, + "grad_norm": 2.5912084579467773, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7226965427398682, + "num_tokens": 118136651.0, + "step": 4726 + }, + { + "epoch": 0.5191082802547771, + "grad_norm": 2.4796245098114014, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7195308208465576, + "num_tokens": 118157246.0, + "step": 4727 + }, + { + "epoch": 0.5192180979573907, + "grad_norm": 2.1508352756500244, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6951335668563843, + "num_tokens": 118185074.0, + "step": 4728 + }, + { + "epoch": 0.5193279156600044, + "grad_norm": 2.246131658554077, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6993932723999023, + "num_tokens": 118210903.0, + "step": 4729 + }, + { + "epoch": 0.519437733362618, + "grad_norm": 2.394901990890503, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7170343399047852, + "num_tokens": 118231969.0, + "step": 4730 + }, + { + "epoch": 0.5195475510652318, + "grad_norm": 2.22408127784729, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7294036746025085, + "num_tokens": 118255516.0, + "step": 4731 + }, + { + "epoch": 0.5196573687678454, + "grad_norm": 2.1667473316192627, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7140090465545654, + "num_tokens": 118282395.0, + "step": 4732 + }, + { + "epoch": 0.519767186470459, + "grad_norm": 2.109255790710449, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6929413080215454, + "num_tokens": 118308575.0, + "step": 4733 + }, + { + "epoch": 0.5198770041730727, + "grad_norm": 2.222583770751953, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7395890951156616, + "num_tokens": 118332588.0, + "step": 4734 + }, + { + "epoch": 0.5199868218756863, + "grad_norm": 2.306527853012085, + "learning_rate": 1e-06, + "loss": 1.1057, + "mean_token_accuracy": 0.6676392555236816, + "num_tokens": 118355577.0, + "step": 4735 + }, + { + "epoch": 0.5200966395783, + "grad_norm": 2.109036922454834, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6923823952674866, + "num_tokens": 118384555.0, + "step": 4736 + }, + { + "epoch": 0.5202064572809136, + "grad_norm": 2.2735447883605957, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.721163809299469, + "num_tokens": 118407241.0, + "step": 4737 + }, + { + "epoch": 0.5203162749835274, + "grad_norm": 2.146820306777954, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7138581275939941, + "num_tokens": 118433310.0, + "step": 4738 + }, + { + "epoch": 0.520426092686141, + "grad_norm": 2.2948076725006104, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6975849270820618, + "num_tokens": 118457336.0, + "step": 4739 + }, + { + "epoch": 0.5205359103887547, + "grad_norm": 2.501410484313965, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7046226263046265, + "num_tokens": 118479353.0, + "step": 4740 + }, + { + "epoch": 0.5206457280913683, + "grad_norm": 2.1634328365325928, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7169767022132874, + "num_tokens": 118503662.0, + "step": 4741 + }, + { + "epoch": 0.520755545793982, + "grad_norm": 2.3217735290527344, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7132905721664429, + "num_tokens": 118526252.0, + "step": 4742 + }, + { + "epoch": 0.5208653634965956, + "grad_norm": 2.0840539932250977, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7202243804931641, + "num_tokens": 118555730.0, + "step": 4743 + }, + { + "epoch": 0.5209751811992093, + "grad_norm": 2.366062879562378, + "learning_rate": 1e-06, + "loss": 1.0701, + "mean_token_accuracy": 0.6811580061912537, + "num_tokens": 118579124.0, + "step": 4744 + }, + { + "epoch": 0.521084998901823, + "grad_norm": 2.2995870113372803, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7244793176651001, + "num_tokens": 118601581.0, + "step": 4745 + }, + { + "epoch": 0.5211948166044367, + "grad_norm": 2.2735064029693604, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6878058910369873, + "num_tokens": 118626092.0, + "step": 4746 + }, + { + "epoch": 0.5213046343070503, + "grad_norm": 2.2819230556488037, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7173409461975098, + "num_tokens": 118648908.0, + "step": 4747 + }, + { + "epoch": 0.521414452009664, + "grad_norm": 2.342207431793213, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7007929682731628, + "num_tokens": 118671596.0, + "step": 4748 + }, + { + "epoch": 0.5215242697122776, + "grad_norm": 1.9039076566696167, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6910685896873474, + "num_tokens": 118706008.0, + "step": 4749 + }, + { + "epoch": 0.5216340874148913, + "grad_norm": 2.4313251972198486, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7058349847793579, + "num_tokens": 118725945.0, + "step": 4750 + }, + { + "epoch": 0.5217439051175049, + "grad_norm": 2.323631525039673, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7123112678527832, + "num_tokens": 118749105.0, + "step": 4751 + }, + { + "epoch": 0.5218537228201185, + "grad_norm": 2.302889823913574, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7187209129333496, + "num_tokens": 118773080.0, + "step": 4752 + }, + { + "epoch": 0.5219635405227323, + "grad_norm": 2.1360559463500977, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.687740683555603, + "num_tokens": 118800689.0, + "step": 4753 + }, + { + "epoch": 0.522073358225346, + "grad_norm": 2.0454752445220947, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7029412984848022, + "num_tokens": 118830907.0, + "step": 4754 + }, + { + "epoch": 0.5221831759279596, + "grad_norm": 1.9752249717712402, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6885561347007751, + "num_tokens": 118863521.0, + "step": 4755 + }, + { + "epoch": 0.5222929936305732, + "grad_norm": 2.262404680252075, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.70039963722229, + "num_tokens": 118887442.0, + "step": 4756 + }, + { + "epoch": 0.5224028113331869, + "grad_norm": 2.1027302742004395, + "learning_rate": 1e-06, + "loss": 1.0854, + "mean_token_accuracy": 0.6735857129096985, + "num_tokens": 118917413.0, + "step": 4757 + }, + { + "epoch": 0.5225126290358005, + "grad_norm": 2.063401460647583, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.6920729875564575, + "num_tokens": 118944576.0, + "step": 4758 + }, + { + "epoch": 0.5226224467384142, + "grad_norm": 1.9702825546264648, + "learning_rate": 1e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.6791715621948242, + "num_tokens": 118978310.0, + "step": 4759 + }, + { + "epoch": 0.5227322644410279, + "grad_norm": 2.132585287094116, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7294886112213135, + "num_tokens": 119004105.0, + "step": 4760 + }, + { + "epoch": 0.5228420821436416, + "grad_norm": 2.0218608379364014, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.714184045791626, + "num_tokens": 119032127.0, + "step": 4761 + }, + { + "epoch": 0.5229518998462552, + "grad_norm": 2.257934808731079, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7197533845901489, + "num_tokens": 119056451.0, + "step": 4762 + }, + { + "epoch": 0.5230617175488689, + "grad_norm": 2.2006404399871826, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7312978506088257, + "num_tokens": 119079547.0, + "step": 4763 + }, + { + "epoch": 0.5231715352514825, + "grad_norm": 2.3475944995880127, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7153095006942749, + "num_tokens": 119101840.0, + "step": 4764 + }, + { + "epoch": 0.5232813529540962, + "grad_norm": 2.5228090286254883, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7181816697120667, + "num_tokens": 119120561.0, + "step": 4765 + }, + { + "epoch": 0.5233911706567098, + "grad_norm": 2.361513137817383, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.7005045413970947, + "num_tokens": 119144325.0, + "step": 4766 + }, + { + "epoch": 0.5235009883593236, + "grad_norm": 2.1209380626678467, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7014741897583008, + "num_tokens": 119171218.0, + "step": 4767 + }, + { + "epoch": 0.5236108060619372, + "grad_norm": 2.5546274185180664, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7035163044929504, + "num_tokens": 119190792.0, + "step": 4768 + }, + { + "epoch": 0.5237206237645509, + "grad_norm": 2.3779280185699463, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7194271087646484, + "num_tokens": 119214017.0, + "step": 4769 + }, + { + "epoch": 0.5238304414671645, + "grad_norm": 1.9682053327560425, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7112517356872559, + "num_tokens": 119243692.0, + "step": 4770 + }, + { + "epoch": 0.5239402591697782, + "grad_norm": 2.3773391246795654, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7371422052383423, + "num_tokens": 119265776.0, + "step": 4771 + }, + { + "epoch": 0.5240500768723918, + "grad_norm": 1.9070788621902466, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.725751519203186, + "num_tokens": 119296658.0, + "step": 4772 + }, + { + "epoch": 0.5241598945750054, + "grad_norm": 2.286947727203369, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7235628366470337, + "num_tokens": 119318178.0, + "step": 4773 + }, + { + "epoch": 0.5242697122776192, + "grad_norm": 2.1514997482299805, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7091571092605591, + "num_tokens": 119343321.0, + "step": 4774 + }, + { + "epoch": 0.5243795299802329, + "grad_norm": 2.192768096923828, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.6983712315559387, + "num_tokens": 119370487.0, + "step": 4775 + }, + { + "epoch": 0.5244893476828465, + "grad_norm": 2.051008701324463, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6984006762504578, + "num_tokens": 119398253.0, + "step": 4776 + }, + { + "epoch": 0.5245991653854601, + "grad_norm": 2.3382890224456787, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7350385189056396, + "num_tokens": 119419534.0, + "step": 4777 + }, + { + "epoch": 0.5247089830880738, + "grad_norm": 2.1891653537750244, + "learning_rate": 1e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.740636944770813, + "num_tokens": 119443577.0, + "step": 4778 + }, + { + "epoch": 0.5248188007906874, + "grad_norm": 2.7572596073150635, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6910583972930908, + "num_tokens": 119462772.0, + "step": 4779 + }, + { + "epoch": 0.5249286184933011, + "grad_norm": 2.089512348175049, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7064625024795532, + "num_tokens": 119490353.0, + "step": 4780 + }, + { + "epoch": 0.5250384361959147, + "grad_norm": 2.2120115756988525, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6977856159210205, + "num_tokens": 119516987.0, + "step": 4781 + }, + { + "epoch": 0.5251482538985285, + "grad_norm": 2.2431588172912598, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.703112781047821, + "num_tokens": 119541893.0, + "step": 4782 + }, + { + "epoch": 0.5252580716011421, + "grad_norm": 2.179527759552002, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6953283548355103, + "num_tokens": 119569616.0, + "step": 4783 + }, + { + "epoch": 0.5253678893037558, + "grad_norm": 2.141101121902466, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6904234886169434, + "num_tokens": 119598862.0, + "step": 4784 + }, + { + "epoch": 0.5254777070063694, + "grad_norm": 2.581340789794922, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7315835356712341, + "num_tokens": 119616755.0, + "step": 4785 + }, + { + "epoch": 0.5255875247089831, + "grad_norm": 2.4096477031707764, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6946917772293091, + "num_tokens": 119639186.0, + "step": 4786 + }, + { + "epoch": 0.5256973424115967, + "grad_norm": 2.325117826461792, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6953456401824951, + "num_tokens": 119661553.0, + "step": 4787 + }, + { + "epoch": 0.5258071601142104, + "grad_norm": 2.401305675506592, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7183582782745361, + "num_tokens": 119682657.0, + "step": 4788 + }, + { + "epoch": 0.5259169778168241, + "grad_norm": 2.2872421741485596, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7253743410110474, + "num_tokens": 119704538.0, + "step": 4789 + }, + { + "epoch": 0.5260267955194378, + "grad_norm": 2.3791251182556152, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7021279335021973, + "num_tokens": 119726297.0, + "step": 4790 + }, + { + "epoch": 0.5261366132220514, + "grad_norm": 2.30842924118042, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7067703008651733, + "num_tokens": 119751866.0, + "step": 4791 + }, + { + "epoch": 0.526246430924665, + "grad_norm": 2.3010213375091553, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7000397443771362, + "num_tokens": 119777222.0, + "step": 4792 + }, + { + "epoch": 0.5263562486272787, + "grad_norm": 2.3588991165161133, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7085121870040894, + "num_tokens": 119799615.0, + "step": 4793 + }, + { + "epoch": 0.5264660663298923, + "grad_norm": 2.1521382331848145, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7043672800064087, + "num_tokens": 119825689.0, + "step": 4794 + }, + { + "epoch": 0.526575884032506, + "grad_norm": 2.008607864379883, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7000378370285034, + "num_tokens": 119853354.0, + "step": 4795 + }, + { + "epoch": 0.5266857017351197, + "grad_norm": 2.2108495235443115, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6926359534263611, + "num_tokens": 119879863.0, + "step": 4796 + }, + { + "epoch": 0.5267955194377334, + "grad_norm": 2.1808388233184814, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7165629863739014, + "num_tokens": 119903473.0, + "step": 4797 + }, + { + "epoch": 0.526905337140347, + "grad_norm": 2.263171434402466, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7191966772079468, + "num_tokens": 119926005.0, + "step": 4798 + }, + { + "epoch": 0.5270151548429607, + "grad_norm": 2.2217509746551514, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6978948712348938, + "num_tokens": 119953010.0, + "step": 4799 + }, + { + "epoch": 0.5271249725455743, + "grad_norm": 2.53873348236084, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7163490056991577, + "num_tokens": 119972794.0, + "step": 4800 + }, + { + "epoch": 0.527234790248188, + "grad_norm": 2.0919578075408936, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6936262845993042, + "num_tokens": 119997907.0, + "step": 4801 + }, + { + "epoch": 0.5273446079508016, + "grad_norm": 2.407740831375122, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.7003957033157349, + "num_tokens": 120021085.0, + "step": 4802 + }, + { + "epoch": 0.5274544256534154, + "grad_norm": 2.338846206665039, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6949750185012817, + "num_tokens": 120044312.0, + "step": 4803 + }, + { + "epoch": 0.527564243356029, + "grad_norm": 2.1191632747650146, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.683015763759613, + "num_tokens": 120070963.0, + "step": 4804 + }, + { + "epoch": 0.5276740610586427, + "grad_norm": 2.166588544845581, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7168961763381958, + "num_tokens": 120096770.0, + "step": 4805 + }, + { + "epoch": 0.5277838787612563, + "grad_norm": 2.323237895965576, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7143964767456055, + "num_tokens": 120119861.0, + "step": 4806 + }, + { + "epoch": 0.52789369646387, + "grad_norm": 2.105635643005371, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7182770371437073, + "num_tokens": 120149851.0, + "step": 4807 + }, + { + "epoch": 0.5280035141664836, + "grad_norm": 2.3903188705444336, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7000502347946167, + "num_tokens": 120171819.0, + "step": 4808 + }, + { + "epoch": 0.5281133318690973, + "grad_norm": 2.182415008544922, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7012408375740051, + "num_tokens": 120197207.0, + "step": 4809 + }, + { + "epoch": 0.528223149571711, + "grad_norm": 2.1086084842681885, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7073723673820496, + "num_tokens": 120225568.0, + "step": 4810 + }, + { + "epoch": 0.5283329672743247, + "grad_norm": 2.3057210445404053, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6976780891418457, + "num_tokens": 120250577.0, + "step": 4811 + }, + { + "epoch": 0.5284427849769383, + "grad_norm": 2.265998125076294, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7159613370895386, + "num_tokens": 120276215.0, + "step": 4812 + }, + { + "epoch": 0.528552602679552, + "grad_norm": 2.2100119590759277, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.702162504196167, + "num_tokens": 120303409.0, + "step": 4813 + }, + { + "epoch": 0.5286624203821656, + "grad_norm": 2.496070623397827, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7077434659004211, + "num_tokens": 120324518.0, + "step": 4814 + }, + { + "epoch": 0.5287722380847792, + "grad_norm": 1.9883931875228882, + "learning_rate": 1e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.683400571346283, + "num_tokens": 120354139.0, + "step": 4815 + }, + { + "epoch": 0.5288820557873929, + "grad_norm": 2.1835317611694336, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6949049234390259, + "num_tokens": 120380599.0, + "step": 4816 + }, + { + "epoch": 0.5289918734900065, + "grad_norm": 2.464621067047119, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7214967012405396, + "num_tokens": 120400878.0, + "step": 4817 + }, + { + "epoch": 0.5291016911926203, + "grad_norm": 1.9171303510665894, + "learning_rate": 1e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6792653203010559, + "num_tokens": 120432331.0, + "step": 4818 + }, + { + "epoch": 0.5292115088952339, + "grad_norm": 2.32071852684021, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6954655051231384, + "num_tokens": 120459885.0, + "step": 4819 + }, + { + "epoch": 0.5293213265978476, + "grad_norm": 2.288360834121704, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7031732797622681, + "num_tokens": 120485577.0, + "step": 4820 + }, + { + "epoch": 0.5294311443004612, + "grad_norm": 2.1841208934783936, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6931936740875244, + "num_tokens": 120513166.0, + "step": 4821 + }, + { + "epoch": 0.5295409620030749, + "grad_norm": 2.009568214416504, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6995100975036621, + "num_tokens": 120542442.0, + "step": 4822 + }, + { + "epoch": 0.5296507797056885, + "grad_norm": 2.1928834915161133, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.696461021900177, + "num_tokens": 120569041.0, + "step": 4823 + }, + { + "epoch": 0.5297605974083022, + "grad_norm": 2.247136116027832, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7294853329658508, + "num_tokens": 120593126.0, + "step": 4824 + }, + { + "epoch": 0.5298704151109159, + "grad_norm": 2.195155620574951, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7044466137886047, + "num_tokens": 120619517.0, + "step": 4825 + }, + { + "epoch": 0.5299802328135296, + "grad_norm": 2.232927083969116, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7224647402763367, + "num_tokens": 120641913.0, + "step": 4826 + }, + { + "epoch": 0.5300900505161432, + "grad_norm": 2.4276788234710693, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7163963317871094, + "num_tokens": 120665388.0, + "step": 4827 + }, + { + "epoch": 0.5301998682187569, + "grad_norm": 2.1727559566497803, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7004892230033875, + "num_tokens": 120690425.0, + "step": 4828 + }, + { + "epoch": 0.5303096859213705, + "grad_norm": 2.4011940956115723, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7168287038803101, + "num_tokens": 120710880.0, + "step": 4829 + }, + { + "epoch": 0.5304195036239842, + "grad_norm": 2.035702705383301, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.699414849281311, + "num_tokens": 120741500.0, + "step": 4830 + }, + { + "epoch": 0.5305293213265978, + "grad_norm": 2.083709955215454, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7247321605682373, + "num_tokens": 120768451.0, + "step": 4831 + }, + { + "epoch": 0.5306391390292116, + "grad_norm": 2.163959503173828, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6971337199211121, + "num_tokens": 120793334.0, + "step": 4832 + }, + { + "epoch": 0.5307489567318252, + "grad_norm": 2.5913729667663574, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.6968929767608643, + "num_tokens": 120813958.0, + "step": 4833 + }, + { + "epoch": 0.5308587744344389, + "grad_norm": 2.3614392280578613, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7470338344573975, + "num_tokens": 120835145.0, + "step": 4834 + }, + { + "epoch": 0.5309685921370525, + "grad_norm": 2.3338115215301514, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6978120803833008, + "num_tokens": 120860908.0, + "step": 4835 + }, + { + "epoch": 0.5310784098396661, + "grad_norm": 2.3106532096862793, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6941133737564087, + "num_tokens": 120884461.0, + "step": 4836 + }, + { + "epoch": 0.5311882275422798, + "grad_norm": 2.3293561935424805, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.725232720375061, + "num_tokens": 120906943.0, + "step": 4837 + }, + { + "epoch": 0.5312980452448934, + "grad_norm": 2.2037415504455566, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7128067016601562, + "num_tokens": 120932569.0, + "step": 4838 + }, + { + "epoch": 0.5314078629475072, + "grad_norm": 2.2198028564453125, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6947657465934753, + "num_tokens": 120958451.0, + "step": 4839 + }, + { + "epoch": 0.5315176806501208, + "grad_norm": 2.4038755893707275, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7133862972259521, + "num_tokens": 120978435.0, + "step": 4840 + }, + { + "epoch": 0.5316274983527345, + "grad_norm": 2.0523576736450195, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6854659914970398, + "num_tokens": 121006748.0, + "step": 4841 + }, + { + "epoch": 0.5317373160553481, + "grad_norm": 2.3391835689544678, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7022266387939453, + "num_tokens": 121029756.0, + "step": 4842 + }, + { + "epoch": 0.5318471337579618, + "grad_norm": 2.147770643234253, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7262067794799805, + "num_tokens": 121053488.0, + "step": 4843 + }, + { + "epoch": 0.5319569514605754, + "grad_norm": 2.452294111251831, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7116791009902954, + "num_tokens": 121074058.0, + "step": 4844 + }, + { + "epoch": 0.5320667691631891, + "grad_norm": 2.3208558559417725, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6863059997558594, + "num_tokens": 121097937.0, + "step": 4845 + }, + { + "epoch": 0.5321765868658027, + "grad_norm": 2.2972114086151123, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6933889389038086, + "num_tokens": 121121458.0, + "step": 4846 + }, + { + "epoch": 0.5322864045684165, + "grad_norm": 2.236785650253296, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7038351893424988, + "num_tokens": 121144072.0, + "step": 4847 + }, + { + "epoch": 0.5323962222710301, + "grad_norm": 1.9165782928466797, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7001993656158447, + "num_tokens": 121174103.0, + "step": 4848 + }, + { + "epoch": 0.5325060399736438, + "grad_norm": 2.280844211578369, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7269089221954346, + "num_tokens": 121197563.0, + "step": 4849 + }, + { + "epoch": 0.5326158576762574, + "grad_norm": 2.2239835262298584, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6914129257202148, + "num_tokens": 121223039.0, + "step": 4850 + }, + { + "epoch": 0.532725675378871, + "grad_norm": 2.222607374191284, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7057662010192871, + "num_tokens": 121249395.0, + "step": 4851 + }, + { + "epoch": 0.5328354930814847, + "grad_norm": 2.3135507106781006, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7032409310340881, + "num_tokens": 121271236.0, + "step": 4852 + }, + { + "epoch": 0.5329453107840983, + "grad_norm": 2.1445014476776123, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6821685433387756, + "num_tokens": 121300214.0, + "step": 4853 + }, + { + "epoch": 0.5330551284867121, + "grad_norm": 2.7448134422302246, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7046016454696655, + "num_tokens": 121318665.0, + "step": 4854 + }, + { + "epoch": 0.5331649461893258, + "grad_norm": 2.1047534942626953, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6988659501075745, + "num_tokens": 121347320.0, + "step": 4855 + }, + { + "epoch": 0.5332747638919394, + "grad_norm": 2.2287824153900146, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7044180631637573, + "num_tokens": 121372129.0, + "step": 4856 + }, + { + "epoch": 0.533384581594553, + "grad_norm": 2.0979859828948975, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7087321281433105, + "num_tokens": 121398258.0, + "step": 4857 + }, + { + "epoch": 0.5334943992971667, + "grad_norm": 2.2574141025543213, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.719490647315979, + "num_tokens": 121423076.0, + "step": 4858 + }, + { + "epoch": 0.5336042169997803, + "grad_norm": 1.9934020042419434, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7022595405578613, + "num_tokens": 121450829.0, + "step": 4859 + }, + { + "epoch": 0.533714034702394, + "grad_norm": 2.0344560146331787, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6988649964332581, + "num_tokens": 121481558.0, + "step": 4860 + }, + { + "epoch": 0.5338238524050077, + "grad_norm": 2.052739381790161, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6965929865837097, + "num_tokens": 121510570.0, + "step": 4861 + }, + { + "epoch": 0.5339336701076214, + "grad_norm": 2.205723524093628, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.705431342124939, + "num_tokens": 121536149.0, + "step": 4862 + }, + { + "epoch": 0.534043487810235, + "grad_norm": 1.9208412170410156, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6972043514251709, + "num_tokens": 121569794.0, + "step": 4863 + }, + { + "epoch": 0.5341533055128487, + "grad_norm": 2.027439594268799, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7008935809135437, + "num_tokens": 121596726.0, + "step": 4864 + }, + { + "epoch": 0.5342631232154623, + "grad_norm": 2.1281332969665527, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6969249844551086, + "num_tokens": 121624189.0, + "step": 4865 + }, + { + "epoch": 0.534372940918076, + "grad_norm": 2.544571876525879, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.708104133605957, + "num_tokens": 121645278.0, + "step": 4866 + }, + { + "epoch": 0.5344827586206896, + "grad_norm": 2.4811575412750244, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.704224705696106, + "num_tokens": 121668869.0, + "step": 4867 + }, + { + "epoch": 0.5345925763233034, + "grad_norm": 2.5578222274780273, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7085598707199097, + "num_tokens": 121689601.0, + "step": 4868 + }, + { + "epoch": 0.534702394025917, + "grad_norm": 2.1602818965911865, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7277758121490479, + "num_tokens": 121713350.0, + "step": 4869 + }, + { + "epoch": 0.5348122117285307, + "grad_norm": 2.310436964035034, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7029509544372559, + "num_tokens": 121737312.0, + "step": 4870 + }, + { + "epoch": 0.5349220294311443, + "grad_norm": 2.253871202468872, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6920090913772583, + "num_tokens": 121764144.0, + "step": 4871 + }, + { + "epoch": 0.535031847133758, + "grad_norm": 2.1983354091644287, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.720881462097168, + "num_tokens": 121786377.0, + "step": 4872 + }, + { + "epoch": 0.5351416648363716, + "grad_norm": 2.1064705848693848, + "learning_rate": 1e-06, + "loss": 1.1056, + "mean_token_accuracy": 0.6756008863449097, + "num_tokens": 121817151.0, + "step": 4873 + }, + { + "epoch": 0.5352514825389852, + "grad_norm": 2.1478147506713867, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7079681158065796, + "num_tokens": 121842411.0, + "step": 4874 + }, + { + "epoch": 0.5353613002415989, + "grad_norm": 2.688718795776367, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7043220400810242, + "num_tokens": 121861854.0, + "step": 4875 + }, + { + "epoch": 0.5354711179442126, + "grad_norm": 2.7360012531280518, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7133920788764954, + "num_tokens": 121880635.0, + "step": 4876 + }, + { + "epoch": 0.5355809356468263, + "grad_norm": 2.324228286743164, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.690208911895752, + "num_tokens": 121904530.0, + "step": 4877 + }, + { + "epoch": 0.5356907533494399, + "grad_norm": 2.101076364517212, + "learning_rate": 1e-06, + "loss": 1.0898, + "mean_token_accuracy": 0.674947202205658, + "num_tokens": 121936703.0, + "step": 4878 + }, + { + "epoch": 0.5358005710520536, + "grad_norm": 2.0544419288635254, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6959081888198853, + "num_tokens": 121964551.0, + "step": 4879 + }, + { + "epoch": 0.5359103887546672, + "grad_norm": 2.2129335403442383, + "learning_rate": 1e-06, + "loss": 1.0816, + "mean_token_accuracy": 0.6811949014663696, + "num_tokens": 121989932.0, + "step": 4880 + }, + { + "epoch": 0.5360202064572809, + "grad_norm": 2.0602304935455322, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7086199522018433, + "num_tokens": 122016115.0, + "step": 4881 + }, + { + "epoch": 0.5361300241598945, + "grad_norm": 2.423349380493164, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7073804140090942, + "num_tokens": 122036226.0, + "step": 4882 + }, + { + "epoch": 0.5362398418625083, + "grad_norm": 2.3184378147125244, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7279431819915771, + "num_tokens": 122059082.0, + "step": 4883 + }, + { + "epoch": 0.5363496595651219, + "grad_norm": 2.402074098587036, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7220391631126404, + "num_tokens": 122079654.0, + "step": 4884 + }, + { + "epoch": 0.5364594772677356, + "grad_norm": 2.4907569885253906, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7204453349113464, + "num_tokens": 122099789.0, + "step": 4885 + }, + { + "epoch": 0.5365692949703492, + "grad_norm": 2.325791597366333, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.709144115447998, + "num_tokens": 122121788.0, + "step": 4886 + }, + { + "epoch": 0.5366791126729629, + "grad_norm": 2.3950486183166504, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7132164239883423, + "num_tokens": 122144160.0, + "step": 4887 + }, + { + "epoch": 0.5367889303755765, + "grad_norm": 2.136653423309326, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.714340090751648, + "num_tokens": 122173956.0, + "step": 4888 + }, + { + "epoch": 0.5368987480781902, + "grad_norm": 2.3326048851013184, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6821873784065247, + "num_tokens": 122199431.0, + "step": 4889 + }, + { + "epoch": 0.5370085657808039, + "grad_norm": 1.931511640548706, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6887493133544922, + "num_tokens": 122231878.0, + "step": 4890 + }, + { + "epoch": 0.5371183834834176, + "grad_norm": 2.486663818359375, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7029289603233337, + "num_tokens": 122254971.0, + "step": 4891 + }, + { + "epoch": 0.5372282011860312, + "grad_norm": 2.3966355323791504, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7187532186508179, + "num_tokens": 122278489.0, + "step": 4892 + }, + { + "epoch": 0.5373380188886449, + "grad_norm": 1.8881257772445679, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6859816312789917, + "num_tokens": 122311269.0, + "step": 4893 + }, + { + "epoch": 0.5374478365912585, + "grad_norm": 2.253857135772705, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7139593362808228, + "num_tokens": 122334368.0, + "step": 4894 + }, + { + "epoch": 0.5375576542938721, + "grad_norm": 2.1508195400238037, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6912885904312134, + "num_tokens": 122361371.0, + "step": 4895 + }, + { + "epoch": 0.5376674719964858, + "grad_norm": 2.306061029434204, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7161701917648315, + "num_tokens": 122383140.0, + "step": 4896 + }, + { + "epoch": 0.5377772896990995, + "grad_norm": 2.1852867603302, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6986168026924133, + "num_tokens": 122412296.0, + "step": 4897 + }, + { + "epoch": 0.5378871074017132, + "grad_norm": 2.3530166149139404, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6853139996528625, + "num_tokens": 122437062.0, + "step": 4898 + }, + { + "epoch": 0.5379969251043268, + "grad_norm": 2.5087263584136963, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7034720182418823, + "num_tokens": 122458786.0, + "step": 4899 + }, + { + "epoch": 0.5381067428069405, + "grad_norm": 2.3560264110565186, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.705081582069397, + "num_tokens": 122482520.0, + "step": 4900 + }, + { + "epoch": 0.5382165605095541, + "grad_norm": 2.2554945945739746, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6924782991409302, + "num_tokens": 122509074.0, + "step": 4901 + }, + { + "epoch": 0.5383263782121678, + "grad_norm": 2.1617374420166016, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6923338174819946, + "num_tokens": 122534352.0, + "step": 4902 + }, + { + "epoch": 0.5384361959147814, + "grad_norm": 2.1788864135742188, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6920769214630127, + "num_tokens": 122560782.0, + "step": 4903 + }, + { + "epoch": 0.5385460136173951, + "grad_norm": 2.3180596828460693, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7231030464172363, + "num_tokens": 122584657.0, + "step": 4904 + }, + { + "epoch": 0.5386558313200088, + "grad_norm": 2.321226119995117, + "learning_rate": 1e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.6792904138565063, + "num_tokens": 122609525.0, + "step": 4905 + }, + { + "epoch": 0.5387656490226225, + "grad_norm": 2.4878053665161133, + "learning_rate": 1e-06, + "loss": 1.0906, + "mean_token_accuracy": 0.6783952116966248, + "num_tokens": 122631836.0, + "step": 4906 + }, + { + "epoch": 0.5388754667252361, + "grad_norm": 2.50128173828125, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7217907905578613, + "num_tokens": 122651884.0, + "step": 4907 + }, + { + "epoch": 0.5389852844278498, + "grad_norm": 2.3344063758850098, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7304937243461609, + "num_tokens": 122672930.0, + "step": 4908 + }, + { + "epoch": 0.5390951021304634, + "grad_norm": 1.968878149986267, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7125217914581299, + "num_tokens": 122702689.0, + "step": 4909 + }, + { + "epoch": 0.5392049198330771, + "grad_norm": 2.195094108581543, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7196637392044067, + "num_tokens": 122729671.0, + "step": 4910 + }, + { + "epoch": 0.5393147375356907, + "grad_norm": 2.244241714477539, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7063291072845459, + "num_tokens": 122752772.0, + "step": 4911 + }, + { + "epoch": 0.5394245552383045, + "grad_norm": 2.4529924392700195, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6926212906837463, + "num_tokens": 122774246.0, + "step": 4912 + }, + { + "epoch": 0.5395343729409181, + "grad_norm": 2.2160212993621826, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6997869610786438, + "num_tokens": 122799480.0, + "step": 4913 + }, + { + "epoch": 0.5396441906435318, + "grad_norm": 2.178584575653076, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7079443335533142, + "num_tokens": 122825499.0, + "step": 4914 + }, + { + "epoch": 0.5397540083461454, + "grad_norm": 2.1258203983306885, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7093552350997925, + "num_tokens": 122851701.0, + "step": 4915 + }, + { + "epoch": 0.539863826048759, + "grad_norm": 2.142961025238037, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7185831069946289, + "num_tokens": 122877209.0, + "step": 4916 + }, + { + "epoch": 0.5399736437513727, + "grad_norm": 2.171442985534668, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6912573575973511, + "num_tokens": 122902854.0, + "step": 4917 + }, + { + "epoch": 0.5400834614539863, + "grad_norm": 2.2909176349639893, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7110504508018494, + "num_tokens": 122925405.0, + "step": 4918 + }, + { + "epoch": 0.5401932791566001, + "grad_norm": 2.5186235904693604, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6877824068069458, + "num_tokens": 122947001.0, + "step": 4919 + }, + { + "epoch": 0.5403030968592137, + "grad_norm": 2.8117032051086426, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7024564146995544, + "num_tokens": 122964611.0, + "step": 4920 + }, + { + "epoch": 0.5404129145618274, + "grad_norm": 2.0003747940063477, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7027288675308228, + "num_tokens": 122994944.0, + "step": 4921 + }, + { + "epoch": 0.540522732264441, + "grad_norm": 1.993649959564209, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6869070529937744, + "num_tokens": 123024744.0, + "step": 4922 + }, + { + "epoch": 0.5406325499670547, + "grad_norm": 2.320976495742798, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6952622532844543, + "num_tokens": 123049759.0, + "step": 4923 + }, + { + "epoch": 0.5407423676696683, + "grad_norm": 2.6625006198883057, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7110192775726318, + "num_tokens": 123068418.0, + "step": 4924 + }, + { + "epoch": 0.540852185372282, + "grad_norm": 2.0799365043640137, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6845523118972778, + "num_tokens": 123098618.0, + "step": 4925 + }, + { + "epoch": 0.5409620030748957, + "grad_norm": 2.2781355381011963, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7215149998664856, + "num_tokens": 123121520.0, + "step": 4926 + }, + { + "epoch": 0.5410718207775094, + "grad_norm": 2.232539415359497, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.687842845916748, + "num_tokens": 123145943.0, + "step": 4927 + }, + { + "epoch": 0.541181638480123, + "grad_norm": 2.133857011795044, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7224483489990234, + "num_tokens": 123171653.0, + "step": 4928 + }, + { + "epoch": 0.5412914561827367, + "grad_norm": 2.390507221221924, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.6963182687759399, + "num_tokens": 123194967.0, + "step": 4929 + }, + { + "epoch": 0.5414012738853503, + "grad_norm": 2.7228660583496094, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7064712047576904, + "num_tokens": 123213344.0, + "step": 4930 + }, + { + "epoch": 0.541511091587964, + "grad_norm": 2.351926565170288, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7162108421325684, + "num_tokens": 123234857.0, + "step": 4931 + }, + { + "epoch": 0.5416209092905776, + "grad_norm": 2.2676842212677, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6973715424537659, + "num_tokens": 123258782.0, + "step": 4932 + }, + { + "epoch": 0.5417307269931912, + "grad_norm": 2.0747575759887695, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7044773697853088, + "num_tokens": 123285587.0, + "step": 4933 + }, + { + "epoch": 0.541840544695805, + "grad_norm": 2.148787021636963, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6902393102645874, + "num_tokens": 123313133.0, + "step": 4934 + }, + { + "epoch": 0.5419503623984187, + "grad_norm": 2.5372331142425537, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7127386331558228, + "num_tokens": 123332959.0, + "step": 4935 + }, + { + "epoch": 0.5420601801010323, + "grad_norm": 2.1393744945526123, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7225834131240845, + "num_tokens": 123358606.0, + "step": 4936 + }, + { + "epoch": 0.5421699978036459, + "grad_norm": 2.2581839561462402, + "learning_rate": 1e-06, + "loss": 1.0958, + "mean_token_accuracy": 0.6750537753105164, + "num_tokens": 123382722.0, + "step": 4937 + }, + { + "epoch": 0.5422798155062596, + "grad_norm": 2.1109917163848877, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6963487863540649, + "num_tokens": 123411842.0, + "step": 4938 + }, + { + "epoch": 0.5423896332088732, + "grad_norm": 2.1548924446105957, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.7053981423377991, + "num_tokens": 123438825.0, + "step": 4939 + }, + { + "epoch": 0.5424994509114869, + "grad_norm": 2.193066358566284, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7037163972854614, + "num_tokens": 123464175.0, + "step": 4940 + }, + { + "epoch": 0.5426092686141006, + "grad_norm": 2.3744406700134277, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6831945776939392, + "num_tokens": 123488653.0, + "step": 4941 + }, + { + "epoch": 0.5427190863167143, + "grad_norm": 2.3625221252441406, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7342816591262817, + "num_tokens": 123510949.0, + "step": 4942 + }, + { + "epoch": 0.5428289040193279, + "grad_norm": 2.295334577560425, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7204364538192749, + "num_tokens": 123533941.0, + "step": 4943 + }, + { + "epoch": 0.5429387217219416, + "grad_norm": 2.283661127090454, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7163267135620117, + "num_tokens": 123557304.0, + "step": 4944 + }, + { + "epoch": 0.5430485394245552, + "grad_norm": 2.2180967330932617, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7165261507034302, + "num_tokens": 123581781.0, + "step": 4945 + }, + { + "epoch": 0.5431583571271689, + "grad_norm": 1.974320888519287, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6939737796783447, + "num_tokens": 123612996.0, + "step": 4946 + }, + { + "epoch": 0.5432681748297825, + "grad_norm": 2.3884503841400146, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7059595584869385, + "num_tokens": 123635636.0, + "step": 4947 + }, + { + "epoch": 0.5433779925323963, + "grad_norm": 2.3309671878814697, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7073280811309814, + "num_tokens": 123658624.0, + "step": 4948 + }, + { + "epoch": 0.5434878102350099, + "grad_norm": 2.201016664505005, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6914628148078918, + "num_tokens": 123684084.0, + "step": 4949 + }, + { + "epoch": 0.5435976279376236, + "grad_norm": 2.0597100257873535, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7049930095672607, + "num_tokens": 123712691.0, + "step": 4950 + }, + { + "epoch": 0.5437074456402372, + "grad_norm": 2.297950029373169, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6997792720794678, + "num_tokens": 123736081.0, + "step": 4951 + }, + { + "epoch": 0.5438172633428509, + "grad_norm": 2.580770492553711, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7033497095108032, + "num_tokens": 123755884.0, + "step": 4952 + }, + { + "epoch": 0.5439270810454645, + "grad_norm": 2.3928189277648926, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7184128761291504, + "num_tokens": 123777934.0, + "step": 4953 + }, + { + "epoch": 0.5440368987480781, + "grad_norm": 2.0284547805786133, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6777282953262329, + "num_tokens": 123808490.0, + "step": 4954 + }, + { + "epoch": 0.5441467164506919, + "grad_norm": 2.0427072048187256, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7028902769088745, + "num_tokens": 123836879.0, + "step": 4955 + }, + { + "epoch": 0.5442565341533055, + "grad_norm": 2.2134130001068115, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7063999772071838, + "num_tokens": 123863296.0, + "step": 4956 + }, + { + "epoch": 0.5443663518559192, + "grad_norm": 2.433074951171875, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7228214740753174, + "num_tokens": 123884952.0, + "step": 4957 + }, + { + "epoch": 0.5444761695585328, + "grad_norm": 2.498098850250244, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6948442459106445, + "num_tokens": 123905910.0, + "step": 4958 + }, + { + "epoch": 0.5445859872611465, + "grad_norm": 2.520493745803833, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7222046256065369, + "num_tokens": 123925803.0, + "step": 4959 + }, + { + "epoch": 0.5446958049637601, + "grad_norm": 2.2495665550231934, + "learning_rate": 1e-06, + "loss": 1.0965, + "mean_token_accuracy": 0.6715288758277893, + "num_tokens": 123952552.0, + "step": 4960 + }, + { + "epoch": 0.5448056226663738, + "grad_norm": 2.238032817840576, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6821950078010559, + "num_tokens": 123977124.0, + "step": 4961 + }, + { + "epoch": 0.5449154403689875, + "grad_norm": 2.5801475048065186, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7144202589988708, + "num_tokens": 123997159.0, + "step": 4962 + }, + { + "epoch": 0.5450252580716012, + "grad_norm": 2.411593198776245, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7134488821029663, + "num_tokens": 124019254.0, + "step": 4963 + }, + { + "epoch": 0.5451350757742148, + "grad_norm": 2.425823211669922, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7023998498916626, + "num_tokens": 124041352.0, + "step": 4964 + }, + { + "epoch": 0.5452448934768285, + "grad_norm": 2.704315185546875, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7199665307998657, + "num_tokens": 124059938.0, + "step": 4965 + }, + { + "epoch": 0.5453547111794421, + "grad_norm": 2.357267141342163, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7170352935791016, + "num_tokens": 124081876.0, + "step": 4966 + }, + { + "epoch": 0.5454645288820558, + "grad_norm": 2.01790189743042, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7061734199523926, + "num_tokens": 124112976.0, + "step": 4967 + }, + { + "epoch": 0.5455743465846694, + "grad_norm": 2.4490389823913574, + "learning_rate": 1e-06, + "loss": 1.0666, + "mean_token_accuracy": 0.6792009472846985, + "num_tokens": 124134433.0, + "step": 4968 + }, + { + "epoch": 0.5456841642872831, + "grad_norm": 2.087653875350952, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7090142965316772, + "num_tokens": 124163327.0, + "step": 4969 + }, + { + "epoch": 0.5457939819898968, + "grad_norm": 2.252906560897827, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7117944359779358, + "num_tokens": 124188157.0, + "step": 4970 + }, + { + "epoch": 0.5459037996925105, + "grad_norm": 2.610351324081421, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7209514379501343, + "num_tokens": 124207908.0, + "step": 4971 + }, + { + "epoch": 0.5460136173951241, + "grad_norm": 2.0462677478790283, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7051327228546143, + "num_tokens": 124239055.0, + "step": 4972 + }, + { + "epoch": 0.5461234350977378, + "grad_norm": 2.425873041152954, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7028024196624756, + "num_tokens": 124261708.0, + "step": 4973 + }, + { + "epoch": 0.5462332528003514, + "grad_norm": 2.1300208568573, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7018612027168274, + "num_tokens": 124288372.0, + "step": 4974 + }, + { + "epoch": 0.546343070502965, + "grad_norm": 2.334054708480835, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7081137895584106, + "num_tokens": 124310072.0, + "step": 4975 + }, + { + "epoch": 0.5464528882055787, + "grad_norm": 2.1127567291259766, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7014719843864441, + "num_tokens": 124337730.0, + "step": 4976 + }, + { + "epoch": 0.5465627059081924, + "grad_norm": 2.26570463180542, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6926624774932861, + "num_tokens": 124363074.0, + "step": 4977 + }, + { + "epoch": 0.5466725236108061, + "grad_norm": 2.1819913387298584, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7032214403152466, + "num_tokens": 124389330.0, + "step": 4978 + }, + { + "epoch": 0.5467823413134197, + "grad_norm": 2.134035110473633, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6869702339172363, + "num_tokens": 124416830.0, + "step": 4979 + }, + { + "epoch": 0.5468921590160334, + "grad_norm": 2.0606136322021484, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7169663310050964, + "num_tokens": 124441255.0, + "step": 4980 + }, + { + "epoch": 0.547001976718647, + "grad_norm": 2.495600461959839, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6927617192268372, + "num_tokens": 124461992.0, + "step": 4981 + }, + { + "epoch": 0.5471117944212607, + "grad_norm": 2.6244723796844482, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7182143330574036, + "num_tokens": 124483394.0, + "step": 4982 + }, + { + "epoch": 0.5472216121238743, + "grad_norm": 2.7200284004211426, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7087308168411255, + "num_tokens": 124500004.0, + "step": 4983 + }, + { + "epoch": 0.5473314298264881, + "grad_norm": 2.0762250423431396, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7197819948196411, + "num_tokens": 124527792.0, + "step": 4984 + }, + { + "epoch": 0.5474412475291017, + "grad_norm": 2.6452643871307373, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7148308753967285, + "num_tokens": 124548030.0, + "step": 4985 + }, + { + "epoch": 0.5475510652317154, + "grad_norm": 2.1411328315734863, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7244452834129333, + "num_tokens": 124574278.0, + "step": 4986 + }, + { + "epoch": 0.547660882934329, + "grad_norm": 2.4076972007751465, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7101361155509949, + "num_tokens": 124597307.0, + "step": 4987 + }, + { + "epoch": 0.5477707006369427, + "grad_norm": 2.127750873565674, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6928889751434326, + "num_tokens": 124624714.0, + "step": 4988 + }, + { + "epoch": 0.5478805183395563, + "grad_norm": 2.104501247406006, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7133898735046387, + "num_tokens": 124651660.0, + "step": 4989 + }, + { + "epoch": 0.54799033604217, + "grad_norm": 2.228825092315674, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7178444862365723, + "num_tokens": 124677561.0, + "step": 4990 + }, + { + "epoch": 0.5481001537447837, + "grad_norm": 2.5158474445343018, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7006514668464661, + "num_tokens": 124697849.0, + "step": 4991 + }, + { + "epoch": 0.5482099714473974, + "grad_norm": 2.343319892883301, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7105787992477417, + "num_tokens": 124721507.0, + "step": 4992 + }, + { + "epoch": 0.548319789150011, + "grad_norm": 2.0798182487487793, + "learning_rate": 1e-06, + "loss": 1.1013, + "mean_token_accuracy": 0.6716997623443604, + "num_tokens": 124750541.0, + "step": 4993 + }, + { + "epoch": 0.5484296068526247, + "grad_norm": 2.2261483669281006, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7233452796936035, + "num_tokens": 124774298.0, + "step": 4994 + }, + { + "epoch": 0.5485394245552383, + "grad_norm": 1.8728731870651245, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7067046165466309, + "num_tokens": 124806006.0, + "step": 4995 + }, + { + "epoch": 0.5486492422578519, + "grad_norm": 2.0976479053497314, + "learning_rate": 1e-06, + "loss": 1.1012, + "mean_token_accuracy": 0.6686244010925293, + "num_tokens": 124835663.0, + "step": 4996 + }, + { + "epoch": 0.5487590599604656, + "grad_norm": 2.2394039630889893, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7101919651031494, + "num_tokens": 124860327.0, + "step": 4997 + }, + { + "epoch": 0.5488688776630792, + "grad_norm": 2.441305160522461, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7237856388092041, + "num_tokens": 124882180.0, + "step": 4998 + }, + { + "epoch": 0.548978695365693, + "grad_norm": 2.077747106552124, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6929335594177246, + "num_tokens": 124909432.0, + "step": 4999 + }, + { + "epoch": 0.5490885130683066, + "grad_norm": 2.6881601810455322, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6907480955123901, + "num_tokens": 124927717.0, + "step": 5000 + }, + { + "epoch": 0.5491983307709203, + "grad_norm": 2.0739848613739014, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7127796411514282, + "num_tokens": 124955887.0, + "step": 5001 + }, + { + "epoch": 0.5493081484735339, + "grad_norm": 2.14361572265625, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7038986086845398, + "num_tokens": 124982356.0, + "step": 5002 + }, + { + "epoch": 0.5494179661761476, + "grad_norm": 2.342167854309082, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.720903217792511, + "num_tokens": 125003096.0, + "step": 5003 + }, + { + "epoch": 0.5495277838787612, + "grad_norm": 2.0443317890167236, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.721610963344574, + "num_tokens": 125032206.0, + "step": 5004 + }, + { + "epoch": 0.5496376015813749, + "grad_norm": 2.1898152828216553, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7298569679260254, + "num_tokens": 125056544.0, + "step": 5005 + }, + { + "epoch": 0.5497474192839886, + "grad_norm": 2.127537965774536, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7239702939987183, + "num_tokens": 125081043.0, + "step": 5006 + }, + { + "epoch": 0.5498572369866023, + "grad_norm": 2.1999282836914062, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7139657139778137, + "num_tokens": 125105602.0, + "step": 5007 + }, + { + "epoch": 0.5499670546892159, + "grad_norm": 2.3830602169036865, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6968543529510498, + "num_tokens": 125128777.0, + "step": 5008 + }, + { + "epoch": 0.5500768723918296, + "grad_norm": 2.27518892288208, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7077505588531494, + "num_tokens": 125151593.0, + "step": 5009 + }, + { + "epoch": 0.5501866900944432, + "grad_norm": 2.8070452213287354, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7263911366462708, + "num_tokens": 125168301.0, + "step": 5010 + }, + { + "epoch": 0.5502965077970569, + "grad_norm": 2.505387783050537, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7384264469146729, + "num_tokens": 125186555.0, + "step": 5011 + }, + { + "epoch": 0.5504063254996705, + "grad_norm": 2.131911277770996, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6891107559204102, + "num_tokens": 125215306.0, + "step": 5012 + }, + { + "epoch": 0.5505161432022843, + "grad_norm": 2.172100067138672, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6899351477622986, + "num_tokens": 125239946.0, + "step": 5013 + }, + { + "epoch": 0.5506259609048979, + "grad_norm": 2.136441946029663, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.736387312412262, + "num_tokens": 125265174.0, + "step": 5014 + }, + { + "epoch": 0.5507357786075116, + "grad_norm": 2.198871374130249, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.719152569770813, + "num_tokens": 125290365.0, + "step": 5015 + }, + { + "epoch": 0.5508455963101252, + "grad_norm": 2.290952205657959, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6956524848937988, + "num_tokens": 125313733.0, + "step": 5016 + }, + { + "epoch": 0.5509554140127388, + "grad_norm": 2.4096806049346924, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.714638888835907, + "num_tokens": 125335509.0, + "step": 5017 + }, + { + "epoch": 0.5510652317153525, + "grad_norm": 2.4163670539855957, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7112030982971191, + "num_tokens": 125358516.0, + "step": 5018 + }, + { + "epoch": 0.5511750494179661, + "grad_norm": 2.0763895511627197, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7035895586013794, + "num_tokens": 125385520.0, + "step": 5019 + }, + { + "epoch": 0.5512848671205799, + "grad_norm": 2.3302018642425537, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7228110432624817, + "num_tokens": 125409693.0, + "step": 5020 + }, + { + "epoch": 0.5513946848231935, + "grad_norm": 2.108156204223633, + "learning_rate": 1e-06, + "loss": 1.1045, + "mean_token_accuracy": 0.677710235118866, + "num_tokens": 125439058.0, + "step": 5021 + }, + { + "epoch": 0.5515045025258072, + "grad_norm": 2.372894525527954, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7151022553443909, + "num_tokens": 125459935.0, + "step": 5022 + }, + { + "epoch": 0.5516143202284208, + "grad_norm": 2.5319554805755615, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7438258528709412, + "num_tokens": 125479080.0, + "step": 5023 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 2.0229177474975586, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6929289698600769, + "num_tokens": 125508946.0, + "step": 5024 + }, + { + "epoch": 0.5518339556336481, + "grad_norm": 2.1472575664520264, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.6984395980834961, + "num_tokens": 125535748.0, + "step": 5025 + }, + { + "epoch": 0.5519437733362618, + "grad_norm": 2.3902227878570557, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7123470306396484, + "num_tokens": 125558081.0, + "step": 5026 + }, + { + "epoch": 0.5520535910388754, + "grad_norm": 2.244986057281494, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7247834205627441, + "num_tokens": 125582452.0, + "step": 5027 + }, + { + "epoch": 0.5521634087414892, + "grad_norm": 2.3353254795074463, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7100889682769775, + "num_tokens": 125607163.0, + "step": 5028 + }, + { + "epoch": 0.5522732264441028, + "grad_norm": 2.635875940322876, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.6781386137008667, + "num_tokens": 125627151.0, + "step": 5029 + }, + { + "epoch": 0.5523830441467165, + "grad_norm": 2.388420581817627, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7008751630783081, + "num_tokens": 125648816.0, + "step": 5030 + }, + { + "epoch": 0.5524928618493301, + "grad_norm": 2.4265055656433105, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6953833103179932, + "num_tokens": 125672149.0, + "step": 5031 + }, + { + "epoch": 0.5526026795519438, + "grad_norm": 2.1247425079345703, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7200177311897278, + "num_tokens": 125697704.0, + "step": 5032 + }, + { + "epoch": 0.5527124972545574, + "grad_norm": 2.1974382400512695, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6950330138206482, + "num_tokens": 125724378.0, + "step": 5033 + }, + { + "epoch": 0.552822314957171, + "grad_norm": 2.052698850631714, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7051287889480591, + "num_tokens": 125751163.0, + "step": 5034 + }, + { + "epoch": 0.5529321326597848, + "grad_norm": 2.4611153602600098, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7143038511276245, + "num_tokens": 125770363.0, + "step": 5035 + }, + { + "epoch": 0.5530419503623984, + "grad_norm": 2.097987413406372, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7089489698410034, + "num_tokens": 125798982.0, + "step": 5036 + }, + { + "epoch": 0.5531517680650121, + "grad_norm": 2.5646729469299316, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7301438450813293, + "num_tokens": 125817234.0, + "step": 5037 + }, + { + "epoch": 0.5532615857676257, + "grad_norm": 2.0967421531677246, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6878474950790405, + "num_tokens": 125846513.0, + "step": 5038 + }, + { + "epoch": 0.5533714034702394, + "grad_norm": 2.068721294403076, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6991806030273438, + "num_tokens": 125875992.0, + "step": 5039 + }, + { + "epoch": 0.553481221172853, + "grad_norm": 2.120739221572876, + "learning_rate": 1e-06, + "loss": 1.133, + "mean_token_accuracy": 0.6639819741249084, + "num_tokens": 125904609.0, + "step": 5040 + }, + { + "epoch": 0.5535910388754667, + "grad_norm": 2.086221933364868, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.691805362701416, + "num_tokens": 125934929.0, + "step": 5041 + }, + { + "epoch": 0.5537008565780804, + "grad_norm": 2.4256861209869385, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7168130874633789, + "num_tokens": 125956710.0, + "step": 5042 + }, + { + "epoch": 0.5538106742806941, + "grad_norm": 2.2627615928649902, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6827400326728821, + "num_tokens": 125981007.0, + "step": 5043 + }, + { + "epoch": 0.5539204919833077, + "grad_norm": 2.177830934524536, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7047871351242065, + "num_tokens": 126006822.0, + "step": 5044 + }, + { + "epoch": 0.5540303096859214, + "grad_norm": 1.9579373598098755, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.702513575553894, + "num_tokens": 126037050.0, + "step": 5045 + }, + { + "epoch": 0.554140127388535, + "grad_norm": 2.3277575969696045, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7131630182266235, + "num_tokens": 126060660.0, + "step": 5046 + }, + { + "epoch": 0.5542499450911487, + "grad_norm": 2.1803669929504395, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.708907961845398, + "num_tokens": 126088419.0, + "step": 5047 + }, + { + "epoch": 0.5543597627937623, + "grad_norm": 2.119340419769287, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7148563265800476, + "num_tokens": 126114748.0, + "step": 5048 + }, + { + "epoch": 0.5544695804963761, + "grad_norm": 2.2249507904052734, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7141176462173462, + "num_tokens": 126138153.0, + "step": 5049 + }, + { + "epoch": 0.5545793981989897, + "grad_norm": 2.5217764377593994, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6959781646728516, + "num_tokens": 126159447.0, + "step": 5050 + }, + { + "epoch": 0.5546892159016034, + "grad_norm": 2.6123464107513428, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7122071981430054, + "num_tokens": 126177852.0, + "step": 5051 + }, + { + "epoch": 0.554799033604217, + "grad_norm": 1.9416227340698242, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7192928194999695, + "num_tokens": 126207670.0, + "step": 5052 + }, + { + "epoch": 0.5549088513068307, + "grad_norm": 2.1493194103240967, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6990213394165039, + "num_tokens": 126234359.0, + "step": 5053 + }, + { + "epoch": 0.5550186690094443, + "grad_norm": 2.1960549354553223, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7082791328430176, + "num_tokens": 126259419.0, + "step": 5054 + }, + { + "epoch": 0.555128486712058, + "grad_norm": 2.3805856704711914, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6953256726264954, + "num_tokens": 126284529.0, + "step": 5055 + }, + { + "epoch": 0.5552383044146716, + "grad_norm": 2.151714563369751, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6969501376152039, + "num_tokens": 126311048.0, + "step": 5056 + }, + { + "epoch": 0.5553481221172853, + "grad_norm": 2.004061460494995, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6966392993927002, + "num_tokens": 126342162.0, + "step": 5057 + }, + { + "epoch": 0.555457939819899, + "grad_norm": 2.430302381515503, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7198641300201416, + "num_tokens": 126363260.0, + "step": 5058 + }, + { + "epoch": 0.5555677575225126, + "grad_norm": 2.4328958988189697, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6887907981872559, + "num_tokens": 126385769.0, + "step": 5059 + }, + { + "epoch": 0.5556775752251263, + "grad_norm": 2.09975528717041, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6870560646057129, + "num_tokens": 126413702.0, + "step": 5060 + }, + { + "epoch": 0.5557873929277399, + "grad_norm": 2.3541741371154785, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6991126537322998, + "num_tokens": 126436895.0, + "step": 5061 + }, + { + "epoch": 0.5558972106303536, + "grad_norm": 2.1968750953674316, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6901212334632874, + "num_tokens": 126464092.0, + "step": 5062 + }, + { + "epoch": 0.5560070283329672, + "grad_norm": 2.378504753112793, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6904193162918091, + "num_tokens": 126488007.0, + "step": 5063 + }, + { + "epoch": 0.556116846035581, + "grad_norm": 2.312753200531006, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7044267058372498, + "num_tokens": 126512448.0, + "step": 5064 + }, + { + "epoch": 0.5562266637381946, + "grad_norm": 1.9745099544525146, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6881729364395142, + "num_tokens": 126544842.0, + "step": 5065 + }, + { + "epoch": 0.5563364814408083, + "grad_norm": 2.4752094745635986, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6977171301841736, + "num_tokens": 126566900.0, + "step": 5066 + }, + { + "epoch": 0.5564462991434219, + "grad_norm": 2.313810110092163, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7180943489074707, + "num_tokens": 126591209.0, + "step": 5067 + }, + { + "epoch": 0.5565561168460356, + "grad_norm": 2.157501697540283, + "learning_rate": 1e-06, + "loss": 1.0879, + "mean_token_accuracy": 0.6795911192893982, + "num_tokens": 126622471.0, + "step": 5068 + }, + { + "epoch": 0.5566659345486492, + "grad_norm": 2.226463556289673, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6935105323791504, + "num_tokens": 126645595.0, + "step": 5069 + }, + { + "epoch": 0.5567757522512629, + "grad_norm": 2.16166090965271, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.6978847980499268, + "num_tokens": 126671666.0, + "step": 5070 + }, + { + "epoch": 0.5568855699538766, + "grad_norm": 2.5111892223358154, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7345749735832214, + "num_tokens": 126690204.0, + "step": 5071 + }, + { + "epoch": 0.5569953876564903, + "grad_norm": 2.16923189163208, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6841044425964355, + "num_tokens": 126714727.0, + "step": 5072 + }, + { + "epoch": 0.5571052053591039, + "grad_norm": 2.138871669769287, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6755459308624268, + "num_tokens": 126741146.0, + "step": 5073 + }, + { + "epoch": 0.5572150230617176, + "grad_norm": 2.0693535804748535, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6979187726974487, + "num_tokens": 126769010.0, + "step": 5074 + }, + { + "epoch": 0.5573248407643312, + "grad_norm": 2.222519874572754, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6990392804145813, + "num_tokens": 126793112.0, + "step": 5075 + }, + { + "epoch": 0.5574346584669448, + "grad_norm": 2.423604726791382, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7152608036994934, + "num_tokens": 126814859.0, + "step": 5076 + }, + { + "epoch": 0.5575444761695585, + "grad_norm": 2.1926329135894775, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6844016313552856, + "num_tokens": 126839891.0, + "step": 5077 + }, + { + "epoch": 0.5576542938721722, + "grad_norm": 2.149172067642212, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7170386910438538, + "num_tokens": 126866327.0, + "step": 5078 + }, + { + "epoch": 0.5577641115747859, + "grad_norm": 2.217418909072876, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6985034346580505, + "num_tokens": 126891272.0, + "step": 5079 + }, + { + "epoch": 0.5578739292773995, + "grad_norm": 2.262030839920044, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.7045891284942627, + "num_tokens": 126915321.0, + "step": 5080 + }, + { + "epoch": 0.5579837469800132, + "grad_norm": 2.2411885261535645, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7141137719154358, + "num_tokens": 126939911.0, + "step": 5081 + }, + { + "epoch": 0.5580935646826268, + "grad_norm": 2.1730539798736572, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7198092341423035, + "num_tokens": 126963560.0, + "step": 5082 + }, + { + "epoch": 0.5582033823852405, + "grad_norm": 2.1006991863250732, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7023090720176697, + "num_tokens": 126989466.0, + "step": 5083 + }, + { + "epoch": 0.5583132000878541, + "grad_norm": 2.1564741134643555, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7009623050689697, + "num_tokens": 127015022.0, + "step": 5084 + }, + { + "epoch": 0.5584230177904678, + "grad_norm": 2.1399261951446533, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7037380933761597, + "num_tokens": 127041533.0, + "step": 5085 + }, + { + "epoch": 0.5585328354930815, + "grad_norm": 2.5550549030303955, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7042075395584106, + "num_tokens": 127061644.0, + "step": 5086 + }, + { + "epoch": 0.5586426531956952, + "grad_norm": 2.0242574214935303, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7060343027114868, + "num_tokens": 127090429.0, + "step": 5087 + }, + { + "epoch": 0.5587524708983088, + "grad_norm": 2.388550281524658, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6898870468139648, + "num_tokens": 127113201.0, + "step": 5088 + }, + { + "epoch": 0.5588622886009225, + "grad_norm": 2.4712493419647217, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6947447657585144, + "num_tokens": 127134010.0, + "step": 5089 + }, + { + "epoch": 0.5589721063035361, + "grad_norm": 2.2028300762176514, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.687423050403595, + "num_tokens": 127158990.0, + "step": 5090 + }, + { + "epoch": 0.5590819240061498, + "grad_norm": 2.1216249465942383, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7143694162368774, + "num_tokens": 127182951.0, + "step": 5091 + }, + { + "epoch": 0.5591917417087634, + "grad_norm": 2.2594552040100098, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6926593780517578, + "num_tokens": 127211421.0, + "step": 5092 + }, + { + "epoch": 0.5593015594113772, + "grad_norm": 1.9860680103302002, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7105728983879089, + "num_tokens": 127239123.0, + "step": 5093 + }, + { + "epoch": 0.5594113771139908, + "grad_norm": 2.082017421722412, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6929739117622375, + "num_tokens": 127268851.0, + "step": 5094 + }, + { + "epoch": 0.5595211948166045, + "grad_norm": 2.1863276958465576, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6869649887084961, + "num_tokens": 127295602.0, + "step": 5095 + }, + { + "epoch": 0.5596310125192181, + "grad_norm": 2.0568900108337402, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7000027298927307, + "num_tokens": 127325366.0, + "step": 5096 + }, + { + "epoch": 0.5597408302218317, + "grad_norm": 1.9974254369735718, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6797033548355103, + "num_tokens": 127355657.0, + "step": 5097 + }, + { + "epoch": 0.5598506479244454, + "grad_norm": 2.281862497329712, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.690684974193573, + "num_tokens": 127380369.0, + "step": 5098 + }, + { + "epoch": 0.559960465627059, + "grad_norm": 2.182175397872925, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7043519020080566, + "num_tokens": 127410420.0, + "step": 5099 + }, + { + "epoch": 0.5600702833296728, + "grad_norm": 2.2092297077178955, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.7026183009147644, + "num_tokens": 127437562.0, + "step": 5100 + }, + { + "epoch": 0.5601801010322864, + "grad_norm": 2.425935983657837, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7047887444496155, + "num_tokens": 127459284.0, + "step": 5101 + }, + { + "epoch": 0.5602899187349001, + "grad_norm": 2.2582292556762695, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6921601891517639, + "num_tokens": 127483033.0, + "step": 5102 + }, + { + "epoch": 0.5603997364375137, + "grad_norm": 2.171715497970581, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6974607110023499, + "num_tokens": 127510267.0, + "step": 5103 + }, + { + "epoch": 0.5605095541401274, + "grad_norm": 2.3012452125549316, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.693939208984375, + "num_tokens": 127533355.0, + "step": 5104 + }, + { + "epoch": 0.560619371842741, + "grad_norm": 2.236464500427246, + "learning_rate": 1e-06, + "loss": 1.088, + "mean_token_accuracy": 0.6852592825889587, + "num_tokens": 127560539.0, + "step": 5105 + }, + { + "epoch": 0.5607291895453547, + "grad_norm": 2.1755871772766113, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6947880387306213, + "num_tokens": 127585671.0, + "step": 5106 + }, + { + "epoch": 0.5608390072479684, + "grad_norm": 2.2222700119018555, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7325460314750671, + "num_tokens": 127612556.0, + "step": 5107 + }, + { + "epoch": 0.5609488249505821, + "grad_norm": 2.4119577407836914, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7039998173713684, + "num_tokens": 127634574.0, + "step": 5108 + }, + { + "epoch": 0.5610586426531957, + "grad_norm": 2.227247714996338, + "learning_rate": 1e-06, + "loss": 1.0581, + "mean_token_accuracy": 0.6764597296714783, + "num_tokens": 127660137.0, + "step": 5109 + }, + { + "epoch": 0.5611684603558094, + "grad_norm": 2.4796712398529053, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7109615802764893, + "num_tokens": 127680751.0, + "step": 5110 + }, + { + "epoch": 0.561278278058423, + "grad_norm": 1.970867395401001, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6948948502540588, + "num_tokens": 127709317.0, + "step": 5111 + }, + { + "epoch": 0.5613880957610367, + "grad_norm": 2.136798620223999, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6961309313774109, + "num_tokens": 127735271.0, + "step": 5112 + }, + { + "epoch": 0.5614979134636503, + "grad_norm": 2.4563400745391846, + "learning_rate": 1e-06, + "loss": 1.0847, + "mean_token_accuracy": 0.6818265914916992, + "num_tokens": 127758441.0, + "step": 5113 + }, + { + "epoch": 0.5616077311662641, + "grad_norm": 2.345353841781616, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7063108086585999, + "num_tokens": 127780039.0, + "step": 5114 + }, + { + "epoch": 0.5617175488688777, + "grad_norm": 2.1392455101013184, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.6952085494995117, + "num_tokens": 127807134.0, + "step": 5115 + }, + { + "epoch": 0.5618273665714913, + "grad_norm": 2.1021406650543213, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7292622327804565, + "num_tokens": 127833767.0, + "step": 5116 + }, + { + "epoch": 0.561937184274105, + "grad_norm": 2.243494987487793, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7312785983085632, + "num_tokens": 127856909.0, + "step": 5117 + }, + { + "epoch": 0.5620470019767186, + "grad_norm": 1.9427428245544434, + "learning_rate": 1e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6733556985855103, + "num_tokens": 127890616.0, + "step": 5118 + }, + { + "epoch": 0.5621568196793323, + "grad_norm": 2.4471614360809326, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7217779755592346, + "num_tokens": 127912939.0, + "step": 5119 + }, + { + "epoch": 0.5622666373819459, + "grad_norm": 2.1088507175445557, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7052168846130371, + "num_tokens": 127942387.0, + "step": 5120 + }, + { + "epoch": 0.5623764550845596, + "grad_norm": 2.2249341011047363, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7011860609054565, + "num_tokens": 127969245.0, + "step": 5121 + }, + { + "epoch": 0.5624862727871733, + "grad_norm": 2.131725311279297, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7197535634040833, + "num_tokens": 127993952.0, + "step": 5122 + }, + { + "epoch": 0.562596090489787, + "grad_norm": 1.9720196723937988, + "learning_rate": 1e-06, + "loss": 1.0863, + "mean_token_accuracy": 0.6730319261550903, + "num_tokens": 128027533.0, + "step": 5123 + }, + { + "epoch": 0.5627059081924006, + "grad_norm": 2.216041326522827, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7204697132110596, + "num_tokens": 128052627.0, + "step": 5124 + }, + { + "epoch": 0.5628157258950143, + "grad_norm": 2.411881923675537, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.6929537057876587, + "num_tokens": 128077312.0, + "step": 5125 + }, + { + "epoch": 0.5629255435976279, + "grad_norm": 2.0871167182922363, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7008006572723389, + "num_tokens": 128105090.0, + "step": 5126 + }, + { + "epoch": 0.5630353613002416, + "grad_norm": 2.314884662628174, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.712003767490387, + "num_tokens": 128127377.0, + "step": 5127 + }, + { + "epoch": 0.5631451790028552, + "grad_norm": 2.4924895763397217, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7161560654640198, + "num_tokens": 128147665.0, + "step": 5128 + }, + { + "epoch": 0.563254996705469, + "grad_norm": 2.1903433799743652, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7027889490127563, + "num_tokens": 128174742.0, + "step": 5129 + }, + { + "epoch": 0.5633648144080826, + "grad_norm": 2.0659544467926025, + "learning_rate": 1e-06, + "loss": 1.1123, + "mean_token_accuracy": 0.6730324029922485, + "num_tokens": 128205988.0, + "step": 5130 + }, + { + "epoch": 0.5634746321106963, + "grad_norm": 1.9611600637435913, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7184075117111206, + "num_tokens": 128236551.0, + "step": 5131 + }, + { + "epoch": 0.5635844498133099, + "grad_norm": 2.2857742309570312, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6942086219787598, + "num_tokens": 128261977.0, + "step": 5132 + }, + { + "epoch": 0.5636942675159236, + "grad_norm": 2.344825267791748, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7049511075019836, + "num_tokens": 128283937.0, + "step": 5133 + }, + { + "epoch": 0.5638040852185372, + "grad_norm": 2.1406185626983643, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7322845458984375, + "num_tokens": 128310781.0, + "step": 5134 + }, + { + "epoch": 0.5639139029211508, + "grad_norm": 2.0461018085479736, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7061991095542908, + "num_tokens": 128339169.0, + "step": 5135 + }, + { + "epoch": 0.5640237206237646, + "grad_norm": 2.2398409843444824, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7014281153678894, + "num_tokens": 128362877.0, + "step": 5136 + }, + { + "epoch": 0.5641335383263782, + "grad_norm": 2.0653297901153564, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7062430381774902, + "num_tokens": 128390568.0, + "step": 5137 + }, + { + "epoch": 0.5642433560289919, + "grad_norm": 2.4154882431030273, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7040128707885742, + "num_tokens": 128414430.0, + "step": 5138 + }, + { + "epoch": 0.5643531737316055, + "grad_norm": 2.1223065853118896, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.720969557762146, + "num_tokens": 128439517.0, + "step": 5139 + }, + { + "epoch": 0.5644629914342192, + "grad_norm": 2.174816846847534, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6917345523834229, + "num_tokens": 128467432.0, + "step": 5140 + }, + { + "epoch": 0.5645728091368328, + "grad_norm": 2.2589643001556396, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7336910963058472, + "num_tokens": 128492645.0, + "step": 5141 + }, + { + "epoch": 0.5646826268394465, + "grad_norm": 2.2946465015411377, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6847690343856812, + "num_tokens": 128517941.0, + "step": 5142 + }, + { + "epoch": 0.5647924445420602, + "grad_norm": 2.215418577194214, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7097307443618774, + "num_tokens": 128545092.0, + "step": 5143 + }, + { + "epoch": 0.5649022622446739, + "grad_norm": 2.4575650691986084, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7179687023162842, + "num_tokens": 128564759.0, + "step": 5144 + }, + { + "epoch": 0.5650120799472875, + "grad_norm": 2.2662556171417236, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.709591269493103, + "num_tokens": 128587766.0, + "step": 5145 + }, + { + "epoch": 0.5651218976499012, + "grad_norm": 2.1443793773651123, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7025766968727112, + "num_tokens": 128615533.0, + "step": 5146 + }, + { + "epoch": 0.5652317153525148, + "grad_norm": 2.2520415782928467, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.682845950126648, + "num_tokens": 128640379.0, + "step": 5147 + }, + { + "epoch": 0.5653415330551285, + "grad_norm": 2.1465201377868652, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7070480585098267, + "num_tokens": 128666140.0, + "step": 5148 + }, + { + "epoch": 0.5654513507577421, + "grad_norm": 2.0629312992095947, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7200879454612732, + "num_tokens": 128693887.0, + "step": 5149 + }, + { + "epoch": 0.5655611684603558, + "grad_norm": 2.2878355979919434, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7075463533401489, + "num_tokens": 128717154.0, + "step": 5150 + }, + { + "epoch": 0.5656709861629695, + "grad_norm": 2.441420793533325, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7119458913803101, + "num_tokens": 128739028.0, + "step": 5151 + }, + { + "epoch": 0.5657808038655832, + "grad_norm": 2.2219042778015137, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7129923105239868, + "num_tokens": 128762441.0, + "step": 5152 + }, + { + "epoch": 0.5658906215681968, + "grad_norm": 2.170687198638916, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6983197927474976, + "num_tokens": 128789572.0, + "step": 5153 + }, + { + "epoch": 0.5660004392708105, + "grad_norm": 2.0068776607513428, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7131561636924744, + "num_tokens": 128818578.0, + "step": 5154 + }, + { + "epoch": 0.5661102569734241, + "grad_norm": 2.117858409881592, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7098459005355835, + "num_tokens": 128846910.0, + "step": 5155 + }, + { + "epoch": 0.5662200746760377, + "grad_norm": 2.2363603115081787, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.685863733291626, + "num_tokens": 128872874.0, + "step": 5156 + }, + { + "epoch": 0.5663298923786514, + "grad_norm": 2.2581899166107178, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7171435356140137, + "num_tokens": 128898576.0, + "step": 5157 + }, + { + "epoch": 0.5664397100812651, + "grad_norm": 2.136025905609131, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7090510129928589, + "num_tokens": 128925426.0, + "step": 5158 + }, + { + "epoch": 0.5665495277838788, + "grad_norm": 2.675600528717041, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.6989511847496033, + "num_tokens": 128943419.0, + "step": 5159 + }, + { + "epoch": 0.5666593454864924, + "grad_norm": 2.2609870433807373, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7281861305236816, + "num_tokens": 128967700.0, + "step": 5160 + }, + { + "epoch": 0.5667691631891061, + "grad_norm": 2.7943809032440186, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.733168363571167, + "num_tokens": 128983016.0, + "step": 5161 + }, + { + "epoch": 0.5668789808917197, + "grad_norm": 2.252016544342041, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7190932035446167, + "num_tokens": 129005656.0, + "step": 5162 + }, + { + "epoch": 0.5669887985943334, + "grad_norm": 2.043304443359375, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6957969069480896, + "num_tokens": 129034710.0, + "step": 5163 + }, + { + "epoch": 0.567098616296947, + "grad_norm": 1.9779356718063354, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7151080369949341, + "num_tokens": 129065054.0, + "step": 5164 + }, + { + "epoch": 0.5672084339995608, + "grad_norm": 2.406885862350464, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.695958137512207, + "num_tokens": 129086379.0, + "step": 5165 + }, + { + "epoch": 0.5673182517021744, + "grad_norm": 2.072244882583618, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6996177434921265, + "num_tokens": 129117448.0, + "step": 5166 + }, + { + "epoch": 0.5674280694047881, + "grad_norm": 2.092714548110962, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6977970600128174, + "num_tokens": 129146400.0, + "step": 5167 + }, + { + "epoch": 0.5675378871074017, + "grad_norm": 2.5672240257263184, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7489382028579712, + "num_tokens": 129164059.0, + "step": 5168 + }, + { + "epoch": 0.5676477048100154, + "grad_norm": 1.9766751527786255, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6862093210220337, + "num_tokens": 129194088.0, + "step": 5169 + }, + { + "epoch": 0.567757522512629, + "grad_norm": 2.6227612495422363, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7077357769012451, + "num_tokens": 129213593.0, + "step": 5170 + }, + { + "epoch": 0.5678673402152427, + "grad_norm": 2.197903871536255, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7178782820701599, + "num_tokens": 129238896.0, + "step": 5171 + }, + { + "epoch": 0.5679771579178564, + "grad_norm": 2.5109949111938477, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.6984964609146118, + "num_tokens": 129259992.0, + "step": 5172 + }, + { + "epoch": 0.5680869756204701, + "grad_norm": 2.0995426177978516, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.702717661857605, + "num_tokens": 129287761.0, + "step": 5173 + }, + { + "epoch": 0.5681967933230837, + "grad_norm": 2.0765743255615234, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7004506587982178, + "num_tokens": 129317131.0, + "step": 5174 + }, + { + "epoch": 0.5683066110256974, + "grad_norm": 2.4724297523498535, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7235070466995239, + "num_tokens": 129338134.0, + "step": 5175 + }, + { + "epoch": 0.568416428728311, + "grad_norm": 2.116758108139038, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6968281269073486, + "num_tokens": 129367093.0, + "step": 5176 + }, + { + "epoch": 0.5685262464309246, + "grad_norm": 2.618244171142578, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7089962959289551, + "num_tokens": 129388713.0, + "step": 5177 + }, + { + "epoch": 0.5686360641335383, + "grad_norm": 1.8639482259750366, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6875730752944946, + "num_tokens": 129423441.0, + "step": 5178 + }, + { + "epoch": 0.5687458818361519, + "grad_norm": 2.2445731163024902, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6973069310188293, + "num_tokens": 129448785.0, + "step": 5179 + }, + { + "epoch": 0.5688556995387657, + "grad_norm": 2.0514609813690186, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.693080484867096, + "num_tokens": 129477473.0, + "step": 5180 + }, + { + "epoch": 0.5689655172413793, + "grad_norm": 2.1226234436035156, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6913420557975769, + "num_tokens": 129505016.0, + "step": 5181 + }, + { + "epoch": 0.569075334943993, + "grad_norm": 2.110701322555542, + "learning_rate": 1e-06, + "loss": 1.0494, + "mean_token_accuracy": 0.686397910118103, + "num_tokens": 129533565.0, + "step": 5182 + }, + { + "epoch": 0.5691851526466066, + "grad_norm": 1.91909921169281, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7033724784851074, + "num_tokens": 129565793.0, + "step": 5183 + }, + { + "epoch": 0.5692949703492203, + "grad_norm": 2.3573434352874756, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.699776291847229, + "num_tokens": 129590365.0, + "step": 5184 + }, + { + "epoch": 0.5694047880518339, + "grad_norm": 2.099701166152954, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7111096382141113, + "num_tokens": 129618369.0, + "step": 5185 + }, + { + "epoch": 0.5695146057544476, + "grad_norm": 2.194478988647461, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7352592349052429, + "num_tokens": 129642611.0, + "step": 5186 + }, + { + "epoch": 0.5696244234570613, + "grad_norm": 2.3095948696136475, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6944280862808228, + "num_tokens": 129667021.0, + "step": 5187 + }, + { + "epoch": 0.569734241159675, + "grad_norm": 2.161499500274658, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7309894561767578, + "num_tokens": 129693377.0, + "step": 5188 + }, + { + "epoch": 0.5698440588622886, + "grad_norm": 2.5607035160064697, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7271319031715393, + "num_tokens": 129713072.0, + "step": 5189 + }, + { + "epoch": 0.5699538765649023, + "grad_norm": 2.2882981300354004, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.707464337348938, + "num_tokens": 129735879.0, + "step": 5190 + }, + { + "epoch": 0.5700636942675159, + "grad_norm": 2.1070165634155273, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.691332995891571, + "num_tokens": 129762654.0, + "step": 5191 + }, + { + "epoch": 0.5701735119701296, + "grad_norm": 2.0118327140808105, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6955831050872803, + "num_tokens": 129790621.0, + "step": 5192 + }, + { + "epoch": 0.5702833296727432, + "grad_norm": 2.2862768173217773, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7000594139099121, + "num_tokens": 129814123.0, + "step": 5193 + }, + { + "epoch": 0.570393147375357, + "grad_norm": 2.481046438217163, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7272090315818787, + "num_tokens": 129833540.0, + "step": 5194 + }, + { + "epoch": 0.5705029650779706, + "grad_norm": 1.931002140045166, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6813327074050903, + "num_tokens": 129866824.0, + "step": 5195 + }, + { + "epoch": 0.5706127827805842, + "grad_norm": 2.266662359237671, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7228485941886902, + "num_tokens": 129889969.0, + "step": 5196 + }, + { + "epoch": 0.5707226004831979, + "grad_norm": 2.1837570667266846, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.6951939463615417, + "num_tokens": 129915421.0, + "step": 5197 + }, + { + "epoch": 0.5708324181858115, + "grad_norm": 2.3152215480804443, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7015121579170227, + "num_tokens": 129937163.0, + "step": 5198 + }, + { + "epoch": 0.5709422358884252, + "grad_norm": 2.3178114891052246, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7033246755599976, + "num_tokens": 129961124.0, + "step": 5199 + }, + { + "epoch": 0.5710520535910388, + "grad_norm": 2.1955771446228027, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7177397608757019, + "num_tokens": 129985740.0, + "step": 5200 + }, + { + "epoch": 0.5711618712936526, + "grad_norm": 2.249650478363037, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.7006161212921143, + "num_tokens": 130012650.0, + "step": 5201 + }, + { + "epoch": 0.5712716889962662, + "grad_norm": 1.9655423164367676, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6986949443817139, + "num_tokens": 130045009.0, + "step": 5202 + }, + { + "epoch": 0.5713815066988799, + "grad_norm": 2.1156930923461914, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7082281708717346, + "num_tokens": 130070059.0, + "step": 5203 + }, + { + "epoch": 0.5714913244014935, + "grad_norm": 2.1156105995178223, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7029237151145935, + "num_tokens": 130096587.0, + "step": 5204 + }, + { + "epoch": 0.5716011421041072, + "grad_norm": 2.5776431560516357, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.712303102016449, + "num_tokens": 130116121.0, + "step": 5205 + }, + { + "epoch": 0.5717109598067208, + "grad_norm": 2.253387212753296, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6912966966629028, + "num_tokens": 130142649.0, + "step": 5206 + }, + { + "epoch": 0.5718207775093345, + "grad_norm": 2.0720269680023193, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7311270236968994, + "num_tokens": 130169385.0, + "step": 5207 + }, + { + "epoch": 0.5719305952119481, + "grad_norm": 2.365062952041626, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.697755753993988, + "num_tokens": 130192066.0, + "step": 5208 + }, + { + "epoch": 0.5720404129145619, + "grad_norm": 2.249203681945801, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6929411292076111, + "num_tokens": 130216843.0, + "step": 5209 + }, + { + "epoch": 0.5721502306171755, + "grad_norm": 2.1433982849121094, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6963270902633667, + "num_tokens": 130243114.0, + "step": 5210 + }, + { + "epoch": 0.5722600483197892, + "grad_norm": 2.591182231903076, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7048866152763367, + "num_tokens": 130262972.0, + "step": 5211 + }, + { + "epoch": 0.5723698660224028, + "grad_norm": 2.4250831604003906, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7107115387916565, + "num_tokens": 130284271.0, + "step": 5212 + }, + { + "epoch": 0.5724796837250165, + "grad_norm": 2.119521379470825, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6997338533401489, + "num_tokens": 130312915.0, + "step": 5213 + }, + { + "epoch": 0.5725895014276301, + "grad_norm": 2.126927614212036, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7142376899719238, + "num_tokens": 130338400.0, + "step": 5214 + }, + { + "epoch": 0.5726993191302437, + "grad_norm": 2.2876763343811035, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6920717358589172, + "num_tokens": 130363007.0, + "step": 5215 + }, + { + "epoch": 0.5728091368328575, + "grad_norm": 2.0826830863952637, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6978095769882202, + "num_tokens": 130390303.0, + "step": 5216 + }, + { + "epoch": 0.5729189545354711, + "grad_norm": 2.229976177215576, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6866663694381714, + "num_tokens": 130416804.0, + "step": 5217 + }, + { + "epoch": 0.5730287722380848, + "grad_norm": 2.067070245742798, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6902815103530884, + "num_tokens": 130447283.0, + "step": 5218 + }, + { + "epoch": 0.5731385899406984, + "grad_norm": 2.3343236446380615, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7068933248519897, + "num_tokens": 130470716.0, + "step": 5219 + }, + { + "epoch": 0.5732484076433121, + "grad_norm": 2.4652209281921387, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7215460538864136, + "num_tokens": 130491185.0, + "step": 5220 + }, + { + "epoch": 0.5733582253459257, + "grad_norm": 2.2366902828216553, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.709182858467102, + "num_tokens": 130515915.0, + "step": 5221 + }, + { + "epoch": 0.5734680430485394, + "grad_norm": 2.279282808303833, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6956807971000671, + "num_tokens": 130539651.0, + "step": 5222 + }, + { + "epoch": 0.5735778607511531, + "grad_norm": 2.6801917552948, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7096070051193237, + "num_tokens": 130558339.0, + "step": 5223 + }, + { + "epoch": 0.5736876784537668, + "grad_norm": 2.224989175796509, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6910297870635986, + "num_tokens": 130585615.0, + "step": 5224 + }, + { + "epoch": 0.5737974961563804, + "grad_norm": 2.0374367237091064, + "learning_rate": 1e-06, + "loss": 1.0799, + "mean_token_accuracy": 0.6758021116256714, + "num_tokens": 130616250.0, + "step": 5225 + }, + { + "epoch": 0.5739073138589941, + "grad_norm": 2.1685893535614014, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7072519063949585, + "num_tokens": 130644523.0, + "step": 5226 + }, + { + "epoch": 0.5740171315616077, + "grad_norm": 2.377138614654541, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7150195240974426, + "num_tokens": 130666542.0, + "step": 5227 + }, + { + "epoch": 0.5741269492642214, + "grad_norm": 2.1079888343811035, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6917069554328918, + "num_tokens": 130695411.0, + "step": 5228 + }, + { + "epoch": 0.574236766966835, + "grad_norm": 3.10072922706604, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7043482065200806, + "num_tokens": 130716068.0, + "step": 5229 + }, + { + "epoch": 0.5743465846694488, + "grad_norm": 2.054138422012329, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6963940858840942, + "num_tokens": 130746064.0, + "step": 5230 + }, + { + "epoch": 0.5744564023720624, + "grad_norm": 2.261786699295044, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.7066160440444946, + "num_tokens": 130771400.0, + "step": 5231 + }, + { + "epoch": 0.5745662200746761, + "grad_norm": 2.4077308177948, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.715687096118927, + "num_tokens": 130792253.0, + "step": 5232 + }, + { + "epoch": 0.5746760377772897, + "grad_norm": 2.3075692653656006, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6823039650917053, + "num_tokens": 130816268.0, + "step": 5233 + }, + { + "epoch": 0.5747858554799034, + "grad_norm": 2.0245556831359863, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7067966461181641, + "num_tokens": 130846337.0, + "step": 5234 + }, + { + "epoch": 0.574895673182517, + "grad_norm": 2.321709394454956, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7160158753395081, + "num_tokens": 130869746.0, + "step": 5235 + }, + { + "epoch": 0.5750054908851306, + "grad_norm": 2.0299391746520996, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7150363922119141, + "num_tokens": 130897400.0, + "step": 5236 + }, + { + "epoch": 0.5751153085877443, + "grad_norm": 2.4923925399780273, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.712649941444397, + "num_tokens": 130916834.0, + "step": 5237 + }, + { + "epoch": 0.575225126290358, + "grad_norm": 1.9808812141418457, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7065281867980957, + "num_tokens": 130946675.0, + "step": 5238 + }, + { + "epoch": 0.5753349439929717, + "grad_norm": 2.279160499572754, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6981821060180664, + "num_tokens": 130971163.0, + "step": 5239 + }, + { + "epoch": 0.5754447616955853, + "grad_norm": 2.0379867553710938, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7031322717666626, + "num_tokens": 131001759.0, + "step": 5240 + }, + { + "epoch": 0.575554579398199, + "grad_norm": 2.445125102996826, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7066026926040649, + "num_tokens": 131023180.0, + "step": 5241 + }, + { + "epoch": 0.5756643971008126, + "grad_norm": 1.8759331703186035, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6865388751029968, + "num_tokens": 131057358.0, + "step": 5242 + }, + { + "epoch": 0.5757742148034263, + "grad_norm": 2.400735855102539, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.7003858089447021, + "num_tokens": 131082049.0, + "step": 5243 + }, + { + "epoch": 0.5758840325060399, + "grad_norm": 2.479212999343872, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7235816717147827, + "num_tokens": 131101638.0, + "step": 5244 + }, + { + "epoch": 0.5759938502086537, + "grad_norm": 2.311793804168701, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.718437671661377, + "num_tokens": 131125045.0, + "step": 5245 + }, + { + "epoch": 0.5761036679112673, + "grad_norm": 2.2371721267700195, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7245233654975891, + "num_tokens": 131148489.0, + "step": 5246 + }, + { + "epoch": 0.576213485613881, + "grad_norm": 2.3438730239868164, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.711422860622406, + "num_tokens": 131171312.0, + "step": 5247 + }, + { + "epoch": 0.5763233033164946, + "grad_norm": 1.982515573501587, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7010906934738159, + "num_tokens": 131200686.0, + "step": 5248 + }, + { + "epoch": 0.5764331210191083, + "grad_norm": 2.0093789100646973, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6884716153144836, + "num_tokens": 131233036.0, + "step": 5249 + }, + { + "epoch": 0.5765429387217219, + "grad_norm": 2.4080610275268555, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7044721841812134, + "num_tokens": 131256908.0, + "step": 5250 + }, + { + "epoch": 0.5766527564243356, + "grad_norm": 2.284222364425659, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7043889164924622, + "num_tokens": 131280386.0, + "step": 5251 + }, + { + "epoch": 0.5767625741269493, + "grad_norm": 2.2718334197998047, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7083818912506104, + "num_tokens": 131302841.0, + "step": 5252 + }, + { + "epoch": 0.576872391829563, + "grad_norm": 2.1150083541870117, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6966198682785034, + "num_tokens": 131330689.0, + "step": 5253 + }, + { + "epoch": 0.5769822095321766, + "grad_norm": 2.404151201248169, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.7040966153144836, + "num_tokens": 131355041.0, + "step": 5254 + }, + { + "epoch": 0.5770920272347903, + "grad_norm": 2.35780930519104, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6882970333099365, + "num_tokens": 131378621.0, + "step": 5255 + }, + { + "epoch": 0.5772018449374039, + "grad_norm": 2.2547903060913086, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7017239928245544, + "num_tokens": 131403008.0, + "step": 5256 + }, + { + "epoch": 0.5773116626400175, + "grad_norm": 2.57401967048645, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7286554574966431, + "num_tokens": 131423551.0, + "step": 5257 + }, + { + "epoch": 0.5774214803426312, + "grad_norm": 2.2729218006134033, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7014621496200562, + "num_tokens": 131449781.0, + "step": 5258 + }, + { + "epoch": 0.577531298045245, + "grad_norm": 2.2168447971343994, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7220704555511475, + "num_tokens": 131474659.0, + "step": 5259 + }, + { + "epoch": 0.5776411157478586, + "grad_norm": 1.9916400909423828, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7171632647514343, + "num_tokens": 131503357.0, + "step": 5260 + }, + { + "epoch": 0.5777509334504722, + "grad_norm": 2.023481845855713, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.695935070514679, + "num_tokens": 131532777.0, + "step": 5261 + }, + { + "epoch": 0.5778607511530859, + "grad_norm": 2.780780553817749, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7257756590843201, + "num_tokens": 131549415.0, + "step": 5262 + }, + { + "epoch": 0.5779705688556995, + "grad_norm": 2.1677045822143555, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.717621922492981, + "num_tokens": 131575180.0, + "step": 5263 + }, + { + "epoch": 0.5780803865583132, + "grad_norm": 2.385575771331787, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7049878835678101, + "num_tokens": 131597051.0, + "step": 5264 + }, + { + "epoch": 0.5781902042609268, + "grad_norm": 2.6542890071868896, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7062254548072815, + "num_tokens": 131616314.0, + "step": 5265 + }, + { + "epoch": 0.5783000219635406, + "grad_norm": 2.1670122146606445, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7038098573684692, + "num_tokens": 131645013.0, + "step": 5266 + }, + { + "epoch": 0.5784098396661542, + "grad_norm": 2.3309342861175537, + "learning_rate": 1e-06, + "loss": 1.1155, + "mean_token_accuracy": 0.6773724555969238, + "num_tokens": 131670804.0, + "step": 5267 + }, + { + "epoch": 0.5785196573687679, + "grad_norm": 1.9563151597976685, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7030470371246338, + "num_tokens": 131702880.0, + "step": 5268 + }, + { + "epoch": 0.5786294750713815, + "grad_norm": 2.594977855682373, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6864054203033447, + "num_tokens": 131723918.0, + "step": 5269 + }, + { + "epoch": 0.5787392927739952, + "grad_norm": 2.3215556144714355, + "learning_rate": 1e-06, + "loss": 1.07, + "mean_token_accuracy": 0.6830211877822876, + "num_tokens": 131747620.0, + "step": 5270 + }, + { + "epoch": 0.5788491104766088, + "grad_norm": 2.15190052986145, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7003598213195801, + "num_tokens": 131774535.0, + "step": 5271 + }, + { + "epoch": 0.5789589281792225, + "grad_norm": 2.3134639263153076, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7153613567352295, + "num_tokens": 131797846.0, + "step": 5272 + }, + { + "epoch": 0.5790687458818361, + "grad_norm": 2.123055934906006, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.688606858253479, + "num_tokens": 131824772.0, + "step": 5273 + }, + { + "epoch": 0.5791785635844499, + "grad_norm": 2.4954142570495605, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7215121388435364, + "num_tokens": 131846057.0, + "step": 5274 + }, + { + "epoch": 0.5792883812870635, + "grad_norm": 2.379438877105713, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7304768562316895, + "num_tokens": 131867359.0, + "step": 5275 + }, + { + "epoch": 0.5793981989896771, + "grad_norm": 2.4042000770568848, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7181388139724731, + "num_tokens": 131887796.0, + "step": 5276 + }, + { + "epoch": 0.5795080166922908, + "grad_norm": 2.326934337615967, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6956154704093933, + "num_tokens": 131911027.0, + "step": 5277 + }, + { + "epoch": 0.5796178343949044, + "grad_norm": 2.3661537170410156, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7159854173660278, + "num_tokens": 131932969.0, + "step": 5278 + }, + { + "epoch": 0.5797276520975181, + "grad_norm": 2.054572820663452, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6936267018318176, + "num_tokens": 131961255.0, + "step": 5279 + }, + { + "epoch": 0.5798374698001317, + "grad_norm": 2.3669354915618896, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7032181024551392, + "num_tokens": 131984448.0, + "step": 5280 + }, + { + "epoch": 0.5799472875027455, + "grad_norm": 2.129624843597412, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7079175710678101, + "num_tokens": 132010515.0, + "step": 5281 + }, + { + "epoch": 0.5800571052053591, + "grad_norm": 2.430617094039917, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7383261919021606, + "num_tokens": 132031008.0, + "step": 5282 + }, + { + "epoch": 0.5801669229079728, + "grad_norm": 2.5748860836029053, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7103131413459778, + "num_tokens": 132051401.0, + "step": 5283 + }, + { + "epoch": 0.5802767406105864, + "grad_norm": 2.287416696548462, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7071005702018738, + "num_tokens": 132075640.0, + "step": 5284 + }, + { + "epoch": 0.5803865583132001, + "grad_norm": 2.269136428833008, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6947996616363525, + "num_tokens": 132100740.0, + "step": 5285 + }, + { + "epoch": 0.5804963760158137, + "grad_norm": 2.1535961627960205, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7117266654968262, + "num_tokens": 132126125.0, + "step": 5286 + }, + { + "epoch": 0.5806061937184274, + "grad_norm": 2.27333402633667, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6916382908821106, + "num_tokens": 132151515.0, + "step": 5287 + }, + { + "epoch": 0.5807160114210411, + "grad_norm": 2.1266937255859375, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7013379335403442, + "num_tokens": 132178615.0, + "step": 5288 + }, + { + "epoch": 0.5808258291236548, + "grad_norm": 2.2701447010040283, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.728034496307373, + "num_tokens": 132202922.0, + "step": 5289 + }, + { + "epoch": 0.5809356468262684, + "grad_norm": 2.0480587482452393, + "learning_rate": 1e-06, + "loss": 1.0688, + "mean_token_accuracy": 0.6724169254302979, + "num_tokens": 132231548.0, + "step": 5290 + }, + { + "epoch": 0.5810454645288821, + "grad_norm": 2.2000558376312256, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.698992908000946, + "num_tokens": 132256530.0, + "step": 5291 + }, + { + "epoch": 0.5811552822314957, + "grad_norm": 2.2569849491119385, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7322399616241455, + "num_tokens": 132283091.0, + "step": 5292 + }, + { + "epoch": 0.5812650999341094, + "grad_norm": 2.3921682834625244, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7092945575714111, + "num_tokens": 132305500.0, + "step": 5293 + }, + { + "epoch": 0.581374917636723, + "grad_norm": 2.081382989883423, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7217841148376465, + "num_tokens": 132332183.0, + "step": 5294 + }, + { + "epoch": 0.5814847353393368, + "grad_norm": 2.1141133308410645, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.700562596321106, + "num_tokens": 132360927.0, + "step": 5295 + }, + { + "epoch": 0.5815945530419504, + "grad_norm": 2.1491613388061523, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7095799446105957, + "num_tokens": 132385741.0, + "step": 5296 + }, + { + "epoch": 0.581704370744564, + "grad_norm": 2.059997797012329, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7122176885604858, + "num_tokens": 132413908.0, + "step": 5297 + }, + { + "epoch": 0.5818141884471777, + "grad_norm": 2.3961827754974365, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7145671844482422, + "num_tokens": 132435797.0, + "step": 5298 + }, + { + "epoch": 0.5819240061497913, + "grad_norm": 2.2726612091064453, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6946649551391602, + "num_tokens": 132461414.0, + "step": 5299 + }, + { + "epoch": 0.582033823852405, + "grad_norm": 2.268846273422241, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.698256254196167, + "num_tokens": 132486378.0, + "step": 5300 + }, + { + "epoch": 0.5821436415550186, + "grad_norm": 1.977235198020935, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.732330858707428, + "num_tokens": 132514798.0, + "step": 5301 + }, + { + "epoch": 0.5822534592576323, + "grad_norm": 2.0102553367614746, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6936644315719604, + "num_tokens": 132543419.0, + "step": 5302 + }, + { + "epoch": 0.582363276960246, + "grad_norm": 2.199685573577881, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6963763236999512, + "num_tokens": 132569260.0, + "step": 5303 + }, + { + "epoch": 0.5824730946628597, + "grad_norm": 2.1268091201782227, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7286676168441772, + "num_tokens": 132594970.0, + "step": 5304 + }, + { + "epoch": 0.5825829123654733, + "grad_norm": 2.1379570960998535, + "learning_rate": 1e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6849199533462524, + "num_tokens": 132623306.0, + "step": 5305 + }, + { + "epoch": 0.582692730068087, + "grad_norm": 2.2188220024108887, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7018272280693054, + "num_tokens": 132647036.0, + "step": 5306 + }, + { + "epoch": 0.5828025477707006, + "grad_norm": 2.1068220138549805, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6789448857307434, + "num_tokens": 132673504.0, + "step": 5307 + }, + { + "epoch": 0.5829123654733143, + "grad_norm": 2.0346016883850098, + "learning_rate": 1e-06, + "loss": 1.0821, + "mean_token_accuracy": 0.67622971534729, + "num_tokens": 132703720.0, + "step": 5308 + }, + { + "epoch": 0.5830221831759279, + "grad_norm": 2.141655683517456, + "learning_rate": 1e-06, + "loss": 1.0843, + "mean_token_accuracy": 0.677980899810791, + "num_tokens": 132732210.0, + "step": 5309 + }, + { + "epoch": 0.5831320008785417, + "grad_norm": 2.4881649017333984, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7176020741462708, + "num_tokens": 132751745.0, + "step": 5310 + }, + { + "epoch": 0.5832418185811553, + "grad_norm": 2.1529202461242676, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7009338140487671, + "num_tokens": 132778094.0, + "step": 5311 + }, + { + "epoch": 0.583351636283769, + "grad_norm": 2.0257534980773926, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7042557001113892, + "num_tokens": 132807103.0, + "step": 5312 + }, + { + "epoch": 0.5834614539863826, + "grad_norm": 2.3518741130828857, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7070308923721313, + "num_tokens": 132829841.0, + "step": 5313 + }, + { + "epoch": 0.5835712716889963, + "grad_norm": 2.203084707260132, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7000774145126343, + "num_tokens": 132855542.0, + "step": 5314 + }, + { + "epoch": 0.5836810893916099, + "grad_norm": 2.021721839904785, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6866638660430908, + "num_tokens": 132883491.0, + "step": 5315 + }, + { + "epoch": 0.5837909070942235, + "grad_norm": 2.800581216812134, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7086125016212463, + "num_tokens": 132900702.0, + "step": 5316 + }, + { + "epoch": 0.5839007247968373, + "grad_norm": 2.4177207946777344, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7123668789863586, + "num_tokens": 132921536.0, + "step": 5317 + }, + { + "epoch": 0.584010542499451, + "grad_norm": 2.2220547199249268, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7054613828659058, + "num_tokens": 132946597.0, + "step": 5318 + }, + { + "epoch": 0.5841203602020646, + "grad_norm": 2.1221044063568115, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7237752676010132, + "num_tokens": 132972045.0, + "step": 5319 + }, + { + "epoch": 0.5842301779046782, + "grad_norm": 2.3952419757843018, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7038784027099609, + "num_tokens": 132992932.0, + "step": 5320 + }, + { + "epoch": 0.5843399956072919, + "grad_norm": 2.083070755004883, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.6979954838752747, + "num_tokens": 133021382.0, + "step": 5321 + }, + { + "epoch": 0.5844498133099055, + "grad_norm": 2.336435079574585, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6928892731666565, + "num_tokens": 133046803.0, + "step": 5322 + }, + { + "epoch": 0.5845596310125192, + "grad_norm": 1.8388326168060303, + "learning_rate": 1e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.6805057525634766, + "num_tokens": 133081915.0, + "step": 5323 + }, + { + "epoch": 0.5846694487151329, + "grad_norm": 1.9340858459472656, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6858339905738831, + "num_tokens": 133114088.0, + "step": 5324 + }, + { + "epoch": 0.5847792664177466, + "grad_norm": 1.9866677522659302, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6911949515342712, + "num_tokens": 133145971.0, + "step": 5325 + }, + { + "epoch": 0.5848890841203602, + "grad_norm": 2.359530448913574, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7188390493392944, + "num_tokens": 133167472.0, + "step": 5326 + }, + { + "epoch": 0.5849989018229739, + "grad_norm": 2.523873805999756, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7146906852722168, + "num_tokens": 133188414.0, + "step": 5327 + }, + { + "epoch": 0.5851087195255875, + "grad_norm": 2.3369479179382324, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7071475386619568, + "num_tokens": 133210697.0, + "step": 5328 + }, + { + "epoch": 0.5852185372282012, + "grad_norm": 2.4124436378479004, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7261638641357422, + "num_tokens": 133231010.0, + "step": 5329 + }, + { + "epoch": 0.5853283549308148, + "grad_norm": 2.469508409500122, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.690024733543396, + "num_tokens": 133253101.0, + "step": 5330 + }, + { + "epoch": 0.5854381726334285, + "grad_norm": 2.383920907974243, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7304525375366211, + "num_tokens": 133273631.0, + "step": 5331 + }, + { + "epoch": 0.5855479903360422, + "grad_norm": 2.2900235652923584, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.737411618232727, + "num_tokens": 133297097.0, + "step": 5332 + }, + { + "epoch": 0.5856578080386559, + "grad_norm": 2.071749448776245, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7028616666793823, + "num_tokens": 133325434.0, + "step": 5333 + }, + { + "epoch": 0.5857676257412695, + "grad_norm": 2.3939638137817383, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7335149645805359, + "num_tokens": 133345061.0, + "step": 5334 + }, + { + "epoch": 0.5858774434438832, + "grad_norm": 2.337887763977051, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6947107315063477, + "num_tokens": 133370313.0, + "step": 5335 + }, + { + "epoch": 0.5859872611464968, + "grad_norm": 2.430863618850708, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6946142315864563, + "num_tokens": 133393936.0, + "step": 5336 + }, + { + "epoch": 0.5860970788491104, + "grad_norm": 2.046480417251587, + "learning_rate": 1e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6761175394058228, + "num_tokens": 133423919.0, + "step": 5337 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 2.339853048324585, + "learning_rate": 1e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7477180361747742, + "num_tokens": 133445571.0, + "step": 5338 + }, + { + "epoch": 0.5863167142543378, + "grad_norm": 2.2209489345550537, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.6973440051078796, + "num_tokens": 133472640.0, + "step": 5339 + }, + { + "epoch": 0.5864265319569515, + "grad_norm": 1.930537223815918, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7061710357666016, + "num_tokens": 133501873.0, + "step": 5340 + }, + { + "epoch": 0.5865363496595651, + "grad_norm": 2.220885992050171, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7035544514656067, + "num_tokens": 133527635.0, + "step": 5341 + }, + { + "epoch": 0.5866461673621788, + "grad_norm": 2.3184361457824707, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6879259347915649, + "num_tokens": 133550158.0, + "step": 5342 + }, + { + "epoch": 0.5867559850647924, + "grad_norm": 2.1025848388671875, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.6998624205589294, + "num_tokens": 133579026.0, + "step": 5343 + }, + { + "epoch": 0.5868658027674061, + "grad_norm": 2.049076795578003, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.688807487487793, + "num_tokens": 133608865.0, + "step": 5344 + }, + { + "epoch": 0.5869756204700197, + "grad_norm": 2.2037863731384277, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7318154573440552, + "num_tokens": 133633288.0, + "step": 5345 + }, + { + "epoch": 0.5870854381726335, + "grad_norm": 2.597785472869873, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7341709733009338, + "num_tokens": 133651386.0, + "step": 5346 + }, + { + "epoch": 0.5871952558752471, + "grad_norm": 2.500859498977661, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7042419910430908, + "num_tokens": 133672560.0, + "step": 5347 + }, + { + "epoch": 0.5873050735778608, + "grad_norm": 2.0788867473602295, + "learning_rate": 1e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.677483320236206, + "num_tokens": 133703271.0, + "step": 5348 + }, + { + "epoch": 0.5874148912804744, + "grad_norm": 2.0495235919952393, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.697488009929657, + "num_tokens": 133732368.0, + "step": 5349 + }, + { + "epoch": 0.5875247089830881, + "grad_norm": 2.058422327041626, + "learning_rate": 1e-06, + "loss": 1.111, + "mean_token_accuracy": 0.6674306988716125, + "num_tokens": 133762642.0, + "step": 5350 + }, + { + "epoch": 0.5876345266857017, + "grad_norm": 2.0511579513549805, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7126178741455078, + "num_tokens": 133790964.0, + "step": 5351 + }, + { + "epoch": 0.5877443443883154, + "grad_norm": 2.3099205493927, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7250396013259888, + "num_tokens": 133812829.0, + "step": 5352 + }, + { + "epoch": 0.5878541620909291, + "grad_norm": 2.1632649898529053, + "learning_rate": 1e-06, + "loss": 1.0906, + "mean_token_accuracy": 0.6932926177978516, + "num_tokens": 133838721.0, + "step": 5353 + }, + { + "epoch": 0.5879639797935428, + "grad_norm": 2.094118118286133, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6763181686401367, + "num_tokens": 133866789.0, + "step": 5354 + }, + { + "epoch": 0.5880737974961564, + "grad_norm": 2.144770860671997, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7050143480300903, + "num_tokens": 133892082.0, + "step": 5355 + }, + { + "epoch": 0.58818361519877, + "grad_norm": 2.2016823291778564, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6968330144882202, + "num_tokens": 133916735.0, + "step": 5356 + }, + { + "epoch": 0.5882934329013837, + "grad_norm": 2.795832633972168, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7266631126403809, + "num_tokens": 133933614.0, + "step": 5357 + }, + { + "epoch": 0.5884032506039973, + "grad_norm": 2.3548812866210938, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7047067880630493, + "num_tokens": 133957608.0, + "step": 5358 + }, + { + "epoch": 0.588513068306611, + "grad_norm": 2.5825088024139404, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7042772769927979, + "num_tokens": 133977957.0, + "step": 5359 + }, + { + "epoch": 0.5886228860092246, + "grad_norm": 2.1439645290374756, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6949106454849243, + "num_tokens": 134005835.0, + "step": 5360 + }, + { + "epoch": 0.5887327037118384, + "grad_norm": 2.0642075538635254, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7459492087364197, + "num_tokens": 134032593.0, + "step": 5361 + }, + { + "epoch": 0.588842521414452, + "grad_norm": 2.395296335220337, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7010581493377686, + "num_tokens": 134054341.0, + "step": 5362 + }, + { + "epoch": 0.5889523391170657, + "grad_norm": 2.093722105026245, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7148253321647644, + "num_tokens": 134080889.0, + "step": 5363 + }, + { + "epoch": 0.5890621568196793, + "grad_norm": 2.3388113975524902, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7315243482589722, + "num_tokens": 134101532.0, + "step": 5364 + }, + { + "epoch": 0.589171974522293, + "grad_norm": 2.3600013256073, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7272844910621643, + "num_tokens": 134122710.0, + "step": 5365 + }, + { + "epoch": 0.5892817922249066, + "grad_norm": 2.3788774013519287, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7395786046981812, + "num_tokens": 134143993.0, + "step": 5366 + }, + { + "epoch": 0.5893916099275203, + "grad_norm": 2.7053442001342773, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7190546989440918, + "num_tokens": 134162853.0, + "step": 5367 + }, + { + "epoch": 0.589501427630134, + "grad_norm": 2.2812345027923584, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7219167351722717, + "num_tokens": 134185365.0, + "step": 5368 + }, + { + "epoch": 0.5896112453327477, + "grad_norm": 2.268564224243164, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.696912407875061, + "num_tokens": 134209443.0, + "step": 5369 + }, + { + "epoch": 0.5897210630353613, + "grad_norm": 2.5744073390960693, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7133430242538452, + "num_tokens": 134229423.0, + "step": 5370 + }, + { + "epoch": 0.589830880737975, + "grad_norm": 2.0311665534973145, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7048947811126709, + "num_tokens": 134258497.0, + "step": 5371 + }, + { + "epoch": 0.5899406984405886, + "grad_norm": 2.9228098392486572, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7220577597618103, + "num_tokens": 134273502.0, + "step": 5372 + }, + { + "epoch": 0.5900505161432023, + "grad_norm": 2.071711301803589, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6973158121109009, + "num_tokens": 134303358.0, + "step": 5373 + }, + { + "epoch": 0.5901603338458159, + "grad_norm": 2.132033348083496, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.730383038520813, + "num_tokens": 134329314.0, + "step": 5374 + }, + { + "epoch": 0.5902701515484297, + "grad_norm": 2.159050941467285, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6952975988388062, + "num_tokens": 134357357.0, + "step": 5375 + }, + { + "epoch": 0.5903799692510433, + "grad_norm": 2.324660539627075, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6966955661773682, + "num_tokens": 134383150.0, + "step": 5376 + }, + { + "epoch": 0.590489786953657, + "grad_norm": 2.0330429077148438, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7066088914871216, + "num_tokens": 134410552.0, + "step": 5377 + }, + { + "epoch": 0.5905996046562706, + "grad_norm": 2.0945940017700195, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7049862742424011, + "num_tokens": 134437872.0, + "step": 5378 + }, + { + "epoch": 0.5907094223588842, + "grad_norm": 2.1598775386810303, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7024253606796265, + "num_tokens": 134464467.0, + "step": 5379 + }, + { + "epoch": 0.5908192400614979, + "grad_norm": 2.081866979598999, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7165588736534119, + "num_tokens": 134492882.0, + "step": 5380 + }, + { + "epoch": 0.5909290577641115, + "grad_norm": 1.8703030347824097, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7038078308105469, + "num_tokens": 134524721.0, + "step": 5381 + }, + { + "epoch": 0.5910388754667253, + "grad_norm": 2.2449886798858643, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7037858963012695, + "num_tokens": 134550028.0, + "step": 5382 + }, + { + "epoch": 0.5911486931693389, + "grad_norm": 2.181666851043701, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6961405277252197, + "num_tokens": 134577263.0, + "step": 5383 + }, + { + "epoch": 0.5912585108719526, + "grad_norm": 2.423086166381836, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7090020775794983, + "num_tokens": 134597994.0, + "step": 5384 + }, + { + "epoch": 0.5913683285745662, + "grad_norm": 2.1205601692199707, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6857341527938843, + "num_tokens": 134626966.0, + "step": 5385 + }, + { + "epoch": 0.5914781462771799, + "grad_norm": 2.1717031002044678, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.6947850584983826, + "num_tokens": 134653760.0, + "step": 5386 + }, + { + "epoch": 0.5915879639797935, + "grad_norm": 2.241830348968506, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6794420480728149, + "num_tokens": 134676989.0, + "step": 5387 + }, + { + "epoch": 0.5916977816824072, + "grad_norm": 2.0991156101226807, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6972949504852295, + "num_tokens": 134704472.0, + "step": 5388 + }, + { + "epoch": 0.5918075993850208, + "grad_norm": 2.3261773586273193, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.6968741416931152, + "num_tokens": 134728293.0, + "step": 5389 + }, + { + "epoch": 0.5919174170876346, + "grad_norm": 2.412855386734009, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7031574845314026, + "num_tokens": 134750224.0, + "step": 5390 + }, + { + "epoch": 0.5920272347902482, + "grad_norm": 2.4637577533721924, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7138365507125854, + "num_tokens": 134771223.0, + "step": 5391 + }, + { + "epoch": 0.5921370524928619, + "grad_norm": 2.1297011375427246, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7099097371101379, + "num_tokens": 134798037.0, + "step": 5392 + }, + { + "epoch": 0.5922468701954755, + "grad_norm": 1.9150477647781372, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6861332654953003, + "num_tokens": 134830478.0, + "step": 5393 + }, + { + "epoch": 0.5923566878980892, + "grad_norm": 2.1253087520599365, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7026325464248657, + "num_tokens": 134856162.0, + "step": 5394 + }, + { + "epoch": 0.5924665056007028, + "grad_norm": 2.089162826538086, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7084097862243652, + "num_tokens": 134884385.0, + "step": 5395 + }, + { + "epoch": 0.5925763233033164, + "grad_norm": 2.463071823120117, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6958651542663574, + "num_tokens": 134906371.0, + "step": 5396 + }, + { + "epoch": 0.5926861410059302, + "grad_norm": 2.3406152725219727, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7150627374649048, + "num_tokens": 134929593.0, + "step": 5397 + }, + { + "epoch": 0.5927959587085438, + "grad_norm": 2.1909122467041016, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.686130702495575, + "num_tokens": 134955063.0, + "step": 5398 + }, + { + "epoch": 0.5929057764111575, + "grad_norm": 2.159497022628784, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7157065868377686, + "num_tokens": 134980573.0, + "step": 5399 + }, + { + "epoch": 0.5930155941137711, + "grad_norm": 2.404289484024048, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6922833323478699, + "num_tokens": 135001591.0, + "step": 5400 + }, + { + "epoch": 0.5931254118163848, + "grad_norm": 2.5409789085388184, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7346082925796509, + "num_tokens": 135021429.0, + "step": 5401 + }, + { + "epoch": 0.5932352295189984, + "grad_norm": 2.133549928665161, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7061108946800232, + "num_tokens": 135047431.0, + "step": 5402 + }, + { + "epoch": 0.5933450472216121, + "grad_norm": 2.391195297241211, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7155228853225708, + "num_tokens": 135069139.0, + "step": 5403 + }, + { + "epoch": 0.5934548649242258, + "grad_norm": 2.0277445316314697, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7201188802719116, + "num_tokens": 135098853.0, + "step": 5404 + }, + { + "epoch": 0.5935646826268395, + "grad_norm": 2.191822052001953, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6989449262619019, + "num_tokens": 135125791.0, + "step": 5405 + }, + { + "epoch": 0.5936745003294531, + "grad_norm": 2.357123613357544, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7112696170806885, + "num_tokens": 135149578.0, + "step": 5406 + }, + { + "epoch": 0.5937843180320668, + "grad_norm": 2.392277956008911, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7150534391403198, + "num_tokens": 135171640.0, + "step": 5407 + }, + { + "epoch": 0.5938941357346804, + "grad_norm": 2.290005683898926, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7075386047363281, + "num_tokens": 135194850.0, + "step": 5408 + }, + { + "epoch": 0.5940039534372941, + "grad_norm": 2.101506233215332, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6981753706932068, + "num_tokens": 135221448.0, + "step": 5409 + }, + { + "epoch": 0.5941137711399077, + "grad_norm": 2.2417685985565186, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6821622848510742, + "num_tokens": 135245586.0, + "step": 5410 + }, + { + "epoch": 0.5942235888425215, + "grad_norm": 2.440931797027588, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7241026163101196, + "num_tokens": 135263513.0, + "step": 5411 + }, + { + "epoch": 0.5943334065451351, + "grad_norm": 2.0053305625915527, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.707747220993042, + "num_tokens": 135290665.0, + "step": 5412 + }, + { + "epoch": 0.5944432242477488, + "grad_norm": 2.090670347213745, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.686789870262146, + "num_tokens": 135318185.0, + "step": 5413 + }, + { + "epoch": 0.5945530419503624, + "grad_norm": 2.120856761932373, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7020275592803955, + "num_tokens": 135345213.0, + "step": 5414 + }, + { + "epoch": 0.594662859652976, + "grad_norm": 2.3040404319763184, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7128452062606812, + "num_tokens": 135368327.0, + "step": 5415 + }, + { + "epoch": 0.5947726773555897, + "grad_norm": 2.380199670791626, + "learning_rate": 1e-06, + "loss": 1.0575, + "mean_token_accuracy": 0.6912351846694946, + "num_tokens": 135392777.0, + "step": 5416 + }, + { + "epoch": 0.5948824950582033, + "grad_norm": 2.082324743270874, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7170820832252502, + "num_tokens": 135420951.0, + "step": 5417 + }, + { + "epoch": 0.5949923127608171, + "grad_norm": 2.2644617557525635, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7076882719993591, + "num_tokens": 135444244.0, + "step": 5418 + }, + { + "epoch": 0.5951021304634307, + "grad_norm": 1.9827251434326172, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7385234832763672, + "num_tokens": 135473483.0, + "step": 5419 + }, + { + "epoch": 0.5952119481660444, + "grad_norm": 2.1154720783233643, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.709685742855072, + "num_tokens": 135499402.0, + "step": 5420 + }, + { + "epoch": 0.595321765868658, + "grad_norm": 2.0316786766052246, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7002808451652527, + "num_tokens": 135528198.0, + "step": 5421 + }, + { + "epoch": 0.5954315835712717, + "grad_norm": 2.2368125915527344, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6890265941619873, + "num_tokens": 135553685.0, + "step": 5422 + }, + { + "epoch": 0.5955414012738853, + "grad_norm": 2.5379621982574463, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6981332898139954, + "num_tokens": 135573270.0, + "step": 5423 + }, + { + "epoch": 0.595651218976499, + "grad_norm": 2.225705146789551, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.720979630947113, + "num_tokens": 135597466.0, + "step": 5424 + }, + { + "epoch": 0.5957610366791126, + "grad_norm": 2.4557554721832275, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6958112716674805, + "num_tokens": 135619141.0, + "step": 5425 + }, + { + "epoch": 0.5958708543817264, + "grad_norm": 2.0461394786834717, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.7000688314437866, + "num_tokens": 135647805.0, + "step": 5426 + }, + { + "epoch": 0.59598067208434, + "grad_norm": 2.1997134685516357, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7159972786903381, + "num_tokens": 135672530.0, + "step": 5427 + }, + { + "epoch": 0.5960904897869537, + "grad_norm": 2.130099296569824, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.703801155090332, + "num_tokens": 135701636.0, + "step": 5428 + }, + { + "epoch": 0.5962003074895673, + "grad_norm": 2.156519651412964, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.6949717998504639, + "num_tokens": 135729072.0, + "step": 5429 + }, + { + "epoch": 0.596310125192181, + "grad_norm": 2.427872657775879, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7123114466667175, + "num_tokens": 135751820.0, + "step": 5430 + }, + { + "epoch": 0.5964199428947946, + "grad_norm": 2.446009635925293, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7082322835922241, + "num_tokens": 135772662.0, + "step": 5431 + }, + { + "epoch": 0.5965297605974083, + "grad_norm": 2.146183490753174, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7208953499794006, + "num_tokens": 135800838.0, + "step": 5432 + }, + { + "epoch": 0.596639578300022, + "grad_norm": 2.2582218647003174, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7000866532325745, + "num_tokens": 135825751.0, + "step": 5433 + }, + { + "epoch": 0.5967493960026357, + "grad_norm": 2.1200528144836426, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.716555655002594, + "num_tokens": 135852417.0, + "step": 5434 + }, + { + "epoch": 0.5968592137052493, + "grad_norm": 2.091064929962158, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7245151996612549, + "num_tokens": 135877202.0, + "step": 5435 + }, + { + "epoch": 0.596969031407863, + "grad_norm": 2.294905662536621, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6845565438270569, + "num_tokens": 135902624.0, + "step": 5436 + }, + { + "epoch": 0.5970788491104766, + "grad_norm": 2.2583816051483154, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7096145749092102, + "num_tokens": 135926621.0, + "step": 5437 + }, + { + "epoch": 0.5971886668130902, + "grad_norm": 2.3168797492980957, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7103065252304077, + "num_tokens": 135949157.0, + "step": 5438 + }, + { + "epoch": 0.5972984845157039, + "grad_norm": 2.216710090637207, + "learning_rate": 1e-06, + "loss": 1.1174, + "mean_token_accuracy": 0.6645462512969971, + "num_tokens": 135976724.0, + "step": 5439 + }, + { + "epoch": 0.5974083022183176, + "grad_norm": 2.280876398086548, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7025407552719116, + "num_tokens": 136000959.0, + "step": 5440 + }, + { + "epoch": 0.5975181199209313, + "grad_norm": 2.150460720062256, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7079059481620789, + "num_tokens": 136026737.0, + "step": 5441 + }, + { + "epoch": 0.5976279376235449, + "grad_norm": 2.2859530448913574, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6882478594779968, + "num_tokens": 136050682.0, + "step": 5442 + }, + { + "epoch": 0.5977377553261586, + "grad_norm": 2.095561981201172, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6986178159713745, + "num_tokens": 136076924.0, + "step": 5443 + }, + { + "epoch": 0.5978475730287722, + "grad_norm": 2.3621392250061035, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7012400031089783, + "num_tokens": 136099138.0, + "step": 5444 + }, + { + "epoch": 0.5979573907313859, + "grad_norm": 2.2108356952667236, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6849467754364014, + "num_tokens": 136124746.0, + "step": 5445 + }, + { + "epoch": 0.5980672084339995, + "grad_norm": 1.9224940538406372, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6898128986358643, + "num_tokens": 136158251.0, + "step": 5446 + }, + { + "epoch": 0.5981770261366133, + "grad_norm": 2.2191214561462402, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6781340837478638, + "num_tokens": 136183525.0, + "step": 5447 + }, + { + "epoch": 0.5982868438392269, + "grad_norm": 2.3782198429107666, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7029626965522766, + "num_tokens": 136204721.0, + "step": 5448 + }, + { + "epoch": 0.5983966615418406, + "grad_norm": 2.231811046600342, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.7010802626609802, + "num_tokens": 136230326.0, + "step": 5449 + }, + { + "epoch": 0.5985064792444542, + "grad_norm": 2.5295207500457764, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7159138917922974, + "num_tokens": 136250541.0, + "step": 5450 + }, + { + "epoch": 0.5986162969470679, + "grad_norm": 2.4707987308502197, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.701941728591919, + "num_tokens": 136270673.0, + "step": 5451 + }, + { + "epoch": 0.5987261146496815, + "grad_norm": 2.417538642883301, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7195151448249817, + "num_tokens": 136291156.0, + "step": 5452 + }, + { + "epoch": 0.5988359323522952, + "grad_norm": 2.0442426204681396, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6988031268119812, + "num_tokens": 136322721.0, + "step": 5453 + }, + { + "epoch": 0.5989457500549088, + "grad_norm": 2.0123300552368164, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7040649652481079, + "num_tokens": 136351430.0, + "step": 5454 + }, + { + "epoch": 0.5990555677575226, + "grad_norm": 2.154740571975708, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6897056102752686, + "num_tokens": 136378710.0, + "step": 5455 + }, + { + "epoch": 0.5991653854601362, + "grad_norm": 2.0777339935302734, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6796500086784363, + "num_tokens": 136406017.0, + "step": 5456 + }, + { + "epoch": 0.5992752031627498, + "grad_norm": 2.0350406169891357, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7088211178779602, + "num_tokens": 136433777.0, + "step": 5457 + }, + { + "epoch": 0.5993850208653635, + "grad_norm": 2.484308958053589, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7023611068725586, + "num_tokens": 136455284.0, + "step": 5458 + }, + { + "epoch": 0.5994948385679771, + "grad_norm": 2.1090455055236816, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7018259167671204, + "num_tokens": 136481623.0, + "step": 5459 + }, + { + "epoch": 0.5996046562705908, + "grad_norm": 2.132681131362915, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7265558838844299, + "num_tokens": 136505875.0, + "step": 5460 + }, + { + "epoch": 0.5997144739732044, + "grad_norm": 1.9156670570373535, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6966254711151123, + "num_tokens": 136536199.0, + "step": 5461 + }, + { + "epoch": 0.5998242916758182, + "grad_norm": 2.197620153427124, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6935245990753174, + "num_tokens": 136563656.0, + "step": 5462 + }, + { + "epoch": 0.5999341093784318, + "grad_norm": 2.4497902393341064, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7268943786621094, + "num_tokens": 136585426.0, + "step": 5463 + }, + { + "epoch": 0.6000439270810455, + "grad_norm": 2.3941643238067627, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7014554738998413, + "num_tokens": 136606469.0, + "step": 5464 + }, + { + "epoch": 0.6001537447836591, + "grad_norm": 2.3303329944610596, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6944066286087036, + "num_tokens": 136631720.0, + "step": 5465 + }, + { + "epoch": 0.6002635624862728, + "grad_norm": 2.105407476425171, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.725322961807251, + "num_tokens": 136658382.0, + "step": 5466 + }, + { + "epoch": 0.6003733801888864, + "grad_norm": 2.1104443073272705, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7220889329910278, + "num_tokens": 136686818.0, + "step": 5467 + }, + { + "epoch": 0.6004831978915001, + "grad_norm": 2.581604242324829, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7207235097885132, + "num_tokens": 136705942.0, + "step": 5468 + }, + { + "epoch": 0.6005930155941138, + "grad_norm": 2.353428363800049, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6867067217826843, + "num_tokens": 136731033.0, + "step": 5469 + }, + { + "epoch": 0.6007028332967275, + "grad_norm": 2.1091244220733643, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.671938955783844, + "num_tokens": 136758207.0, + "step": 5470 + }, + { + "epoch": 0.6008126509993411, + "grad_norm": 2.3613674640655518, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.7008922696113586, + "num_tokens": 136782390.0, + "step": 5471 + }, + { + "epoch": 0.6009224687019548, + "grad_norm": 2.1133551597595215, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.71107017993927, + "num_tokens": 136808313.0, + "step": 5472 + }, + { + "epoch": 0.6010322864045684, + "grad_norm": 2.313364267349243, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6831465363502502, + "num_tokens": 136835179.0, + "step": 5473 + }, + { + "epoch": 0.601142104107182, + "grad_norm": 2.1089789867401123, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7110533714294434, + "num_tokens": 136860354.0, + "step": 5474 + }, + { + "epoch": 0.6012519218097957, + "grad_norm": 2.4175548553466797, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6950976252555847, + "num_tokens": 136882325.0, + "step": 5475 + }, + { + "epoch": 0.6013617395124095, + "grad_norm": 2.4665215015411377, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7168188095092773, + "num_tokens": 136903732.0, + "step": 5476 + }, + { + "epoch": 0.6014715572150231, + "grad_norm": 2.3993046283721924, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7080162763595581, + "num_tokens": 136925648.0, + "step": 5477 + }, + { + "epoch": 0.6015813749176367, + "grad_norm": 2.302910804748535, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6831053495407104, + "num_tokens": 136948262.0, + "step": 5478 + }, + { + "epoch": 0.6016911926202504, + "grad_norm": 2.054094076156616, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6936254501342773, + "num_tokens": 136977206.0, + "step": 5479 + }, + { + "epoch": 0.601801010322864, + "grad_norm": 2.4017174243927, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6925609707832336, + "num_tokens": 136998243.0, + "step": 5480 + }, + { + "epoch": 0.6019108280254777, + "grad_norm": 2.175076961517334, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7096772193908691, + "num_tokens": 137022894.0, + "step": 5481 + }, + { + "epoch": 0.6020206457280913, + "grad_norm": 2.3748056888580322, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7189353704452515, + "num_tokens": 137044245.0, + "step": 5482 + }, + { + "epoch": 0.602130463430705, + "grad_norm": 2.493204116821289, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7153723239898682, + "num_tokens": 137065403.0, + "step": 5483 + }, + { + "epoch": 0.6022402811333187, + "grad_norm": 2.8369412422180176, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7293838858604431, + "num_tokens": 137081333.0, + "step": 5484 + }, + { + "epoch": 0.6023500988359324, + "grad_norm": 2.2898690700531006, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7079272270202637, + "num_tokens": 137106384.0, + "step": 5485 + }, + { + "epoch": 0.602459916538546, + "grad_norm": 2.3442344665527344, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.718273401260376, + "num_tokens": 137128172.0, + "step": 5486 + }, + { + "epoch": 0.6025697342411597, + "grad_norm": 2.3374061584472656, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7151690721511841, + "num_tokens": 137150237.0, + "step": 5487 + }, + { + "epoch": 0.6026795519437733, + "grad_norm": 2.662353754043579, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7401578426361084, + "num_tokens": 137167591.0, + "step": 5488 + }, + { + "epoch": 0.602789369646387, + "grad_norm": 2.427700996398926, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7088382244110107, + "num_tokens": 137189621.0, + "step": 5489 + }, + { + "epoch": 0.6028991873490006, + "grad_norm": 2.3328285217285156, + "learning_rate": 1e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6827646493911743, + "num_tokens": 137217202.0, + "step": 5490 + }, + { + "epoch": 0.6030090050516144, + "grad_norm": 2.4042844772338867, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7339869737625122, + "num_tokens": 137238653.0, + "step": 5491 + }, + { + "epoch": 0.603118822754228, + "grad_norm": 2.1780571937561035, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7062164545059204, + "num_tokens": 137264736.0, + "step": 5492 + }, + { + "epoch": 0.6032286404568417, + "grad_norm": 2.248892307281494, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7071601152420044, + "num_tokens": 137288479.0, + "step": 5493 + }, + { + "epoch": 0.6033384581594553, + "grad_norm": 2.167607069015503, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7106006145477295, + "num_tokens": 137314620.0, + "step": 5494 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 1.9759145975112915, + "learning_rate": 1e-06, + "loss": 1.0795, + "mean_token_accuracy": 0.6854569911956787, + "num_tokens": 137347320.0, + "step": 5495 + }, + { + "epoch": 0.6035580935646826, + "grad_norm": 2.1814723014831543, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6856945753097534, + "num_tokens": 137372686.0, + "step": 5496 + }, + { + "epoch": 0.6036679112672962, + "grad_norm": 2.3061397075653076, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7154843807220459, + "num_tokens": 137394627.0, + "step": 5497 + }, + { + "epoch": 0.60377772896991, + "grad_norm": 2.411196708679199, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7232174873352051, + "num_tokens": 137415502.0, + "step": 5498 + }, + { + "epoch": 0.6038875466725236, + "grad_norm": 2.1706292629241943, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7010642290115356, + "num_tokens": 137441655.0, + "step": 5499 + }, + { + "epoch": 0.6039973643751373, + "grad_norm": 2.106128215789795, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6878541707992554, + "num_tokens": 137467591.0, + "step": 5500 + }, + { + "epoch": 0.6041071820777509, + "grad_norm": 2.0426666736602783, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.71150141954422, + "num_tokens": 137494443.0, + "step": 5501 + }, + { + "epoch": 0.6042169997803646, + "grad_norm": 2.007045030593872, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7153844833374023, + "num_tokens": 137521496.0, + "step": 5502 + }, + { + "epoch": 0.6043268174829782, + "grad_norm": 2.2014386653900146, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7178182601928711, + "num_tokens": 137545353.0, + "step": 5503 + }, + { + "epoch": 0.6044366351855919, + "grad_norm": 2.298330068588257, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7053598761558533, + "num_tokens": 137568265.0, + "step": 5504 + }, + { + "epoch": 0.6045464528882056, + "grad_norm": 1.960784912109375, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.690251350402832, + "num_tokens": 137602234.0, + "step": 5505 + }, + { + "epoch": 0.6046562705908193, + "grad_norm": 2.2448198795318604, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6951277256011963, + "num_tokens": 137628121.0, + "step": 5506 + }, + { + "epoch": 0.6047660882934329, + "grad_norm": 2.108156442642212, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7032647132873535, + "num_tokens": 137654824.0, + "step": 5507 + }, + { + "epoch": 0.6048759059960466, + "grad_norm": 2.2119388580322266, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7186743021011353, + "num_tokens": 137679828.0, + "step": 5508 + }, + { + "epoch": 0.6049857236986602, + "grad_norm": 2.2210886478424072, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7169718742370605, + "num_tokens": 137704004.0, + "step": 5509 + }, + { + "epoch": 0.6050955414012739, + "grad_norm": 2.0798189640045166, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.6974325180053711, + "num_tokens": 137731757.0, + "step": 5510 + }, + { + "epoch": 0.6052053591038875, + "grad_norm": 2.0040135383605957, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7230662107467651, + "num_tokens": 137761675.0, + "step": 5511 + }, + { + "epoch": 0.6053151768065012, + "grad_norm": 2.3273379802703857, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7068857550621033, + "num_tokens": 137782990.0, + "step": 5512 + }, + { + "epoch": 0.6054249945091149, + "grad_norm": 2.110605239868164, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7007016539573669, + "num_tokens": 137811503.0, + "step": 5513 + }, + { + "epoch": 0.6055348122117286, + "grad_norm": 2.5681660175323486, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7129051685333252, + "num_tokens": 137831766.0, + "step": 5514 + }, + { + "epoch": 0.6056446299143422, + "grad_norm": 2.1804473400115967, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6898784637451172, + "num_tokens": 137859188.0, + "step": 5515 + }, + { + "epoch": 0.6057544476169558, + "grad_norm": 2.4098660945892334, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.712365984916687, + "num_tokens": 137880276.0, + "step": 5516 + }, + { + "epoch": 0.6058642653195695, + "grad_norm": 2.1481449604034424, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7019708752632141, + "num_tokens": 137907394.0, + "step": 5517 + }, + { + "epoch": 0.6059740830221831, + "grad_norm": 2.168227434158325, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7060570120811462, + "num_tokens": 137935504.0, + "step": 5518 + }, + { + "epoch": 0.6060839007247968, + "grad_norm": 2.0912725925445557, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6907968521118164, + "num_tokens": 137963692.0, + "step": 5519 + }, + { + "epoch": 0.6061937184274105, + "grad_norm": 2.3594136238098145, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7036702632904053, + "num_tokens": 137989549.0, + "step": 5520 + }, + { + "epoch": 0.6063035361300242, + "grad_norm": 2.375387668609619, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.6982799172401428, + "num_tokens": 138009987.0, + "step": 5521 + }, + { + "epoch": 0.6064133538326378, + "grad_norm": 2.2225611209869385, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6926504373550415, + "num_tokens": 138034458.0, + "step": 5522 + }, + { + "epoch": 0.6065231715352515, + "grad_norm": 2.6989293098449707, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7044034004211426, + "num_tokens": 138053568.0, + "step": 5523 + }, + { + "epoch": 0.6066329892378651, + "grad_norm": 1.9898964166641235, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7059805393218994, + "num_tokens": 138082906.0, + "step": 5524 + }, + { + "epoch": 0.6067428069404788, + "grad_norm": 1.9664208889007568, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.704541027545929, + "num_tokens": 138114612.0, + "step": 5525 + }, + { + "epoch": 0.6068526246430924, + "grad_norm": 2.2708890438079834, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6936749815940857, + "num_tokens": 138138681.0, + "step": 5526 + }, + { + "epoch": 0.6069624423457062, + "grad_norm": 2.207836151123047, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7087367177009583, + "num_tokens": 138164262.0, + "step": 5527 + }, + { + "epoch": 0.6070722600483198, + "grad_norm": 2.4923908710479736, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.703214168548584, + "num_tokens": 138186264.0, + "step": 5528 + }, + { + "epoch": 0.6071820777509335, + "grad_norm": 2.4323179721832275, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7173258066177368, + "num_tokens": 138207482.0, + "step": 5529 + }, + { + "epoch": 0.6072918954535471, + "grad_norm": 1.9748293161392212, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6980055570602417, + "num_tokens": 138238213.0, + "step": 5530 + }, + { + "epoch": 0.6074017131561608, + "grad_norm": 2.5043177604675293, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7089170217514038, + "num_tokens": 138257561.0, + "step": 5531 + }, + { + "epoch": 0.6075115308587744, + "grad_norm": 2.1896982192993164, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.7006134986877441, + "num_tokens": 138282045.0, + "step": 5532 + }, + { + "epoch": 0.607621348561388, + "grad_norm": 2.152695417404175, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6977818012237549, + "num_tokens": 138309247.0, + "step": 5533 + }, + { + "epoch": 0.6077311662640018, + "grad_norm": 2.2839150428771973, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7084113359451294, + "num_tokens": 138331508.0, + "step": 5534 + }, + { + "epoch": 0.6078409839666155, + "grad_norm": 2.140554666519165, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7015514969825745, + "num_tokens": 138357781.0, + "step": 5535 + }, + { + "epoch": 0.6079508016692291, + "grad_norm": 2.1892616748809814, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7075390815734863, + "num_tokens": 138385569.0, + "step": 5536 + }, + { + "epoch": 0.6080606193718427, + "grad_norm": 2.194667339324951, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.7013300657272339, + "num_tokens": 138409850.0, + "step": 5537 + }, + { + "epoch": 0.6081704370744564, + "grad_norm": 2.269541025161743, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7276349067687988, + "num_tokens": 138432174.0, + "step": 5538 + }, + { + "epoch": 0.60828025477707, + "grad_norm": 2.147155523300171, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7195093631744385, + "num_tokens": 138461885.0, + "step": 5539 + }, + { + "epoch": 0.6083900724796837, + "grad_norm": 2.3852226734161377, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6776900291442871, + "num_tokens": 138485497.0, + "step": 5540 + }, + { + "epoch": 0.6084998901822973, + "grad_norm": 1.8935283422470093, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7017070055007935, + "num_tokens": 138516413.0, + "step": 5541 + }, + { + "epoch": 0.6086097078849111, + "grad_norm": 2.0598082542419434, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6931322813034058, + "num_tokens": 138546681.0, + "step": 5542 + }, + { + "epoch": 0.6087195255875247, + "grad_norm": 2.0872044563293457, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7061363458633423, + "num_tokens": 138575664.0, + "step": 5543 + }, + { + "epoch": 0.6088293432901384, + "grad_norm": 2.3509631156921387, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.714745283126831, + "num_tokens": 138598404.0, + "step": 5544 + }, + { + "epoch": 0.608939160992752, + "grad_norm": 1.947998285293579, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7083470821380615, + "num_tokens": 138628838.0, + "step": 5545 + }, + { + "epoch": 0.6090489786953657, + "grad_norm": 2.1212809085845947, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6907123327255249, + "num_tokens": 138655942.0, + "step": 5546 + }, + { + "epoch": 0.6091587963979793, + "grad_norm": 2.1053836345672607, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7063302993774414, + "num_tokens": 138683965.0, + "step": 5547 + }, + { + "epoch": 0.609268614100593, + "grad_norm": 2.3251190185546875, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7323164343833923, + "num_tokens": 138706327.0, + "step": 5548 + }, + { + "epoch": 0.6093784318032067, + "grad_norm": 2.8446426391601562, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7348188757896423, + "num_tokens": 138722901.0, + "step": 5549 + }, + { + "epoch": 0.6094882495058204, + "grad_norm": 2.53845477104187, + "learning_rate": 1e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.683623194694519, + "num_tokens": 138743781.0, + "step": 5550 + }, + { + "epoch": 0.609598067208434, + "grad_norm": 1.9699842929840088, + "learning_rate": 1e-06, + "loss": 1.1019, + "mean_token_accuracy": 0.6685507893562317, + "num_tokens": 138775980.0, + "step": 5551 + }, + { + "epoch": 0.6097078849110477, + "grad_norm": 1.9160964488983154, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6982027292251587, + "num_tokens": 138809836.0, + "step": 5552 + }, + { + "epoch": 0.6098177026136613, + "grad_norm": 2.35911226272583, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.701073169708252, + "num_tokens": 138833530.0, + "step": 5553 + }, + { + "epoch": 0.609927520316275, + "grad_norm": 2.023496627807617, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6944454312324524, + "num_tokens": 138863370.0, + "step": 5554 + }, + { + "epoch": 0.6100373380188886, + "grad_norm": 2.2583067417144775, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6866315603256226, + "num_tokens": 138889722.0, + "step": 5555 + }, + { + "epoch": 0.6101471557215024, + "grad_norm": 2.4224205017089844, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7076322436332703, + "num_tokens": 138912491.0, + "step": 5556 + }, + { + "epoch": 0.610256973424116, + "grad_norm": 2.5133209228515625, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7392159700393677, + "num_tokens": 138930565.0, + "step": 5557 + }, + { + "epoch": 0.6103667911267296, + "grad_norm": 2.2573328018188477, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.696946382522583, + "num_tokens": 138957540.0, + "step": 5558 + }, + { + "epoch": 0.6104766088293433, + "grad_norm": 2.2263288497924805, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6872563362121582, + "num_tokens": 138984224.0, + "step": 5559 + }, + { + "epoch": 0.6105864265319569, + "grad_norm": 2.486950159072876, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.709845781326294, + "num_tokens": 139007127.0, + "step": 5560 + }, + { + "epoch": 0.6106962442345706, + "grad_norm": 2.0731565952301025, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.707683801651001, + "num_tokens": 139034722.0, + "step": 5561 + }, + { + "epoch": 0.6108060619371842, + "grad_norm": 2.3759920597076416, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7008749842643738, + "num_tokens": 139057587.0, + "step": 5562 + }, + { + "epoch": 0.610915879639798, + "grad_norm": 2.3698222637176514, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.696014404296875, + "num_tokens": 139081558.0, + "step": 5563 + }, + { + "epoch": 0.6110256973424116, + "grad_norm": 2.4515128135681152, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6942933797836304, + "num_tokens": 139103865.0, + "step": 5564 + }, + { + "epoch": 0.6111355150450253, + "grad_norm": 2.16652250289917, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6915148496627808, + "num_tokens": 139129364.0, + "step": 5565 + }, + { + "epoch": 0.6112453327476389, + "grad_norm": 2.251089572906494, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6839485168457031, + "num_tokens": 139154734.0, + "step": 5566 + }, + { + "epoch": 0.6113551504502526, + "grad_norm": 2.065692663192749, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7204586267471313, + "num_tokens": 139181691.0, + "step": 5567 + }, + { + "epoch": 0.6114649681528662, + "grad_norm": 2.2801713943481445, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7167053818702698, + "num_tokens": 139205137.0, + "step": 5568 + }, + { + "epoch": 0.6115747858554799, + "grad_norm": 2.41084623336792, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7216936945915222, + "num_tokens": 139226421.0, + "step": 5569 + }, + { + "epoch": 0.6116846035580935, + "grad_norm": 1.9372590780258179, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7038543224334717, + "num_tokens": 139259033.0, + "step": 5570 + }, + { + "epoch": 0.6117944212607073, + "grad_norm": 1.8893742561340332, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7028131484985352, + "num_tokens": 139291234.0, + "step": 5571 + }, + { + "epoch": 0.6119042389633209, + "grad_norm": 2.206716299057007, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7315381765365601, + "num_tokens": 139315675.0, + "step": 5572 + }, + { + "epoch": 0.6120140566659346, + "grad_norm": 2.226149320602417, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7140226364135742, + "num_tokens": 139340202.0, + "step": 5573 + }, + { + "epoch": 0.6121238743685482, + "grad_norm": 2.175720691680908, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.703314483165741, + "num_tokens": 139366503.0, + "step": 5574 + }, + { + "epoch": 0.6122336920711619, + "grad_norm": 2.365673065185547, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6909077167510986, + "num_tokens": 139390454.0, + "step": 5575 + }, + { + "epoch": 0.6123435097737755, + "grad_norm": 2.6270246505737305, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7183150053024292, + "num_tokens": 139410649.0, + "step": 5576 + }, + { + "epoch": 0.6124533274763891, + "grad_norm": 2.5538811683654785, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7104192972183228, + "num_tokens": 139430753.0, + "step": 5577 + }, + { + "epoch": 0.6125631451790029, + "grad_norm": 2.0526487827301025, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7262027859687805, + "num_tokens": 139457078.0, + "step": 5578 + }, + { + "epoch": 0.6126729628816165, + "grad_norm": 2.067096710205078, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7150101661682129, + "num_tokens": 139486481.0, + "step": 5579 + }, + { + "epoch": 0.6127827805842302, + "grad_norm": 2.2810349464416504, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6960241794586182, + "num_tokens": 139511782.0, + "step": 5580 + }, + { + "epoch": 0.6128925982868438, + "grad_norm": 1.9009836912155151, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.6860308647155762, + "num_tokens": 139544287.0, + "step": 5581 + }, + { + "epoch": 0.6130024159894575, + "grad_norm": 2.1472156047821045, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6994789838790894, + "num_tokens": 139570035.0, + "step": 5582 + }, + { + "epoch": 0.6131122336920711, + "grad_norm": 2.3417234420776367, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7253922820091248, + "num_tokens": 139592543.0, + "step": 5583 + }, + { + "epoch": 0.6132220513946848, + "grad_norm": 2.3324947357177734, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6907075047492981, + "num_tokens": 139615374.0, + "step": 5584 + }, + { + "epoch": 0.6133318690972985, + "grad_norm": 2.1978564262390137, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6901857256889343, + "num_tokens": 139642918.0, + "step": 5585 + }, + { + "epoch": 0.6134416867999122, + "grad_norm": 2.1663317680358887, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6850613355636597, + "num_tokens": 139669179.0, + "step": 5586 + }, + { + "epoch": 0.6135515045025258, + "grad_norm": 2.161101818084717, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.711807131767273, + "num_tokens": 139694764.0, + "step": 5587 + }, + { + "epoch": 0.6136613222051395, + "grad_norm": 2.0046517848968506, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7051841020584106, + "num_tokens": 139724076.0, + "step": 5588 + }, + { + "epoch": 0.6137711399077531, + "grad_norm": 2.2463064193725586, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7069903016090393, + "num_tokens": 139746346.0, + "step": 5589 + }, + { + "epoch": 0.6138809576103668, + "grad_norm": 2.2457103729248047, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7295638918876648, + "num_tokens": 139770470.0, + "step": 5590 + }, + { + "epoch": 0.6139907753129804, + "grad_norm": 2.3566598892211914, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7133036851882935, + "num_tokens": 139791971.0, + "step": 5591 + }, + { + "epoch": 0.6141005930155942, + "grad_norm": 2.2952935695648193, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.709069013595581, + "num_tokens": 139814868.0, + "step": 5592 + }, + { + "epoch": 0.6142104107182078, + "grad_norm": 2.2188832759857178, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6967230439186096, + "num_tokens": 139838550.0, + "step": 5593 + }, + { + "epoch": 0.6143202284208215, + "grad_norm": 2.455471992492676, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7212344408035278, + "num_tokens": 139859437.0, + "step": 5594 + }, + { + "epoch": 0.6144300461234351, + "grad_norm": 2.357666015625, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6972215175628662, + "num_tokens": 139884029.0, + "step": 5595 + }, + { + "epoch": 0.6145398638260487, + "grad_norm": 2.2112278938293457, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6875795125961304, + "num_tokens": 139909742.0, + "step": 5596 + }, + { + "epoch": 0.6146496815286624, + "grad_norm": 2.2683773040771484, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7192987203598022, + "num_tokens": 139932013.0, + "step": 5597 + }, + { + "epoch": 0.614759499231276, + "grad_norm": 2.2340667247772217, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7326445579528809, + "num_tokens": 139957129.0, + "step": 5598 + }, + { + "epoch": 0.6148693169338898, + "grad_norm": 2.609734058380127, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7103850245475769, + "num_tokens": 139975172.0, + "step": 5599 + }, + { + "epoch": 0.6149791346365034, + "grad_norm": 2.0410377979278564, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6894822120666504, + "num_tokens": 140004725.0, + "step": 5600 + }, + { + "epoch": 0.6150889523391171, + "grad_norm": 2.140418291091919, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7316036224365234, + "num_tokens": 140029167.0, + "step": 5601 + }, + { + "epoch": 0.6151987700417307, + "grad_norm": 2.6197426319122314, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7088331580162048, + "num_tokens": 140049995.0, + "step": 5602 + }, + { + "epoch": 0.6153085877443444, + "grad_norm": 2.5861854553222656, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7170003056526184, + "num_tokens": 140068398.0, + "step": 5603 + }, + { + "epoch": 0.615418405446958, + "grad_norm": 2.4994356632232666, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6922410726547241, + "num_tokens": 140090550.0, + "step": 5604 + }, + { + "epoch": 0.6155282231495717, + "grad_norm": 2.0562705993652344, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6822885870933533, + "num_tokens": 140118433.0, + "step": 5605 + }, + { + "epoch": 0.6156380408521853, + "grad_norm": 2.0633962154388428, + "learning_rate": 1e-06, + "loss": 1.0755, + "mean_token_accuracy": 0.6805264353752136, + "num_tokens": 140146964.0, + "step": 5606 + }, + { + "epoch": 0.6157478585547991, + "grad_norm": 2.3884565830230713, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6938320398330688, + "num_tokens": 140168435.0, + "step": 5607 + }, + { + "epoch": 0.6158576762574127, + "grad_norm": 2.3669302463531494, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7106358408927917, + "num_tokens": 140190219.0, + "step": 5608 + }, + { + "epoch": 0.6159674939600264, + "grad_norm": 2.5992326736450195, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7115910053253174, + "num_tokens": 140208046.0, + "step": 5609 + }, + { + "epoch": 0.61607731166264, + "grad_norm": 2.1623475551605225, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7200348377227783, + "num_tokens": 140233845.0, + "step": 5610 + }, + { + "epoch": 0.6161871293652537, + "grad_norm": 2.4558048248291016, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7182707786560059, + "num_tokens": 140255949.0, + "step": 5611 + }, + { + "epoch": 0.6162969470678673, + "grad_norm": 2.3239247798919678, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6993453502655029, + "num_tokens": 140279067.0, + "step": 5612 + }, + { + "epoch": 0.616406764770481, + "grad_norm": 2.3395938873291016, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7302074432373047, + "num_tokens": 140301406.0, + "step": 5613 + }, + { + "epoch": 0.6165165824730947, + "grad_norm": 2.2375946044921875, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7224331498146057, + "num_tokens": 140325899.0, + "step": 5614 + }, + { + "epoch": 0.6166264001757084, + "grad_norm": 2.2834370136260986, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7038438320159912, + "num_tokens": 140351129.0, + "step": 5615 + }, + { + "epoch": 0.616736217878322, + "grad_norm": 2.12967586517334, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6916815042495728, + "num_tokens": 140377640.0, + "step": 5616 + }, + { + "epoch": 0.6168460355809356, + "grad_norm": 2.235955238342285, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6849485635757446, + "num_tokens": 140402209.0, + "step": 5617 + }, + { + "epoch": 0.6169558532835493, + "grad_norm": 2.160510540008545, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7116203308105469, + "num_tokens": 140426239.0, + "step": 5618 + }, + { + "epoch": 0.6170656709861629, + "grad_norm": 1.9633127450942993, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6942380666732788, + "num_tokens": 140459310.0, + "step": 5619 + }, + { + "epoch": 0.6171754886887766, + "grad_norm": 2.476323366165161, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6850417256355286, + "num_tokens": 140483688.0, + "step": 5620 + }, + { + "epoch": 0.6172853063913903, + "grad_norm": 1.9586800336837769, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.708682656288147, + "num_tokens": 140513321.0, + "step": 5621 + }, + { + "epoch": 0.617395124094004, + "grad_norm": 2.082737445831299, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7005380988121033, + "num_tokens": 140541298.0, + "step": 5622 + }, + { + "epoch": 0.6175049417966176, + "grad_norm": 2.132742166519165, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7186499834060669, + "num_tokens": 140567557.0, + "step": 5623 + }, + { + "epoch": 0.6176147594992313, + "grad_norm": 2.2044670581817627, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.712065577507019, + "num_tokens": 140592571.0, + "step": 5624 + }, + { + "epoch": 0.6177245772018449, + "grad_norm": 2.1304140090942383, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6813519597053528, + "num_tokens": 140619885.0, + "step": 5625 + }, + { + "epoch": 0.6178343949044586, + "grad_norm": 2.3744468688964844, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7054263353347778, + "num_tokens": 140639785.0, + "step": 5626 + }, + { + "epoch": 0.6179442126070722, + "grad_norm": 2.1902337074279785, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6983377933502197, + "num_tokens": 140666398.0, + "step": 5627 + }, + { + "epoch": 0.618054030309686, + "grad_norm": 2.3090882301330566, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7254040837287903, + "num_tokens": 140690837.0, + "step": 5628 + }, + { + "epoch": 0.6181638480122996, + "grad_norm": 2.292473793029785, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7178411483764648, + "num_tokens": 140716044.0, + "step": 5629 + }, + { + "epoch": 0.6182736657149133, + "grad_norm": 2.249436855316162, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7132155895233154, + "num_tokens": 140741368.0, + "step": 5630 + }, + { + "epoch": 0.6183834834175269, + "grad_norm": 2.3728103637695312, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7156839370727539, + "num_tokens": 140763114.0, + "step": 5631 + }, + { + "epoch": 0.6184933011201406, + "grad_norm": 2.0439772605895996, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6938216686248779, + "num_tokens": 140791988.0, + "step": 5632 + }, + { + "epoch": 0.6186031188227542, + "grad_norm": 2.0523746013641357, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6980576515197754, + "num_tokens": 140821692.0, + "step": 5633 + }, + { + "epoch": 0.6187129365253679, + "grad_norm": 2.4308154582977295, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7235735654830933, + "num_tokens": 140841442.0, + "step": 5634 + }, + { + "epoch": 0.6188227542279815, + "grad_norm": 2.1452419757843018, + "learning_rate": 1e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6880598068237305, + "num_tokens": 140867745.0, + "step": 5635 + }, + { + "epoch": 0.6189325719305953, + "grad_norm": 2.349583387374878, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7110037803649902, + "num_tokens": 140890072.0, + "step": 5636 + }, + { + "epoch": 0.6190423896332089, + "grad_norm": 2.1223223209381104, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7228739857673645, + "num_tokens": 140916252.0, + "step": 5637 + }, + { + "epoch": 0.6191522073358225, + "grad_norm": 2.188405752182007, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.6998368501663208, + "num_tokens": 140942009.0, + "step": 5638 + }, + { + "epoch": 0.6192620250384362, + "grad_norm": 2.0179762840270996, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6833934783935547, + "num_tokens": 140971362.0, + "step": 5639 + }, + { + "epoch": 0.6193718427410498, + "grad_norm": 2.358006238937378, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.715945303440094, + "num_tokens": 140994992.0, + "step": 5640 + }, + { + "epoch": 0.6194816604436635, + "grad_norm": 2.1374552249908447, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7054252624511719, + "num_tokens": 141021376.0, + "step": 5641 + }, + { + "epoch": 0.6195914781462771, + "grad_norm": 2.5550341606140137, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6941570043563843, + "num_tokens": 141044168.0, + "step": 5642 + }, + { + "epoch": 0.6197012958488909, + "grad_norm": 2.778064250946045, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7143586874008179, + "num_tokens": 141061894.0, + "step": 5643 + }, + { + "epoch": 0.6198111135515045, + "grad_norm": 2.174713134765625, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6963346004486084, + "num_tokens": 141089042.0, + "step": 5644 + }, + { + "epoch": 0.6199209312541182, + "grad_norm": 2.257455825805664, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6860350370407104, + "num_tokens": 141113360.0, + "step": 5645 + }, + { + "epoch": 0.6200307489567318, + "grad_norm": 2.338923931121826, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7025412917137146, + "num_tokens": 141137172.0, + "step": 5646 + }, + { + "epoch": 0.6201405666593455, + "grad_norm": 2.662236452102661, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7309347987174988, + "num_tokens": 141155305.0, + "step": 5647 + }, + { + "epoch": 0.6202503843619591, + "grad_norm": 2.123263120651245, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.70058274269104, + "num_tokens": 141181622.0, + "step": 5648 + }, + { + "epoch": 0.6203602020645728, + "grad_norm": 2.3768372535705566, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7135698795318604, + "num_tokens": 141203789.0, + "step": 5649 + }, + { + "epoch": 0.6204700197671865, + "grad_norm": 2.7031242847442627, + "learning_rate": 1e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7414547204971313, + "num_tokens": 141221262.0, + "step": 5650 + }, + { + "epoch": 0.6205798374698002, + "grad_norm": 2.2539165019989014, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7234151363372803, + "num_tokens": 141243566.0, + "step": 5651 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 2.2118093967437744, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7213476896286011, + "num_tokens": 141267425.0, + "step": 5652 + }, + { + "epoch": 0.6207994728750275, + "grad_norm": 2.062412977218628, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7004269361495972, + "num_tokens": 141296478.0, + "step": 5653 + }, + { + "epoch": 0.6209092905776411, + "grad_norm": 2.3798115253448486, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7273598909378052, + "num_tokens": 141318847.0, + "step": 5654 + }, + { + "epoch": 0.6210191082802548, + "grad_norm": 2.10188364982605, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7040276527404785, + "num_tokens": 141345553.0, + "step": 5655 + }, + { + "epoch": 0.6211289259828684, + "grad_norm": 2.2521233558654785, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.706279993057251, + "num_tokens": 141369113.0, + "step": 5656 + }, + { + "epoch": 0.6212387436854822, + "grad_norm": 2.3960392475128174, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6955177783966064, + "num_tokens": 141392199.0, + "step": 5657 + }, + { + "epoch": 0.6213485613880958, + "grad_norm": 2.465153932571411, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7124773263931274, + "num_tokens": 141412825.0, + "step": 5658 + }, + { + "epoch": 0.6214583790907094, + "grad_norm": 2.1309821605682373, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7133744359016418, + "num_tokens": 141439469.0, + "step": 5659 + }, + { + "epoch": 0.6215681967933231, + "grad_norm": 2.5174989700317383, + "learning_rate": 1e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7456305027008057, + "num_tokens": 141458208.0, + "step": 5660 + }, + { + "epoch": 0.6216780144959367, + "grad_norm": 2.3273251056671143, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7331423759460449, + "num_tokens": 141480108.0, + "step": 5661 + }, + { + "epoch": 0.6217878321985504, + "grad_norm": 2.630756139755249, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7216899394989014, + "num_tokens": 141498154.0, + "step": 5662 + }, + { + "epoch": 0.621897649901164, + "grad_norm": 2.4114317893981934, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7110826373100281, + "num_tokens": 141519059.0, + "step": 5663 + }, + { + "epoch": 0.6220074676037777, + "grad_norm": 1.8809345960617065, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6898877024650574, + "num_tokens": 141551820.0, + "step": 5664 + }, + { + "epoch": 0.6221172853063914, + "grad_norm": 2.4482171535491943, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6963109970092773, + "num_tokens": 141574327.0, + "step": 5665 + }, + { + "epoch": 0.6222271030090051, + "grad_norm": 2.266322612762451, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.707313060760498, + "num_tokens": 141598897.0, + "step": 5666 + }, + { + "epoch": 0.6223369207116187, + "grad_norm": 2.06036376953125, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7232062220573425, + "num_tokens": 141625854.0, + "step": 5667 + }, + { + "epoch": 0.6224467384142324, + "grad_norm": 2.4023008346557617, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7114963531494141, + "num_tokens": 141647143.0, + "step": 5668 + }, + { + "epoch": 0.622556556116846, + "grad_norm": 2.112931251525879, + "learning_rate": 1e-06, + "loss": 1.08, + "mean_token_accuracy": 0.6872420310974121, + "num_tokens": 141675556.0, + "step": 5669 + }, + { + "epoch": 0.6226663738194597, + "grad_norm": 2.1267971992492676, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7235108613967896, + "num_tokens": 141700974.0, + "step": 5670 + }, + { + "epoch": 0.6227761915220733, + "grad_norm": 2.1878859996795654, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7266775369644165, + "num_tokens": 141724678.0, + "step": 5671 + }, + { + "epoch": 0.6228860092246871, + "grad_norm": 2.208693027496338, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7089115381240845, + "num_tokens": 141747497.0, + "step": 5672 + }, + { + "epoch": 0.6229958269273007, + "grad_norm": 2.3795411586761475, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6921603679656982, + "num_tokens": 141771193.0, + "step": 5673 + }, + { + "epoch": 0.6231056446299144, + "grad_norm": 2.659010410308838, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7300843596458435, + "num_tokens": 141789887.0, + "step": 5674 + }, + { + "epoch": 0.623215462332528, + "grad_norm": 2.1643948554992676, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.70047926902771, + "num_tokens": 141814551.0, + "step": 5675 + }, + { + "epoch": 0.6233252800351416, + "grad_norm": 2.315031051635742, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6852384805679321, + "num_tokens": 141840451.0, + "step": 5676 + }, + { + "epoch": 0.6234350977377553, + "grad_norm": 2.0905134677886963, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.718684732913971, + "num_tokens": 141869921.0, + "step": 5677 + }, + { + "epoch": 0.6235449154403689, + "grad_norm": 2.1683993339538574, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.722862958908081, + "num_tokens": 141895600.0, + "step": 5678 + }, + { + "epoch": 0.6236547331429827, + "grad_norm": 2.3296213150024414, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6983577013015747, + "num_tokens": 141920415.0, + "step": 5679 + }, + { + "epoch": 0.6237645508455963, + "grad_norm": 2.327634572982788, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7053142189979553, + "num_tokens": 141943971.0, + "step": 5680 + }, + { + "epoch": 0.62387436854821, + "grad_norm": 2.120420217514038, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7114788293838501, + "num_tokens": 141972188.0, + "step": 5681 + }, + { + "epoch": 0.6239841862508236, + "grad_norm": 2.708219051361084, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7270267009735107, + "num_tokens": 141991745.0, + "step": 5682 + }, + { + "epoch": 0.6240940039534373, + "grad_norm": 2.0113723278045654, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7069284915924072, + "num_tokens": 142020602.0, + "step": 5683 + }, + { + "epoch": 0.6242038216560509, + "grad_norm": 1.9376606941223145, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7046723961830139, + "num_tokens": 142052111.0, + "step": 5684 + }, + { + "epoch": 0.6243136393586646, + "grad_norm": 2.0723330974578857, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7155869007110596, + "num_tokens": 142077979.0, + "step": 5685 + }, + { + "epoch": 0.6244234570612783, + "grad_norm": 2.2486443519592285, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.718895673751831, + "num_tokens": 142100526.0, + "step": 5686 + }, + { + "epoch": 0.624533274763892, + "grad_norm": 2.448441743850708, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.703040599822998, + "num_tokens": 142122782.0, + "step": 5687 + }, + { + "epoch": 0.6246430924665056, + "grad_norm": 2.391258716583252, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6819067597389221, + "num_tokens": 142145815.0, + "step": 5688 + }, + { + "epoch": 0.6247529101691193, + "grad_norm": 2.3871450424194336, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6804301738739014, + "num_tokens": 142171364.0, + "step": 5689 + }, + { + "epoch": 0.6248627278717329, + "grad_norm": 2.257998466491699, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7157737612724304, + "num_tokens": 142193887.0, + "step": 5690 + }, + { + "epoch": 0.6249725455743466, + "grad_norm": 2.2921180725097656, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6940867304801941, + "num_tokens": 142218934.0, + "step": 5691 + }, + { + "epoch": 0.6250823632769602, + "grad_norm": 2.2165513038635254, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7111108303070068, + "num_tokens": 142244164.0, + "step": 5692 + }, + { + "epoch": 0.6251921809795739, + "grad_norm": 2.0710060596466064, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6977740526199341, + "num_tokens": 142274665.0, + "step": 5693 + }, + { + "epoch": 0.6253019986821876, + "grad_norm": 2.062560558319092, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6917829513549805, + "num_tokens": 142304611.0, + "step": 5694 + }, + { + "epoch": 0.6254118163848013, + "grad_norm": 2.274503469467163, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7279403209686279, + "num_tokens": 142329206.0, + "step": 5695 + }, + { + "epoch": 0.6255216340874149, + "grad_norm": 2.2938342094421387, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7044232487678528, + "num_tokens": 142351668.0, + "step": 5696 + }, + { + "epoch": 0.6256314517900285, + "grad_norm": 2.1828575134277344, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7064172029495239, + "num_tokens": 142377395.0, + "step": 5697 + }, + { + "epoch": 0.6257412694926422, + "grad_norm": 2.2852559089660645, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.6992152333259583, + "num_tokens": 142401139.0, + "step": 5698 + }, + { + "epoch": 0.6258510871952558, + "grad_norm": 2.6270065307617188, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7199221253395081, + "num_tokens": 142420713.0, + "step": 5699 + }, + { + "epoch": 0.6259609048978695, + "grad_norm": 2.0891623497009277, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7045089602470398, + "num_tokens": 142448296.0, + "step": 5700 + }, + { + "epoch": 0.6260707226004832, + "grad_norm": 2.169496774673462, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6956669688224792, + "num_tokens": 142475228.0, + "step": 5701 + }, + { + "epoch": 0.6261805403030969, + "grad_norm": 2.288560628890991, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.687618613243103, + "num_tokens": 142502009.0, + "step": 5702 + }, + { + "epoch": 0.6262903580057105, + "grad_norm": 1.8934623003005981, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6850494146347046, + "num_tokens": 142535403.0, + "step": 5703 + }, + { + "epoch": 0.6264001757083242, + "grad_norm": 2.317333459854126, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6949480175971985, + "num_tokens": 142562085.0, + "step": 5704 + }, + { + "epoch": 0.6265099934109378, + "grad_norm": 2.3225083351135254, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7061910629272461, + "num_tokens": 142587473.0, + "step": 5705 + }, + { + "epoch": 0.6266198111135515, + "grad_norm": 2.318636178970337, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7134543657302856, + "num_tokens": 142610221.0, + "step": 5706 + }, + { + "epoch": 0.6267296288161651, + "grad_norm": 2.1236865520477295, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7348493933677673, + "num_tokens": 142634999.0, + "step": 5707 + }, + { + "epoch": 0.6268394465187789, + "grad_norm": 2.25435471534729, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6983805298805237, + "num_tokens": 142659753.0, + "step": 5708 + }, + { + "epoch": 0.6269492642213925, + "grad_norm": 2.250466823577881, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7019076347351074, + "num_tokens": 142683302.0, + "step": 5709 + }, + { + "epoch": 0.6270590819240062, + "grad_norm": 2.2045416831970215, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7067009210586548, + "num_tokens": 142708730.0, + "step": 5710 + }, + { + "epoch": 0.6271688996266198, + "grad_norm": 2.242844820022583, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7008463144302368, + "num_tokens": 142731323.0, + "step": 5711 + }, + { + "epoch": 0.6272787173292335, + "grad_norm": 2.1426198482513428, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.706139087677002, + "num_tokens": 142757164.0, + "step": 5712 + }, + { + "epoch": 0.6273885350318471, + "grad_norm": 2.2564311027526855, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7071608304977417, + "num_tokens": 142780337.0, + "step": 5713 + }, + { + "epoch": 0.6274983527344608, + "grad_norm": 2.079756021499634, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7274429798126221, + "num_tokens": 142807646.0, + "step": 5714 + }, + { + "epoch": 0.6276081704370745, + "grad_norm": 2.4131569862365723, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.6977167129516602, + "num_tokens": 142829544.0, + "step": 5715 + }, + { + "epoch": 0.6277179881396882, + "grad_norm": 2.124528169631958, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7221376299858093, + "num_tokens": 142854221.0, + "step": 5716 + }, + { + "epoch": 0.6278278058423018, + "grad_norm": 2.0967798233032227, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6985971927642822, + "num_tokens": 142881806.0, + "step": 5717 + }, + { + "epoch": 0.6279376235449154, + "grad_norm": 2.1641252040863037, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7100574970245361, + "num_tokens": 142907877.0, + "step": 5718 + }, + { + "epoch": 0.6280474412475291, + "grad_norm": 2.5559873580932617, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6997779607772827, + "num_tokens": 142928602.0, + "step": 5719 + }, + { + "epoch": 0.6281572589501427, + "grad_norm": 2.326930284500122, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6983957290649414, + "num_tokens": 142952507.0, + "step": 5720 + }, + { + "epoch": 0.6282670766527564, + "grad_norm": 2.305262804031372, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.693182110786438, + "num_tokens": 142975481.0, + "step": 5721 + }, + { + "epoch": 0.62837689435537, + "grad_norm": 2.3298614025115967, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7456756830215454, + "num_tokens": 142994157.0, + "step": 5722 + }, + { + "epoch": 0.6284867120579838, + "grad_norm": 2.509010076522827, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7004188299179077, + "num_tokens": 143014521.0, + "step": 5723 + }, + { + "epoch": 0.6285965297605974, + "grad_norm": 2.031024932861328, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6918187737464905, + "num_tokens": 143044615.0, + "step": 5724 + }, + { + "epoch": 0.6287063474632111, + "grad_norm": 2.4603002071380615, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7160348892211914, + "num_tokens": 143068124.0, + "step": 5725 + }, + { + "epoch": 0.6288161651658247, + "grad_norm": 2.5447680950164795, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7064536809921265, + "num_tokens": 143090824.0, + "step": 5726 + }, + { + "epoch": 0.6289259828684384, + "grad_norm": 2.297882080078125, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.71653813123703, + "num_tokens": 143112654.0, + "step": 5727 + }, + { + "epoch": 0.629035800571052, + "grad_norm": 1.939046859741211, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6792151927947998, + "num_tokens": 143145338.0, + "step": 5728 + }, + { + "epoch": 0.6291456182736657, + "grad_norm": 2.198293924331665, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6929690837860107, + "num_tokens": 143170251.0, + "step": 5729 + }, + { + "epoch": 0.6292554359762794, + "grad_norm": 2.4012720584869385, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7225691080093384, + "num_tokens": 143190329.0, + "step": 5730 + }, + { + "epoch": 0.6293652536788931, + "grad_norm": 2.464986801147461, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6783730983734131, + "num_tokens": 143212531.0, + "step": 5731 + }, + { + "epoch": 0.6294750713815067, + "grad_norm": 2.2632100582122803, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7178070545196533, + "num_tokens": 143236335.0, + "step": 5732 + }, + { + "epoch": 0.6295848890841204, + "grad_norm": 2.540360450744629, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7268738150596619, + "num_tokens": 143256900.0, + "step": 5733 + }, + { + "epoch": 0.629694706786734, + "grad_norm": 2.191661834716797, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7120081186294556, + "num_tokens": 143282702.0, + "step": 5734 + }, + { + "epoch": 0.6298045244893477, + "grad_norm": 2.2081124782562256, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7082960605621338, + "num_tokens": 143307227.0, + "step": 5735 + }, + { + "epoch": 0.6299143421919613, + "grad_norm": 1.9350693225860596, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.693292498588562, + "num_tokens": 143338719.0, + "step": 5736 + }, + { + "epoch": 0.630024159894575, + "grad_norm": 2.4804491996765137, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6999395489692688, + "num_tokens": 143358974.0, + "step": 5737 + }, + { + "epoch": 0.6301339775971887, + "grad_norm": 2.4027583599090576, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7162676453590393, + "num_tokens": 143380570.0, + "step": 5738 + }, + { + "epoch": 0.6302437952998023, + "grad_norm": 2.0473825931549072, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7133803367614746, + "num_tokens": 143407517.0, + "step": 5739 + }, + { + "epoch": 0.630353613002416, + "grad_norm": 2.3580243587493896, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7368109822273254, + "num_tokens": 143427852.0, + "step": 5740 + }, + { + "epoch": 0.6304634307050296, + "grad_norm": 2.1506083011627197, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7113806009292603, + "num_tokens": 143453307.0, + "step": 5741 + }, + { + "epoch": 0.6305732484076433, + "grad_norm": 2.2354209423065186, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6965631246566772, + "num_tokens": 143476713.0, + "step": 5742 + }, + { + "epoch": 0.6306830661102569, + "grad_norm": 2.718764543533325, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.712548017501831, + "num_tokens": 143493535.0, + "step": 5743 + }, + { + "epoch": 0.6307928838128707, + "grad_norm": 1.9398738145828247, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.6889127492904663, + "num_tokens": 143524095.0, + "step": 5744 + }, + { + "epoch": 0.6309027015154843, + "grad_norm": 2.185317277908325, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7014611959457397, + "num_tokens": 143550439.0, + "step": 5745 + }, + { + "epoch": 0.631012519218098, + "grad_norm": 2.1802055835723877, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6998507976531982, + "num_tokens": 143574999.0, + "step": 5746 + }, + { + "epoch": 0.6311223369207116, + "grad_norm": 2.0648536682128906, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7045401334762573, + "num_tokens": 143602039.0, + "step": 5747 + }, + { + "epoch": 0.6312321546233253, + "grad_norm": 2.4523727893829346, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7008180618286133, + "num_tokens": 143623234.0, + "step": 5748 + }, + { + "epoch": 0.6313419723259389, + "grad_norm": 2.135603666305542, + "learning_rate": 1e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6847988963127136, + "num_tokens": 143651506.0, + "step": 5749 + }, + { + "epoch": 0.6314517900285526, + "grad_norm": 2.097407341003418, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.715377688407898, + "num_tokens": 143677834.0, + "step": 5750 + }, + { + "epoch": 0.6315616077311663, + "grad_norm": 2.1767756938934326, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7122724056243896, + "num_tokens": 143701322.0, + "step": 5751 + }, + { + "epoch": 0.63167142543378, + "grad_norm": 2.017054557800293, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7164158821105957, + "num_tokens": 143731281.0, + "step": 5752 + }, + { + "epoch": 0.6317812431363936, + "grad_norm": 2.5628113746643066, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7673878073692322, + "num_tokens": 143748442.0, + "step": 5753 + }, + { + "epoch": 0.6318910608390073, + "grad_norm": 2.4266133308410645, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7121416330337524, + "num_tokens": 143771289.0, + "step": 5754 + }, + { + "epoch": 0.6320008785416209, + "grad_norm": 2.320871114730835, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6897005438804626, + "num_tokens": 143794094.0, + "step": 5755 + }, + { + "epoch": 0.6321106962442345, + "grad_norm": 2.5530478954315186, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7045758962631226, + "num_tokens": 143814097.0, + "step": 5756 + }, + { + "epoch": 0.6322205139468482, + "grad_norm": 2.274937391281128, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7003039121627808, + "num_tokens": 143836689.0, + "step": 5757 + }, + { + "epoch": 0.6323303316494618, + "grad_norm": 2.3507022857666016, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.709025502204895, + "num_tokens": 143860758.0, + "step": 5758 + }, + { + "epoch": 0.6324401493520756, + "grad_norm": 2.1643896102905273, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7126604318618774, + "num_tokens": 143883614.0, + "step": 5759 + }, + { + "epoch": 0.6325499670546892, + "grad_norm": 2.141719341278076, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.7010414004325867, + "num_tokens": 143909861.0, + "step": 5760 + }, + { + "epoch": 0.6326597847573029, + "grad_norm": 2.060930013656616, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7070983648300171, + "num_tokens": 143936340.0, + "step": 5761 + }, + { + "epoch": 0.6327696024599165, + "grad_norm": 2.5313494205474854, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7306864261627197, + "num_tokens": 143955171.0, + "step": 5762 + }, + { + "epoch": 0.6328794201625302, + "grad_norm": 2.26545786857605, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7096719145774841, + "num_tokens": 143977346.0, + "step": 5763 + }, + { + "epoch": 0.6329892378651438, + "grad_norm": 2.463277816772461, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7327865958213806, + "num_tokens": 143997577.0, + "step": 5764 + }, + { + "epoch": 0.6330990555677575, + "grad_norm": 2.170729637145996, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7157357931137085, + "num_tokens": 144022146.0, + "step": 5765 + }, + { + "epoch": 0.6332088732703712, + "grad_norm": 2.0288503170013428, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6774935722351074, + "num_tokens": 144050186.0, + "step": 5766 + }, + { + "epoch": 0.6333186909729849, + "grad_norm": 2.2284443378448486, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6776593923568726, + "num_tokens": 144075700.0, + "step": 5767 + }, + { + "epoch": 0.6334285086755985, + "grad_norm": 2.000491142272949, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7085310220718384, + "num_tokens": 144103986.0, + "step": 5768 + }, + { + "epoch": 0.6335383263782122, + "grad_norm": 1.9941922426223755, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7055162787437439, + "num_tokens": 144133655.0, + "step": 5769 + }, + { + "epoch": 0.6336481440808258, + "grad_norm": 2.190446615219116, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6951446533203125, + "num_tokens": 144158867.0, + "step": 5770 + }, + { + "epoch": 0.6337579617834395, + "grad_norm": 2.3277406692504883, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7061073780059814, + "num_tokens": 144182208.0, + "step": 5771 + }, + { + "epoch": 0.6338677794860531, + "grad_norm": 2.2079086303710938, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6838440895080566, + "num_tokens": 144207050.0, + "step": 5772 + }, + { + "epoch": 0.6339775971886669, + "grad_norm": 2.4600117206573486, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.7083145380020142, + "num_tokens": 144227425.0, + "step": 5773 + }, + { + "epoch": 0.6340874148912805, + "grad_norm": 2.215843439102173, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7171029448509216, + "num_tokens": 144250643.0, + "step": 5774 + }, + { + "epoch": 0.6341972325938942, + "grad_norm": 2.2679760456085205, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7003711462020874, + "num_tokens": 144274099.0, + "step": 5775 + }, + { + "epoch": 0.6343070502965078, + "grad_norm": 2.1055991649627686, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7035795450210571, + "num_tokens": 144301887.0, + "step": 5776 + }, + { + "epoch": 0.6344168679991214, + "grad_norm": 2.267834424972534, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7054680585861206, + "num_tokens": 144324409.0, + "step": 5777 + }, + { + "epoch": 0.6345266857017351, + "grad_norm": 2.5223565101623535, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7268493175506592, + "num_tokens": 144343572.0, + "step": 5778 + }, + { + "epoch": 0.6346365034043487, + "grad_norm": 2.3632025718688965, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.711970329284668, + "num_tokens": 144365655.0, + "step": 5779 + }, + { + "epoch": 0.6347463211069625, + "grad_norm": 2.247083902359009, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6998003721237183, + "num_tokens": 144390046.0, + "step": 5780 + }, + { + "epoch": 0.6348561388095761, + "grad_norm": 2.4094741344451904, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.716100811958313, + "num_tokens": 144411268.0, + "step": 5781 + }, + { + "epoch": 0.6349659565121898, + "grad_norm": 2.257072925567627, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6909092664718628, + "num_tokens": 144436263.0, + "step": 5782 + }, + { + "epoch": 0.6350757742148034, + "grad_norm": 2.452200174331665, + "learning_rate": 1e-06, + "loss": 1.0625, + "mean_token_accuracy": 0.6842970848083496, + "num_tokens": 144458687.0, + "step": 5783 + }, + { + "epoch": 0.6351855919174171, + "grad_norm": 2.0679140090942383, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6929446458816528, + "num_tokens": 144487080.0, + "step": 5784 + }, + { + "epoch": 0.6352954096200307, + "grad_norm": 2.2404956817626953, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7187893390655518, + "num_tokens": 144509530.0, + "step": 5785 + }, + { + "epoch": 0.6354052273226444, + "grad_norm": 2.109764337539673, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6913695335388184, + "num_tokens": 144538795.0, + "step": 5786 + }, + { + "epoch": 0.635515045025258, + "grad_norm": 1.866929054260254, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.6975216865539551, + "num_tokens": 144575066.0, + "step": 5787 + }, + { + "epoch": 0.6356248627278718, + "grad_norm": 2.116877794265747, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7034204006195068, + "num_tokens": 144602671.0, + "step": 5788 + }, + { + "epoch": 0.6357346804304854, + "grad_norm": 2.3992133140563965, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7040468454360962, + "num_tokens": 144625552.0, + "step": 5789 + }, + { + "epoch": 0.6358444981330991, + "grad_norm": 2.085430860519409, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7081267237663269, + "num_tokens": 144654141.0, + "step": 5790 + }, + { + "epoch": 0.6359543158357127, + "grad_norm": 2.3030261993408203, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7314109802246094, + "num_tokens": 144675920.0, + "step": 5791 + }, + { + "epoch": 0.6360641335383264, + "grad_norm": 2.2868592739105225, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7189770936965942, + "num_tokens": 144699557.0, + "step": 5792 + }, + { + "epoch": 0.63617395124094, + "grad_norm": 2.322815418243408, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7200562953948975, + "num_tokens": 144721464.0, + "step": 5793 + }, + { + "epoch": 0.6362837689435537, + "grad_norm": 2.1825568675994873, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6720314025878906, + "num_tokens": 144749562.0, + "step": 5794 + }, + { + "epoch": 0.6363935866461674, + "grad_norm": 2.204350471496582, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7020168304443359, + "num_tokens": 144774252.0, + "step": 5795 + }, + { + "epoch": 0.636503404348781, + "grad_norm": 2.031341314315796, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7007572650909424, + "num_tokens": 144803699.0, + "step": 5796 + }, + { + "epoch": 0.6366132220513947, + "grad_norm": 1.9562147855758667, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7193654179573059, + "num_tokens": 144836529.0, + "step": 5797 + }, + { + "epoch": 0.6367230397540083, + "grad_norm": 2.2176434993743896, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6898230314254761, + "num_tokens": 144864940.0, + "step": 5798 + }, + { + "epoch": 0.636832857456622, + "grad_norm": 2.4615209102630615, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7131163477897644, + "num_tokens": 144888625.0, + "step": 5799 + }, + { + "epoch": 0.6369426751592356, + "grad_norm": 1.9323073625564575, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6781684756278992, + "num_tokens": 144919955.0, + "step": 5800 + }, + { + "epoch": 0.6370524928618493, + "grad_norm": 2.1631786823272705, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7105250358581543, + "num_tokens": 144945845.0, + "step": 5801 + }, + { + "epoch": 0.637162310564463, + "grad_norm": 2.212892770767212, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7061988115310669, + "num_tokens": 144970011.0, + "step": 5802 + }, + { + "epoch": 0.6372721282670767, + "grad_norm": 2.2487566471099854, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7055454850196838, + "num_tokens": 144993605.0, + "step": 5803 + }, + { + "epoch": 0.6373819459696903, + "grad_norm": 2.458672523498535, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7193287014961243, + "num_tokens": 145012522.0, + "step": 5804 + }, + { + "epoch": 0.637491763672304, + "grad_norm": 2.228395938873291, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6947880983352661, + "num_tokens": 145037385.0, + "step": 5805 + }, + { + "epoch": 0.6376015813749176, + "grad_norm": 2.052865982055664, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6892367601394653, + "num_tokens": 145066924.0, + "step": 5806 + }, + { + "epoch": 0.6377113990775313, + "grad_norm": 2.396483898162842, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7026476860046387, + "num_tokens": 145088555.0, + "step": 5807 + }, + { + "epoch": 0.6378212167801449, + "grad_norm": 2.438567638397217, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7376933693885803, + "num_tokens": 145108307.0, + "step": 5808 + }, + { + "epoch": 0.6379310344827587, + "grad_norm": 2.5314738750457764, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7352676391601562, + "num_tokens": 145128534.0, + "step": 5809 + }, + { + "epoch": 0.6380408521853723, + "grad_norm": 2.421506404876709, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6995370984077454, + "num_tokens": 145150838.0, + "step": 5810 + }, + { + "epoch": 0.638150669887986, + "grad_norm": 2.5867764949798584, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7104575634002686, + "num_tokens": 145170207.0, + "step": 5811 + }, + { + "epoch": 0.6382604875905996, + "grad_norm": 2.352384090423584, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7022414207458496, + "num_tokens": 145197534.0, + "step": 5812 + }, + { + "epoch": 0.6383703052932133, + "grad_norm": 2.3233730792999268, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7156885862350464, + "num_tokens": 145220710.0, + "step": 5813 + }, + { + "epoch": 0.6384801229958269, + "grad_norm": 2.134206533432007, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6973428726196289, + "num_tokens": 145247928.0, + "step": 5814 + }, + { + "epoch": 0.6385899406984406, + "grad_norm": 2.1079800128936768, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7038998603820801, + "num_tokens": 145276076.0, + "step": 5815 + }, + { + "epoch": 0.6386997584010542, + "grad_norm": 2.485222339630127, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.718143105506897, + "num_tokens": 145297285.0, + "step": 5816 + }, + { + "epoch": 0.638809576103668, + "grad_norm": 2.329676866531372, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7270251512527466, + "num_tokens": 145318570.0, + "step": 5817 + }, + { + "epoch": 0.6389193938062816, + "grad_norm": 2.3282980918884277, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6886546611785889, + "num_tokens": 145341381.0, + "step": 5818 + }, + { + "epoch": 0.6390292115088952, + "grad_norm": 2.004179000854492, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6820806264877319, + "num_tokens": 145372886.0, + "step": 5819 + }, + { + "epoch": 0.6391390292115089, + "grad_norm": 2.206664800643921, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.7086538076400757, + "num_tokens": 145399750.0, + "step": 5820 + }, + { + "epoch": 0.6392488469141225, + "grad_norm": 2.22945499420166, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7073162794113159, + "num_tokens": 145423349.0, + "step": 5821 + }, + { + "epoch": 0.6393586646167362, + "grad_norm": 2.276599168777466, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6956225037574768, + "num_tokens": 145448484.0, + "step": 5822 + }, + { + "epoch": 0.6394684823193498, + "grad_norm": 2.535861015319824, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6930434703826904, + "num_tokens": 145469220.0, + "step": 5823 + }, + { + "epoch": 0.6395783000219636, + "grad_norm": 2.313692092895508, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6940768361091614, + "num_tokens": 145494146.0, + "step": 5824 + }, + { + "epoch": 0.6396881177245772, + "grad_norm": 2.668628215789795, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6923727989196777, + "num_tokens": 145521025.0, + "step": 5825 + }, + { + "epoch": 0.6397979354271909, + "grad_norm": 2.281886100769043, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.712074875831604, + "num_tokens": 145545189.0, + "step": 5826 + }, + { + "epoch": 0.6399077531298045, + "grad_norm": 2.2888901233673096, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.708564281463623, + "num_tokens": 145570335.0, + "step": 5827 + }, + { + "epoch": 0.6400175708324182, + "grad_norm": 2.343165397644043, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.718487024307251, + "num_tokens": 145594787.0, + "step": 5828 + }, + { + "epoch": 0.6401273885350318, + "grad_norm": 2.429713487625122, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7207705974578857, + "num_tokens": 145614913.0, + "step": 5829 + }, + { + "epoch": 0.6402372062376455, + "grad_norm": 2.1971065998077393, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7177430391311646, + "num_tokens": 145639545.0, + "step": 5830 + }, + { + "epoch": 0.6403470239402592, + "grad_norm": 2.200620412826538, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.6988216638565063, + "num_tokens": 145665778.0, + "step": 5831 + }, + { + "epoch": 0.6404568416428729, + "grad_norm": 2.0880556106567383, + "learning_rate": 1e-06, + "loss": 0.7711, + "mean_token_accuracy": 0.7484979629516602, + "num_tokens": 145690908.0, + "step": 5832 + }, + { + "epoch": 0.6405666593454865, + "grad_norm": 2.34710693359375, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.694621205329895, + "num_tokens": 145716192.0, + "step": 5833 + }, + { + "epoch": 0.6406764770481002, + "grad_norm": 2.416639804840088, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6947000026702881, + "num_tokens": 145740284.0, + "step": 5834 + }, + { + "epoch": 0.6407862947507138, + "grad_norm": 2.3988351821899414, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6829322576522827, + "num_tokens": 145764039.0, + "step": 5835 + }, + { + "epoch": 0.6408961124533274, + "grad_norm": 2.667602062225342, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7221142053604126, + "num_tokens": 145782975.0, + "step": 5836 + }, + { + "epoch": 0.6410059301559411, + "grad_norm": 2.1126341819763184, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7262505292892456, + "num_tokens": 145808648.0, + "step": 5837 + }, + { + "epoch": 0.6411157478585549, + "grad_norm": 1.89911949634552, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.689487099647522, + "num_tokens": 145842418.0, + "step": 5838 + }, + { + "epoch": 0.6412255655611685, + "grad_norm": 2.396573543548584, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7125346660614014, + "num_tokens": 145863418.0, + "step": 5839 + }, + { + "epoch": 0.6413353832637821, + "grad_norm": 2.1233718395233154, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7126078605651855, + "num_tokens": 145889122.0, + "step": 5840 + }, + { + "epoch": 0.6414452009663958, + "grad_norm": 2.0224459171295166, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7244741320610046, + "num_tokens": 145917348.0, + "step": 5841 + }, + { + "epoch": 0.6415550186690094, + "grad_norm": 1.9889271259307861, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6947191953659058, + "num_tokens": 145946584.0, + "step": 5842 + }, + { + "epoch": 0.6416648363716231, + "grad_norm": 2.541804075241089, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6885347962379456, + "num_tokens": 145968100.0, + "step": 5843 + }, + { + "epoch": 0.6417746540742367, + "grad_norm": 2.171426296234131, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6971991062164307, + "num_tokens": 145994469.0, + "step": 5844 + }, + { + "epoch": 0.6418844717768504, + "grad_norm": 2.1578400135040283, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6991687417030334, + "num_tokens": 146020753.0, + "step": 5845 + }, + { + "epoch": 0.6419942894794641, + "grad_norm": 2.528993844985962, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6991413831710815, + "num_tokens": 146043124.0, + "step": 5846 + }, + { + "epoch": 0.6421041071820778, + "grad_norm": 2.0428481101989746, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7312754392623901, + "num_tokens": 146069376.0, + "step": 5847 + }, + { + "epoch": 0.6422139248846914, + "grad_norm": 2.424196481704712, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7185279130935669, + "num_tokens": 146092838.0, + "step": 5848 + }, + { + "epoch": 0.6423237425873051, + "grad_norm": 2.0950701236724854, + "learning_rate": 1e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6734691262245178, + "num_tokens": 146126696.0, + "step": 5849 + }, + { + "epoch": 0.6424335602899187, + "grad_norm": 2.052395820617676, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.691253125667572, + "num_tokens": 146154861.0, + "step": 5850 + }, + { + "epoch": 0.6425433779925324, + "grad_norm": 2.4640746116638184, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7157050371170044, + "num_tokens": 146175549.0, + "step": 5851 + }, + { + "epoch": 0.642653195695146, + "grad_norm": 1.9541014432907104, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7189007997512817, + "num_tokens": 146203592.0, + "step": 5852 + }, + { + "epoch": 0.6427630133977598, + "grad_norm": 2.2203290462493896, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7192306518554688, + "num_tokens": 146227829.0, + "step": 5853 + }, + { + "epoch": 0.6428728311003734, + "grad_norm": 2.3995718955993652, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7222460508346558, + "num_tokens": 146248857.0, + "step": 5854 + }, + { + "epoch": 0.6429826488029871, + "grad_norm": 2.156545877456665, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7143653631210327, + "num_tokens": 146274503.0, + "step": 5855 + }, + { + "epoch": 0.6430924665056007, + "grad_norm": 2.0871994495391846, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.7002438902854919, + "num_tokens": 146302998.0, + "step": 5856 + }, + { + "epoch": 0.6432022842082143, + "grad_norm": 1.995287299156189, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7144474983215332, + "num_tokens": 146329471.0, + "step": 5857 + }, + { + "epoch": 0.643312101910828, + "grad_norm": 2.0296967029571533, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6880306005477905, + "num_tokens": 146357707.0, + "step": 5858 + }, + { + "epoch": 0.6434219196134416, + "grad_norm": 2.1043026447296143, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6930404901504517, + "num_tokens": 146385441.0, + "step": 5859 + }, + { + "epoch": 0.6435317373160554, + "grad_norm": 2.6284053325653076, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6911396980285645, + "num_tokens": 146405679.0, + "step": 5860 + }, + { + "epoch": 0.643641555018669, + "grad_norm": 2.2715060710906982, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6900191307067871, + "num_tokens": 146430797.0, + "step": 5861 + }, + { + "epoch": 0.6437513727212827, + "grad_norm": 2.056593656539917, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.691769003868103, + "num_tokens": 146461305.0, + "step": 5862 + }, + { + "epoch": 0.6438611904238963, + "grad_norm": 2.260019063949585, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7086564302444458, + "num_tokens": 146486505.0, + "step": 5863 + }, + { + "epoch": 0.64397100812651, + "grad_norm": 2.1332006454467773, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7075136303901672, + "num_tokens": 146514660.0, + "step": 5864 + }, + { + "epoch": 0.6440808258291236, + "grad_norm": 1.9512656927108765, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7138985395431519, + "num_tokens": 146544978.0, + "step": 5865 + }, + { + "epoch": 0.6441906435317373, + "grad_norm": 2.322979211807251, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6915123462677002, + "num_tokens": 146568757.0, + "step": 5866 + }, + { + "epoch": 0.644300461234351, + "grad_norm": 2.0054702758789062, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7063038945198059, + "num_tokens": 146598136.0, + "step": 5867 + }, + { + "epoch": 0.6444102789369647, + "grad_norm": 2.355003833770752, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7232125401496887, + "num_tokens": 146620699.0, + "step": 5868 + }, + { + "epoch": 0.6445200966395783, + "grad_norm": 2.3113749027252197, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7064777612686157, + "num_tokens": 146643197.0, + "step": 5869 + }, + { + "epoch": 0.644629914342192, + "grad_norm": 2.4042446613311768, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7024118900299072, + "num_tokens": 146665240.0, + "step": 5870 + }, + { + "epoch": 0.6447397320448056, + "grad_norm": 2.214059591293335, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7086782455444336, + "num_tokens": 146690549.0, + "step": 5871 + }, + { + "epoch": 0.6448495497474193, + "grad_norm": 2.271646738052368, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7149050831794739, + "num_tokens": 146712832.0, + "step": 5872 + }, + { + "epoch": 0.6449593674500329, + "grad_norm": 2.429476022720337, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7182530164718628, + "num_tokens": 146734934.0, + "step": 5873 + }, + { + "epoch": 0.6450691851526466, + "grad_norm": 2.289543390274048, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.706841230392456, + "num_tokens": 146759795.0, + "step": 5874 + }, + { + "epoch": 0.6451790028552603, + "grad_norm": 2.217029571533203, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7212302684783936, + "num_tokens": 146785475.0, + "step": 5875 + }, + { + "epoch": 0.645288820557874, + "grad_norm": 2.2410519123077393, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7312160730361938, + "num_tokens": 146807803.0, + "step": 5876 + }, + { + "epoch": 0.6453986382604876, + "grad_norm": 2.1576759815216064, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7098397016525269, + "num_tokens": 146832292.0, + "step": 5877 + }, + { + "epoch": 0.6455084559631012, + "grad_norm": 2.518221616744995, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7403219938278198, + "num_tokens": 146851578.0, + "step": 5878 + }, + { + "epoch": 0.6456182736657149, + "grad_norm": 1.907038927078247, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6920104026794434, + "num_tokens": 146882989.0, + "step": 5879 + }, + { + "epoch": 0.6457280913683285, + "grad_norm": 2.441387891769409, + "learning_rate": 1e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7425227761268616, + "num_tokens": 146902845.0, + "step": 5880 + }, + { + "epoch": 0.6458379090709422, + "grad_norm": 2.0839009284973145, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6833192110061646, + "num_tokens": 146930935.0, + "step": 5881 + }, + { + "epoch": 0.6459477267735559, + "grad_norm": 2.1491408348083496, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6863536238670349, + "num_tokens": 146958808.0, + "step": 5882 + }, + { + "epoch": 0.6460575444761696, + "grad_norm": 2.243006706237793, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6968041658401489, + "num_tokens": 146983013.0, + "step": 5883 + }, + { + "epoch": 0.6461673621787832, + "grad_norm": 2.113011121749878, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.704816460609436, + "num_tokens": 147011844.0, + "step": 5884 + }, + { + "epoch": 0.6462771798813969, + "grad_norm": 2.3202171325683594, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7496082186698914, + "num_tokens": 147033991.0, + "step": 5885 + }, + { + "epoch": 0.6463869975840105, + "grad_norm": 2.0835776329040527, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7246996164321899, + "num_tokens": 147059585.0, + "step": 5886 + }, + { + "epoch": 0.6464968152866242, + "grad_norm": 2.165011167526245, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.6980018615722656, + "num_tokens": 147087925.0, + "step": 5887 + }, + { + "epoch": 0.6466066329892378, + "grad_norm": 2.281005620956421, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7091308236122131, + "num_tokens": 147110315.0, + "step": 5888 + }, + { + "epoch": 0.6467164506918516, + "grad_norm": 2.202751398086548, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7089452743530273, + "num_tokens": 147135677.0, + "step": 5889 + }, + { + "epoch": 0.6468262683944652, + "grad_norm": 2.561378002166748, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6874305009841919, + "num_tokens": 147157168.0, + "step": 5890 + }, + { + "epoch": 0.6469360860970789, + "grad_norm": 2.27187442779541, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.70870041847229, + "num_tokens": 147180116.0, + "step": 5891 + }, + { + "epoch": 0.6470459037996925, + "grad_norm": 1.9584324359893799, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7036555409431458, + "num_tokens": 147210301.0, + "step": 5892 + }, + { + "epoch": 0.6471557215023062, + "grad_norm": 2.0404880046844482, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6948022842407227, + "num_tokens": 147239170.0, + "step": 5893 + }, + { + "epoch": 0.6472655392049198, + "grad_norm": 2.4599080085754395, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7165431380271912, + "num_tokens": 147258695.0, + "step": 5894 + }, + { + "epoch": 0.6473753569075335, + "grad_norm": 1.97311532497406, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7186828851699829, + "num_tokens": 147287800.0, + "step": 5895 + }, + { + "epoch": 0.6474851746101472, + "grad_norm": 2.0818121433258057, + "learning_rate": 1e-06, + "loss": 1.0792, + "mean_token_accuracy": 0.6816298365592957, + "num_tokens": 147315648.0, + "step": 5896 + }, + { + "epoch": 0.6475949923127609, + "grad_norm": 2.4055938720703125, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7261807918548584, + "num_tokens": 147337797.0, + "step": 5897 + }, + { + "epoch": 0.6477048100153745, + "grad_norm": 2.159743547439575, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6973448395729065, + "num_tokens": 147363657.0, + "step": 5898 + }, + { + "epoch": 0.6478146277179881, + "grad_norm": 2.209677219390869, + "learning_rate": 1e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.6757526993751526, + "num_tokens": 147389550.0, + "step": 5899 + }, + { + "epoch": 0.6479244454206018, + "grad_norm": 1.9557359218597412, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7089641094207764, + "num_tokens": 147419761.0, + "step": 5900 + }, + { + "epoch": 0.6480342631232154, + "grad_norm": 2.4116156101226807, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.721032977104187, + "num_tokens": 147440479.0, + "step": 5901 + }, + { + "epoch": 0.6481440808258291, + "grad_norm": 2.19331693649292, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7064522504806519, + "num_tokens": 147466085.0, + "step": 5902 + }, + { + "epoch": 0.6482538985284428, + "grad_norm": 2.2786002159118652, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7236073613166809, + "num_tokens": 147488963.0, + "step": 5903 + }, + { + "epoch": 0.6483637162310565, + "grad_norm": 1.85758376121521, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.6797068119049072, + "num_tokens": 147525816.0, + "step": 5904 + }, + { + "epoch": 0.6484735339336701, + "grad_norm": 2.16168475151062, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.699547290802002, + "num_tokens": 147552487.0, + "step": 5905 + }, + { + "epoch": 0.6485833516362838, + "grad_norm": 2.49625825881958, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7352854609489441, + "num_tokens": 147571823.0, + "step": 5906 + }, + { + "epoch": 0.6486931693388974, + "grad_norm": 2.2789509296417236, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6919996738433838, + "num_tokens": 147598079.0, + "step": 5907 + }, + { + "epoch": 0.6488029870415111, + "grad_norm": 2.154919147491455, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7240756750106812, + "num_tokens": 147622795.0, + "step": 5908 + }, + { + "epoch": 0.6489128047441247, + "grad_norm": 2.1700093746185303, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7095034718513489, + "num_tokens": 147649095.0, + "step": 5909 + }, + { + "epoch": 0.6490226224467384, + "grad_norm": 2.3148319721221924, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7379186153411865, + "num_tokens": 147671519.0, + "step": 5910 + }, + { + "epoch": 0.6491324401493521, + "grad_norm": 2.1765754222869873, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6817869544029236, + "num_tokens": 147698343.0, + "step": 5911 + }, + { + "epoch": 0.6492422578519658, + "grad_norm": 2.2360141277313232, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.6987048387527466, + "num_tokens": 147722298.0, + "step": 5912 + }, + { + "epoch": 0.6493520755545794, + "grad_norm": 1.790253758430481, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7042964696884155, + "num_tokens": 147756078.0, + "step": 5913 + }, + { + "epoch": 0.6494618932571931, + "grad_norm": 1.9683594703674316, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7153028845787048, + "num_tokens": 147785412.0, + "step": 5914 + }, + { + "epoch": 0.6495717109598067, + "grad_norm": 2.201063632965088, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7272763848304749, + "num_tokens": 147811523.0, + "step": 5915 + }, + { + "epoch": 0.6496815286624203, + "grad_norm": 2.2248926162719727, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7051750421524048, + "num_tokens": 147835170.0, + "step": 5916 + }, + { + "epoch": 0.649791346365034, + "grad_norm": 2.02199387550354, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6973029375076294, + "num_tokens": 147863657.0, + "step": 5917 + }, + { + "epoch": 0.6499011640676478, + "grad_norm": 2.2157678604125977, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.699325680732727, + "num_tokens": 147890181.0, + "step": 5918 + }, + { + "epoch": 0.6500109817702614, + "grad_norm": 2.52073335647583, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7115585803985596, + "num_tokens": 147911376.0, + "step": 5919 + }, + { + "epoch": 0.650120799472875, + "grad_norm": 2.128573417663574, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7052435874938965, + "num_tokens": 147937130.0, + "step": 5920 + }, + { + "epoch": 0.6502306171754887, + "grad_norm": 2.3894166946411133, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7123612761497498, + "num_tokens": 147960806.0, + "step": 5921 + }, + { + "epoch": 0.6503404348781023, + "grad_norm": 2.283743143081665, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6949734687805176, + "num_tokens": 147985489.0, + "step": 5922 + }, + { + "epoch": 0.650450252580716, + "grad_norm": 2.0986876487731934, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7051904201507568, + "num_tokens": 148012895.0, + "step": 5923 + }, + { + "epoch": 0.6505600702833296, + "grad_norm": 2.28857421875, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7375217080116272, + "num_tokens": 148035197.0, + "step": 5924 + }, + { + "epoch": 0.6506698879859434, + "grad_norm": 2.1409332752227783, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6981091499328613, + "num_tokens": 148060717.0, + "step": 5925 + }, + { + "epoch": 0.650779705688557, + "grad_norm": 2.3022620677948, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.695885419845581, + "num_tokens": 148086057.0, + "step": 5926 + }, + { + "epoch": 0.6508895233911707, + "grad_norm": 2.0375025272369385, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7205256223678589, + "num_tokens": 148112793.0, + "step": 5927 + }, + { + "epoch": 0.6509993410937843, + "grad_norm": 2.0372605323791504, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6912801265716553, + "num_tokens": 148141268.0, + "step": 5928 + }, + { + "epoch": 0.651109158796398, + "grad_norm": 1.9957009553909302, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.682489275932312, + "num_tokens": 148170907.0, + "step": 5929 + }, + { + "epoch": 0.6512189764990116, + "grad_norm": 2.085035800933838, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6917866468429565, + "num_tokens": 148197806.0, + "step": 5930 + }, + { + "epoch": 0.6513287942016253, + "grad_norm": 2.21960711479187, + "learning_rate": 1e-06, + "loss": 1.1007, + "mean_token_accuracy": 0.6675741672515869, + "num_tokens": 148224953.0, + "step": 5931 + }, + { + "epoch": 0.651438611904239, + "grad_norm": 1.9398385286331177, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7021251320838928, + "num_tokens": 148255967.0, + "step": 5932 + }, + { + "epoch": 0.6515484296068527, + "grad_norm": 2.5075411796569824, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7174684405326843, + "num_tokens": 148274811.0, + "step": 5933 + }, + { + "epoch": 0.6516582473094663, + "grad_norm": 2.1467955112457275, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7152730226516724, + "num_tokens": 148299738.0, + "step": 5934 + }, + { + "epoch": 0.65176806501208, + "grad_norm": 2.201845169067383, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6986960172653198, + "num_tokens": 148325178.0, + "step": 5935 + }, + { + "epoch": 0.6518778827146936, + "grad_norm": 2.5725834369659424, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7282176613807678, + "num_tokens": 148343370.0, + "step": 5936 + }, + { + "epoch": 0.6519877004173072, + "grad_norm": 2.2410225868225098, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7179048657417297, + "num_tokens": 148365226.0, + "step": 5937 + }, + { + "epoch": 0.6520975181199209, + "grad_norm": 2.1673412322998047, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6820781826972961, + "num_tokens": 148392096.0, + "step": 5938 + }, + { + "epoch": 0.6522073358225345, + "grad_norm": 2.6027722358703613, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7266485095024109, + "num_tokens": 148410317.0, + "step": 5939 + }, + { + "epoch": 0.6523171535251483, + "grad_norm": 2.243542194366455, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6985771656036377, + "num_tokens": 148436479.0, + "step": 5940 + }, + { + "epoch": 0.6524269712277619, + "grad_norm": 2.4050047397613525, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7050168514251709, + "num_tokens": 148460057.0, + "step": 5941 + }, + { + "epoch": 0.6525367889303756, + "grad_norm": 2.5698513984680176, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6830907464027405, + "num_tokens": 148481608.0, + "step": 5942 + }, + { + "epoch": 0.6526466066329892, + "grad_norm": 2.189396381378174, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6895307898521423, + "num_tokens": 148509199.0, + "step": 5943 + }, + { + "epoch": 0.6527564243356029, + "grad_norm": 2.090139627456665, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7051024436950684, + "num_tokens": 148537207.0, + "step": 5944 + }, + { + "epoch": 0.6528662420382165, + "grad_norm": 2.0280838012695312, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7073667049407959, + "num_tokens": 148564018.0, + "step": 5945 + }, + { + "epoch": 0.6529760597408302, + "grad_norm": 2.13222074508667, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6768386363983154, + "num_tokens": 148591071.0, + "step": 5946 + }, + { + "epoch": 0.6530858774434439, + "grad_norm": 2.1825485229492188, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7082117795944214, + "num_tokens": 148615980.0, + "step": 5947 + }, + { + "epoch": 0.6531956951460576, + "grad_norm": 2.319044828414917, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7153258323669434, + "num_tokens": 148638350.0, + "step": 5948 + }, + { + "epoch": 0.6533055128486712, + "grad_norm": 2.2339704036712646, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.698663592338562, + "num_tokens": 148663226.0, + "step": 5949 + }, + { + "epoch": 0.6534153305512849, + "grad_norm": 2.395183801651001, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7050989866256714, + "num_tokens": 148687738.0, + "step": 5950 + }, + { + "epoch": 0.6535251482538985, + "grad_norm": 2.510582208633423, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7246382236480713, + "num_tokens": 148708537.0, + "step": 5951 + }, + { + "epoch": 0.6536349659565122, + "grad_norm": 2.119948148727417, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7191239595413208, + "num_tokens": 148735010.0, + "step": 5952 + }, + { + "epoch": 0.6537447836591258, + "grad_norm": 2.2406768798828125, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7289159297943115, + "num_tokens": 148760060.0, + "step": 5953 + }, + { + "epoch": 0.6538546013617396, + "grad_norm": 2.2124111652374268, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7190443873405457, + "num_tokens": 148784773.0, + "step": 5954 + }, + { + "epoch": 0.6539644190643532, + "grad_norm": 2.1916940212249756, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7112282514572144, + "num_tokens": 148811219.0, + "step": 5955 + }, + { + "epoch": 0.6540742367669669, + "grad_norm": 2.5525519847869873, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.719365119934082, + "num_tokens": 148830435.0, + "step": 5956 + }, + { + "epoch": 0.6541840544695805, + "grad_norm": 2.1929988861083984, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7320694923400879, + "num_tokens": 148853982.0, + "step": 5957 + }, + { + "epoch": 0.6542938721721941, + "grad_norm": 2.1339051723480225, + "learning_rate": 1e-06, + "loss": 1.0943, + "mean_token_accuracy": 0.680661678314209, + "num_tokens": 148879872.0, + "step": 5958 + }, + { + "epoch": 0.6544036898748078, + "grad_norm": 2.2147605419158936, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.706169068813324, + "num_tokens": 148905174.0, + "step": 5959 + }, + { + "epoch": 0.6545135075774214, + "grad_norm": 2.5499422550201416, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7349047660827637, + "num_tokens": 148923758.0, + "step": 5960 + }, + { + "epoch": 0.6546233252800352, + "grad_norm": 2.226959228515625, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.706413745880127, + "num_tokens": 148948891.0, + "step": 5961 + }, + { + "epoch": 0.6547331429826488, + "grad_norm": 2.1630289554595947, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6934319734573364, + "num_tokens": 148975094.0, + "step": 5962 + }, + { + "epoch": 0.6548429606852625, + "grad_norm": 2.2809581756591797, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7438874244689941, + "num_tokens": 148998448.0, + "step": 5963 + }, + { + "epoch": 0.6549527783878761, + "grad_norm": 2.336106538772583, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7040935754776001, + "num_tokens": 149019705.0, + "step": 5964 + }, + { + "epoch": 0.6550625960904898, + "grad_norm": 1.9348481893539429, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7035276889801025, + "num_tokens": 149052504.0, + "step": 5965 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 2.1325135231018066, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7154988050460815, + "num_tokens": 149078584.0, + "step": 5966 + }, + { + "epoch": 0.6552822314957171, + "grad_norm": 2.121994733810425, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.714993953704834, + "num_tokens": 149103747.0, + "step": 5967 + }, + { + "epoch": 0.6553920491983307, + "grad_norm": 2.1950714588165283, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7055662870407104, + "num_tokens": 149129151.0, + "step": 5968 + }, + { + "epoch": 0.6555018669009445, + "grad_norm": 2.318746328353882, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7306661605834961, + "num_tokens": 149151292.0, + "step": 5969 + }, + { + "epoch": 0.6556116846035581, + "grad_norm": 2.4496264457702637, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7166286110877991, + "num_tokens": 149169968.0, + "step": 5970 + }, + { + "epoch": 0.6557215023061718, + "grad_norm": 2.437126398086548, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.70830237865448, + "num_tokens": 149192947.0, + "step": 5971 + }, + { + "epoch": 0.6558313200087854, + "grad_norm": 2.182326078414917, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7371299266815186, + "num_tokens": 149215523.0, + "step": 5972 + }, + { + "epoch": 0.6559411377113991, + "grad_norm": 2.2524595260620117, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7000673413276672, + "num_tokens": 149241050.0, + "step": 5973 + }, + { + "epoch": 0.6560509554140127, + "grad_norm": 2.312450408935547, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7016335725784302, + "num_tokens": 149263726.0, + "step": 5974 + }, + { + "epoch": 0.6561607731166264, + "grad_norm": 2.2076430320739746, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7232897877693176, + "num_tokens": 149287244.0, + "step": 5975 + }, + { + "epoch": 0.6562705908192401, + "grad_norm": 2.394476890563965, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6854425072669983, + "num_tokens": 149310752.0, + "step": 5976 + }, + { + "epoch": 0.6563804085218538, + "grad_norm": 2.0762557983398438, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7248257994651794, + "num_tokens": 149336467.0, + "step": 5977 + }, + { + "epoch": 0.6564902262244674, + "grad_norm": 2.039314031600952, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6851644515991211, + "num_tokens": 149363555.0, + "step": 5978 + }, + { + "epoch": 0.656600043927081, + "grad_norm": 2.136180877685547, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7030656337738037, + "num_tokens": 149390010.0, + "step": 5979 + }, + { + "epoch": 0.6567098616296947, + "grad_norm": 2.1175456047058105, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7250088453292847, + "num_tokens": 149414860.0, + "step": 5980 + }, + { + "epoch": 0.6568196793323083, + "grad_norm": 2.515902280807495, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6976602077484131, + "num_tokens": 149434639.0, + "step": 5981 + }, + { + "epoch": 0.656929497034922, + "grad_norm": 2.3636696338653564, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7237703800201416, + "num_tokens": 149454692.0, + "step": 5982 + }, + { + "epoch": 0.6570393147375357, + "grad_norm": 2.0659008026123047, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7073504328727722, + "num_tokens": 149482649.0, + "step": 5983 + }, + { + "epoch": 0.6571491324401494, + "grad_norm": 2.3689208030700684, + "learning_rate": 1e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6909249424934387, + "num_tokens": 149504976.0, + "step": 5984 + }, + { + "epoch": 0.657258950142763, + "grad_norm": 2.5192997455596924, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6830538511276245, + "num_tokens": 149526117.0, + "step": 5985 + }, + { + "epoch": 0.6573687678453767, + "grad_norm": 2.6349196434020996, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7337468862533569, + "num_tokens": 149544651.0, + "step": 5986 + }, + { + "epoch": 0.6574785855479903, + "grad_norm": 2.2695674896240234, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7342414259910583, + "num_tokens": 149567142.0, + "step": 5987 + }, + { + "epoch": 0.657588403250604, + "grad_norm": 2.207265853881836, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7244585156440735, + "num_tokens": 149591437.0, + "step": 5988 + }, + { + "epoch": 0.6576982209532176, + "grad_norm": 2.356123447418213, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7092506885528564, + "num_tokens": 149613174.0, + "step": 5989 + }, + { + "epoch": 0.6578080386558314, + "grad_norm": 2.429684638977051, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7109144926071167, + "num_tokens": 149634734.0, + "step": 5990 + }, + { + "epoch": 0.657917856358445, + "grad_norm": 2.4120705127716064, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6993913054466248, + "num_tokens": 149658388.0, + "step": 5991 + }, + { + "epoch": 0.6580276740610587, + "grad_norm": 2.2263572216033936, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6965181231498718, + "num_tokens": 149682978.0, + "step": 5992 + }, + { + "epoch": 0.6581374917636723, + "grad_norm": 2.177090644836426, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7501490116119385, + "num_tokens": 149706847.0, + "step": 5993 + }, + { + "epoch": 0.658247309466286, + "grad_norm": 2.522087812423706, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7136505842208862, + "num_tokens": 149725996.0, + "step": 5994 + }, + { + "epoch": 0.6583571271688996, + "grad_norm": 2.1398234367370605, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7050554752349854, + "num_tokens": 149752093.0, + "step": 5995 + }, + { + "epoch": 0.6584669448715132, + "grad_norm": 2.0836198329925537, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.700914740562439, + "num_tokens": 149779862.0, + "step": 5996 + }, + { + "epoch": 0.6585767625741269, + "grad_norm": 2.174696683883667, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7302290797233582, + "num_tokens": 149805073.0, + "step": 5997 + }, + { + "epoch": 0.6586865802767407, + "grad_norm": 2.3476011753082275, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7035439014434814, + "num_tokens": 149827910.0, + "step": 5998 + }, + { + "epoch": 0.6587963979793543, + "grad_norm": 1.8978519439697266, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7104446887969971, + "num_tokens": 149859944.0, + "step": 5999 + }, + { + "epoch": 0.6589062156819679, + "grad_norm": 1.983960509300232, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7191600799560547, + "num_tokens": 149886583.0, + "step": 6000 + }, + { + "epoch": 0.6590160333845816, + "grad_norm": 2.148294448852539, + "learning_rate": 1e-06, + "loss": 1.0686, + "mean_token_accuracy": 0.682844877243042, + "num_tokens": 149912693.0, + "step": 6001 + }, + { + "epoch": 0.6591258510871952, + "grad_norm": 2.3976566791534424, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7150076627731323, + "num_tokens": 149933568.0, + "step": 6002 + }, + { + "epoch": 0.6592356687898089, + "grad_norm": 2.3972861766815186, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7135295867919922, + "num_tokens": 149954726.0, + "step": 6003 + }, + { + "epoch": 0.6593454864924225, + "grad_norm": 2.467646837234497, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7106389403343201, + "num_tokens": 149975006.0, + "step": 6004 + }, + { + "epoch": 0.6594553041950363, + "grad_norm": 2.219604730606079, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7178707122802734, + "num_tokens": 149998370.0, + "step": 6005 + }, + { + "epoch": 0.6595651218976499, + "grad_norm": 2.349489450454712, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7246314287185669, + "num_tokens": 150019546.0, + "step": 6006 + }, + { + "epoch": 0.6596749396002636, + "grad_norm": 2.2443790435791016, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7038773894309998, + "num_tokens": 150043712.0, + "step": 6007 + }, + { + "epoch": 0.6597847573028772, + "grad_norm": 2.1180036067962646, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6960930824279785, + "num_tokens": 150071541.0, + "step": 6008 + }, + { + "epoch": 0.6598945750054909, + "grad_norm": 2.4816455841064453, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7099865078926086, + "num_tokens": 150092801.0, + "step": 6009 + }, + { + "epoch": 0.6600043927081045, + "grad_norm": 2.3341684341430664, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6949520111083984, + "num_tokens": 150116081.0, + "step": 6010 + }, + { + "epoch": 0.6601142104107182, + "grad_norm": 2.142362117767334, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7251346707344055, + "num_tokens": 150140565.0, + "step": 6011 + }, + { + "epoch": 0.6602240281133319, + "grad_norm": 2.4324605464935303, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6893924474716187, + "num_tokens": 150163197.0, + "step": 6012 + }, + { + "epoch": 0.6603338458159456, + "grad_norm": 2.429793119430542, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7067505121231079, + "num_tokens": 150185948.0, + "step": 6013 + }, + { + "epoch": 0.6604436635185592, + "grad_norm": 2.0381970405578613, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6902139186859131, + "num_tokens": 150218175.0, + "step": 6014 + }, + { + "epoch": 0.6605534812211729, + "grad_norm": 2.2398524284362793, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6986584663391113, + "num_tokens": 150243378.0, + "step": 6015 + }, + { + "epoch": 0.6606632989237865, + "grad_norm": 2.250882863998413, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6941107511520386, + "num_tokens": 150268885.0, + "step": 6016 + }, + { + "epoch": 0.6607731166264001, + "grad_norm": 2.3906707763671875, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7044423818588257, + "num_tokens": 150290407.0, + "step": 6017 + }, + { + "epoch": 0.6608829343290138, + "grad_norm": 2.213775873184204, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7124080061912537, + "num_tokens": 150313982.0, + "step": 6018 + }, + { + "epoch": 0.6609927520316276, + "grad_norm": 2.344282865524292, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.718070924282074, + "num_tokens": 150335840.0, + "step": 6019 + }, + { + "epoch": 0.6611025697342412, + "grad_norm": 1.9200725555419922, + "learning_rate": 1e-06, + "loss": 1.0915, + "mean_token_accuracy": 0.6766758561134338, + "num_tokens": 150371163.0, + "step": 6020 + }, + { + "epoch": 0.6612123874368548, + "grad_norm": 1.9860782623291016, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7058920860290527, + "num_tokens": 150399325.0, + "step": 6021 + }, + { + "epoch": 0.6613222051394685, + "grad_norm": 2.2224221229553223, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.725929856300354, + "num_tokens": 150423297.0, + "step": 6022 + }, + { + "epoch": 0.6614320228420821, + "grad_norm": 2.2703661918640137, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7102664113044739, + "num_tokens": 150446863.0, + "step": 6023 + }, + { + "epoch": 0.6615418405446958, + "grad_norm": 2.1520514488220215, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7018308639526367, + "num_tokens": 150473738.0, + "step": 6024 + }, + { + "epoch": 0.6616516582473094, + "grad_norm": 2.248445749282837, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.727342426776886, + "num_tokens": 150496723.0, + "step": 6025 + }, + { + "epoch": 0.6617614759499231, + "grad_norm": 2.1363463401794434, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7183700203895569, + "num_tokens": 150522227.0, + "step": 6026 + }, + { + "epoch": 0.6618712936525368, + "grad_norm": 2.446805477142334, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7024936676025391, + "num_tokens": 150542740.0, + "step": 6027 + }, + { + "epoch": 0.6619811113551505, + "grad_norm": 2.408663034439087, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6954137086868286, + "num_tokens": 150565361.0, + "step": 6028 + }, + { + "epoch": 0.6620909290577641, + "grad_norm": 2.204662561416626, + "learning_rate": 1e-06, + "loss": 1.0938, + "mean_token_accuracy": 0.6742714643478394, + "num_tokens": 150592776.0, + "step": 6029 + }, + { + "epoch": 0.6622007467603778, + "grad_norm": 2.1902902126312256, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7008492946624756, + "num_tokens": 150618225.0, + "step": 6030 + }, + { + "epoch": 0.6623105644629914, + "grad_norm": 2.5766608715057373, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7283530235290527, + "num_tokens": 150636374.0, + "step": 6031 + }, + { + "epoch": 0.6624203821656051, + "grad_norm": 2.203458786010742, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7154287695884705, + "num_tokens": 150660208.0, + "step": 6032 + }, + { + "epoch": 0.6625301998682187, + "grad_norm": 2.273329019546509, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6916738748550415, + "num_tokens": 150683226.0, + "step": 6033 + }, + { + "epoch": 0.6626400175708325, + "grad_norm": 1.9121503829956055, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7088520526885986, + "num_tokens": 150716421.0, + "step": 6034 + }, + { + "epoch": 0.6627498352734461, + "grad_norm": 2.436373710632324, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7145534753799438, + "num_tokens": 150737657.0, + "step": 6035 + }, + { + "epoch": 0.6628596529760598, + "grad_norm": 2.110642671585083, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7270058989524841, + "num_tokens": 150763741.0, + "step": 6036 + }, + { + "epoch": 0.6629694706786734, + "grad_norm": 2.1915807723999023, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6965972781181335, + "num_tokens": 150789535.0, + "step": 6037 + }, + { + "epoch": 0.663079288381287, + "grad_norm": 2.0021891593933105, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7271003723144531, + "num_tokens": 150817944.0, + "step": 6038 + }, + { + "epoch": 0.6631891060839007, + "grad_norm": 2.3675456047058105, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7356892228126526, + "num_tokens": 150839482.0, + "step": 6039 + }, + { + "epoch": 0.6632989237865143, + "grad_norm": 2.2539305686950684, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6860583424568176, + "num_tokens": 150865479.0, + "step": 6040 + }, + { + "epoch": 0.6634087414891281, + "grad_norm": 2.6176207065582275, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7283309698104858, + "num_tokens": 150882390.0, + "step": 6041 + }, + { + "epoch": 0.6635185591917417, + "grad_norm": 2.0705807209014893, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7120438814163208, + "num_tokens": 150912822.0, + "step": 6042 + }, + { + "epoch": 0.6636283768943554, + "grad_norm": 2.712876319885254, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7326375246047974, + "num_tokens": 150930256.0, + "step": 6043 + }, + { + "epoch": 0.663738194596969, + "grad_norm": 2.5148582458496094, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.707042396068573, + "num_tokens": 150950730.0, + "step": 6044 + }, + { + "epoch": 0.6638480122995827, + "grad_norm": 2.5813956260681152, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7229068875312805, + "num_tokens": 150970103.0, + "step": 6045 + }, + { + "epoch": 0.6639578300021963, + "grad_norm": 2.14528489112854, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.7176122665405273, + "num_tokens": 150997024.0, + "step": 6046 + }, + { + "epoch": 0.66406764770481, + "grad_norm": 2.2792210578918457, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7183349132537842, + "num_tokens": 151020402.0, + "step": 6047 + }, + { + "epoch": 0.6641774654074237, + "grad_norm": 2.2728145122528076, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7097669839859009, + "num_tokens": 151044772.0, + "step": 6048 + }, + { + "epoch": 0.6642872831100374, + "grad_norm": 2.790992498397827, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.734730064868927, + "num_tokens": 151061709.0, + "step": 6049 + }, + { + "epoch": 0.664397100812651, + "grad_norm": 2.1569080352783203, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6927088499069214, + "num_tokens": 151087573.0, + "step": 6050 + }, + { + "epoch": 0.6645069185152647, + "grad_norm": 2.1911470890045166, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7145956754684448, + "num_tokens": 151113118.0, + "step": 6051 + }, + { + "epoch": 0.6646167362178783, + "grad_norm": 2.0155396461486816, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7013430595397949, + "num_tokens": 151142178.0, + "step": 6052 + }, + { + "epoch": 0.664726553920492, + "grad_norm": 2.1325836181640625, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7151306867599487, + "num_tokens": 151168163.0, + "step": 6053 + }, + { + "epoch": 0.6648363716231056, + "grad_norm": 2.1555426120758057, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7209596037864685, + "num_tokens": 151191198.0, + "step": 6054 + }, + { + "epoch": 0.6649461893257194, + "grad_norm": 2.2320404052734375, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6850357055664062, + "num_tokens": 151218086.0, + "step": 6055 + }, + { + "epoch": 0.665056007028333, + "grad_norm": 1.8609962463378906, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7000044584274292, + "num_tokens": 151253375.0, + "step": 6056 + }, + { + "epoch": 0.6651658247309467, + "grad_norm": 2.5048019886016846, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7258408069610596, + "num_tokens": 151271945.0, + "step": 6057 + }, + { + "epoch": 0.6652756424335603, + "grad_norm": 2.2866175174713135, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7239705324172974, + "num_tokens": 151292981.0, + "step": 6058 + }, + { + "epoch": 0.665385460136174, + "grad_norm": 2.2395877838134766, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7091437578201294, + "num_tokens": 151316357.0, + "step": 6059 + }, + { + "epoch": 0.6654952778387876, + "grad_norm": 2.2519569396972656, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7269529700279236, + "num_tokens": 151338563.0, + "step": 6060 + }, + { + "epoch": 0.6656050955414012, + "grad_norm": 2.0334103107452393, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7285764217376709, + "num_tokens": 151365489.0, + "step": 6061 + }, + { + "epoch": 0.6657149132440149, + "grad_norm": 1.9555689096450806, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6782505512237549, + "num_tokens": 151395969.0, + "step": 6062 + }, + { + "epoch": 0.6658247309466286, + "grad_norm": 2.1524367332458496, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.69070965051651, + "num_tokens": 151423191.0, + "step": 6063 + }, + { + "epoch": 0.6659345486492423, + "grad_norm": 2.3527350425720215, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7117010354995728, + "num_tokens": 151445481.0, + "step": 6064 + }, + { + "epoch": 0.6660443663518559, + "grad_norm": 2.4354231357574463, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7317621111869812, + "num_tokens": 151467099.0, + "step": 6065 + }, + { + "epoch": 0.6661541840544696, + "grad_norm": 2.1848256587982178, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7271727919578552, + "num_tokens": 151491227.0, + "step": 6066 + }, + { + "epoch": 0.6662640017570832, + "grad_norm": 2.3163070678710938, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7086279392242432, + "num_tokens": 151514547.0, + "step": 6067 + }, + { + "epoch": 0.6663738194596969, + "grad_norm": 2.4680545330047607, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7388339042663574, + "num_tokens": 151535196.0, + "step": 6068 + }, + { + "epoch": 0.6664836371623105, + "grad_norm": 2.384230613708496, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6882920265197754, + "num_tokens": 151557845.0, + "step": 6069 + }, + { + "epoch": 0.6665934548649243, + "grad_norm": 1.953126072883606, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.6975694298744202, + "num_tokens": 151587087.0, + "step": 6070 + }, + { + "epoch": 0.6667032725675379, + "grad_norm": 2.2301509380340576, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7299501895904541, + "num_tokens": 151610531.0, + "step": 6071 + }, + { + "epoch": 0.6668130902701516, + "grad_norm": 1.9560574293136597, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.695587158203125, + "num_tokens": 151642044.0, + "step": 6072 + }, + { + "epoch": 0.6669229079727652, + "grad_norm": 1.9244391918182373, + "learning_rate": 1e-06, + "loss": 1.1113, + "mean_token_accuracy": 0.6635639071464539, + "num_tokens": 151675912.0, + "step": 6073 + }, + { + "epoch": 0.6670327256753789, + "grad_norm": 2.2365171909332275, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7130274772644043, + "num_tokens": 151702186.0, + "step": 6074 + }, + { + "epoch": 0.6671425433779925, + "grad_norm": 2.268101453781128, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6959646940231323, + "num_tokens": 151726820.0, + "step": 6075 + }, + { + "epoch": 0.6672523610806061, + "grad_norm": 2.2420694828033447, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7121438980102539, + "num_tokens": 151751601.0, + "step": 6076 + }, + { + "epoch": 0.6673621787832199, + "grad_norm": 2.1019017696380615, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7158679366111755, + "num_tokens": 151778459.0, + "step": 6077 + }, + { + "epoch": 0.6674719964858336, + "grad_norm": 2.3988795280456543, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.6956987380981445, + "num_tokens": 151801261.0, + "step": 6078 + }, + { + "epoch": 0.6675818141884472, + "grad_norm": 2.2946484088897705, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7059683799743652, + "num_tokens": 151824000.0, + "step": 6079 + }, + { + "epoch": 0.6676916318910608, + "grad_norm": 2.137146472930908, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6921184659004211, + "num_tokens": 151850460.0, + "step": 6080 + }, + { + "epoch": 0.6678014495936745, + "grad_norm": 2.338860511779785, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7152270078659058, + "num_tokens": 151872870.0, + "step": 6081 + }, + { + "epoch": 0.6679112672962881, + "grad_norm": 2.335456132888794, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7010016441345215, + "num_tokens": 151896886.0, + "step": 6082 + }, + { + "epoch": 0.6680210849989018, + "grad_norm": 2.7508132457733154, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7057163119316101, + "num_tokens": 151914680.0, + "step": 6083 + }, + { + "epoch": 0.6681309027015155, + "grad_norm": 2.3263790607452393, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7213668823242188, + "num_tokens": 151937629.0, + "step": 6084 + }, + { + "epoch": 0.6682407204041292, + "grad_norm": 2.11470890045166, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6979231834411621, + "num_tokens": 151965875.0, + "step": 6085 + }, + { + "epoch": 0.6683505381067428, + "grad_norm": 2.259920597076416, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.71532142162323, + "num_tokens": 151990291.0, + "step": 6086 + }, + { + "epoch": 0.6684603558093565, + "grad_norm": 1.8961793184280396, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7014409899711609, + "num_tokens": 152022121.0, + "step": 6087 + }, + { + "epoch": 0.6685701735119701, + "grad_norm": 2.050260066986084, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7151904702186584, + "num_tokens": 152049777.0, + "step": 6088 + }, + { + "epoch": 0.6686799912145838, + "grad_norm": 2.3407795429229736, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7125834226608276, + "num_tokens": 152071733.0, + "step": 6089 + }, + { + "epoch": 0.6687898089171974, + "grad_norm": 2.009765863418579, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6993096470832825, + "num_tokens": 152102794.0, + "step": 6090 + }, + { + "epoch": 0.6688996266198111, + "grad_norm": 2.0265402793884277, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6918009519577026, + "num_tokens": 152133292.0, + "step": 6091 + }, + { + "epoch": 0.6690094443224248, + "grad_norm": 2.2092137336730957, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7116780281066895, + "num_tokens": 152158404.0, + "step": 6092 + }, + { + "epoch": 0.6691192620250385, + "grad_norm": 2.329718589782715, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6868065595626831, + "num_tokens": 152181917.0, + "step": 6093 + }, + { + "epoch": 0.6692290797276521, + "grad_norm": 2.0227091312408447, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7216889262199402, + "num_tokens": 152209445.0, + "step": 6094 + }, + { + "epoch": 0.6693388974302658, + "grad_norm": 2.4313485622406006, + "learning_rate": 1e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7342641949653625, + "num_tokens": 152229155.0, + "step": 6095 + }, + { + "epoch": 0.6694487151328794, + "grad_norm": 2.1283295154571533, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.687876284122467, + "num_tokens": 152258381.0, + "step": 6096 + }, + { + "epoch": 0.669558532835493, + "grad_norm": 2.0452685356140137, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7107679843902588, + "num_tokens": 152285978.0, + "step": 6097 + }, + { + "epoch": 0.6696683505381067, + "grad_norm": 2.274139165878296, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6963953971862793, + "num_tokens": 152315817.0, + "step": 6098 + }, + { + "epoch": 0.6697781682407205, + "grad_norm": 1.9896951913833618, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6885342001914978, + "num_tokens": 152346110.0, + "step": 6099 + }, + { + "epoch": 0.6698879859433341, + "grad_norm": 2.2879929542541504, + "learning_rate": 1e-06, + "loss": 1.0584, + "mean_token_accuracy": 0.6760833859443665, + "num_tokens": 152372136.0, + "step": 6100 + }, + { + "epoch": 0.6699978036459477, + "grad_norm": 2.0979011058807373, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.698574423789978, + "num_tokens": 152398567.0, + "step": 6101 + }, + { + "epoch": 0.6701076213485614, + "grad_norm": 2.516662836074829, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7274527549743652, + "num_tokens": 152417624.0, + "step": 6102 + }, + { + "epoch": 0.670217439051175, + "grad_norm": 2.3353633880615234, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7176454663276672, + "num_tokens": 152439529.0, + "step": 6103 + }, + { + "epoch": 0.6703272567537887, + "grad_norm": 2.1651275157928467, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7053806781768799, + "num_tokens": 152464657.0, + "step": 6104 + }, + { + "epoch": 0.6704370744564023, + "grad_norm": 2.500061273574829, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7420823574066162, + "num_tokens": 152484976.0, + "step": 6105 + }, + { + "epoch": 0.6705468921590161, + "grad_norm": 2.6680471897125244, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7228289842605591, + "num_tokens": 152501613.0, + "step": 6106 + }, + { + "epoch": 0.6706567098616297, + "grad_norm": 2.4038426876068115, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.720577597618103, + "num_tokens": 152523303.0, + "step": 6107 + }, + { + "epoch": 0.6707665275642434, + "grad_norm": 2.3232710361480713, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7044014930725098, + "num_tokens": 152546034.0, + "step": 6108 + }, + { + "epoch": 0.670876345266857, + "grad_norm": 2.2347681522369385, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6983283758163452, + "num_tokens": 152571152.0, + "step": 6109 + }, + { + "epoch": 0.6709861629694707, + "grad_norm": 2.563870906829834, + "learning_rate": 1e-06, + "loss": 1.1008, + "mean_token_accuracy": 0.6791836023330688, + "num_tokens": 152591650.0, + "step": 6110 + }, + { + "epoch": 0.6710959806720843, + "grad_norm": 2.151489019393921, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6980111598968506, + "num_tokens": 152618086.0, + "step": 6111 + }, + { + "epoch": 0.671205798374698, + "grad_norm": 2.1798460483551025, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7022136449813843, + "num_tokens": 152645990.0, + "step": 6112 + }, + { + "epoch": 0.6713156160773117, + "grad_norm": 2.171882152557373, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6938264966011047, + "num_tokens": 152672327.0, + "step": 6113 + }, + { + "epoch": 0.6714254337799254, + "grad_norm": 2.328468084335327, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7225050926208496, + "num_tokens": 152693812.0, + "step": 6114 + }, + { + "epoch": 0.671535251482539, + "grad_norm": 2.421943187713623, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7233596444129944, + "num_tokens": 152714895.0, + "step": 6115 + }, + { + "epoch": 0.6716450691851527, + "grad_norm": 2.2217392921447754, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7140418887138367, + "num_tokens": 152738328.0, + "step": 6116 + }, + { + "epoch": 0.6717548868877663, + "grad_norm": 2.5881152153015137, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7232334613800049, + "num_tokens": 152756900.0, + "step": 6117 + }, + { + "epoch": 0.67186470459038, + "grad_norm": 2.210083484649658, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7076468467712402, + "num_tokens": 152782599.0, + "step": 6118 + }, + { + "epoch": 0.6719745222929936, + "grad_norm": 2.1444151401519775, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7051838636398315, + "num_tokens": 152807404.0, + "step": 6119 + }, + { + "epoch": 0.6720843399956072, + "grad_norm": 1.9113106727600098, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7113136053085327, + "num_tokens": 152839219.0, + "step": 6120 + }, + { + "epoch": 0.672194157698221, + "grad_norm": 2.109935998916626, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.71211838722229, + "num_tokens": 152865504.0, + "step": 6121 + }, + { + "epoch": 0.6723039754008346, + "grad_norm": 2.178499221801758, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7178351879119873, + "num_tokens": 152889077.0, + "step": 6122 + }, + { + "epoch": 0.6724137931034483, + "grad_norm": 2.3792333602905273, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7034960985183716, + "num_tokens": 152912309.0, + "step": 6123 + }, + { + "epoch": 0.6725236108060619, + "grad_norm": 2.391404151916504, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7405911684036255, + "num_tokens": 152931516.0, + "step": 6124 + }, + { + "epoch": 0.6726334285086756, + "grad_norm": 2.1420364379882812, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7078538537025452, + "num_tokens": 152957833.0, + "step": 6125 + }, + { + "epoch": 0.6727432462112892, + "grad_norm": 2.577439308166504, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7268036603927612, + "num_tokens": 152976439.0, + "step": 6126 + }, + { + "epoch": 0.6728530639139029, + "grad_norm": 2.219029664993286, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7299102544784546, + "num_tokens": 153000041.0, + "step": 6127 + }, + { + "epoch": 0.6729628816165166, + "grad_norm": 2.383213758468628, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7316916584968567, + "num_tokens": 153021502.0, + "step": 6128 + }, + { + "epoch": 0.6730726993191303, + "grad_norm": 2.0496864318847656, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7075101733207703, + "num_tokens": 153049361.0, + "step": 6129 + }, + { + "epoch": 0.6731825170217439, + "grad_norm": 2.2741055488586426, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6960416436195374, + "num_tokens": 153074332.0, + "step": 6130 + }, + { + "epoch": 0.6732923347243576, + "grad_norm": 2.337239980697632, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7407512664794922, + "num_tokens": 153094498.0, + "step": 6131 + }, + { + "epoch": 0.6734021524269712, + "grad_norm": 2.401031017303467, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7024691104888916, + "num_tokens": 153117432.0, + "step": 6132 + }, + { + "epoch": 0.6735119701295849, + "grad_norm": 2.6210668087005615, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.720546543598175, + "num_tokens": 153136129.0, + "step": 6133 + }, + { + "epoch": 0.6736217878321985, + "grad_norm": 2.232184886932373, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6938775181770325, + "num_tokens": 153163202.0, + "step": 6134 + }, + { + "epoch": 0.6737316055348123, + "grad_norm": 2.073502540588379, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7193305492401123, + "num_tokens": 153190617.0, + "step": 6135 + }, + { + "epoch": 0.6738414232374259, + "grad_norm": 2.407752275466919, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7172022461891174, + "num_tokens": 153212360.0, + "step": 6136 + }, + { + "epoch": 0.6739512409400396, + "grad_norm": 2.479012966156006, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.687349796295166, + "num_tokens": 153233968.0, + "step": 6137 + }, + { + "epoch": 0.6740610586426532, + "grad_norm": 2.4468963146209717, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7273515462875366, + "num_tokens": 153255028.0, + "step": 6138 + }, + { + "epoch": 0.6741708763452668, + "grad_norm": 2.256995677947998, + "learning_rate": 1e-06, + "loss": 1.0654, + "mean_token_accuracy": 0.6826565265655518, + "num_tokens": 153279288.0, + "step": 6139 + }, + { + "epoch": 0.6742806940478805, + "grad_norm": 2.379005193710327, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7054723501205444, + "num_tokens": 153301964.0, + "step": 6140 + }, + { + "epoch": 0.6743905117504941, + "grad_norm": 2.179389238357544, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7310006618499756, + "num_tokens": 153327842.0, + "step": 6141 + }, + { + "epoch": 0.6745003294531079, + "grad_norm": 2.1338696479797363, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6859650015830994, + "num_tokens": 153358209.0, + "step": 6142 + }, + { + "epoch": 0.6746101471557215, + "grad_norm": 2.1433255672454834, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6878708600997925, + "num_tokens": 153383274.0, + "step": 6143 + }, + { + "epoch": 0.6747199648583352, + "grad_norm": 2.4225680828094482, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7033701539039612, + "num_tokens": 153405678.0, + "step": 6144 + }, + { + "epoch": 0.6748297825609488, + "grad_norm": 2.464442253112793, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7326656579971313, + "num_tokens": 153425076.0, + "step": 6145 + }, + { + "epoch": 0.6749396002635625, + "grad_norm": 2.3735358715057373, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.709764838218689, + "num_tokens": 153447590.0, + "step": 6146 + }, + { + "epoch": 0.6750494179661761, + "grad_norm": 2.015174150466919, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7012014389038086, + "num_tokens": 153478347.0, + "step": 6147 + }, + { + "epoch": 0.6751592356687898, + "grad_norm": 2.332827091217041, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7027276754379272, + "num_tokens": 153500359.0, + "step": 6148 + }, + { + "epoch": 0.6752690533714034, + "grad_norm": 2.005176067352295, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7138410210609436, + "num_tokens": 153530352.0, + "step": 6149 + }, + { + "epoch": 0.6753788710740172, + "grad_norm": 2.3610846996307373, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7292814254760742, + "num_tokens": 153552691.0, + "step": 6150 + }, + { + "epoch": 0.6754886887766308, + "grad_norm": 2.3234031200408936, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.6954871416091919, + "num_tokens": 153575976.0, + "step": 6151 + }, + { + "epoch": 0.6755985064792445, + "grad_norm": 1.9029831886291504, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7146236896514893, + "num_tokens": 153607500.0, + "step": 6152 + }, + { + "epoch": 0.6757083241818581, + "grad_norm": 2.2845513820648193, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7045347690582275, + "num_tokens": 153631022.0, + "step": 6153 + }, + { + "epoch": 0.6758181418844718, + "grad_norm": 1.9934018850326538, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.6945700645446777, + "num_tokens": 153660843.0, + "step": 6154 + }, + { + "epoch": 0.6759279595870854, + "grad_norm": 2.6070494651794434, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7165696024894714, + "num_tokens": 153679803.0, + "step": 6155 + }, + { + "epoch": 0.676037777289699, + "grad_norm": 2.0799832344055176, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7064064145088196, + "num_tokens": 153706522.0, + "step": 6156 + }, + { + "epoch": 0.6761475949923128, + "grad_norm": 2.4091198444366455, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7039129734039307, + "num_tokens": 153729123.0, + "step": 6157 + }, + { + "epoch": 0.6762574126949265, + "grad_norm": 1.9488413333892822, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6924310326576233, + "num_tokens": 153762271.0, + "step": 6158 + }, + { + "epoch": 0.6763672303975401, + "grad_norm": 2.4129068851470947, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7190344929695129, + "num_tokens": 153782335.0, + "step": 6159 + }, + { + "epoch": 0.6764770481001537, + "grad_norm": 2.463042974472046, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7148029804229736, + "num_tokens": 153802761.0, + "step": 6160 + }, + { + "epoch": 0.6765868658027674, + "grad_norm": 2.1999776363372803, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7010092735290527, + "num_tokens": 153827626.0, + "step": 6161 + }, + { + "epoch": 0.676696683505381, + "grad_norm": 1.9668372869491577, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7103111743927002, + "num_tokens": 153857714.0, + "step": 6162 + }, + { + "epoch": 0.6768065012079947, + "grad_norm": 2.3323965072631836, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7057265639305115, + "num_tokens": 153880315.0, + "step": 6163 + }, + { + "epoch": 0.6769163189106084, + "grad_norm": 2.3573174476623535, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7182177305221558, + "num_tokens": 153903538.0, + "step": 6164 + }, + { + "epoch": 0.6770261366132221, + "grad_norm": 2.5658679008483887, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.7008565068244934, + "num_tokens": 153923905.0, + "step": 6165 + }, + { + "epoch": 0.6771359543158357, + "grad_norm": 2.5714504718780518, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7085753679275513, + "num_tokens": 153944304.0, + "step": 6166 + }, + { + "epoch": 0.6772457720184494, + "grad_norm": 1.8986396789550781, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.716583251953125, + "num_tokens": 153977209.0, + "step": 6167 + }, + { + "epoch": 0.677355589721063, + "grad_norm": 2.0596907138824463, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6801031827926636, + "num_tokens": 154007526.0, + "step": 6168 + }, + { + "epoch": 0.6774654074236767, + "grad_norm": 1.9259849786758423, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6899000406265259, + "num_tokens": 154039884.0, + "step": 6169 + }, + { + "epoch": 0.6775752251262903, + "grad_norm": 2.260183334350586, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.728679895401001, + "num_tokens": 154062718.0, + "step": 6170 + }, + { + "epoch": 0.6776850428289041, + "grad_norm": 2.1460962295532227, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7167330980300903, + "num_tokens": 154087301.0, + "step": 6171 + }, + { + "epoch": 0.6777948605315177, + "grad_norm": 1.8652031421661377, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7035051584243774, + "num_tokens": 154120450.0, + "step": 6172 + }, + { + "epoch": 0.6779046782341314, + "grad_norm": 2.288278341293335, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6879849433898926, + "num_tokens": 154145039.0, + "step": 6173 + }, + { + "epoch": 0.678014495936745, + "grad_norm": 2.0814614295959473, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7072190642356873, + "num_tokens": 154172988.0, + "step": 6174 + }, + { + "epoch": 0.6781243136393587, + "grad_norm": 1.9873522520065308, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7106267213821411, + "num_tokens": 154204085.0, + "step": 6175 + }, + { + "epoch": 0.6782341313419723, + "grad_norm": 2.1760611534118652, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.725734531879425, + "num_tokens": 154228322.0, + "step": 6176 + }, + { + "epoch": 0.678343949044586, + "grad_norm": 1.9206547737121582, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7045040726661682, + "num_tokens": 154259648.0, + "step": 6177 + }, + { + "epoch": 0.6784537667471996, + "grad_norm": 2.19181489944458, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7181615829467773, + "num_tokens": 154283514.0, + "step": 6178 + }, + { + "epoch": 0.6785635844498134, + "grad_norm": 2.197430372238159, + "learning_rate": 1e-06, + "loss": 1.0521, + "mean_token_accuracy": 0.6828727722167969, + "num_tokens": 154310942.0, + "step": 6179 + }, + { + "epoch": 0.678673402152427, + "grad_norm": 2.550015449523926, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7299250364303589, + "num_tokens": 154330304.0, + "step": 6180 + }, + { + "epoch": 0.6787832198550406, + "grad_norm": 1.9543843269348145, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7105035781860352, + "num_tokens": 154361036.0, + "step": 6181 + }, + { + "epoch": 0.6788930375576543, + "grad_norm": 2.2620534896850586, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7067705392837524, + "num_tokens": 154384562.0, + "step": 6182 + }, + { + "epoch": 0.6790028552602679, + "grad_norm": 2.1717591285705566, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7071051597595215, + "num_tokens": 154410385.0, + "step": 6183 + }, + { + "epoch": 0.6791126729628816, + "grad_norm": 2.364314556121826, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7259588241577148, + "num_tokens": 154431952.0, + "step": 6184 + }, + { + "epoch": 0.6792224906654952, + "grad_norm": 2.0401411056518555, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6925041675567627, + "num_tokens": 154462503.0, + "step": 6185 + }, + { + "epoch": 0.679332308368109, + "grad_norm": 2.147364377975464, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6844472885131836, + "num_tokens": 154491294.0, + "step": 6186 + }, + { + "epoch": 0.6794421260707226, + "grad_norm": 2.168442964553833, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7158621549606323, + "num_tokens": 154514753.0, + "step": 6187 + }, + { + "epoch": 0.6795519437733363, + "grad_norm": 2.2117862701416016, + "learning_rate": 1e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7529865503311157, + "num_tokens": 154536619.0, + "step": 6188 + }, + { + "epoch": 0.6796617614759499, + "grad_norm": 2.144695997238159, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7177282571792603, + "num_tokens": 154563371.0, + "step": 6189 + }, + { + "epoch": 0.6797715791785636, + "grad_norm": 2.395509958267212, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7201187610626221, + "num_tokens": 154586383.0, + "step": 6190 + }, + { + "epoch": 0.6798813968811772, + "grad_norm": 2.5571961402893066, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.710680365562439, + "num_tokens": 154606461.0, + "step": 6191 + }, + { + "epoch": 0.6799912145837909, + "grad_norm": 2.3237950801849365, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7101632356643677, + "num_tokens": 154628735.0, + "step": 6192 + }, + { + "epoch": 0.6801010322864046, + "grad_norm": 2.2771966457366943, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.716017484664917, + "num_tokens": 154651862.0, + "step": 6193 + }, + { + "epoch": 0.6802108499890183, + "grad_norm": 2.192955255508423, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.6968791484832764, + "num_tokens": 154676703.0, + "step": 6194 + }, + { + "epoch": 0.6803206676916319, + "grad_norm": 2.079061269760132, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6981533765792847, + "num_tokens": 154704781.0, + "step": 6195 + }, + { + "epoch": 0.6804304853942456, + "grad_norm": 2.110724925994873, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6990426778793335, + "num_tokens": 154733846.0, + "step": 6196 + }, + { + "epoch": 0.6805403030968592, + "grad_norm": 2.1429286003112793, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.6983578205108643, + "num_tokens": 154758917.0, + "step": 6197 + }, + { + "epoch": 0.6806501207994728, + "grad_norm": 2.118553400039673, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.709587037563324, + "num_tokens": 154785129.0, + "step": 6198 + }, + { + "epoch": 0.6807599385020865, + "grad_norm": 2.8116073608398438, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7033025026321411, + "num_tokens": 154804134.0, + "step": 6199 + }, + { + "epoch": 0.6808697562047002, + "grad_norm": 2.257481575012207, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6961194276809692, + "num_tokens": 154828884.0, + "step": 6200 + }, + { + "epoch": 0.6809795739073139, + "grad_norm": 1.9685665369033813, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6895981431007385, + "num_tokens": 154859844.0, + "step": 6201 + }, + { + "epoch": 0.6810893916099275, + "grad_norm": 2.326718330383301, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7288795709609985, + "num_tokens": 154881951.0, + "step": 6202 + }, + { + "epoch": 0.6811992093125412, + "grad_norm": 2.1260087490081787, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6916786432266235, + "num_tokens": 154910537.0, + "step": 6203 + }, + { + "epoch": 0.6813090270151548, + "grad_norm": 2.203672170639038, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7118561267852783, + "num_tokens": 154935012.0, + "step": 6204 + }, + { + "epoch": 0.6814188447177685, + "grad_norm": 2.1863763332366943, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7050106525421143, + "num_tokens": 154958358.0, + "step": 6205 + }, + { + "epoch": 0.6815286624203821, + "grad_norm": 1.9845396280288696, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.68231201171875, + "num_tokens": 154987960.0, + "step": 6206 + }, + { + "epoch": 0.6816384801229959, + "grad_norm": 2.196542739868164, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6862348318099976, + "num_tokens": 155013305.0, + "step": 6207 + }, + { + "epoch": 0.6817482978256095, + "grad_norm": 2.1781044006347656, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7121889591217041, + "num_tokens": 155040418.0, + "step": 6208 + }, + { + "epoch": 0.6818581155282232, + "grad_norm": 2.1774144172668457, + "learning_rate": 1e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7371847629547119, + "num_tokens": 155064768.0, + "step": 6209 + }, + { + "epoch": 0.6819679332308368, + "grad_norm": 2.1008870601654053, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6933465600013733, + "num_tokens": 155091321.0, + "step": 6210 + }, + { + "epoch": 0.6820777509334505, + "grad_norm": 2.1565561294555664, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6836022734642029, + "num_tokens": 155117597.0, + "step": 6211 + }, + { + "epoch": 0.6821875686360641, + "grad_norm": 2.2976818084716797, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6919568777084351, + "num_tokens": 155142019.0, + "step": 6212 + }, + { + "epoch": 0.6822973863386778, + "grad_norm": 2.3124053478240967, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6896165609359741, + "num_tokens": 155165712.0, + "step": 6213 + }, + { + "epoch": 0.6824072040412914, + "grad_norm": 2.4359452724456787, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7129126191139221, + "num_tokens": 155187053.0, + "step": 6214 + }, + { + "epoch": 0.6825170217439052, + "grad_norm": 2.4686527252197266, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7341135144233704, + "num_tokens": 155206082.0, + "step": 6215 + }, + { + "epoch": 0.6826268394465188, + "grad_norm": 2.6215593814849854, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7333556413650513, + "num_tokens": 155223703.0, + "step": 6216 + }, + { + "epoch": 0.6827366571491325, + "grad_norm": 2.5338215827941895, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7143118977546692, + "num_tokens": 155244666.0, + "step": 6217 + }, + { + "epoch": 0.6828464748517461, + "grad_norm": 2.0599777698516846, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.681240975856781, + "num_tokens": 155275135.0, + "step": 6218 + }, + { + "epoch": 0.6829562925543597, + "grad_norm": 2.4015536308288574, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7204331159591675, + "num_tokens": 155296708.0, + "step": 6219 + }, + { + "epoch": 0.6830661102569734, + "grad_norm": 2.341996192932129, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.702563464641571, + "num_tokens": 155319264.0, + "step": 6220 + }, + { + "epoch": 0.683175927959587, + "grad_norm": 1.9210625886917114, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6959973573684692, + "num_tokens": 155350803.0, + "step": 6221 + }, + { + "epoch": 0.6832857456622008, + "grad_norm": 2.1405439376831055, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7171781063079834, + "num_tokens": 155378502.0, + "step": 6222 + }, + { + "epoch": 0.6833955633648144, + "grad_norm": 2.385429620742798, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.718765377998352, + "num_tokens": 155399701.0, + "step": 6223 + }, + { + "epoch": 0.6835053810674281, + "grad_norm": 2.3768298625946045, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7152042388916016, + "num_tokens": 155422887.0, + "step": 6224 + }, + { + "epoch": 0.6836151987700417, + "grad_norm": 2.372063398361206, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7119537591934204, + "num_tokens": 155444511.0, + "step": 6225 + }, + { + "epoch": 0.6837250164726554, + "grad_norm": 2.2802999019622803, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7030237913131714, + "num_tokens": 155468508.0, + "step": 6226 + }, + { + "epoch": 0.683834834175269, + "grad_norm": 2.161180257797241, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6843419075012207, + "num_tokens": 155497108.0, + "step": 6227 + }, + { + "epoch": 0.6839446518778827, + "grad_norm": 2.2173523902893066, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7214667797088623, + "num_tokens": 155520225.0, + "step": 6228 + }, + { + "epoch": 0.6840544695804964, + "grad_norm": 2.09116268157959, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7108392715454102, + "num_tokens": 155547060.0, + "step": 6229 + }, + { + "epoch": 0.6841642872831101, + "grad_norm": 1.9890094995498657, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7185690999031067, + "num_tokens": 155575561.0, + "step": 6230 + }, + { + "epoch": 0.6842741049857237, + "grad_norm": 2.1643991470336914, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7222238183021545, + "num_tokens": 155602105.0, + "step": 6231 + }, + { + "epoch": 0.6843839226883374, + "grad_norm": 1.962032437324524, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6909266114234924, + "num_tokens": 155634662.0, + "step": 6232 + }, + { + "epoch": 0.684493740390951, + "grad_norm": 2.5864932537078857, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7289239168167114, + "num_tokens": 155653991.0, + "step": 6233 + }, + { + "epoch": 0.6846035580935647, + "grad_norm": 2.243166208267212, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7172631621360779, + "num_tokens": 155677247.0, + "step": 6234 + }, + { + "epoch": 0.6847133757961783, + "grad_norm": 2.4740896224975586, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7021748423576355, + "num_tokens": 155699722.0, + "step": 6235 + }, + { + "epoch": 0.6848231934987921, + "grad_norm": 2.220568895339966, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7186751365661621, + "num_tokens": 155722868.0, + "step": 6236 + }, + { + "epoch": 0.6849330112014057, + "grad_norm": 2.286803722381592, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7262994647026062, + "num_tokens": 155745024.0, + "step": 6237 + }, + { + "epoch": 0.6850428289040194, + "grad_norm": 2.1906888484954834, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7400089502334595, + "num_tokens": 155768189.0, + "step": 6238 + }, + { + "epoch": 0.685152646606633, + "grad_norm": 2.442258358001709, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7097794413566589, + "num_tokens": 155790241.0, + "step": 6239 + }, + { + "epoch": 0.6852624643092466, + "grad_norm": 2.3508782386779785, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7190494537353516, + "num_tokens": 155812992.0, + "step": 6240 + }, + { + "epoch": 0.6853722820118603, + "grad_norm": 2.036682605743408, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7128286361694336, + "num_tokens": 155840592.0, + "step": 6241 + }, + { + "epoch": 0.6854820997144739, + "grad_norm": 2.1145293712615967, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7070062160491943, + "num_tokens": 155866336.0, + "step": 6242 + }, + { + "epoch": 0.6855919174170876, + "grad_norm": 1.9419912099838257, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6933014392852783, + "num_tokens": 155897733.0, + "step": 6243 + }, + { + "epoch": 0.6857017351197013, + "grad_norm": 2.2269585132598877, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.729659914970398, + "num_tokens": 155921109.0, + "step": 6244 + }, + { + "epoch": 0.685811552822315, + "grad_norm": 2.291059732437134, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6817797422409058, + "num_tokens": 155945033.0, + "step": 6245 + }, + { + "epoch": 0.6859213705249286, + "grad_norm": 2.084397792816162, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7075464725494385, + "num_tokens": 155975072.0, + "step": 6246 + }, + { + "epoch": 0.6860311882275423, + "grad_norm": 2.3595879077911377, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7238211631774902, + "num_tokens": 155997087.0, + "step": 6247 + }, + { + "epoch": 0.6861410059301559, + "grad_norm": 2.133570909500122, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7320238351821899, + "num_tokens": 156021462.0, + "step": 6248 + }, + { + "epoch": 0.6862508236327696, + "grad_norm": 2.0463221073150635, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7029711604118347, + "num_tokens": 156048784.0, + "step": 6249 + }, + { + "epoch": 0.6863606413353832, + "grad_norm": 2.157761335372925, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7131224870681763, + "num_tokens": 156073164.0, + "step": 6250 + }, + { + "epoch": 0.686470459037997, + "grad_norm": 1.9510045051574707, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6792361736297607, + "num_tokens": 156107239.0, + "step": 6251 + }, + { + "epoch": 0.6865802767406106, + "grad_norm": 2.0385913848876953, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.696126401424408, + "num_tokens": 156136544.0, + "step": 6252 + }, + { + "epoch": 0.6866900944432243, + "grad_norm": 2.375898838043213, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7028810977935791, + "num_tokens": 156158147.0, + "step": 6253 + }, + { + "epoch": 0.6867999121458379, + "grad_norm": 2.21816349029541, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6769185662269592, + "num_tokens": 156185106.0, + "step": 6254 + }, + { + "epoch": 0.6869097298484516, + "grad_norm": 2.190593719482422, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.700313925743103, + "num_tokens": 156212249.0, + "step": 6255 + }, + { + "epoch": 0.6870195475510652, + "grad_norm": 2.2847204208374023, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7229993939399719, + "num_tokens": 156234707.0, + "step": 6256 + }, + { + "epoch": 0.6871293652536788, + "grad_norm": 2.1680667400360107, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7020649909973145, + "num_tokens": 156260406.0, + "step": 6257 + }, + { + "epoch": 0.6872391829562926, + "grad_norm": 1.9902122020721436, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.68453449010849, + "num_tokens": 156290054.0, + "step": 6258 + }, + { + "epoch": 0.6873490006589063, + "grad_norm": 1.9950292110443115, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.697437047958374, + "num_tokens": 156319840.0, + "step": 6259 + }, + { + "epoch": 0.6874588183615199, + "grad_norm": 2.247067928314209, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7376452684402466, + "num_tokens": 156343565.0, + "step": 6260 + }, + { + "epoch": 0.6875686360641335, + "grad_norm": 2.101780414581299, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6929289102554321, + "num_tokens": 156372050.0, + "step": 6261 + }, + { + "epoch": 0.6876784537667472, + "grad_norm": 2.352450132369995, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7119025588035583, + "num_tokens": 156394747.0, + "step": 6262 + }, + { + "epoch": 0.6877882714693608, + "grad_norm": 2.1344711780548096, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.714687705039978, + "num_tokens": 156420873.0, + "step": 6263 + }, + { + "epoch": 0.6878980891719745, + "grad_norm": 2.8284122943878174, + "learning_rate": 1e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7443466186523438, + "num_tokens": 156438404.0, + "step": 6264 + }, + { + "epoch": 0.6880079068745882, + "grad_norm": 2.43638014793396, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7094582319259644, + "num_tokens": 156459214.0, + "step": 6265 + }, + { + "epoch": 0.6881177245772019, + "grad_norm": 2.0459959506988525, + "learning_rate": 1e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.7070700526237488, + "num_tokens": 156489665.0, + "step": 6266 + }, + { + "epoch": 0.6882275422798155, + "grad_norm": 2.1564064025878906, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7246914505958557, + "num_tokens": 156514400.0, + "step": 6267 + }, + { + "epoch": 0.6883373599824292, + "grad_norm": 2.3946831226348877, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7193292379379272, + "num_tokens": 156537335.0, + "step": 6268 + }, + { + "epoch": 0.6884471776850428, + "grad_norm": 2.1917669773101807, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7046773433685303, + "num_tokens": 156562537.0, + "step": 6269 + }, + { + "epoch": 0.6885569953876565, + "grad_norm": 2.3038697242736816, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6870720386505127, + "num_tokens": 156586752.0, + "step": 6270 + }, + { + "epoch": 0.6886668130902701, + "grad_norm": 2.3331472873687744, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7162665724754333, + "num_tokens": 156609072.0, + "step": 6271 + }, + { + "epoch": 0.6887766307928838, + "grad_norm": 2.313668966293335, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.72319495677948, + "num_tokens": 156631811.0, + "step": 6272 + }, + { + "epoch": 0.6888864484954975, + "grad_norm": 1.909276008605957, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6960839033126831, + "num_tokens": 156663684.0, + "step": 6273 + }, + { + "epoch": 0.6889962661981112, + "grad_norm": 2.271389961242676, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7015706300735474, + "num_tokens": 156687209.0, + "step": 6274 + }, + { + "epoch": 0.6891060839007248, + "grad_norm": 2.3839359283447266, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.721067488193512, + "num_tokens": 156709818.0, + "step": 6275 + }, + { + "epoch": 0.6892159016033385, + "grad_norm": 1.9242479801177979, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7190701961517334, + "num_tokens": 156739149.0, + "step": 6276 + }, + { + "epoch": 0.6893257193059521, + "grad_norm": 2.0499584674835205, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6926591396331787, + "num_tokens": 156767553.0, + "step": 6277 + }, + { + "epoch": 0.6894355370085657, + "grad_norm": 2.0450353622436523, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7046921253204346, + "num_tokens": 156796182.0, + "step": 6278 + }, + { + "epoch": 0.6895453547111794, + "grad_norm": 2.0415797233581543, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6992214918136597, + "num_tokens": 156826911.0, + "step": 6279 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 2.289106607437134, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7159780263900757, + "num_tokens": 156848963.0, + "step": 6280 + }, + { + "epoch": 0.6897649901164068, + "grad_norm": 2.560150623321533, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7052114009857178, + "num_tokens": 156868984.0, + "step": 6281 + }, + { + "epoch": 0.6898748078190204, + "grad_norm": 2.1138858795166016, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7059733867645264, + "num_tokens": 156894157.0, + "step": 6282 + }, + { + "epoch": 0.6899846255216341, + "grad_norm": 1.7783823013305664, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.683190643787384, + "num_tokens": 156933397.0, + "step": 6283 + }, + { + "epoch": 0.6900944432242477, + "grad_norm": 2.478003978729248, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7337132096290588, + "num_tokens": 156952881.0, + "step": 6284 + }, + { + "epoch": 0.6902042609268614, + "grad_norm": 2.522721767425537, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7244284749031067, + "num_tokens": 156973169.0, + "step": 6285 + }, + { + "epoch": 0.690314078629475, + "grad_norm": 2.1799190044403076, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7109426856040955, + "num_tokens": 156996729.0, + "step": 6286 + }, + { + "epoch": 0.6904238963320888, + "grad_norm": 2.1033427715301514, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7232306003570557, + "num_tokens": 157024412.0, + "step": 6287 + }, + { + "epoch": 0.6905337140347024, + "grad_norm": 2.2662832736968994, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7093428373336792, + "num_tokens": 157046288.0, + "step": 6288 + }, + { + "epoch": 0.6906435317373161, + "grad_norm": 2.326624870300293, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7130237817764282, + "num_tokens": 157070888.0, + "step": 6289 + }, + { + "epoch": 0.6907533494399297, + "grad_norm": 1.8459858894348145, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7132495641708374, + "num_tokens": 157103214.0, + "step": 6290 + }, + { + "epoch": 0.6908631671425434, + "grad_norm": 2.1318249702453613, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.708629310131073, + "num_tokens": 157131499.0, + "step": 6291 + }, + { + "epoch": 0.690972984845157, + "grad_norm": 1.9911280870437622, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6862746477127075, + "num_tokens": 157162991.0, + "step": 6292 + }, + { + "epoch": 0.6910828025477707, + "grad_norm": 2.251605987548828, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7293840050697327, + "num_tokens": 157186261.0, + "step": 6293 + }, + { + "epoch": 0.6911926202503844, + "grad_norm": 2.280109405517578, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7158632278442383, + "num_tokens": 157211209.0, + "step": 6294 + }, + { + "epoch": 0.6913024379529981, + "grad_norm": 2.3578593730926514, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7180018424987793, + "num_tokens": 157232108.0, + "step": 6295 + }, + { + "epoch": 0.6914122556556117, + "grad_norm": 2.374485492706299, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7107180953025818, + "num_tokens": 157255548.0, + "step": 6296 + }, + { + "epoch": 0.6915220733582254, + "grad_norm": 2.254831552505493, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7228805422782898, + "num_tokens": 157278844.0, + "step": 6297 + }, + { + "epoch": 0.691631891060839, + "grad_norm": 2.3308064937591553, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7118249535560608, + "num_tokens": 157302506.0, + "step": 6298 + }, + { + "epoch": 0.6917417087634526, + "grad_norm": 2.070781946182251, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7381804585456848, + "num_tokens": 157329101.0, + "step": 6299 + }, + { + "epoch": 0.6918515264660663, + "grad_norm": 2.2690975666046143, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7009081244468689, + "num_tokens": 157354215.0, + "step": 6300 + }, + { + "epoch": 0.6919613441686799, + "grad_norm": 1.766615390777588, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7139899730682373, + "num_tokens": 157392991.0, + "step": 6301 + }, + { + "epoch": 0.6920711618712937, + "grad_norm": 2.529823064804077, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.733599066734314, + "num_tokens": 157413435.0, + "step": 6302 + }, + { + "epoch": 0.6921809795739073, + "grad_norm": 2.2013227939605713, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.718302309513092, + "num_tokens": 157438046.0, + "step": 6303 + }, + { + "epoch": 0.692290797276521, + "grad_norm": 1.846869945526123, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6984952688217163, + "num_tokens": 157470638.0, + "step": 6304 + }, + { + "epoch": 0.6924006149791346, + "grad_norm": 2.046640634536743, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7253894805908203, + "num_tokens": 157496779.0, + "step": 6305 + }, + { + "epoch": 0.6925104326817483, + "grad_norm": 2.203429937362671, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7207728028297424, + "num_tokens": 157522565.0, + "step": 6306 + }, + { + "epoch": 0.6926202503843619, + "grad_norm": 2.2251999378204346, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7157599329948425, + "num_tokens": 157548897.0, + "step": 6307 + }, + { + "epoch": 0.6927300680869756, + "grad_norm": 2.635012149810791, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7126315832138062, + "num_tokens": 157567517.0, + "step": 6308 + }, + { + "epoch": 0.6928398857895893, + "grad_norm": 2.226092576980591, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7062776684761047, + "num_tokens": 157593329.0, + "step": 6309 + }, + { + "epoch": 0.692949703492203, + "grad_norm": 2.573392391204834, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6899824142456055, + "num_tokens": 157614884.0, + "step": 6310 + }, + { + "epoch": 0.6930595211948166, + "grad_norm": 2.5868754386901855, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7220995426177979, + "num_tokens": 157633190.0, + "step": 6311 + }, + { + "epoch": 0.6931693388974303, + "grad_norm": 2.161548376083374, + "learning_rate": 1e-06, + "loss": 1.0996, + "mean_token_accuracy": 0.6813631057739258, + "num_tokens": 157660677.0, + "step": 6312 + }, + { + "epoch": 0.6932791566000439, + "grad_norm": 2.2777466773986816, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7095869779586792, + "num_tokens": 157684276.0, + "step": 6313 + }, + { + "epoch": 0.6933889743026576, + "grad_norm": 2.2642879486083984, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7109087109565735, + "num_tokens": 157709110.0, + "step": 6314 + }, + { + "epoch": 0.6934987920052712, + "grad_norm": 2.179262399673462, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.6974741816520691, + "num_tokens": 157734205.0, + "step": 6315 + }, + { + "epoch": 0.693608609707885, + "grad_norm": 2.194382905960083, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6990877985954285, + "num_tokens": 157760191.0, + "step": 6316 + }, + { + "epoch": 0.6937184274104986, + "grad_norm": 2.2230026721954346, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6867225170135498, + "num_tokens": 157784495.0, + "step": 6317 + }, + { + "epoch": 0.6938282451131123, + "grad_norm": 2.1067323684692383, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7006189823150635, + "num_tokens": 157810894.0, + "step": 6318 + }, + { + "epoch": 0.6939380628157259, + "grad_norm": 2.039695978164673, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6934677362442017, + "num_tokens": 157841378.0, + "step": 6319 + }, + { + "epoch": 0.6940478805183395, + "grad_norm": 2.3677024841308594, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6987959146499634, + "num_tokens": 157864626.0, + "step": 6320 + }, + { + "epoch": 0.6941576982209532, + "grad_norm": 2.2729077339172363, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7121771574020386, + "num_tokens": 157889986.0, + "step": 6321 + }, + { + "epoch": 0.6942675159235668, + "grad_norm": 2.2191720008850098, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7097747325897217, + "num_tokens": 157914181.0, + "step": 6322 + }, + { + "epoch": 0.6943773336261806, + "grad_norm": 2.290858268737793, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7284099459648132, + "num_tokens": 157935398.0, + "step": 6323 + }, + { + "epoch": 0.6944871513287942, + "grad_norm": 2.4355273246765137, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7144330739974976, + "num_tokens": 157957575.0, + "step": 6324 + }, + { + "epoch": 0.6945969690314079, + "grad_norm": 2.103708505630493, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.740784764289856, + "num_tokens": 157983769.0, + "step": 6325 + }, + { + "epoch": 0.6947067867340215, + "grad_norm": 2.3770153522491455, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7085704207420349, + "num_tokens": 158007415.0, + "step": 6326 + }, + { + "epoch": 0.6948166044366352, + "grad_norm": 2.1734464168548584, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7026599645614624, + "num_tokens": 158033524.0, + "step": 6327 + }, + { + "epoch": 0.6949264221392488, + "grad_norm": 2.2780039310455322, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7118701934814453, + "num_tokens": 158057836.0, + "step": 6328 + }, + { + "epoch": 0.6950362398418625, + "grad_norm": 2.1346795558929443, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.6947081089019775, + "num_tokens": 158084859.0, + "step": 6329 + }, + { + "epoch": 0.6951460575444761, + "grad_norm": 2.0102195739746094, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6964200139045715, + "num_tokens": 158113189.0, + "step": 6330 + }, + { + "epoch": 0.6952558752470899, + "grad_norm": 2.0378119945526123, + "learning_rate": 1e-06, + "loss": 1.1328, + "mean_token_accuracy": 0.6651955842971802, + "num_tokens": 158144559.0, + "step": 6331 + }, + { + "epoch": 0.6953656929497035, + "grad_norm": 2.2040534019470215, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7055819034576416, + "num_tokens": 158169799.0, + "step": 6332 + }, + { + "epoch": 0.6954755106523172, + "grad_norm": 2.0858118534088135, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6877202391624451, + "num_tokens": 158198634.0, + "step": 6333 + }, + { + "epoch": 0.6955853283549308, + "grad_norm": 2.2732958793640137, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7280755043029785, + "num_tokens": 158223198.0, + "step": 6334 + }, + { + "epoch": 0.6956951460575445, + "grad_norm": 2.267099618911743, + "learning_rate": 1e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.6823487281799316, + "num_tokens": 158248870.0, + "step": 6335 + }, + { + "epoch": 0.6958049637601581, + "grad_norm": 2.221889019012451, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.718761682510376, + "num_tokens": 158272095.0, + "step": 6336 + }, + { + "epoch": 0.6959147814627717, + "grad_norm": 2.166106939315796, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6860613822937012, + "num_tokens": 158300331.0, + "step": 6337 + }, + { + "epoch": 0.6960245991653855, + "grad_norm": 2.3691418170928955, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7241645455360413, + "num_tokens": 158322497.0, + "step": 6338 + }, + { + "epoch": 0.6961344168679992, + "grad_norm": 2.2592878341674805, + "learning_rate": 1e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7290312051773071, + "num_tokens": 158343744.0, + "step": 6339 + }, + { + "epoch": 0.6962442345706128, + "grad_norm": 2.103208065032959, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6760233640670776, + "num_tokens": 158372928.0, + "step": 6340 + }, + { + "epoch": 0.6963540522732264, + "grad_norm": 1.97426438331604, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7199972867965698, + "num_tokens": 158402223.0, + "step": 6341 + }, + { + "epoch": 0.6964638699758401, + "grad_norm": 2.167285442352295, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.7006152272224426, + "num_tokens": 158427353.0, + "step": 6342 + }, + { + "epoch": 0.6965736876784537, + "grad_norm": 2.220186710357666, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7133437395095825, + "num_tokens": 158451315.0, + "step": 6343 + }, + { + "epoch": 0.6966835053810674, + "grad_norm": 2.6988773345947266, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7246876955032349, + "num_tokens": 158468889.0, + "step": 6344 + }, + { + "epoch": 0.6967933230836811, + "grad_norm": 2.1197879314422607, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6979719400405884, + "num_tokens": 158495612.0, + "step": 6345 + }, + { + "epoch": 0.6969031407862948, + "grad_norm": 2.5731825828552246, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7167193293571472, + "num_tokens": 158515258.0, + "step": 6346 + }, + { + "epoch": 0.6970129584889084, + "grad_norm": 2.290907382965088, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6825610399246216, + "num_tokens": 158540209.0, + "step": 6347 + }, + { + "epoch": 0.6971227761915221, + "grad_norm": 2.2528469562530518, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7067041993141174, + "num_tokens": 158565884.0, + "step": 6348 + }, + { + "epoch": 0.6972325938941357, + "grad_norm": 2.2444581985473633, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7269038558006287, + "num_tokens": 158591174.0, + "step": 6349 + }, + { + "epoch": 0.6973424115967494, + "grad_norm": 2.403578758239746, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.6975107192993164, + "num_tokens": 158612553.0, + "step": 6350 + }, + { + "epoch": 0.697452229299363, + "grad_norm": 2.25956654548645, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7010517120361328, + "num_tokens": 158636608.0, + "step": 6351 + }, + { + "epoch": 0.6975620470019768, + "grad_norm": 2.114910125732422, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6921448707580566, + "num_tokens": 158664863.0, + "step": 6352 + }, + { + "epoch": 0.6976718647045904, + "grad_norm": 2.2934937477111816, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7281043529510498, + "num_tokens": 158688289.0, + "step": 6353 + }, + { + "epoch": 0.6977816824072041, + "grad_norm": 2.5798637866973877, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7176223993301392, + "num_tokens": 158706913.0, + "step": 6354 + }, + { + "epoch": 0.6978915001098177, + "grad_norm": 2.23652720451355, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7058444023132324, + "num_tokens": 158733448.0, + "step": 6355 + }, + { + "epoch": 0.6980013178124314, + "grad_norm": 2.312211513519287, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7176546454429626, + "num_tokens": 158756223.0, + "step": 6356 + }, + { + "epoch": 0.698111135515045, + "grad_norm": 2.2089011669158936, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7170963287353516, + "num_tokens": 158780744.0, + "step": 6357 + }, + { + "epoch": 0.6982209532176586, + "grad_norm": 2.1419339179992676, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.703711748123169, + "num_tokens": 158808745.0, + "step": 6358 + }, + { + "epoch": 0.6983307709202724, + "grad_norm": 2.331312417984009, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7003494501113892, + "num_tokens": 158832537.0, + "step": 6359 + }, + { + "epoch": 0.698440588622886, + "grad_norm": 2.2739806175231934, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7047969102859497, + "num_tokens": 158856904.0, + "step": 6360 + }, + { + "epoch": 0.6985504063254997, + "grad_norm": 2.4059207439422607, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7240942716598511, + "num_tokens": 158879244.0, + "step": 6361 + }, + { + "epoch": 0.6986602240281133, + "grad_norm": 2.1178340911865234, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7189594507217407, + "num_tokens": 158908004.0, + "step": 6362 + }, + { + "epoch": 0.698770041730727, + "grad_norm": 2.2693278789520264, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7218064069747925, + "num_tokens": 158932697.0, + "step": 6363 + }, + { + "epoch": 0.6988798594333406, + "grad_norm": 2.1502976417541504, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6938067078590393, + "num_tokens": 158960088.0, + "step": 6364 + }, + { + "epoch": 0.6989896771359543, + "grad_norm": 2.1423957347869873, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7099995017051697, + "num_tokens": 158984576.0, + "step": 6365 + }, + { + "epoch": 0.6990994948385679, + "grad_norm": 2.4782960414886475, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7218422889709473, + "num_tokens": 159005315.0, + "step": 6366 + }, + { + "epoch": 0.6992093125411817, + "grad_norm": 2.345046281814575, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.69928377866745, + "num_tokens": 159029245.0, + "step": 6367 + }, + { + "epoch": 0.6993191302437953, + "grad_norm": 2.1736950874328613, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7241246700286865, + "num_tokens": 159053079.0, + "step": 6368 + }, + { + "epoch": 0.699428947946409, + "grad_norm": 2.231257200241089, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.717094898223877, + "num_tokens": 159079194.0, + "step": 6369 + }, + { + "epoch": 0.6995387656490226, + "grad_norm": 2.1621992588043213, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7288627624511719, + "num_tokens": 159105665.0, + "step": 6370 + }, + { + "epoch": 0.6996485833516363, + "grad_norm": 1.9549850225448608, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6818288564682007, + "num_tokens": 159137682.0, + "step": 6371 + }, + { + "epoch": 0.6997584010542499, + "grad_norm": 2.428112745285034, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7145316004753113, + "num_tokens": 159159385.0, + "step": 6372 + }, + { + "epoch": 0.6998682187568636, + "grad_norm": 2.763021230697632, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7365459203720093, + "num_tokens": 159176845.0, + "step": 6373 + }, + { + "epoch": 0.6999780364594773, + "grad_norm": 2.1580886840820312, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7119992971420288, + "num_tokens": 159201533.0, + "step": 6374 + }, + { + "epoch": 0.700087854162091, + "grad_norm": 2.092329263687134, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.702073872089386, + "num_tokens": 159227751.0, + "step": 6375 + }, + { + "epoch": 0.7001976718647046, + "grad_norm": 2.482272148132324, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.715829610824585, + "num_tokens": 159247197.0, + "step": 6376 + }, + { + "epoch": 0.7003074895673183, + "grad_norm": 2.7145047187805176, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7122624516487122, + "num_tokens": 159265197.0, + "step": 6377 + }, + { + "epoch": 0.7004173072699319, + "grad_norm": 2.251612663269043, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.712511420249939, + "num_tokens": 159288632.0, + "step": 6378 + }, + { + "epoch": 0.7005271249725455, + "grad_norm": 1.993036150932312, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7075831890106201, + "num_tokens": 159321595.0, + "step": 6379 + }, + { + "epoch": 0.7006369426751592, + "grad_norm": 2.1936047077178955, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6877896189689636, + "num_tokens": 159348552.0, + "step": 6380 + }, + { + "epoch": 0.700746760377773, + "grad_norm": 2.0601940155029297, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7555899620056152, + "num_tokens": 159377961.0, + "step": 6381 + }, + { + "epoch": 0.7008565780803866, + "grad_norm": 2.3958396911621094, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7071593999862671, + "num_tokens": 159400065.0, + "step": 6382 + }, + { + "epoch": 0.7009663957830002, + "grad_norm": 2.0499472618103027, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6952583193778992, + "num_tokens": 159428295.0, + "step": 6383 + }, + { + "epoch": 0.7010762134856139, + "grad_norm": 2.118802785873413, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7011953592300415, + "num_tokens": 159456238.0, + "step": 6384 + }, + { + "epoch": 0.7011860311882275, + "grad_norm": 2.3369057178497314, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6947935223579407, + "num_tokens": 159483611.0, + "step": 6385 + }, + { + "epoch": 0.7012958488908412, + "grad_norm": 2.247189521789551, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.6994370222091675, + "num_tokens": 159506434.0, + "step": 6386 + }, + { + "epoch": 0.7014056665934548, + "grad_norm": 2.205089807510376, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7105216383934021, + "num_tokens": 159530394.0, + "step": 6387 + }, + { + "epoch": 0.7015154842960686, + "grad_norm": 2.457170009613037, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.715926468372345, + "num_tokens": 159552134.0, + "step": 6388 + }, + { + "epoch": 0.7016253019986822, + "grad_norm": 2.3970401287078857, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7168518304824829, + "num_tokens": 159574136.0, + "step": 6389 + }, + { + "epoch": 0.7017351197012959, + "grad_norm": 2.347768545150757, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7381080389022827, + "num_tokens": 159596791.0, + "step": 6390 + }, + { + "epoch": 0.7018449374039095, + "grad_norm": 2.0267624855041504, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7232563495635986, + "num_tokens": 159625115.0, + "step": 6391 + }, + { + "epoch": 0.7019547551065232, + "grad_norm": 1.9794589281082153, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6940621137619019, + "num_tokens": 159655859.0, + "step": 6392 + }, + { + "epoch": 0.7020645728091368, + "grad_norm": 2.1516785621643066, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7082912921905518, + "num_tokens": 159682189.0, + "step": 6393 + }, + { + "epoch": 0.7021743905117505, + "grad_norm": 2.1482839584350586, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7028040885925293, + "num_tokens": 159709227.0, + "step": 6394 + }, + { + "epoch": 0.7022842082143641, + "grad_norm": 2.0923407077789307, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7003002166748047, + "num_tokens": 159735897.0, + "step": 6395 + }, + { + "epoch": 0.7023940259169779, + "grad_norm": 2.1953206062316895, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6937281489372253, + "num_tokens": 159762795.0, + "step": 6396 + }, + { + "epoch": 0.7025038436195915, + "grad_norm": 2.054837465286255, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7070299386978149, + "num_tokens": 159789525.0, + "step": 6397 + }, + { + "epoch": 0.7026136613222052, + "grad_norm": 1.889779806137085, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6995221972465515, + "num_tokens": 159823541.0, + "step": 6398 + }, + { + "epoch": 0.7027234790248188, + "grad_norm": 2.40378999710083, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7140548229217529, + "num_tokens": 159844908.0, + "step": 6399 + }, + { + "epoch": 0.7028332967274324, + "grad_norm": 2.1427299976348877, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7135004997253418, + "num_tokens": 159869347.0, + "step": 6400 + }, + { + "epoch": 0.7029431144300461, + "grad_norm": 2.3332173824310303, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6896159648895264, + "num_tokens": 159892419.0, + "step": 6401 + }, + { + "epoch": 0.7030529321326597, + "grad_norm": 2.0605580806732178, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7075755596160889, + "num_tokens": 159919919.0, + "step": 6402 + }, + { + "epoch": 0.7031627498352735, + "grad_norm": 2.7010040283203125, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7246845960617065, + "num_tokens": 159937098.0, + "step": 6403 + }, + { + "epoch": 0.7032725675378871, + "grad_norm": 2.252793550491333, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6945976614952087, + "num_tokens": 159962815.0, + "step": 6404 + }, + { + "epoch": 0.7033823852405008, + "grad_norm": 2.217590093612671, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7174556255340576, + "num_tokens": 159987325.0, + "step": 6405 + }, + { + "epoch": 0.7034922029431144, + "grad_norm": 2.5113375186920166, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7106449604034424, + "num_tokens": 160006345.0, + "step": 6406 + }, + { + "epoch": 0.7036020206457281, + "grad_norm": 2.2418127059936523, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7033201456069946, + "num_tokens": 160030157.0, + "step": 6407 + }, + { + "epoch": 0.7037118383483417, + "grad_norm": 2.3662421703338623, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7123306393623352, + "num_tokens": 160051683.0, + "step": 6408 + }, + { + "epoch": 0.7038216560509554, + "grad_norm": 2.1917316913604736, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6924176216125488, + "num_tokens": 160077467.0, + "step": 6409 + }, + { + "epoch": 0.7039314737535691, + "grad_norm": 2.106720447540283, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7050171494483948, + "num_tokens": 160103779.0, + "step": 6410 + }, + { + "epoch": 0.7040412914561828, + "grad_norm": 2.477064371109009, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7153030633926392, + "num_tokens": 160125135.0, + "step": 6411 + }, + { + "epoch": 0.7041511091587964, + "grad_norm": 2.186955213546753, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7244135141372681, + "num_tokens": 160150257.0, + "step": 6412 + }, + { + "epoch": 0.7042609268614101, + "grad_norm": 2.2647132873535156, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7356720566749573, + "num_tokens": 160172719.0, + "step": 6413 + }, + { + "epoch": 0.7043707445640237, + "grad_norm": 2.228569507598877, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7242949604988098, + "num_tokens": 160195792.0, + "step": 6414 + }, + { + "epoch": 0.7044805622666374, + "grad_norm": 2.139408826828003, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7206213474273682, + "num_tokens": 160222166.0, + "step": 6415 + }, + { + "epoch": 0.704590379969251, + "grad_norm": 2.2237417697906494, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6984701156616211, + "num_tokens": 160246681.0, + "step": 6416 + }, + { + "epoch": 0.7047001976718648, + "grad_norm": 1.8173309564590454, + "learning_rate": 1e-06, + "loss": 1.1135, + "mean_token_accuracy": 0.6706756949424744, + "num_tokens": 160284262.0, + "step": 6417 + }, + { + "epoch": 0.7048100153744784, + "grad_norm": 2.362910509109497, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7056461572647095, + "num_tokens": 160306727.0, + "step": 6418 + }, + { + "epoch": 0.704919833077092, + "grad_norm": 2.358427047729492, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7158141136169434, + "num_tokens": 160327715.0, + "step": 6419 + }, + { + "epoch": 0.7050296507797057, + "grad_norm": 2.5854249000549316, + "learning_rate": 1e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6855065822601318, + "num_tokens": 160349896.0, + "step": 6420 + }, + { + "epoch": 0.7051394684823193, + "grad_norm": 2.2759199142456055, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7011279463768005, + "num_tokens": 160375291.0, + "step": 6421 + }, + { + "epoch": 0.705249286184933, + "grad_norm": 2.051295518875122, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7180553674697876, + "num_tokens": 160405026.0, + "step": 6422 + }, + { + "epoch": 0.7053591038875466, + "grad_norm": 2.2634458541870117, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7114285230636597, + "num_tokens": 160430739.0, + "step": 6423 + }, + { + "epoch": 0.7054689215901603, + "grad_norm": 2.4196605682373047, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7117303609848022, + "num_tokens": 160452229.0, + "step": 6424 + }, + { + "epoch": 0.705578739292774, + "grad_norm": 2.4687719345092773, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6960679888725281, + "num_tokens": 160472970.0, + "step": 6425 + }, + { + "epoch": 0.7056885569953877, + "grad_norm": 2.332961082458496, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7205681204795837, + "num_tokens": 160494139.0, + "step": 6426 + }, + { + "epoch": 0.7057983746980013, + "grad_norm": 2.0594773292541504, + "learning_rate": 1e-06, + "loss": 1.0582, + "mean_token_accuracy": 0.680776834487915, + "num_tokens": 160523931.0, + "step": 6427 + }, + { + "epoch": 0.705908192400615, + "grad_norm": 2.074357032775879, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.7041813135147095, + "num_tokens": 160553267.0, + "step": 6428 + }, + { + "epoch": 0.7060180101032286, + "grad_norm": 2.1882784366607666, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7032897472381592, + "num_tokens": 160576923.0, + "step": 6429 + }, + { + "epoch": 0.7061278278058423, + "grad_norm": 2.203644275665283, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7336151599884033, + "num_tokens": 160601170.0, + "step": 6430 + }, + { + "epoch": 0.7062376455084559, + "grad_norm": 2.1053123474121094, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7289893627166748, + "num_tokens": 160625847.0, + "step": 6431 + }, + { + "epoch": 0.7063474632110697, + "grad_norm": 2.128127098083496, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7059658765792847, + "num_tokens": 160654411.0, + "step": 6432 + }, + { + "epoch": 0.7064572809136833, + "grad_norm": 2.2142724990844727, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.6990176439285278, + "num_tokens": 160678933.0, + "step": 6433 + }, + { + "epoch": 0.706567098616297, + "grad_norm": 2.070866823196411, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.7065197229385376, + "num_tokens": 160706585.0, + "step": 6434 + }, + { + "epoch": 0.7066769163189106, + "grad_norm": 2.3738577365875244, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7040500640869141, + "num_tokens": 160728721.0, + "step": 6435 + }, + { + "epoch": 0.7067867340215243, + "grad_norm": 1.9902251958847046, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6849104166030884, + "num_tokens": 160761437.0, + "step": 6436 + }, + { + "epoch": 0.7068965517241379, + "grad_norm": 2.04710054397583, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6957499384880066, + "num_tokens": 160791338.0, + "step": 6437 + }, + { + "epoch": 0.7070063694267515, + "grad_norm": 2.432114839553833, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6862483620643616, + "num_tokens": 160815027.0, + "step": 6438 + }, + { + "epoch": 0.7071161871293653, + "grad_norm": 2.2170462608337402, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6903231739997864, + "num_tokens": 160841081.0, + "step": 6439 + }, + { + "epoch": 0.707226004831979, + "grad_norm": 1.9316169023513794, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7068324089050293, + "num_tokens": 160871790.0, + "step": 6440 + }, + { + "epoch": 0.7073358225345926, + "grad_norm": 2.282874584197998, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.743328332901001, + "num_tokens": 160892190.0, + "step": 6441 + }, + { + "epoch": 0.7074456402372062, + "grad_norm": 2.0895872116088867, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7133138179779053, + "num_tokens": 160918197.0, + "step": 6442 + }, + { + "epoch": 0.7075554579398199, + "grad_norm": 2.3834168910980225, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7033064365386963, + "num_tokens": 160940703.0, + "step": 6443 + }, + { + "epoch": 0.7076652756424335, + "grad_norm": 2.146949291229248, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6971744298934937, + "num_tokens": 160967130.0, + "step": 6444 + }, + { + "epoch": 0.7077750933450472, + "grad_norm": 2.389716863632202, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6911695003509521, + "num_tokens": 160989181.0, + "step": 6445 + }, + { + "epoch": 0.7078849110476609, + "grad_norm": 2.394826650619507, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6961919069290161, + "num_tokens": 161011365.0, + "step": 6446 + }, + { + "epoch": 0.7079947287502746, + "grad_norm": 1.9313632249832153, + "learning_rate": 1e-06, + "loss": 1.0654, + "mean_token_accuracy": 0.6812623143196106, + "num_tokens": 161045203.0, + "step": 6447 + }, + { + "epoch": 0.7081045464528882, + "grad_norm": 1.97152841091156, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.6841782927513123, + "num_tokens": 161078307.0, + "step": 6448 + }, + { + "epoch": 0.7082143641555019, + "grad_norm": 2.387219190597534, + "learning_rate": 1e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.739905834197998, + "num_tokens": 161098971.0, + "step": 6449 + }, + { + "epoch": 0.7083241818581155, + "grad_norm": 2.1434664726257324, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.7073673009872437, + "num_tokens": 161123293.0, + "step": 6450 + }, + { + "epoch": 0.7084339995607292, + "grad_norm": 2.4890682697296143, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7165169715881348, + "num_tokens": 161142892.0, + "step": 6451 + }, + { + "epoch": 0.7085438172633428, + "grad_norm": 2.057648181915283, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.703822672367096, + "num_tokens": 161172694.0, + "step": 6452 + }, + { + "epoch": 0.7086536349659565, + "grad_norm": 2.3702359199523926, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7033710479736328, + "num_tokens": 161196922.0, + "step": 6453 + }, + { + "epoch": 0.7087634526685702, + "grad_norm": 2.699495315551758, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7117049694061279, + "num_tokens": 161215985.0, + "step": 6454 + }, + { + "epoch": 0.7088732703711839, + "grad_norm": 2.692709445953369, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7282033562660217, + "num_tokens": 161233547.0, + "step": 6455 + }, + { + "epoch": 0.7089830880737975, + "grad_norm": 2.1468255519866943, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7080929279327393, + "num_tokens": 161258440.0, + "step": 6456 + }, + { + "epoch": 0.7090929057764112, + "grad_norm": 2.174715757369995, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7220582962036133, + "num_tokens": 161283420.0, + "step": 6457 + }, + { + "epoch": 0.7092027234790248, + "grad_norm": 2.0971462726593018, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7121642827987671, + "num_tokens": 161311277.0, + "step": 6458 + }, + { + "epoch": 0.7093125411816384, + "grad_norm": 2.1784603595733643, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6964689493179321, + "num_tokens": 161336920.0, + "step": 6459 + }, + { + "epoch": 0.7094223588842521, + "grad_norm": 2.270779609680176, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6806454658508301, + "num_tokens": 161362730.0, + "step": 6460 + }, + { + "epoch": 0.7095321765868658, + "grad_norm": 2.17464280128479, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7176090478897095, + "num_tokens": 161385932.0, + "step": 6461 + }, + { + "epoch": 0.7096419942894795, + "grad_norm": 2.050358533859253, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6957361698150635, + "num_tokens": 161413058.0, + "step": 6462 + }, + { + "epoch": 0.7097518119920931, + "grad_norm": 2.386679172515869, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7079117298126221, + "num_tokens": 161436618.0, + "step": 6463 + }, + { + "epoch": 0.7098616296947068, + "grad_norm": 2.024871587753296, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6953874826431274, + "num_tokens": 161466589.0, + "step": 6464 + }, + { + "epoch": 0.7099714473973204, + "grad_norm": 2.2537341117858887, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6715681552886963, + "num_tokens": 161490605.0, + "step": 6465 + }, + { + "epoch": 0.7100812650999341, + "grad_norm": 2.092435121536255, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6889798641204834, + "num_tokens": 161518820.0, + "step": 6466 + }, + { + "epoch": 0.7101910828025477, + "grad_norm": 2.1844139099121094, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6947206854820251, + "num_tokens": 161545800.0, + "step": 6467 + }, + { + "epoch": 0.7103009005051615, + "grad_norm": 2.0626349449157715, + "learning_rate": 1e-06, + "loss": 1.0901, + "mean_token_accuracy": 0.6785262823104858, + "num_tokens": 161576642.0, + "step": 6468 + }, + { + "epoch": 0.7104107182077751, + "grad_norm": 2.0754921436309814, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6887869834899902, + "num_tokens": 161607878.0, + "step": 6469 + }, + { + "epoch": 0.7105205359103888, + "grad_norm": 1.8908978700637817, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.701980471611023, + "num_tokens": 161641235.0, + "step": 6470 + }, + { + "epoch": 0.7106303536130024, + "grad_norm": 2.2513248920440674, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7039114236831665, + "num_tokens": 161666491.0, + "step": 6471 + }, + { + "epoch": 0.7107401713156161, + "grad_norm": 2.3789873123168945, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7041687369346619, + "num_tokens": 161688200.0, + "step": 6472 + }, + { + "epoch": 0.7108499890182297, + "grad_norm": 2.0033912658691406, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7280523777008057, + "num_tokens": 161714711.0, + "step": 6473 + }, + { + "epoch": 0.7109598067208434, + "grad_norm": 2.0718255043029785, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6869496703147888, + "num_tokens": 161743466.0, + "step": 6474 + }, + { + "epoch": 0.7110696244234571, + "grad_norm": 2.2380237579345703, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.709223747253418, + "num_tokens": 161766701.0, + "step": 6475 + }, + { + "epoch": 0.7111794421260708, + "grad_norm": 2.0357577800750732, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7131729125976562, + "num_tokens": 161794056.0, + "step": 6476 + }, + { + "epoch": 0.7112892598286844, + "grad_norm": 2.0602033138275146, + "learning_rate": 1e-06, + "loss": 1.0892, + "mean_token_accuracy": 0.6818886995315552, + "num_tokens": 161824234.0, + "step": 6477 + }, + { + "epoch": 0.711399077531298, + "grad_norm": 2.3136789798736572, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7002743482589722, + "num_tokens": 161849201.0, + "step": 6478 + }, + { + "epoch": 0.7115088952339117, + "grad_norm": 1.9676531553268433, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6909036636352539, + "num_tokens": 161879688.0, + "step": 6479 + }, + { + "epoch": 0.7116187129365253, + "grad_norm": 2.267005681991577, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7097824811935425, + "num_tokens": 161905960.0, + "step": 6480 + }, + { + "epoch": 0.711728530639139, + "grad_norm": 1.9648102521896362, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.6948511600494385, + "num_tokens": 161936746.0, + "step": 6481 + }, + { + "epoch": 0.7118383483417526, + "grad_norm": 2.0632359981536865, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7000778913497925, + "num_tokens": 161964075.0, + "step": 6482 + }, + { + "epoch": 0.7119481660443664, + "grad_norm": 2.2773377895355225, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7029333710670471, + "num_tokens": 161987535.0, + "step": 6483 + }, + { + "epoch": 0.71205798374698, + "grad_norm": 2.2672042846679688, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.6974453330039978, + "num_tokens": 162010244.0, + "step": 6484 + }, + { + "epoch": 0.7121678014495937, + "grad_norm": 2.1333913803100586, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.6993077993392944, + "num_tokens": 162038752.0, + "step": 6485 + }, + { + "epoch": 0.7122776191522073, + "grad_norm": 2.145298719406128, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7048244476318359, + "num_tokens": 162063374.0, + "step": 6486 + }, + { + "epoch": 0.712387436854821, + "grad_norm": 2.3752241134643555, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7174471020698547, + "num_tokens": 162085815.0, + "step": 6487 + }, + { + "epoch": 0.7124972545574346, + "grad_norm": 2.3682026863098145, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.716689944267273, + "num_tokens": 162108143.0, + "step": 6488 + }, + { + "epoch": 0.7126070722600483, + "grad_norm": 2.5612006187438965, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7259668111801147, + "num_tokens": 162127451.0, + "step": 6489 + }, + { + "epoch": 0.712716889962662, + "grad_norm": 2.1693179607391357, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7080273628234863, + "num_tokens": 162151123.0, + "step": 6490 + }, + { + "epoch": 0.7128267076652757, + "grad_norm": 2.2225379943847656, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.71831876039505, + "num_tokens": 162177401.0, + "step": 6491 + }, + { + "epoch": 0.7129365253678893, + "grad_norm": 2.485553503036499, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.729759931564331, + "num_tokens": 162195737.0, + "step": 6492 + }, + { + "epoch": 0.713046343070503, + "grad_norm": 2.112438678741455, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.714051365852356, + "num_tokens": 162224103.0, + "step": 6493 + }, + { + "epoch": 0.7131561607731166, + "grad_norm": 2.1620185375213623, + "learning_rate": 1e-06, + "loss": 1.0898, + "mean_token_accuracy": 0.6752713918685913, + "num_tokens": 162251378.0, + "step": 6494 + }, + { + "epoch": 0.7132659784757303, + "grad_norm": 2.2340121269226074, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6966913342475891, + "num_tokens": 162277553.0, + "step": 6495 + }, + { + "epoch": 0.7133757961783439, + "grad_norm": 2.130687713623047, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6959595680236816, + "num_tokens": 162306069.0, + "step": 6496 + }, + { + "epoch": 0.7134856138809577, + "grad_norm": 2.2511744499206543, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7288132905960083, + "num_tokens": 162330800.0, + "step": 6497 + }, + { + "epoch": 0.7135954315835713, + "grad_norm": 2.298593759536743, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7077072858810425, + "num_tokens": 162356002.0, + "step": 6498 + }, + { + "epoch": 0.713705249286185, + "grad_norm": 2.2950243949890137, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6992440223693848, + "num_tokens": 162380572.0, + "step": 6499 + }, + { + "epoch": 0.7138150669887986, + "grad_norm": 2.0015289783477783, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7130470275878906, + "num_tokens": 162408950.0, + "step": 6500 + }, + { + "epoch": 0.7139248846914122, + "grad_norm": 2.1321310997009277, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6990505456924438, + "num_tokens": 162436381.0, + "step": 6501 + }, + { + "epoch": 0.7140347023940259, + "grad_norm": 2.154041290283203, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.697106122970581, + "num_tokens": 162463138.0, + "step": 6502 + }, + { + "epoch": 0.7141445200966395, + "grad_norm": 2.1713075637817383, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.708017110824585, + "num_tokens": 162487489.0, + "step": 6503 + }, + { + "epoch": 0.7142543377992533, + "grad_norm": 2.375248432159424, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7166464328765869, + "num_tokens": 162509839.0, + "step": 6504 + }, + { + "epoch": 0.7143641555018669, + "grad_norm": 1.8766742944717407, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7078803777694702, + "num_tokens": 162541240.0, + "step": 6505 + }, + { + "epoch": 0.7144739732044806, + "grad_norm": 2.4618093967437744, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.736993134021759, + "num_tokens": 162561367.0, + "step": 6506 + }, + { + "epoch": 0.7145837909070942, + "grad_norm": 2.134183168411255, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7052400708198547, + "num_tokens": 162587518.0, + "step": 6507 + }, + { + "epoch": 0.7146936086097079, + "grad_norm": 2.0137810707092285, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6953839063644409, + "num_tokens": 162615604.0, + "step": 6508 + }, + { + "epoch": 0.7148034263123215, + "grad_norm": 2.0868961811065674, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.701555609703064, + "num_tokens": 162642947.0, + "step": 6509 + }, + { + "epoch": 0.7149132440149352, + "grad_norm": 1.8684958219528198, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7058557271957397, + "num_tokens": 162675967.0, + "step": 6510 + }, + { + "epoch": 0.7150230617175488, + "grad_norm": 2.3315203189849854, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7027693390846252, + "num_tokens": 162700054.0, + "step": 6511 + }, + { + "epoch": 0.7151328794201626, + "grad_norm": 2.3210244178771973, + "learning_rate": 1e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6806520223617554, + "num_tokens": 162727613.0, + "step": 6512 + }, + { + "epoch": 0.7152426971227762, + "grad_norm": 2.272524833679199, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.692205548286438, + "num_tokens": 162754077.0, + "step": 6513 + }, + { + "epoch": 0.7153525148253899, + "grad_norm": 2.2906785011291504, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6878098845481873, + "num_tokens": 162779931.0, + "step": 6514 + }, + { + "epoch": 0.7154623325280035, + "grad_norm": 2.1311471462249756, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6972935199737549, + "num_tokens": 162807295.0, + "step": 6515 + }, + { + "epoch": 0.7155721502306172, + "grad_norm": 2.3009161949157715, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7208134531974792, + "num_tokens": 162829576.0, + "step": 6516 + }, + { + "epoch": 0.7156819679332308, + "grad_norm": 2.069445848464966, + "learning_rate": 1e-06, + "loss": 1.1217, + "mean_token_accuracy": 0.6609092950820923, + "num_tokens": 162858779.0, + "step": 6517 + }, + { + "epoch": 0.7157917856358444, + "grad_norm": 2.231964111328125, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.709863543510437, + "num_tokens": 162882417.0, + "step": 6518 + }, + { + "epoch": 0.7159016033384582, + "grad_norm": 2.312467098236084, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7169898748397827, + "num_tokens": 162905198.0, + "step": 6519 + }, + { + "epoch": 0.7160114210410718, + "grad_norm": 2.464954376220703, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7197939157485962, + "num_tokens": 162926396.0, + "step": 6520 + }, + { + "epoch": 0.7161212387436855, + "grad_norm": 1.9260746240615845, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7235570549964905, + "num_tokens": 162957413.0, + "step": 6521 + }, + { + "epoch": 0.7162310564462991, + "grad_norm": 2.3082942962646484, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7158313393592834, + "num_tokens": 162980230.0, + "step": 6522 + }, + { + "epoch": 0.7163408741489128, + "grad_norm": 2.1390914916992188, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7191038727760315, + "num_tokens": 163006007.0, + "step": 6523 + }, + { + "epoch": 0.7164506918515264, + "grad_norm": 2.223708391189575, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7094658613204956, + "num_tokens": 163031445.0, + "step": 6524 + }, + { + "epoch": 0.7165605095541401, + "grad_norm": 2.4194884300231934, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7264459729194641, + "num_tokens": 163052508.0, + "step": 6525 + }, + { + "epoch": 0.7166703272567538, + "grad_norm": 2.5486552715301514, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7222698926925659, + "num_tokens": 163071631.0, + "step": 6526 + }, + { + "epoch": 0.7167801449593675, + "grad_norm": 2.2207577228546143, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7045912146568298, + "num_tokens": 163095204.0, + "step": 6527 + }, + { + "epoch": 0.7168899626619811, + "grad_norm": 2.291841745376587, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7067442536354065, + "num_tokens": 163117840.0, + "step": 6528 + }, + { + "epoch": 0.7169997803645948, + "grad_norm": 1.9467442035675049, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6822435855865479, + "num_tokens": 163150112.0, + "step": 6529 + }, + { + "epoch": 0.7171095980672084, + "grad_norm": 2.2104532718658447, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7089637517929077, + "num_tokens": 163175047.0, + "step": 6530 + }, + { + "epoch": 0.7172194157698221, + "grad_norm": 2.1878528594970703, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7117443084716797, + "num_tokens": 163198712.0, + "step": 6531 + }, + { + "epoch": 0.7173292334724357, + "grad_norm": 2.4190287590026855, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6908021569252014, + "num_tokens": 163221467.0, + "step": 6532 + }, + { + "epoch": 0.7174390511750495, + "grad_norm": 2.4491374492645264, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7161386013031006, + "num_tokens": 163244707.0, + "step": 6533 + }, + { + "epoch": 0.7175488688776631, + "grad_norm": 2.3118293285369873, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7042710185050964, + "num_tokens": 163269097.0, + "step": 6534 + }, + { + "epoch": 0.7176586865802768, + "grad_norm": 2.3696470260620117, + "learning_rate": 1e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7457388043403625, + "num_tokens": 163289754.0, + "step": 6535 + }, + { + "epoch": 0.7177685042828904, + "grad_norm": 2.313373327255249, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.711656391620636, + "num_tokens": 163313799.0, + "step": 6536 + }, + { + "epoch": 0.717878321985504, + "grad_norm": 2.282029628753662, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7107083201408386, + "num_tokens": 163338112.0, + "step": 6537 + }, + { + "epoch": 0.7179881396881177, + "grad_norm": 2.5738236904144287, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7162811756134033, + "num_tokens": 163356246.0, + "step": 6538 + }, + { + "epoch": 0.7180979573907313, + "grad_norm": 2.395617961883545, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7082529664039612, + "num_tokens": 163378332.0, + "step": 6539 + }, + { + "epoch": 0.7182077750933451, + "grad_norm": 2.746027946472168, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7225214242935181, + "num_tokens": 163393955.0, + "step": 6540 + }, + { + "epoch": 0.7183175927959587, + "grad_norm": 2.343747138977051, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6891685724258423, + "num_tokens": 163417788.0, + "step": 6541 + }, + { + "epoch": 0.7184274104985724, + "grad_norm": 2.2391059398651123, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7124539017677307, + "num_tokens": 163441951.0, + "step": 6542 + }, + { + "epoch": 0.718537228201186, + "grad_norm": 2.130408525466919, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7352292537689209, + "num_tokens": 163466051.0, + "step": 6543 + }, + { + "epoch": 0.7186470459037997, + "grad_norm": 2.4958608150482178, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7119163274765015, + "num_tokens": 163487687.0, + "step": 6544 + }, + { + "epoch": 0.7187568636064133, + "grad_norm": 2.4240474700927734, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7325809001922607, + "num_tokens": 163508418.0, + "step": 6545 + }, + { + "epoch": 0.718866681309027, + "grad_norm": 2.173656463623047, + "learning_rate": 1e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6785812377929688, + "num_tokens": 163533603.0, + "step": 6546 + }, + { + "epoch": 0.7189764990116406, + "grad_norm": 2.7391884326934814, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7190840244293213, + "num_tokens": 163552246.0, + "step": 6547 + }, + { + "epoch": 0.7190863167142544, + "grad_norm": 2.170231580734253, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6900985836982727, + "num_tokens": 163579802.0, + "step": 6548 + }, + { + "epoch": 0.719196134416868, + "grad_norm": 2.379845142364502, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7191018462181091, + "num_tokens": 163602331.0, + "step": 6549 + }, + { + "epoch": 0.7193059521194817, + "grad_norm": 2.2201497554779053, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7082358598709106, + "num_tokens": 163627531.0, + "step": 6550 + }, + { + "epoch": 0.7194157698220953, + "grad_norm": 2.1427109241485596, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7143261432647705, + "num_tokens": 163653609.0, + "step": 6551 + }, + { + "epoch": 0.719525587524709, + "grad_norm": 2.249887704849243, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7041006088256836, + "num_tokens": 163678436.0, + "step": 6552 + }, + { + "epoch": 0.7196354052273226, + "grad_norm": 2.028125286102295, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.6963083744049072, + "num_tokens": 163706722.0, + "step": 6553 + }, + { + "epoch": 0.7197452229299363, + "grad_norm": 2.2043912410736084, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6948056221008301, + "num_tokens": 163732096.0, + "step": 6554 + }, + { + "epoch": 0.71985504063255, + "grad_norm": 2.4558522701263428, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.6989567279815674, + "num_tokens": 163754128.0, + "step": 6555 + }, + { + "epoch": 0.7199648583351637, + "grad_norm": 2.2153358459472656, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7151307463645935, + "num_tokens": 163778263.0, + "step": 6556 + }, + { + "epoch": 0.7200746760377773, + "grad_norm": 2.083193063735962, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7223037481307983, + "num_tokens": 163804747.0, + "step": 6557 + }, + { + "epoch": 0.720184493740391, + "grad_norm": 2.401951313018799, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7207643389701843, + "num_tokens": 163827804.0, + "step": 6558 + }, + { + "epoch": 0.7202943114430046, + "grad_norm": 2.5560245513916016, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7193413376808167, + "num_tokens": 163846642.0, + "step": 6559 + }, + { + "epoch": 0.7204041291456182, + "grad_norm": 2.3187828063964844, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7183760404586792, + "num_tokens": 163870291.0, + "step": 6560 + }, + { + "epoch": 0.7205139468482319, + "grad_norm": 2.2541310787200928, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7162021398544312, + "num_tokens": 163894161.0, + "step": 6561 + }, + { + "epoch": 0.7206237645508456, + "grad_norm": 2.2209696769714355, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7274419069290161, + "num_tokens": 163915713.0, + "step": 6562 + }, + { + "epoch": 0.7207335822534593, + "grad_norm": 2.461726665496826, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7089557647705078, + "num_tokens": 163935901.0, + "step": 6563 + }, + { + "epoch": 0.7208433999560729, + "grad_norm": 2.0376532077789307, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7211135625839233, + "num_tokens": 163964782.0, + "step": 6564 + }, + { + "epoch": 0.7209532176586866, + "grad_norm": 2.111480712890625, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7264679670333862, + "num_tokens": 163992312.0, + "step": 6565 + }, + { + "epoch": 0.7210630353613002, + "grad_norm": 2.3205151557922363, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7021124362945557, + "num_tokens": 164015881.0, + "step": 6566 + }, + { + "epoch": 0.7211728530639139, + "grad_norm": 2.1248507499694824, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7104129791259766, + "num_tokens": 164042304.0, + "step": 6567 + }, + { + "epoch": 0.7212826707665275, + "grad_norm": 2.174821615219116, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6900737881660461, + "num_tokens": 164070836.0, + "step": 6568 + }, + { + "epoch": 0.7213924884691413, + "grad_norm": 2.4294660091400146, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7238147258758545, + "num_tokens": 164091716.0, + "step": 6569 + }, + { + "epoch": 0.7215023061717549, + "grad_norm": 2.1854817867279053, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.691680371761322, + "num_tokens": 164119482.0, + "step": 6570 + }, + { + "epoch": 0.7216121238743686, + "grad_norm": 2.2125356197357178, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7157702445983887, + "num_tokens": 164144227.0, + "step": 6571 + }, + { + "epoch": 0.7217219415769822, + "grad_norm": 2.14367413520813, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7088379859924316, + "num_tokens": 164169230.0, + "step": 6572 + }, + { + "epoch": 0.7218317592795959, + "grad_norm": 2.456324577331543, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7142069339752197, + "num_tokens": 164190874.0, + "step": 6573 + }, + { + "epoch": 0.7219415769822095, + "grad_norm": 2.0076136589050293, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7084181308746338, + "num_tokens": 164220004.0, + "step": 6574 + }, + { + "epoch": 0.7220513946848232, + "grad_norm": 2.36690354347229, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7025811076164246, + "num_tokens": 164242583.0, + "step": 6575 + }, + { + "epoch": 0.7221612123874368, + "grad_norm": 2.3053624629974365, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7085312604904175, + "num_tokens": 164267274.0, + "step": 6576 + }, + { + "epoch": 0.7222710300900506, + "grad_norm": 2.1038787364959717, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6987725496292114, + "num_tokens": 164295373.0, + "step": 6577 + }, + { + "epoch": 0.7223808477926642, + "grad_norm": 2.3329710960388184, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7124557495117188, + "num_tokens": 164319055.0, + "step": 6578 + }, + { + "epoch": 0.7224906654952779, + "grad_norm": 2.3125617504119873, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6963292956352234, + "num_tokens": 164344395.0, + "step": 6579 + }, + { + "epoch": 0.7226004831978915, + "grad_norm": 2.101060390472412, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.714179515838623, + "num_tokens": 164372222.0, + "step": 6580 + }, + { + "epoch": 0.7227103009005051, + "grad_norm": 2.278135061264038, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6826084852218628, + "num_tokens": 164397577.0, + "step": 6581 + }, + { + "epoch": 0.7228201186031188, + "grad_norm": 2.3299012184143066, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7037932872772217, + "num_tokens": 164420227.0, + "step": 6582 + }, + { + "epoch": 0.7229299363057324, + "grad_norm": 2.456667423248291, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7047930955886841, + "num_tokens": 164439826.0, + "step": 6583 + }, + { + "epoch": 0.7230397540083462, + "grad_norm": 2.1907896995544434, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7057284116744995, + "num_tokens": 164465740.0, + "step": 6584 + }, + { + "epoch": 0.7231495717109598, + "grad_norm": 2.004110336303711, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6873575448989868, + "num_tokens": 164495404.0, + "step": 6585 + }, + { + "epoch": 0.7232593894135735, + "grad_norm": 2.3368613719940186, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7020772695541382, + "num_tokens": 164518420.0, + "step": 6586 + }, + { + "epoch": 0.7233692071161871, + "grad_norm": 2.021314859390259, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6956608295440674, + "num_tokens": 164547281.0, + "step": 6587 + }, + { + "epoch": 0.7234790248188008, + "grad_norm": 2.299516201019287, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7226461172103882, + "num_tokens": 164570886.0, + "step": 6588 + }, + { + "epoch": 0.7235888425214144, + "grad_norm": 2.2216920852661133, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7085162997245789, + "num_tokens": 164596693.0, + "step": 6589 + }, + { + "epoch": 0.7236986602240281, + "grad_norm": 2.2649834156036377, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7191400527954102, + "num_tokens": 164618364.0, + "step": 6590 + }, + { + "epoch": 0.7238084779266418, + "grad_norm": 2.3527064323425293, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7012004852294922, + "num_tokens": 164641794.0, + "step": 6591 + }, + { + "epoch": 0.7239182956292555, + "grad_norm": 2.198918104171753, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7025542259216309, + "num_tokens": 164666438.0, + "step": 6592 + }, + { + "epoch": 0.7240281133318691, + "grad_norm": 2.275190830230713, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.724511981010437, + "num_tokens": 164690183.0, + "step": 6593 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 2.066962480545044, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.705295741558075, + "num_tokens": 164717767.0, + "step": 6594 + }, + { + "epoch": 0.7242477487370964, + "grad_norm": 2.7285730838775635, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7185635566711426, + "num_tokens": 164735478.0, + "step": 6595 + }, + { + "epoch": 0.72435756643971, + "grad_norm": 2.280003309249878, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7037839889526367, + "num_tokens": 164758142.0, + "step": 6596 + }, + { + "epoch": 0.7244673841423237, + "grad_norm": 2.374091148376465, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7180648446083069, + "num_tokens": 164780877.0, + "step": 6597 + }, + { + "epoch": 0.7245772018449375, + "grad_norm": 2.0361058712005615, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7099789977073669, + "num_tokens": 164808793.0, + "step": 6598 + }, + { + "epoch": 0.7246870195475511, + "grad_norm": 2.0153732299804688, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.692642331123352, + "num_tokens": 164838314.0, + "step": 6599 + }, + { + "epoch": 0.7247968372501647, + "grad_norm": 2.2808613777160645, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7272772789001465, + "num_tokens": 164862133.0, + "step": 6600 + }, + { + "epoch": 0.7249066549527784, + "grad_norm": 2.335012912750244, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7006120085716248, + "num_tokens": 164887560.0, + "step": 6601 + }, + { + "epoch": 0.725016472655392, + "grad_norm": 2.2372865676879883, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7085927128791809, + "num_tokens": 164911341.0, + "step": 6602 + }, + { + "epoch": 0.7251262903580057, + "grad_norm": 2.1461517810821533, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6873712539672852, + "num_tokens": 164937796.0, + "step": 6603 + }, + { + "epoch": 0.7252361080606193, + "grad_norm": 2.1522796154022217, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7356346845626831, + "num_tokens": 164960426.0, + "step": 6604 + }, + { + "epoch": 0.725345925763233, + "grad_norm": 2.3539376258850098, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7310330867767334, + "num_tokens": 164982100.0, + "step": 6605 + }, + { + "epoch": 0.7254557434658467, + "grad_norm": 2.0397849082946777, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7087894678115845, + "num_tokens": 165011455.0, + "step": 6606 + }, + { + "epoch": 0.7255655611684604, + "grad_norm": 2.4359095096588135, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7262335419654846, + "num_tokens": 165032911.0, + "step": 6607 + }, + { + "epoch": 0.725675378871074, + "grad_norm": 2.159881353378296, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.71940678358078, + "num_tokens": 165058430.0, + "step": 6608 + }, + { + "epoch": 0.7257851965736877, + "grad_norm": 2.028360605239868, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6941962242126465, + "num_tokens": 165088752.0, + "step": 6609 + }, + { + "epoch": 0.7258950142763013, + "grad_norm": 2.460900068283081, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7286781072616577, + "num_tokens": 165106777.0, + "step": 6610 + }, + { + "epoch": 0.726004831978915, + "grad_norm": 2.1478238105773926, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7327551245689392, + "num_tokens": 165131594.0, + "step": 6611 + }, + { + "epoch": 0.7261146496815286, + "grad_norm": 2.2851779460906982, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7029051780700684, + "num_tokens": 165155163.0, + "step": 6612 + }, + { + "epoch": 0.7262244673841424, + "grad_norm": 2.243242025375366, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7021619081497192, + "num_tokens": 165180071.0, + "step": 6613 + }, + { + "epoch": 0.726334285086756, + "grad_norm": 2.1551332473754883, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6807940602302551, + "num_tokens": 165205886.0, + "step": 6614 + }, + { + "epoch": 0.7264441027893697, + "grad_norm": 2.6604368686676025, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7057092189788818, + "num_tokens": 165226263.0, + "step": 6615 + }, + { + "epoch": 0.7265539204919833, + "grad_norm": 2.0521433353424072, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7150917053222656, + "num_tokens": 165254964.0, + "step": 6616 + }, + { + "epoch": 0.726663738194597, + "grad_norm": 2.432908535003662, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7197306156158447, + "num_tokens": 165276160.0, + "step": 6617 + }, + { + "epoch": 0.7267735558972106, + "grad_norm": 2.2932939529418945, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.688721776008606, + "num_tokens": 165299387.0, + "step": 6618 + }, + { + "epoch": 0.7268833735998242, + "grad_norm": 2.348176956176758, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6919305324554443, + "num_tokens": 165322529.0, + "step": 6619 + }, + { + "epoch": 0.726993191302438, + "grad_norm": 1.9908477067947388, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7192374467849731, + "num_tokens": 165350900.0, + "step": 6620 + }, + { + "epoch": 0.7271030090050516, + "grad_norm": 1.9799628257751465, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7175524234771729, + "num_tokens": 165382257.0, + "step": 6621 + }, + { + "epoch": 0.7272128267076653, + "grad_norm": 2.0558812618255615, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7118738293647766, + "num_tokens": 165410194.0, + "step": 6622 + }, + { + "epoch": 0.7273226444102789, + "grad_norm": 2.242565393447876, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7006717920303345, + "num_tokens": 165437495.0, + "step": 6623 + }, + { + "epoch": 0.7274324621128926, + "grad_norm": 2.2079050540924072, + "learning_rate": 1e-06, + "loss": 1.0777, + "mean_token_accuracy": 0.6746788024902344, + "num_tokens": 165462406.0, + "step": 6624 + }, + { + "epoch": 0.7275422798155062, + "grad_norm": 2.3832943439483643, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6973069906234741, + "num_tokens": 165484694.0, + "step": 6625 + }, + { + "epoch": 0.7276520975181199, + "grad_norm": 2.3419594764709473, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7292295694351196, + "num_tokens": 165505055.0, + "step": 6626 + }, + { + "epoch": 0.7277619152207336, + "grad_norm": 1.9355429410934448, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7009048461914062, + "num_tokens": 165535931.0, + "step": 6627 + }, + { + "epoch": 0.7278717329233473, + "grad_norm": 2.4106788635253906, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7215561866760254, + "num_tokens": 165558100.0, + "step": 6628 + }, + { + "epoch": 0.7279815506259609, + "grad_norm": 2.0710577964782715, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6980851888656616, + "num_tokens": 165586708.0, + "step": 6629 + }, + { + "epoch": 0.7280913683285746, + "grad_norm": 2.354525327682495, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7111421823501587, + "num_tokens": 165608163.0, + "step": 6630 + }, + { + "epoch": 0.7282011860311882, + "grad_norm": 1.9122787714004517, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7090949416160583, + "num_tokens": 165639355.0, + "step": 6631 + }, + { + "epoch": 0.7283110037338019, + "grad_norm": 2.142948865890503, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6955151557922363, + "num_tokens": 165662865.0, + "step": 6632 + }, + { + "epoch": 0.7284208214364155, + "grad_norm": 2.280945062637329, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7438991069793701, + "num_tokens": 165683786.0, + "step": 6633 + }, + { + "epoch": 0.7285306391390292, + "grad_norm": 2.356504201889038, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7194985151290894, + "num_tokens": 165706944.0, + "step": 6634 + }, + { + "epoch": 0.7286404568416429, + "grad_norm": 2.327679395675659, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7225062847137451, + "num_tokens": 165729235.0, + "step": 6635 + }, + { + "epoch": 0.7287502745442566, + "grad_norm": 2.0036630630493164, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6932830214500427, + "num_tokens": 165759023.0, + "step": 6636 + }, + { + "epoch": 0.7288600922468702, + "grad_norm": 2.3928191661834717, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7157306671142578, + "num_tokens": 165780211.0, + "step": 6637 + }, + { + "epoch": 0.7289699099494839, + "grad_norm": 1.9613456726074219, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7141076326370239, + "num_tokens": 165809046.0, + "step": 6638 + }, + { + "epoch": 0.7290797276520975, + "grad_norm": 2.1786649227142334, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7225290536880493, + "num_tokens": 165832742.0, + "step": 6639 + }, + { + "epoch": 0.7291895453547111, + "grad_norm": 2.1737449169158936, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7080179452896118, + "num_tokens": 165857299.0, + "step": 6640 + }, + { + "epoch": 0.7292993630573248, + "grad_norm": 2.3952972888946533, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7125734090805054, + "num_tokens": 165880736.0, + "step": 6641 + }, + { + "epoch": 0.7294091807599385, + "grad_norm": 2.142453193664551, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.691487193107605, + "num_tokens": 165906312.0, + "step": 6642 + }, + { + "epoch": 0.7295189984625522, + "grad_norm": 1.9779256582260132, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6984702944755554, + "num_tokens": 165935928.0, + "step": 6643 + }, + { + "epoch": 0.7296288161651658, + "grad_norm": 2.0480988025665283, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.702198326587677, + "num_tokens": 165963059.0, + "step": 6644 + }, + { + "epoch": 0.7297386338677795, + "grad_norm": 2.106332778930664, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7043152451515198, + "num_tokens": 165991645.0, + "step": 6645 + }, + { + "epoch": 0.7298484515703931, + "grad_norm": 2.007106065750122, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7147192358970642, + "num_tokens": 166020244.0, + "step": 6646 + }, + { + "epoch": 0.7299582692730068, + "grad_norm": 2.132887840270996, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7031060457229614, + "num_tokens": 166043834.0, + "step": 6647 + }, + { + "epoch": 0.7300680869756204, + "grad_norm": 2.546217203140259, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7149984836578369, + "num_tokens": 166063959.0, + "step": 6648 + }, + { + "epoch": 0.7301779046782342, + "grad_norm": 2.5014095306396484, + "learning_rate": 1e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7481313943862915, + "num_tokens": 166081943.0, + "step": 6649 + }, + { + "epoch": 0.7302877223808478, + "grad_norm": 2.287116527557373, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.727400004863739, + "num_tokens": 166103392.0, + "step": 6650 + }, + { + "epoch": 0.7303975400834615, + "grad_norm": 2.018517255783081, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6788537502288818, + "num_tokens": 166135989.0, + "step": 6651 + }, + { + "epoch": 0.7305073577860751, + "grad_norm": 2.186868906021118, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7198432683944702, + "num_tokens": 166159080.0, + "step": 6652 + }, + { + "epoch": 0.7306171754886888, + "grad_norm": 2.3406543731689453, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7124776840209961, + "num_tokens": 166180597.0, + "step": 6653 + }, + { + "epoch": 0.7307269931913024, + "grad_norm": 2.137845993041992, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.726152777671814, + "num_tokens": 166204990.0, + "step": 6654 + }, + { + "epoch": 0.7308368108939161, + "grad_norm": 2.3239989280700684, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7004210948944092, + "num_tokens": 166228560.0, + "step": 6655 + }, + { + "epoch": 0.7309466285965298, + "grad_norm": 2.288698196411133, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7061231136322021, + "num_tokens": 166250981.0, + "step": 6656 + }, + { + "epoch": 0.7310564462991435, + "grad_norm": 2.314361333847046, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6924378871917725, + "num_tokens": 166276162.0, + "step": 6657 + }, + { + "epoch": 0.7311662640017571, + "grad_norm": 2.2661571502685547, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6928635835647583, + "num_tokens": 166302201.0, + "step": 6658 + }, + { + "epoch": 0.7312760817043708, + "grad_norm": 2.082958459854126, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7049536108970642, + "num_tokens": 166331532.0, + "step": 6659 + }, + { + "epoch": 0.7313858994069844, + "grad_norm": 2.229992389678955, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7015634179115295, + "num_tokens": 166357699.0, + "step": 6660 + }, + { + "epoch": 0.731495717109598, + "grad_norm": 2.0653812885284424, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6987128257751465, + "num_tokens": 166387178.0, + "step": 6661 + }, + { + "epoch": 0.7316055348122117, + "grad_norm": 2.394070863723755, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7248274683952332, + "num_tokens": 166409198.0, + "step": 6662 + }, + { + "epoch": 0.7317153525148253, + "grad_norm": 2.114053726196289, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6952987313270569, + "num_tokens": 166435961.0, + "step": 6663 + }, + { + "epoch": 0.7318251702174391, + "grad_norm": 2.2726123332977295, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7324304580688477, + "num_tokens": 166459123.0, + "step": 6664 + }, + { + "epoch": 0.7319349879200527, + "grad_norm": 2.1900579929351807, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6985587477684021, + "num_tokens": 166487374.0, + "step": 6665 + }, + { + "epoch": 0.7320448056226664, + "grad_norm": 2.5677504539489746, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7391047477722168, + "num_tokens": 166505897.0, + "step": 6666 + }, + { + "epoch": 0.73215462332528, + "grad_norm": 2.7067975997924805, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7290898561477661, + "num_tokens": 166522198.0, + "step": 6667 + }, + { + "epoch": 0.7322644410278937, + "grad_norm": 1.9448528289794922, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7003053426742554, + "num_tokens": 166551441.0, + "step": 6668 + }, + { + "epoch": 0.7323742587305073, + "grad_norm": 2.133281707763672, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6915135383605957, + "num_tokens": 166579218.0, + "step": 6669 + }, + { + "epoch": 0.732484076433121, + "grad_norm": 2.247373104095459, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7177200317382812, + "num_tokens": 166602668.0, + "step": 6670 + }, + { + "epoch": 0.7325938941357347, + "grad_norm": 2.107954740524292, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7037114500999451, + "num_tokens": 166629136.0, + "step": 6671 + }, + { + "epoch": 0.7327037118383484, + "grad_norm": 2.2574145793914795, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7140809893608093, + "num_tokens": 166652778.0, + "step": 6672 + }, + { + "epoch": 0.732813529540962, + "grad_norm": 2.0116140842437744, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7281876802444458, + "num_tokens": 166681594.0, + "step": 6673 + }, + { + "epoch": 0.7329233472435757, + "grad_norm": 2.033787488937378, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6802821159362793, + "num_tokens": 166712057.0, + "step": 6674 + }, + { + "epoch": 0.7330331649461893, + "grad_norm": 2.4630563259124756, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7177249789237976, + "num_tokens": 166733856.0, + "step": 6675 + }, + { + "epoch": 0.733142982648803, + "grad_norm": 2.3528764247894287, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7164713740348816, + "num_tokens": 166756261.0, + "step": 6676 + }, + { + "epoch": 0.7332528003514166, + "grad_norm": 2.1146557331085205, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7139669060707092, + "num_tokens": 166782513.0, + "step": 6677 + }, + { + "epoch": 0.7333626180540304, + "grad_norm": 2.1079158782958984, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7232526540756226, + "num_tokens": 166808427.0, + "step": 6678 + }, + { + "epoch": 0.733472435756644, + "grad_norm": 2.430785894393921, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7119386196136475, + "num_tokens": 166828336.0, + "step": 6679 + }, + { + "epoch": 0.7335822534592576, + "grad_norm": 2.1104962825775146, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6807587742805481, + "num_tokens": 166855882.0, + "step": 6680 + }, + { + "epoch": 0.7336920711618713, + "grad_norm": 2.0806493759155273, + "learning_rate": 1e-06, + "loss": 1.0544, + "mean_token_accuracy": 0.6835775971412659, + "num_tokens": 166884900.0, + "step": 6681 + }, + { + "epoch": 0.7338018888644849, + "grad_norm": 1.9050096273422241, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.712157130241394, + "num_tokens": 166916187.0, + "step": 6682 + }, + { + "epoch": 0.7339117065670986, + "grad_norm": 1.8315279483795166, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6950724124908447, + "num_tokens": 166952171.0, + "step": 6683 + }, + { + "epoch": 0.7340215242697122, + "grad_norm": 2.289818286895752, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6951172947883606, + "num_tokens": 166976973.0, + "step": 6684 + }, + { + "epoch": 0.734131341972326, + "grad_norm": 2.2196786403656006, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6970062851905823, + "num_tokens": 167002326.0, + "step": 6685 + }, + { + "epoch": 0.7342411596749396, + "grad_norm": 2.0087735652923584, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7238067388534546, + "num_tokens": 167031552.0, + "step": 6686 + }, + { + "epoch": 0.7343509773775533, + "grad_norm": 2.1403462886810303, + "learning_rate": 1e-06, + "loss": 1.0622, + "mean_token_accuracy": 0.6730384826660156, + "num_tokens": 167059008.0, + "step": 6687 + }, + { + "epoch": 0.7344607950801669, + "grad_norm": 2.3335070610046387, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7145742177963257, + "num_tokens": 167084543.0, + "step": 6688 + }, + { + "epoch": 0.7345706127827806, + "grad_norm": 2.0272421836853027, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.691965639591217, + "num_tokens": 167112651.0, + "step": 6689 + }, + { + "epoch": 0.7346804304853942, + "grad_norm": 2.161569356918335, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7080295085906982, + "num_tokens": 167139350.0, + "step": 6690 + }, + { + "epoch": 0.7347902481880079, + "grad_norm": 2.337815761566162, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7196045517921448, + "num_tokens": 167161462.0, + "step": 6691 + }, + { + "epoch": 0.7349000658906216, + "grad_norm": 2.088165760040283, + "learning_rate": 1e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.690585732460022, + "num_tokens": 167189454.0, + "step": 6692 + }, + { + "epoch": 0.7350098835932353, + "grad_norm": 2.024522304534912, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7042866349220276, + "num_tokens": 167217982.0, + "step": 6693 + }, + { + "epoch": 0.7351197012958489, + "grad_norm": 2.095914125442505, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7243927121162415, + "num_tokens": 167245931.0, + "step": 6694 + }, + { + "epoch": 0.7352295189984626, + "grad_norm": 2.493074417114258, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7131948471069336, + "num_tokens": 167265157.0, + "step": 6695 + }, + { + "epoch": 0.7353393367010762, + "grad_norm": 2.3140764236450195, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6865510940551758, + "num_tokens": 167290335.0, + "step": 6696 + }, + { + "epoch": 0.7354491544036899, + "grad_norm": 2.236623525619507, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7030152678489685, + "num_tokens": 167314684.0, + "step": 6697 + }, + { + "epoch": 0.7355589721063035, + "grad_norm": 2.206284523010254, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6968345642089844, + "num_tokens": 167342180.0, + "step": 6698 + }, + { + "epoch": 0.7356687898089171, + "grad_norm": 2.236328125, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7170702219009399, + "num_tokens": 167367387.0, + "step": 6699 + }, + { + "epoch": 0.7357786075115309, + "grad_norm": 2.3708646297454834, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7232052683830261, + "num_tokens": 167387881.0, + "step": 6700 + }, + { + "epoch": 0.7358884252141445, + "grad_norm": 2.3079161643981934, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7117552757263184, + "num_tokens": 167410540.0, + "step": 6701 + }, + { + "epoch": 0.7359982429167582, + "grad_norm": 1.9074254035949707, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7202074527740479, + "num_tokens": 167442853.0, + "step": 6702 + }, + { + "epoch": 0.7361080606193718, + "grad_norm": 1.8724274635314941, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7122083902359009, + "num_tokens": 167474400.0, + "step": 6703 + }, + { + "epoch": 0.7362178783219855, + "grad_norm": 2.4873881340026855, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7312079668045044, + "num_tokens": 167494958.0, + "step": 6704 + }, + { + "epoch": 0.7363276960245991, + "grad_norm": 2.5894675254821777, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7376192808151245, + "num_tokens": 167514494.0, + "step": 6705 + }, + { + "epoch": 0.7364375137272128, + "grad_norm": 2.5696704387664795, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7090556621551514, + "num_tokens": 167534467.0, + "step": 6706 + }, + { + "epoch": 0.7365473314298265, + "grad_norm": 2.1905670166015625, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7323947548866272, + "num_tokens": 167558293.0, + "step": 6707 + }, + { + "epoch": 0.7366571491324402, + "grad_norm": 2.3869378566741943, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7158653140068054, + "num_tokens": 167579017.0, + "step": 6708 + }, + { + "epoch": 0.7367669668350538, + "grad_norm": 2.141416072845459, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6919956803321838, + "num_tokens": 167607877.0, + "step": 6709 + }, + { + "epoch": 0.7368767845376675, + "grad_norm": 2.4553444385528564, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.6957787871360779, + "num_tokens": 167629619.0, + "step": 6710 + }, + { + "epoch": 0.7369866022402811, + "grad_norm": 2.4407613277435303, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7202991247177124, + "num_tokens": 167649758.0, + "step": 6711 + }, + { + "epoch": 0.7370964199428948, + "grad_norm": 2.089893341064453, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6898791790008545, + "num_tokens": 167678201.0, + "step": 6712 + }, + { + "epoch": 0.7372062376455084, + "grad_norm": 2.152083396911621, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7050920724868774, + "num_tokens": 167703138.0, + "step": 6713 + }, + { + "epoch": 0.7373160553481222, + "grad_norm": 2.0814156532287598, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7191796898841858, + "num_tokens": 167729078.0, + "step": 6714 + }, + { + "epoch": 0.7374258730507358, + "grad_norm": 2.1961557865142822, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7117174863815308, + "num_tokens": 167754709.0, + "step": 6715 + }, + { + "epoch": 0.7375356907533495, + "grad_norm": 2.0637366771698, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6986627578735352, + "num_tokens": 167781828.0, + "step": 6716 + }, + { + "epoch": 0.7376455084559631, + "grad_norm": 2.5020759105682373, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6899484395980835, + "num_tokens": 167803986.0, + "step": 6717 + }, + { + "epoch": 0.7377553261585768, + "grad_norm": 2.275887966156006, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.718259334564209, + "num_tokens": 167826600.0, + "step": 6718 + }, + { + "epoch": 0.7378651438611904, + "grad_norm": 2.337447166442871, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7074018716812134, + "num_tokens": 167849854.0, + "step": 6719 + }, + { + "epoch": 0.737974961563804, + "grad_norm": 2.3261258602142334, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7248114943504333, + "num_tokens": 167872492.0, + "step": 6720 + }, + { + "epoch": 0.7380847792664178, + "grad_norm": 2.27286434173584, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7110899090766907, + "num_tokens": 167895872.0, + "step": 6721 + }, + { + "epoch": 0.7381945969690314, + "grad_norm": 2.298588275909424, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7005270719528198, + "num_tokens": 167917994.0, + "step": 6722 + }, + { + "epoch": 0.7383044146716451, + "grad_norm": 2.254572629928589, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7105737924575806, + "num_tokens": 167942259.0, + "step": 6723 + }, + { + "epoch": 0.7384142323742587, + "grad_norm": 2.3109233379364014, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7135200500488281, + "num_tokens": 167965599.0, + "step": 6724 + }, + { + "epoch": 0.7385240500768724, + "grad_norm": 2.3982858657836914, + "learning_rate": 1e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7419170141220093, + "num_tokens": 167984055.0, + "step": 6725 + }, + { + "epoch": 0.738633867779486, + "grad_norm": 2.195659875869751, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6984010934829712, + "num_tokens": 168009798.0, + "step": 6726 + }, + { + "epoch": 0.7387436854820997, + "grad_norm": 2.279200792312622, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7215596437454224, + "num_tokens": 168032478.0, + "step": 6727 + }, + { + "epoch": 0.7388535031847133, + "grad_norm": 1.9559619426727295, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7180538773536682, + "num_tokens": 168063044.0, + "step": 6728 + }, + { + "epoch": 0.7389633208873271, + "grad_norm": 2.5803678035736084, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7242568731307983, + "num_tokens": 168081885.0, + "step": 6729 + }, + { + "epoch": 0.7390731385899407, + "grad_norm": 2.091341495513916, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7049419283866882, + "num_tokens": 168108825.0, + "step": 6730 + }, + { + "epoch": 0.7391829562925544, + "grad_norm": 2.1194660663604736, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7114300727844238, + "num_tokens": 168134409.0, + "step": 6731 + }, + { + "epoch": 0.739292773995168, + "grad_norm": 2.4638266563415527, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7045871019363403, + "num_tokens": 168154408.0, + "step": 6732 + }, + { + "epoch": 0.7394025916977817, + "grad_norm": 2.307328701019287, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7243898510932922, + "num_tokens": 168176912.0, + "step": 6733 + }, + { + "epoch": 0.7395124094003953, + "grad_norm": 2.400026798248291, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6954285502433777, + "num_tokens": 168199999.0, + "step": 6734 + }, + { + "epoch": 0.739622227103009, + "grad_norm": 2.1997852325439453, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7322262525558472, + "num_tokens": 168224745.0, + "step": 6735 + }, + { + "epoch": 0.7397320448056227, + "grad_norm": 2.2529337406158447, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.693604588508606, + "num_tokens": 168253175.0, + "step": 6736 + }, + { + "epoch": 0.7398418625082364, + "grad_norm": 2.8996174335479736, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7078137993812561, + "num_tokens": 168269562.0, + "step": 6737 + }, + { + "epoch": 0.73995168021085, + "grad_norm": 2.334460496902466, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7091699838638306, + "num_tokens": 168292028.0, + "step": 6738 + }, + { + "epoch": 0.7400614979134637, + "grad_norm": 2.2167866230010986, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7135665416717529, + "num_tokens": 168318174.0, + "step": 6739 + }, + { + "epoch": 0.7401713156160773, + "grad_norm": 2.184736728668213, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7209386229515076, + "num_tokens": 168343959.0, + "step": 6740 + }, + { + "epoch": 0.7402811333186909, + "grad_norm": 2.346804141998291, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7479361891746521, + "num_tokens": 168364892.0, + "step": 6741 + }, + { + "epoch": 0.7403909510213046, + "grad_norm": 2.4882211685180664, + "learning_rate": 1e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7579270601272583, + "num_tokens": 168383647.0, + "step": 6742 + }, + { + "epoch": 0.7405007687239183, + "grad_norm": 1.9894963502883911, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.712634801864624, + "num_tokens": 168413249.0, + "step": 6743 + }, + { + "epoch": 0.740610586426532, + "grad_norm": 1.9945416450500488, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.6976603269577026, + "num_tokens": 168444222.0, + "step": 6744 + }, + { + "epoch": 0.7407204041291456, + "grad_norm": 2.7719199657440186, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7275404930114746, + "num_tokens": 168460279.0, + "step": 6745 + }, + { + "epoch": 0.7408302218317593, + "grad_norm": 2.2518625259399414, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7228747010231018, + "num_tokens": 168485919.0, + "step": 6746 + }, + { + "epoch": 0.7409400395343729, + "grad_norm": 2.237863302230835, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7082182168960571, + "num_tokens": 168511415.0, + "step": 6747 + }, + { + "epoch": 0.7410498572369866, + "grad_norm": 2.015082836151123, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6844363808631897, + "num_tokens": 168542622.0, + "step": 6748 + }, + { + "epoch": 0.7411596749396002, + "grad_norm": 2.0708940029144287, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6904786229133606, + "num_tokens": 168571935.0, + "step": 6749 + }, + { + "epoch": 0.741269492642214, + "grad_norm": 2.45808482170105, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7087659239768982, + "num_tokens": 168592179.0, + "step": 6750 + }, + { + "epoch": 0.7413793103448276, + "grad_norm": 2.3375635147094727, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.693108081817627, + "num_tokens": 168615625.0, + "step": 6751 + }, + { + "epoch": 0.7414891280474413, + "grad_norm": 2.3411896228790283, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7391059398651123, + "num_tokens": 168637161.0, + "step": 6752 + }, + { + "epoch": 0.7415989457500549, + "grad_norm": 2.39054274559021, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7074378728866577, + "num_tokens": 168658733.0, + "step": 6753 + }, + { + "epoch": 0.7417087634526686, + "grad_norm": 2.5244600772857666, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6904350519180298, + "num_tokens": 168681655.0, + "step": 6754 + }, + { + "epoch": 0.7418185811552822, + "grad_norm": 2.133847713470459, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7012113332748413, + "num_tokens": 168707372.0, + "step": 6755 + }, + { + "epoch": 0.7419283988578959, + "grad_norm": 2.122199773788452, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7143110036849976, + "num_tokens": 168733624.0, + "step": 6756 + }, + { + "epoch": 0.7420382165605095, + "grad_norm": 2.111213445663452, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.697481632232666, + "num_tokens": 168761398.0, + "step": 6757 + }, + { + "epoch": 0.7421480342631233, + "grad_norm": 2.233818531036377, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6908799409866333, + "num_tokens": 168786164.0, + "step": 6758 + }, + { + "epoch": 0.7422578519657369, + "grad_norm": 2.158715009689331, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6870535612106323, + "num_tokens": 168811553.0, + "step": 6759 + }, + { + "epoch": 0.7423676696683505, + "grad_norm": 2.092592716217041, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7060254812240601, + "num_tokens": 168838700.0, + "step": 6760 + }, + { + "epoch": 0.7424774873709642, + "grad_norm": 2.5410799980163574, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.698472261428833, + "num_tokens": 168859491.0, + "step": 6761 + }, + { + "epoch": 0.7425873050735778, + "grad_norm": 1.9794882535934448, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6987416744232178, + "num_tokens": 168888190.0, + "step": 6762 + }, + { + "epoch": 0.7426971227761915, + "grad_norm": 2.221898078918457, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7025445699691772, + "num_tokens": 168910997.0, + "step": 6763 + }, + { + "epoch": 0.7428069404788051, + "grad_norm": 2.2783684730529785, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6841537952423096, + "num_tokens": 168936914.0, + "step": 6764 + }, + { + "epoch": 0.7429167581814189, + "grad_norm": 2.139587163925171, + "learning_rate": 1e-06, + "loss": 1.0738, + "mean_token_accuracy": 0.6787093281745911, + "num_tokens": 168965214.0, + "step": 6765 + }, + { + "epoch": 0.7430265758840325, + "grad_norm": 2.2058908939361572, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.711027979850769, + "num_tokens": 168989752.0, + "step": 6766 + }, + { + "epoch": 0.7431363935866462, + "grad_norm": 2.0904672145843506, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6912068128585815, + "num_tokens": 169017651.0, + "step": 6767 + }, + { + "epoch": 0.7432462112892598, + "grad_norm": 2.093776226043701, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.6955634355545044, + "num_tokens": 169045289.0, + "step": 6768 + }, + { + "epoch": 0.7433560289918735, + "grad_norm": 2.289332866668701, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.728956937789917, + "num_tokens": 169068344.0, + "step": 6769 + }, + { + "epoch": 0.7434658466944871, + "grad_norm": 2.2549540996551514, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7005051374435425, + "num_tokens": 169093428.0, + "step": 6770 + }, + { + "epoch": 0.7435756643971008, + "grad_norm": 2.2446792125701904, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7250557541847229, + "num_tokens": 169116785.0, + "step": 6771 + }, + { + "epoch": 0.7436854820997145, + "grad_norm": 2.0882256031036377, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7283849716186523, + "num_tokens": 169142829.0, + "step": 6772 + }, + { + "epoch": 0.7437952998023282, + "grad_norm": 2.2176501750946045, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7259070873260498, + "num_tokens": 169166272.0, + "step": 6773 + }, + { + "epoch": 0.7439051175049418, + "grad_norm": 2.014078378677368, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7098851203918457, + "num_tokens": 169195541.0, + "step": 6774 + }, + { + "epoch": 0.7440149352075555, + "grad_norm": 2.2757060527801514, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7167572975158691, + "num_tokens": 169217817.0, + "step": 6775 + }, + { + "epoch": 0.7441247529101691, + "grad_norm": 2.168901205062866, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7002896070480347, + "num_tokens": 169243143.0, + "step": 6776 + }, + { + "epoch": 0.7442345706127828, + "grad_norm": 2.3666226863861084, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7215160727500916, + "num_tokens": 169264290.0, + "step": 6777 + }, + { + "epoch": 0.7443443883153964, + "grad_norm": 2.494097948074341, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7087782025337219, + "num_tokens": 169284480.0, + "step": 6778 + }, + { + "epoch": 0.7444542060180102, + "grad_norm": 2.410695791244507, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.721551239490509, + "num_tokens": 169305209.0, + "step": 6779 + }, + { + "epoch": 0.7445640237206238, + "grad_norm": 2.3048651218414307, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7358612418174744, + "num_tokens": 169328039.0, + "step": 6780 + }, + { + "epoch": 0.7446738414232374, + "grad_norm": 2.216305732727051, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.6987081170082092, + "num_tokens": 169354308.0, + "step": 6781 + }, + { + "epoch": 0.7447836591258511, + "grad_norm": 2.311765670776367, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.688564121723175, + "num_tokens": 169376602.0, + "step": 6782 + }, + { + "epoch": 0.7448934768284647, + "grad_norm": 2.5711565017700195, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7298791408538818, + "num_tokens": 169395112.0, + "step": 6783 + }, + { + "epoch": 0.7450032945310784, + "grad_norm": 2.2156050205230713, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6930201053619385, + "num_tokens": 169420481.0, + "step": 6784 + }, + { + "epoch": 0.745113112233692, + "grad_norm": 2.1249287128448486, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6971036195755005, + "num_tokens": 169446649.0, + "step": 6785 + }, + { + "epoch": 0.7452229299363057, + "grad_norm": 2.342872142791748, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6983506083488464, + "num_tokens": 169469971.0, + "step": 6786 + }, + { + "epoch": 0.7453327476389194, + "grad_norm": 2.540198564529419, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7027351260185242, + "num_tokens": 169489561.0, + "step": 6787 + }, + { + "epoch": 0.7454425653415331, + "grad_norm": 1.9221935272216797, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6934504508972168, + "num_tokens": 169521075.0, + "step": 6788 + }, + { + "epoch": 0.7455523830441467, + "grad_norm": 2.2025046348571777, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7032789587974548, + "num_tokens": 169545155.0, + "step": 6789 + }, + { + "epoch": 0.7456622007467604, + "grad_norm": 2.5548272132873535, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7221202254295349, + "num_tokens": 169564185.0, + "step": 6790 + }, + { + "epoch": 0.745772018449374, + "grad_norm": 2.5414469242095947, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7274194359779358, + "num_tokens": 169583257.0, + "step": 6791 + }, + { + "epoch": 0.7458818361519877, + "grad_norm": 2.078630208969116, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7092075943946838, + "num_tokens": 169610933.0, + "step": 6792 + }, + { + "epoch": 0.7459916538546013, + "grad_norm": 1.880533218383789, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6965419054031372, + "num_tokens": 169645511.0, + "step": 6793 + }, + { + "epoch": 0.7461014715572151, + "grad_norm": 2.192467451095581, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6907569766044617, + "num_tokens": 169672234.0, + "step": 6794 + }, + { + "epoch": 0.7462112892598287, + "grad_norm": 2.440995454788208, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6941020488739014, + "num_tokens": 169696238.0, + "step": 6795 + }, + { + "epoch": 0.7463211069624424, + "grad_norm": 2.490438938140869, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7203195095062256, + "num_tokens": 169718367.0, + "step": 6796 + }, + { + "epoch": 0.746430924665056, + "grad_norm": 2.026958465576172, + "learning_rate": 1e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7484242916107178, + "num_tokens": 169743204.0, + "step": 6797 + }, + { + "epoch": 0.7465407423676697, + "grad_norm": 2.217557430267334, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7338953018188477, + "num_tokens": 169766549.0, + "step": 6798 + }, + { + "epoch": 0.7466505600702833, + "grad_norm": 2.0533576011657715, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7195885181427002, + "num_tokens": 169794229.0, + "step": 6799 + }, + { + "epoch": 0.7467603777728969, + "grad_norm": 2.2125699520111084, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7321720123291016, + "num_tokens": 169817858.0, + "step": 6800 + }, + { + "epoch": 0.7468701954755107, + "grad_norm": 2.353499412536621, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.699791669845581, + "num_tokens": 169840549.0, + "step": 6801 + }, + { + "epoch": 0.7469800131781243, + "grad_norm": 2.4162755012512207, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7025213241577148, + "num_tokens": 169864057.0, + "step": 6802 + }, + { + "epoch": 0.747089830880738, + "grad_norm": 2.1708569526672363, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7273330688476562, + "num_tokens": 169888199.0, + "step": 6803 + }, + { + "epoch": 0.7471996485833516, + "grad_norm": 2.684680223464966, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7120497226715088, + "num_tokens": 169907658.0, + "step": 6804 + }, + { + "epoch": 0.7473094662859653, + "grad_norm": 2.3488781452178955, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6948912143707275, + "num_tokens": 169930395.0, + "step": 6805 + }, + { + "epoch": 0.7474192839885789, + "grad_norm": 2.5421245098114014, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.730057418346405, + "num_tokens": 169951524.0, + "step": 6806 + }, + { + "epoch": 0.7475291016911926, + "grad_norm": 2.090545415878296, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.699285626411438, + "num_tokens": 169979846.0, + "step": 6807 + }, + { + "epoch": 0.7476389193938063, + "grad_norm": 2.2164673805236816, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.681138277053833, + "num_tokens": 170006024.0, + "step": 6808 + }, + { + "epoch": 0.74774873709642, + "grad_norm": 2.257540702819824, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7325315475463867, + "num_tokens": 170030893.0, + "step": 6809 + }, + { + "epoch": 0.7478585547990336, + "grad_norm": 1.9869263172149658, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7109488844871521, + "num_tokens": 170060415.0, + "step": 6810 + }, + { + "epoch": 0.7479683725016473, + "grad_norm": 2.4757814407348633, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7394506931304932, + "num_tokens": 170079155.0, + "step": 6811 + }, + { + "epoch": 0.7480781902042609, + "grad_norm": 2.430933713912964, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7214956879615784, + "num_tokens": 170099399.0, + "step": 6812 + }, + { + "epoch": 0.7481880079068746, + "grad_norm": 2.232308864593506, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.706102728843689, + "num_tokens": 170126570.0, + "step": 6813 + }, + { + "epoch": 0.7482978256094882, + "grad_norm": 2.1008214950561523, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7178765535354614, + "num_tokens": 170152831.0, + "step": 6814 + }, + { + "epoch": 0.7484076433121019, + "grad_norm": 2.05275821685791, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7104060649871826, + "num_tokens": 170180752.0, + "step": 6815 + }, + { + "epoch": 0.7485174610147156, + "grad_norm": 2.535733222961426, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.6993433237075806, + "num_tokens": 170201303.0, + "step": 6816 + }, + { + "epoch": 0.7486272787173293, + "grad_norm": 2.433279037475586, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.6984555721282959, + "num_tokens": 170222615.0, + "step": 6817 + }, + { + "epoch": 0.7487370964199429, + "grad_norm": 1.995936632156372, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6945242285728455, + "num_tokens": 170251157.0, + "step": 6818 + }, + { + "epoch": 0.7488469141225566, + "grad_norm": 2.2541213035583496, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7168946266174316, + "num_tokens": 170272666.0, + "step": 6819 + }, + { + "epoch": 0.7489567318251702, + "grad_norm": 2.4191336631774902, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7076243758201599, + "num_tokens": 170294699.0, + "step": 6820 + }, + { + "epoch": 0.7490665495277838, + "grad_norm": 2.5439250469207764, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.703406035900116, + "num_tokens": 170315856.0, + "step": 6821 + }, + { + "epoch": 0.7491763672303975, + "grad_norm": 2.2082340717315674, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.687722384929657, + "num_tokens": 170344775.0, + "step": 6822 + }, + { + "epoch": 0.7492861849330112, + "grad_norm": 2.356771945953369, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.693698525428772, + "num_tokens": 170368234.0, + "step": 6823 + }, + { + "epoch": 0.7493960026356249, + "grad_norm": 2.072298526763916, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7162041068077087, + "num_tokens": 170395460.0, + "step": 6824 + }, + { + "epoch": 0.7495058203382385, + "grad_norm": 2.3662590980529785, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7180031538009644, + "num_tokens": 170418835.0, + "step": 6825 + }, + { + "epoch": 0.7496156380408522, + "grad_norm": 2.2782208919525146, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.71014404296875, + "num_tokens": 170444342.0, + "step": 6826 + }, + { + "epoch": 0.7497254557434658, + "grad_norm": 2.19769287109375, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.709588885307312, + "num_tokens": 170470740.0, + "step": 6827 + }, + { + "epoch": 0.7498352734460795, + "grad_norm": 2.1548361778259277, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7018342018127441, + "num_tokens": 170497179.0, + "step": 6828 + }, + { + "epoch": 0.7499450911486931, + "grad_norm": 2.1819331645965576, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7198997139930725, + "num_tokens": 170522486.0, + "step": 6829 + }, + { + "epoch": 0.7500549088513069, + "grad_norm": 2.021043300628662, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7260676622390747, + "num_tokens": 170549421.0, + "step": 6830 + }, + { + "epoch": 0.7501647265539205, + "grad_norm": 2.3438572883605957, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6815704107284546, + "num_tokens": 170572409.0, + "step": 6831 + }, + { + "epoch": 0.7502745442565342, + "grad_norm": 1.9645586013793945, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7087898254394531, + "num_tokens": 170600313.0, + "step": 6832 + }, + { + "epoch": 0.7503843619591478, + "grad_norm": 2.319533348083496, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7086452841758728, + "num_tokens": 170622889.0, + "step": 6833 + }, + { + "epoch": 0.7504941796617615, + "grad_norm": 2.2141594886779785, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7389509081840515, + "num_tokens": 170646386.0, + "step": 6834 + }, + { + "epoch": 0.7506039973643751, + "grad_norm": 2.3298065662384033, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7066954374313354, + "num_tokens": 170668463.0, + "step": 6835 + }, + { + "epoch": 0.7507138150669888, + "grad_norm": 2.108427047729492, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7032678127288818, + "num_tokens": 170697848.0, + "step": 6836 + }, + { + "epoch": 0.7508236327696025, + "grad_norm": 2.3256821632385254, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6850674152374268, + "num_tokens": 170722435.0, + "step": 6837 + }, + { + "epoch": 0.7509334504722162, + "grad_norm": 2.2580392360687256, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7145447134971619, + "num_tokens": 170748293.0, + "step": 6838 + }, + { + "epoch": 0.7510432681748298, + "grad_norm": 2.0554049015045166, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7161649465560913, + "num_tokens": 170777224.0, + "step": 6839 + }, + { + "epoch": 0.7511530858774434, + "grad_norm": 2.1895196437835693, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7085646390914917, + "num_tokens": 170800550.0, + "step": 6840 + }, + { + "epoch": 0.7512629035800571, + "grad_norm": 2.26849365234375, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7146245241165161, + "num_tokens": 170823043.0, + "step": 6841 + }, + { + "epoch": 0.7513727212826707, + "grad_norm": 2.2812588214874268, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7301632165908813, + "num_tokens": 170845696.0, + "step": 6842 + }, + { + "epoch": 0.7514825389852844, + "grad_norm": 1.8558955192565918, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7056541442871094, + "num_tokens": 170879678.0, + "step": 6843 + }, + { + "epoch": 0.7515923566878981, + "grad_norm": 2.3294332027435303, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7071715593338013, + "num_tokens": 170903283.0, + "step": 6844 + }, + { + "epoch": 0.7517021743905118, + "grad_norm": 2.1493453979492188, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.7077364921569824, + "num_tokens": 170929758.0, + "step": 6845 + }, + { + "epoch": 0.7518119920931254, + "grad_norm": 1.8246679306030273, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7147204279899597, + "num_tokens": 170962689.0, + "step": 6846 + }, + { + "epoch": 0.7519218097957391, + "grad_norm": 2.0513713359832764, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6957216858863831, + "num_tokens": 170991713.0, + "step": 6847 + }, + { + "epoch": 0.7520316274983527, + "grad_norm": 2.161376714706421, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7034417390823364, + "num_tokens": 171019115.0, + "step": 6848 + }, + { + "epoch": 0.7521414452009664, + "grad_norm": 1.9421292543411255, + "learning_rate": 1e-06, + "loss": 1.0557, + "mean_token_accuracy": 0.6866406202316284, + "num_tokens": 171050066.0, + "step": 6849 + }, + { + "epoch": 0.75225126290358, + "grad_norm": 2.001817226409912, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7021850943565369, + "num_tokens": 171078704.0, + "step": 6850 + }, + { + "epoch": 0.7523610806061937, + "grad_norm": 2.038552761077881, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6855456829071045, + "num_tokens": 171108077.0, + "step": 6851 + }, + { + "epoch": 0.7524708983088074, + "grad_norm": 2.226594924926758, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7062363028526306, + "num_tokens": 171132906.0, + "step": 6852 + }, + { + "epoch": 0.7525807160114211, + "grad_norm": 2.345327615737915, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.715718150138855, + "num_tokens": 171157628.0, + "step": 6853 + }, + { + "epoch": 0.7526905337140347, + "grad_norm": 2.1313693523406982, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.7021239995956421, + "num_tokens": 171183127.0, + "step": 6854 + }, + { + "epoch": 0.7528003514166484, + "grad_norm": 2.0340332984924316, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7255613803863525, + "num_tokens": 171209970.0, + "step": 6855 + }, + { + "epoch": 0.752910169119262, + "grad_norm": 2.1423189640045166, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7095094323158264, + "num_tokens": 171238112.0, + "step": 6856 + }, + { + "epoch": 0.7530199868218757, + "grad_norm": 2.240192174911499, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7022743225097656, + "num_tokens": 171261366.0, + "step": 6857 + }, + { + "epoch": 0.7531298045244893, + "grad_norm": 2.1679725646972656, + "learning_rate": 1e-06, + "loss": 1.0538, + "mean_token_accuracy": 0.6776293516159058, + "num_tokens": 171288509.0, + "step": 6858 + }, + { + "epoch": 0.7532396222271031, + "grad_norm": 1.9995530843734741, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7103813886642456, + "num_tokens": 171316814.0, + "step": 6859 + }, + { + "epoch": 0.7533494399297167, + "grad_norm": 2.271972417831421, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7113144397735596, + "num_tokens": 171339688.0, + "step": 6860 + }, + { + "epoch": 0.7534592576323303, + "grad_norm": 2.4288339614868164, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.703862190246582, + "num_tokens": 171362786.0, + "step": 6861 + }, + { + "epoch": 0.753569075334944, + "grad_norm": 2.1129095554351807, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7106203436851501, + "num_tokens": 171390709.0, + "step": 6862 + }, + { + "epoch": 0.7536788930375576, + "grad_norm": 1.9969407320022583, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7282453775405884, + "num_tokens": 171420553.0, + "step": 6863 + }, + { + "epoch": 0.7537887107401713, + "grad_norm": 2.1972429752349854, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6953533291816711, + "num_tokens": 171445863.0, + "step": 6864 + }, + { + "epoch": 0.7538985284427849, + "grad_norm": 2.0893869400024414, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7141398191452026, + "num_tokens": 171472124.0, + "step": 6865 + }, + { + "epoch": 0.7540083461453987, + "grad_norm": 2.1256072521209717, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7214305400848389, + "num_tokens": 171498740.0, + "step": 6866 + }, + { + "epoch": 0.7541181638480123, + "grad_norm": 2.5438644886016846, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7004876732826233, + "num_tokens": 171519034.0, + "step": 6867 + }, + { + "epoch": 0.754227981550626, + "grad_norm": 2.1519951820373535, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7248609066009521, + "num_tokens": 171543621.0, + "step": 6868 + }, + { + "epoch": 0.7543377992532396, + "grad_norm": 2.03289794921875, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7050011157989502, + "num_tokens": 171573422.0, + "step": 6869 + }, + { + "epoch": 0.7544476169558533, + "grad_norm": 1.9740760326385498, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7055488228797913, + "num_tokens": 171602441.0, + "step": 6870 + }, + { + "epoch": 0.7545574346584669, + "grad_norm": 2.5394623279571533, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7149628400802612, + "num_tokens": 171621048.0, + "step": 6871 + }, + { + "epoch": 0.7546672523610806, + "grad_norm": 2.2339413166046143, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.710357666015625, + "num_tokens": 171647020.0, + "step": 6872 + }, + { + "epoch": 0.7547770700636943, + "grad_norm": 2.1287038326263428, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7063149213790894, + "num_tokens": 171672335.0, + "step": 6873 + }, + { + "epoch": 0.754886887766308, + "grad_norm": 2.007882595062256, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.714736819267273, + "num_tokens": 171700554.0, + "step": 6874 + }, + { + "epoch": 0.7549967054689216, + "grad_norm": 2.2608234882354736, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7043100595474243, + "num_tokens": 171724110.0, + "step": 6875 + }, + { + "epoch": 0.7551065231715353, + "grad_norm": 2.3047332763671875, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6984578967094421, + "num_tokens": 171749792.0, + "step": 6876 + }, + { + "epoch": 0.7552163408741489, + "grad_norm": 2.4278013706207275, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7008134722709656, + "num_tokens": 171770454.0, + "step": 6877 + }, + { + "epoch": 0.7553261585767626, + "grad_norm": 2.275104284286499, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6869803667068481, + "num_tokens": 171795342.0, + "step": 6878 + }, + { + "epoch": 0.7554359762793762, + "grad_norm": 1.95220148563385, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.721861720085144, + "num_tokens": 171826687.0, + "step": 6879 + }, + { + "epoch": 0.7555457939819898, + "grad_norm": 2.7006919384002686, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7228730320930481, + "num_tokens": 171843597.0, + "step": 6880 + }, + { + "epoch": 0.7556556116846036, + "grad_norm": 2.1605327129364014, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7033632397651672, + "num_tokens": 171870210.0, + "step": 6881 + }, + { + "epoch": 0.7557654293872172, + "grad_norm": 2.016000509262085, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7156945466995239, + "num_tokens": 171897519.0, + "step": 6882 + }, + { + "epoch": 0.7558752470898309, + "grad_norm": 2.248164176940918, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7042343616485596, + "num_tokens": 171923810.0, + "step": 6883 + }, + { + "epoch": 0.7559850647924445, + "grad_norm": 2.192798137664795, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7082681655883789, + "num_tokens": 171947916.0, + "step": 6884 + }, + { + "epoch": 0.7560948824950582, + "grad_norm": 2.4193708896636963, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7173891067504883, + "num_tokens": 171968174.0, + "step": 6885 + }, + { + "epoch": 0.7562047001976718, + "grad_norm": 1.9971429109573364, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6972553133964539, + "num_tokens": 171998203.0, + "step": 6886 + }, + { + "epoch": 0.7563145179002855, + "grad_norm": 2.2088356018066406, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.720365047454834, + "num_tokens": 172021532.0, + "step": 6887 + }, + { + "epoch": 0.7564243356028992, + "grad_norm": 2.0159051418304443, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6993197798728943, + "num_tokens": 172051329.0, + "step": 6888 + }, + { + "epoch": 0.7565341533055129, + "grad_norm": 2.058260440826416, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6927658319473267, + "num_tokens": 172079373.0, + "step": 6889 + }, + { + "epoch": 0.7566439710081265, + "grad_norm": 2.3656702041625977, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7065045833587646, + "num_tokens": 172103210.0, + "step": 6890 + }, + { + "epoch": 0.7567537887107402, + "grad_norm": 1.9632368087768555, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7300251722335815, + "num_tokens": 172131757.0, + "step": 6891 + }, + { + "epoch": 0.7568636064133538, + "grad_norm": 2.408956527709961, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7074776291847229, + "num_tokens": 172151580.0, + "step": 6892 + }, + { + "epoch": 0.7569734241159675, + "grad_norm": 2.3332579135894775, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7065513730049133, + "num_tokens": 172174809.0, + "step": 6893 + }, + { + "epoch": 0.7570832418185811, + "grad_norm": 2.549625873565674, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7255890965461731, + "num_tokens": 172193099.0, + "step": 6894 + }, + { + "epoch": 0.7571930595211949, + "grad_norm": 2.1182849407196045, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7214533090591431, + "num_tokens": 172216825.0, + "step": 6895 + }, + { + "epoch": 0.7573028772238085, + "grad_norm": 2.491272211074829, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7302650213241577, + "num_tokens": 172236474.0, + "step": 6896 + }, + { + "epoch": 0.7574126949264222, + "grad_norm": 2.2551567554473877, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7007933259010315, + "num_tokens": 172259733.0, + "step": 6897 + }, + { + "epoch": 0.7575225126290358, + "grad_norm": 2.3533103466033936, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7250229716300964, + "num_tokens": 172280729.0, + "step": 6898 + }, + { + "epoch": 0.7576323303316495, + "grad_norm": 1.9333487749099731, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6858387589454651, + "num_tokens": 172312837.0, + "step": 6899 + }, + { + "epoch": 0.7577421480342631, + "grad_norm": 2.0813121795654297, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6970572471618652, + "num_tokens": 172340223.0, + "step": 6900 + }, + { + "epoch": 0.7578519657368767, + "grad_norm": 1.9109071493148804, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7084171772003174, + "num_tokens": 172373227.0, + "step": 6901 + }, + { + "epoch": 0.7579617834394905, + "grad_norm": 2.6102182865142822, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7054274082183838, + "num_tokens": 172390739.0, + "step": 6902 + }, + { + "epoch": 0.7580716011421041, + "grad_norm": 2.0830795764923096, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7123064994812012, + "num_tokens": 172416831.0, + "step": 6903 + }, + { + "epoch": 0.7581814188447178, + "grad_norm": 2.1195197105407715, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.710723876953125, + "num_tokens": 172443313.0, + "step": 6904 + }, + { + "epoch": 0.7582912365473314, + "grad_norm": 2.000046730041504, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7164349555969238, + "num_tokens": 172471890.0, + "step": 6905 + }, + { + "epoch": 0.7584010542499451, + "grad_norm": 2.104814052581787, + "learning_rate": 1e-06, + "loss": 1.0975, + "mean_token_accuracy": 0.6720576882362366, + "num_tokens": 172498660.0, + "step": 6906 + }, + { + "epoch": 0.7585108719525587, + "grad_norm": 2.383355140686035, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7354793548583984, + "num_tokens": 172519896.0, + "step": 6907 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 2.1164700984954834, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6930263638496399, + "num_tokens": 172547612.0, + "step": 6908 + }, + { + "epoch": 0.758730507357786, + "grad_norm": 2.1650686264038086, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7113280296325684, + "num_tokens": 172572224.0, + "step": 6909 + }, + { + "epoch": 0.7588403250603998, + "grad_norm": 2.354619264602661, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.727575421333313, + "num_tokens": 172593361.0, + "step": 6910 + }, + { + "epoch": 0.7589501427630134, + "grad_norm": 1.9699522256851196, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7392715215682983, + "num_tokens": 172619938.0, + "step": 6911 + }, + { + "epoch": 0.7590599604656271, + "grad_norm": 2.043116331100464, + "learning_rate": 1e-06, + "loss": 1.1195, + "mean_token_accuracy": 0.6638891696929932, + "num_tokens": 172651875.0, + "step": 6912 + }, + { + "epoch": 0.7591697781682407, + "grad_norm": 2.6091620922088623, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7115411162376404, + "num_tokens": 172672402.0, + "step": 6913 + }, + { + "epoch": 0.7592795958708544, + "grad_norm": 2.085425853729248, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7077749967575073, + "num_tokens": 172699299.0, + "step": 6914 + }, + { + "epoch": 0.759389413573468, + "grad_norm": 2.1906228065490723, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7050198316574097, + "num_tokens": 172724884.0, + "step": 6915 + }, + { + "epoch": 0.7594992312760817, + "grad_norm": 2.3311288356781006, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6952676773071289, + "num_tokens": 172748124.0, + "step": 6916 + }, + { + "epoch": 0.7596090489786954, + "grad_norm": 1.955379843711853, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7021398544311523, + "num_tokens": 172779121.0, + "step": 6917 + }, + { + "epoch": 0.7597188666813091, + "grad_norm": 2.156944751739502, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6896800398826599, + "num_tokens": 172805458.0, + "step": 6918 + }, + { + "epoch": 0.7598286843839227, + "grad_norm": 2.179157018661499, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7296193838119507, + "num_tokens": 172830664.0, + "step": 6919 + }, + { + "epoch": 0.7599385020865363, + "grad_norm": 2.093906879425049, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7029073238372803, + "num_tokens": 172857427.0, + "step": 6920 + }, + { + "epoch": 0.76004831978915, + "grad_norm": 2.0639145374298096, + "learning_rate": 1e-06, + "loss": 1.0993, + "mean_token_accuracy": 0.6712530851364136, + "num_tokens": 172890262.0, + "step": 6921 + }, + { + "epoch": 0.7601581374917636, + "grad_norm": 2.254359722137451, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7326388955116272, + "num_tokens": 172914644.0, + "step": 6922 + }, + { + "epoch": 0.7602679551943773, + "grad_norm": 2.250307559967041, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7353510856628418, + "num_tokens": 172937542.0, + "step": 6923 + }, + { + "epoch": 0.760377772896991, + "grad_norm": 2.211118459701538, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6945419907569885, + "num_tokens": 172964990.0, + "step": 6924 + }, + { + "epoch": 0.7604875905996047, + "grad_norm": 2.789715528488159, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7289042472839355, + "num_tokens": 172981787.0, + "step": 6925 + }, + { + "epoch": 0.7605974083022183, + "grad_norm": 2.451540470123291, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7255372405052185, + "num_tokens": 173002145.0, + "step": 6926 + }, + { + "epoch": 0.760707226004832, + "grad_norm": 2.2082509994506836, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7067636251449585, + "num_tokens": 173026936.0, + "step": 6927 + }, + { + "epoch": 0.7608170437074456, + "grad_norm": 1.911399245262146, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6957243084907532, + "num_tokens": 173060739.0, + "step": 6928 + }, + { + "epoch": 0.7609268614100593, + "grad_norm": 2.5690271854400635, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7272192239761353, + "num_tokens": 173077852.0, + "step": 6929 + }, + { + "epoch": 0.7610366791126729, + "grad_norm": 2.511317253112793, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6928400993347168, + "num_tokens": 173099596.0, + "step": 6930 + }, + { + "epoch": 0.7611464968152867, + "grad_norm": 2.3223676681518555, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7085850238800049, + "num_tokens": 173123078.0, + "step": 6931 + }, + { + "epoch": 0.7612563145179003, + "grad_norm": 2.1710023880004883, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6943714618682861, + "num_tokens": 173149384.0, + "step": 6932 + }, + { + "epoch": 0.761366132220514, + "grad_norm": 2.2197890281677246, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7120414972305298, + "num_tokens": 173172656.0, + "step": 6933 + }, + { + "epoch": 0.7614759499231276, + "grad_norm": 2.218973398208618, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7241067886352539, + "num_tokens": 173196226.0, + "step": 6934 + }, + { + "epoch": 0.7615857676257413, + "grad_norm": 2.2030019760131836, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.70839923620224, + "num_tokens": 173220355.0, + "step": 6935 + }, + { + "epoch": 0.7616955853283549, + "grad_norm": 1.7965470552444458, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.6994390487670898, + "num_tokens": 173256223.0, + "step": 6936 + }, + { + "epoch": 0.7618054030309686, + "grad_norm": 2.4642393589019775, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7158301472663879, + "num_tokens": 173276728.0, + "step": 6937 + }, + { + "epoch": 0.7619152207335822, + "grad_norm": 2.4275147914886475, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7367189526557922, + "num_tokens": 173296836.0, + "step": 6938 + }, + { + "epoch": 0.762025038436196, + "grad_norm": 2.0661497116088867, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7283333539962769, + "num_tokens": 173324949.0, + "step": 6939 + }, + { + "epoch": 0.7621348561388096, + "grad_norm": 2.199798583984375, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7058137655258179, + "num_tokens": 173350928.0, + "step": 6940 + }, + { + "epoch": 0.7622446738414232, + "grad_norm": 2.307356357574463, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.726067066192627, + "num_tokens": 173372698.0, + "step": 6941 + }, + { + "epoch": 0.7623544915440369, + "grad_norm": 2.077829599380493, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7357646822929382, + "num_tokens": 173398693.0, + "step": 6942 + }, + { + "epoch": 0.7624643092466505, + "grad_norm": 2.090238332748413, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7423173189163208, + "num_tokens": 173424254.0, + "step": 6943 + }, + { + "epoch": 0.7625741269492642, + "grad_norm": 2.253661632537842, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6931350231170654, + "num_tokens": 173448528.0, + "step": 6944 + }, + { + "epoch": 0.7626839446518778, + "grad_norm": 2.096421957015991, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.712644100189209, + "num_tokens": 173474827.0, + "step": 6945 + }, + { + "epoch": 0.7627937623544916, + "grad_norm": 2.0914249420166016, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7108948230743408, + "num_tokens": 173503101.0, + "step": 6946 + }, + { + "epoch": 0.7629035800571052, + "grad_norm": 2.5617425441741943, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7129011154174805, + "num_tokens": 173521769.0, + "step": 6947 + }, + { + "epoch": 0.7630133977597189, + "grad_norm": 2.1638386249542236, + "learning_rate": 1e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.6757580041885376, + "num_tokens": 173550356.0, + "step": 6948 + }, + { + "epoch": 0.7631232154623325, + "grad_norm": 2.377479076385498, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7361404299736023, + "num_tokens": 173569991.0, + "step": 6949 + }, + { + "epoch": 0.7632330331649462, + "grad_norm": 2.2976443767547607, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7127652168273926, + "num_tokens": 173592635.0, + "step": 6950 + }, + { + "epoch": 0.7633428508675598, + "grad_norm": 2.1402535438537598, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6895081400871277, + "num_tokens": 173623990.0, + "step": 6951 + }, + { + "epoch": 0.7634526685701735, + "grad_norm": 2.2196950912475586, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6956605911254883, + "num_tokens": 173649484.0, + "step": 6952 + }, + { + "epoch": 0.7635624862727872, + "grad_norm": 2.422312021255493, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7335374355316162, + "num_tokens": 173670812.0, + "step": 6953 + }, + { + "epoch": 0.7636723039754009, + "grad_norm": 2.2160377502441406, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6886637210845947, + "num_tokens": 173696526.0, + "step": 6954 + }, + { + "epoch": 0.7637821216780145, + "grad_norm": 2.09017014503479, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7347031831741333, + "num_tokens": 173721047.0, + "step": 6955 + }, + { + "epoch": 0.7638919393806282, + "grad_norm": 2.0083518028259277, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6976603269577026, + "num_tokens": 173749126.0, + "step": 6956 + }, + { + "epoch": 0.7640017570832418, + "grad_norm": 2.3793327808380127, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7042381763458252, + "num_tokens": 173770342.0, + "step": 6957 + }, + { + "epoch": 0.7641115747858555, + "grad_norm": 2.1608874797821045, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6979783773422241, + "num_tokens": 173796587.0, + "step": 6958 + }, + { + "epoch": 0.7642213924884691, + "grad_norm": 2.1490094661712646, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.72544264793396, + "num_tokens": 173820581.0, + "step": 6959 + }, + { + "epoch": 0.7643312101910829, + "grad_norm": 2.4443159103393555, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7228643298149109, + "num_tokens": 173840386.0, + "step": 6960 + }, + { + "epoch": 0.7644410278936965, + "grad_norm": 2.725566864013672, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7139850854873657, + "num_tokens": 173858682.0, + "step": 6961 + }, + { + "epoch": 0.7645508455963101, + "grad_norm": 2.3945653438568115, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6941036581993103, + "num_tokens": 173880265.0, + "step": 6962 + }, + { + "epoch": 0.7646606632989238, + "grad_norm": 1.9745545387268066, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.7034478187561035, + "num_tokens": 173911246.0, + "step": 6963 + }, + { + "epoch": 0.7647704810015374, + "grad_norm": 2.2740564346313477, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7085160613059998, + "num_tokens": 173934036.0, + "step": 6964 + }, + { + "epoch": 0.7648802987041511, + "grad_norm": 2.219223737716675, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7185776233673096, + "num_tokens": 173958246.0, + "step": 6965 + }, + { + "epoch": 0.7649901164067647, + "grad_norm": 2.1307342052459717, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7044715881347656, + "num_tokens": 173987886.0, + "step": 6966 + }, + { + "epoch": 0.7650999341093784, + "grad_norm": 2.424309015274048, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7309658527374268, + "num_tokens": 174008639.0, + "step": 6967 + }, + { + "epoch": 0.7652097518119921, + "grad_norm": 2.2100493907928467, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6974513530731201, + "num_tokens": 174034607.0, + "step": 6968 + }, + { + "epoch": 0.7653195695146058, + "grad_norm": 2.0997722148895264, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7205179929733276, + "num_tokens": 174061468.0, + "step": 6969 + }, + { + "epoch": 0.7654293872172194, + "grad_norm": 1.938918113708496, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6880199909210205, + "num_tokens": 174091493.0, + "step": 6970 + }, + { + "epoch": 0.7655392049198331, + "grad_norm": 2.3142199516296387, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.6977163553237915, + "num_tokens": 174115152.0, + "step": 6971 + }, + { + "epoch": 0.7656490226224467, + "grad_norm": 2.323301076889038, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7207540273666382, + "num_tokens": 174136898.0, + "step": 6972 + }, + { + "epoch": 0.7657588403250604, + "grad_norm": 2.2813832759857178, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7297884821891785, + "num_tokens": 174158368.0, + "step": 6973 + }, + { + "epoch": 0.765868658027674, + "grad_norm": 2.3168070316314697, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7046028971672058, + "num_tokens": 174180833.0, + "step": 6974 + }, + { + "epoch": 0.7659784757302878, + "grad_norm": 2.0002431869506836, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7126586437225342, + "num_tokens": 174211326.0, + "step": 6975 + }, + { + "epoch": 0.7660882934329014, + "grad_norm": 2.240396738052368, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6987485289573669, + "num_tokens": 174235719.0, + "step": 6976 + }, + { + "epoch": 0.7661981111355151, + "grad_norm": 2.2770142555236816, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7035731077194214, + "num_tokens": 174259222.0, + "step": 6977 + }, + { + "epoch": 0.7663079288381287, + "grad_norm": 2.497807741165161, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7345235347747803, + "num_tokens": 174277463.0, + "step": 6978 + }, + { + "epoch": 0.7664177465407424, + "grad_norm": 2.4900665283203125, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7027373909950256, + "num_tokens": 174298792.0, + "step": 6979 + }, + { + "epoch": 0.766527564243356, + "grad_norm": 2.1063928604125977, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6862081289291382, + "num_tokens": 174326375.0, + "step": 6980 + }, + { + "epoch": 0.7666373819459696, + "grad_norm": 2.163989782333374, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7228021621704102, + "num_tokens": 174351271.0, + "step": 6981 + }, + { + "epoch": 0.7667471996485834, + "grad_norm": 2.189399003982544, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6783262491226196, + "num_tokens": 174378689.0, + "step": 6982 + }, + { + "epoch": 0.766857017351197, + "grad_norm": 2.675872325897217, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7113094329833984, + "num_tokens": 174396824.0, + "step": 6983 + }, + { + "epoch": 0.7669668350538107, + "grad_norm": 2.023505687713623, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7063788175582886, + "num_tokens": 174426915.0, + "step": 6984 + }, + { + "epoch": 0.7670766527564243, + "grad_norm": 2.0894052982330322, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.730457067489624, + "num_tokens": 174453048.0, + "step": 6985 + }, + { + "epoch": 0.767186470459038, + "grad_norm": 2.427818775177002, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7174403667449951, + "num_tokens": 174473758.0, + "step": 6986 + }, + { + "epoch": 0.7672962881616516, + "grad_norm": 2.5823824405670166, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7178279161453247, + "num_tokens": 174492576.0, + "step": 6987 + }, + { + "epoch": 0.7674061058642653, + "grad_norm": 1.9379931688308716, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6952699422836304, + "num_tokens": 174525106.0, + "step": 6988 + }, + { + "epoch": 0.767515923566879, + "grad_norm": 2.2889082431793213, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7043007016181946, + "num_tokens": 174548075.0, + "step": 6989 + }, + { + "epoch": 0.7676257412694927, + "grad_norm": 2.0569803714752197, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7025852203369141, + "num_tokens": 174579284.0, + "step": 6990 + }, + { + "epoch": 0.7677355589721063, + "grad_norm": 2.2398390769958496, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7019884586334229, + "num_tokens": 174603190.0, + "step": 6991 + }, + { + "epoch": 0.76784537667472, + "grad_norm": 1.9757180213928223, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6822479963302612, + "num_tokens": 174637045.0, + "step": 6992 + }, + { + "epoch": 0.7679551943773336, + "grad_norm": 2.1521151065826416, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7219855785369873, + "num_tokens": 174663930.0, + "step": 6993 + }, + { + "epoch": 0.7680650120799473, + "grad_norm": 2.371030569076538, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.730600893497467, + "num_tokens": 174684486.0, + "step": 6994 + }, + { + "epoch": 0.7681748297825609, + "grad_norm": 2.1081976890563965, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7049630284309387, + "num_tokens": 174711356.0, + "step": 6995 + }, + { + "epoch": 0.7682846474851747, + "grad_norm": 2.0407981872558594, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6898249387741089, + "num_tokens": 174741121.0, + "step": 6996 + }, + { + "epoch": 0.7683944651877883, + "grad_norm": 2.287661075592041, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.6963552236557007, + "num_tokens": 174765183.0, + "step": 6997 + }, + { + "epoch": 0.768504282890402, + "grad_norm": 2.2067394256591797, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6881589293479919, + "num_tokens": 174790331.0, + "step": 6998 + }, + { + "epoch": 0.7686141005930156, + "grad_norm": 2.11824107170105, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7136873006820679, + "num_tokens": 174818437.0, + "step": 6999 + }, + { + "epoch": 0.7687239182956292, + "grad_norm": 2.5516316890716553, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7277669906616211, + "num_tokens": 174839785.0, + "step": 7000 + }, + { + "epoch": 0.7688337359982429, + "grad_norm": 2.3199026584625244, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7121456265449524, + "num_tokens": 174861003.0, + "step": 7001 + }, + { + "epoch": 0.7689435537008565, + "grad_norm": 2.1778204441070557, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7018600702285767, + "num_tokens": 174887207.0, + "step": 7002 + }, + { + "epoch": 0.7690533714034702, + "grad_norm": 2.121021032333374, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7040320634841919, + "num_tokens": 174913888.0, + "step": 7003 + }, + { + "epoch": 0.7691631891060839, + "grad_norm": 2.081392526626587, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7290404438972473, + "num_tokens": 174940822.0, + "step": 7004 + }, + { + "epoch": 0.7692730068086976, + "grad_norm": 2.335599422454834, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7139899134635925, + "num_tokens": 174962098.0, + "step": 7005 + }, + { + "epoch": 0.7693828245113112, + "grad_norm": 2.431372880935669, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7158258557319641, + "num_tokens": 174982358.0, + "step": 7006 + }, + { + "epoch": 0.7694926422139249, + "grad_norm": 2.077481508255005, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7250020503997803, + "num_tokens": 175008910.0, + "step": 7007 + }, + { + "epoch": 0.7696024599165385, + "grad_norm": 2.2633962631225586, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7042104601860046, + "num_tokens": 175033794.0, + "step": 7008 + }, + { + "epoch": 0.7697122776191522, + "grad_norm": 2.2337069511413574, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7313786745071411, + "num_tokens": 175057652.0, + "step": 7009 + }, + { + "epoch": 0.7698220953217658, + "grad_norm": 2.08453369140625, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.701392412185669, + "num_tokens": 175084536.0, + "step": 7010 + }, + { + "epoch": 0.7699319130243796, + "grad_norm": 2.1591954231262207, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7047922611236572, + "num_tokens": 175109625.0, + "step": 7011 + }, + { + "epoch": 0.7700417307269932, + "grad_norm": 2.240999698638916, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7020699977874756, + "num_tokens": 175135023.0, + "step": 7012 + }, + { + "epoch": 0.7701515484296069, + "grad_norm": 2.465311050415039, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7286964654922485, + "num_tokens": 175154554.0, + "step": 7013 + }, + { + "epoch": 0.7702613661322205, + "grad_norm": 2.5188331604003906, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7174326777458191, + "num_tokens": 175174014.0, + "step": 7014 + }, + { + "epoch": 0.7703711838348342, + "grad_norm": 2.0492801666259766, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6931166648864746, + "num_tokens": 175201437.0, + "step": 7015 + }, + { + "epoch": 0.7704810015374478, + "grad_norm": 2.1508986949920654, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7212631106376648, + "num_tokens": 175226564.0, + "step": 7016 + }, + { + "epoch": 0.7705908192400615, + "grad_norm": 2.3413679599761963, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7098066210746765, + "num_tokens": 175248594.0, + "step": 7017 + }, + { + "epoch": 0.7707006369426752, + "grad_norm": 2.390291452407837, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7113426923751831, + "num_tokens": 175268557.0, + "step": 7018 + }, + { + "epoch": 0.7708104546452889, + "grad_norm": 2.0971364974975586, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.691296398639679, + "num_tokens": 175296770.0, + "step": 7019 + }, + { + "epoch": 0.7709202723479025, + "grad_norm": 2.039422035217285, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6842043399810791, + "num_tokens": 175326105.0, + "step": 7020 + }, + { + "epoch": 0.7710300900505161, + "grad_norm": 2.050555944442749, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7186471819877625, + "num_tokens": 175351285.0, + "step": 7021 + }, + { + "epoch": 0.7711399077531298, + "grad_norm": 2.1538002490997314, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6900859475135803, + "num_tokens": 175379720.0, + "step": 7022 + }, + { + "epoch": 0.7712497254557434, + "grad_norm": 2.4689924716949463, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.712760865688324, + "num_tokens": 175400782.0, + "step": 7023 + }, + { + "epoch": 0.7713595431583571, + "grad_norm": 2.2786242961883545, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7224462032318115, + "num_tokens": 175425357.0, + "step": 7024 + }, + { + "epoch": 0.7714693608609708, + "grad_norm": 2.122514486312866, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6958186626434326, + "num_tokens": 175453759.0, + "step": 7025 + }, + { + "epoch": 0.7715791785635845, + "grad_norm": 2.2126166820526123, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7011383175849915, + "num_tokens": 175478815.0, + "step": 7026 + }, + { + "epoch": 0.7716889962661981, + "grad_norm": 2.3204009532928467, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7113062739372253, + "num_tokens": 175504707.0, + "step": 7027 + }, + { + "epoch": 0.7717988139688118, + "grad_norm": 2.414545774459839, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.703936755657196, + "num_tokens": 175525815.0, + "step": 7028 + }, + { + "epoch": 0.7719086316714254, + "grad_norm": 2.3287975788116455, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.723639190196991, + "num_tokens": 175550427.0, + "step": 7029 + }, + { + "epoch": 0.7720184493740391, + "grad_norm": 2.5433897972106934, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.710250973701477, + "num_tokens": 175570910.0, + "step": 7030 + }, + { + "epoch": 0.7721282670766527, + "grad_norm": 2.313227891921997, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7375684976577759, + "num_tokens": 175593123.0, + "step": 7031 + }, + { + "epoch": 0.7722380847792664, + "grad_norm": 2.359013080596924, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6894212365150452, + "num_tokens": 175617389.0, + "step": 7032 + }, + { + "epoch": 0.7723479024818801, + "grad_norm": 2.579882860183716, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7125202417373657, + "num_tokens": 175636648.0, + "step": 7033 + }, + { + "epoch": 0.7724577201844938, + "grad_norm": 2.23429274559021, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6953802704811096, + "num_tokens": 175661692.0, + "step": 7034 + }, + { + "epoch": 0.7725675378871074, + "grad_norm": 2.415515661239624, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7261698246002197, + "num_tokens": 175682644.0, + "step": 7035 + }, + { + "epoch": 0.7726773555897211, + "grad_norm": 2.0247960090637207, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7279257774353027, + "num_tokens": 175709517.0, + "step": 7036 + }, + { + "epoch": 0.7727871732923347, + "grad_norm": 2.2387659549713135, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.6962759494781494, + "num_tokens": 175736458.0, + "step": 7037 + }, + { + "epoch": 0.7728969909949484, + "grad_norm": 2.566117525100708, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7125765681266785, + "num_tokens": 175757275.0, + "step": 7038 + }, + { + "epoch": 0.773006808697562, + "grad_norm": 2.2808759212493896, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7055639028549194, + "num_tokens": 175780802.0, + "step": 7039 + }, + { + "epoch": 0.7731166264001758, + "grad_norm": 2.401463508605957, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6983669400215149, + "num_tokens": 175804560.0, + "step": 7040 + }, + { + "epoch": 0.7732264441027894, + "grad_norm": 2.3873746395111084, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7044219374656677, + "num_tokens": 175827692.0, + "step": 7041 + }, + { + "epoch": 0.773336261805403, + "grad_norm": 2.0284478664398193, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7146620154380798, + "num_tokens": 175855827.0, + "step": 7042 + }, + { + "epoch": 0.7734460795080167, + "grad_norm": 2.2047674655914307, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7206624150276184, + "num_tokens": 175878848.0, + "step": 7043 + }, + { + "epoch": 0.7735558972106303, + "grad_norm": 2.314014196395874, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7265796661376953, + "num_tokens": 175904207.0, + "step": 7044 + }, + { + "epoch": 0.773665714913244, + "grad_norm": 2.0840165615081787, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6877220273017883, + "num_tokens": 175932662.0, + "step": 7045 + }, + { + "epoch": 0.7737755326158576, + "grad_norm": 2.0429937839508057, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7024205327033997, + "num_tokens": 175959261.0, + "step": 7046 + }, + { + "epoch": 0.7738853503184714, + "grad_norm": 2.1455323696136475, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7119203209877014, + "num_tokens": 175985995.0, + "step": 7047 + }, + { + "epoch": 0.773995168021085, + "grad_norm": 2.3563075065612793, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6867740154266357, + "num_tokens": 176011716.0, + "step": 7048 + }, + { + "epoch": 0.7741049857236987, + "grad_norm": 2.341329336166382, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7163935899734497, + "num_tokens": 176034821.0, + "step": 7049 + }, + { + "epoch": 0.7742148034263123, + "grad_norm": 2.359387159347534, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7334376573562622, + "num_tokens": 176056116.0, + "step": 7050 + }, + { + "epoch": 0.774324621128926, + "grad_norm": 2.2332839965820312, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7125782370567322, + "num_tokens": 176080744.0, + "step": 7051 + }, + { + "epoch": 0.7744344388315396, + "grad_norm": 2.186440944671631, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7179524302482605, + "num_tokens": 176105779.0, + "step": 7052 + }, + { + "epoch": 0.7745442565341533, + "grad_norm": 2.6796815395355225, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7153216600418091, + "num_tokens": 176124398.0, + "step": 7053 + }, + { + "epoch": 0.774654074236767, + "grad_norm": 2.3508572578430176, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.713276743888855, + "num_tokens": 176147928.0, + "step": 7054 + }, + { + "epoch": 0.7747638919393807, + "grad_norm": 2.325615167617798, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7283845543861389, + "num_tokens": 176171299.0, + "step": 7055 + }, + { + "epoch": 0.7748737096419943, + "grad_norm": 2.2202892303466797, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7105967998504639, + "num_tokens": 176196840.0, + "step": 7056 + }, + { + "epoch": 0.774983527344608, + "grad_norm": 2.3583362102508545, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7120906710624695, + "num_tokens": 176218762.0, + "step": 7057 + }, + { + "epoch": 0.7750933450472216, + "grad_norm": 2.3505055904388428, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7040915489196777, + "num_tokens": 176242624.0, + "step": 7058 + }, + { + "epoch": 0.7752031627498353, + "grad_norm": 2.0182204246520996, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6946840882301331, + "num_tokens": 176273161.0, + "step": 7059 + }, + { + "epoch": 0.7753129804524489, + "grad_norm": 2.577552318572998, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7206301689147949, + "num_tokens": 176292219.0, + "step": 7060 + }, + { + "epoch": 0.7754227981550625, + "grad_norm": 2.7007462978363037, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7333933711051941, + "num_tokens": 176310605.0, + "step": 7061 + }, + { + "epoch": 0.7755326158576763, + "grad_norm": 2.3231186866760254, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7198585271835327, + "num_tokens": 176332787.0, + "step": 7062 + }, + { + "epoch": 0.77564243356029, + "grad_norm": 2.5299482345581055, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7327812910079956, + "num_tokens": 176351340.0, + "step": 7063 + }, + { + "epoch": 0.7757522512629036, + "grad_norm": 2.0026557445526123, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7080023884773254, + "num_tokens": 176379333.0, + "step": 7064 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 2.1873321533203125, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7047072649002075, + "num_tokens": 176403438.0, + "step": 7065 + }, + { + "epoch": 0.7759718866681309, + "grad_norm": 2.154881477355957, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.6996318697929382, + "num_tokens": 176427725.0, + "step": 7066 + }, + { + "epoch": 0.7760817043707445, + "grad_norm": 2.3466219902038574, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7334962487220764, + "num_tokens": 176449735.0, + "step": 7067 + }, + { + "epoch": 0.7761915220733582, + "grad_norm": 1.9660402536392212, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6994955539703369, + "num_tokens": 176479551.0, + "step": 7068 + }, + { + "epoch": 0.7763013397759719, + "grad_norm": 2.2094178199768066, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7093240022659302, + "num_tokens": 176504117.0, + "step": 7069 + }, + { + "epoch": 0.7764111574785856, + "grad_norm": 2.4307713508605957, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7221692800521851, + "num_tokens": 176523555.0, + "step": 7070 + }, + { + "epoch": 0.7765209751811992, + "grad_norm": 2.2293143272399902, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7119738459587097, + "num_tokens": 176547231.0, + "step": 7071 + }, + { + "epoch": 0.7766307928838129, + "grad_norm": 2.5813610553741455, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7074897885322571, + "num_tokens": 176567844.0, + "step": 7072 + }, + { + "epoch": 0.7767406105864265, + "grad_norm": 2.384188175201416, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6887573599815369, + "num_tokens": 176591211.0, + "step": 7073 + }, + { + "epoch": 0.7768504282890402, + "grad_norm": 2.1875736713409424, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7209957838058472, + "num_tokens": 176615256.0, + "step": 7074 + }, + { + "epoch": 0.7769602459916538, + "grad_norm": 2.441753387451172, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7179852724075317, + "num_tokens": 176635362.0, + "step": 7075 + }, + { + "epoch": 0.7770700636942676, + "grad_norm": 2.0151145458221436, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7221031188964844, + "num_tokens": 176663329.0, + "step": 7076 + }, + { + "epoch": 0.7771798813968812, + "grad_norm": 2.1366262435913086, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6951586008071899, + "num_tokens": 176688074.0, + "step": 7077 + }, + { + "epoch": 0.7772896990994949, + "grad_norm": 1.942276120185852, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6889086365699768, + "num_tokens": 176720626.0, + "step": 7078 + }, + { + "epoch": 0.7773995168021085, + "grad_norm": 2.2620227336883545, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.721411406993866, + "num_tokens": 176742635.0, + "step": 7079 + }, + { + "epoch": 0.7775093345047221, + "grad_norm": 2.0866849422454834, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6927685141563416, + "num_tokens": 176770050.0, + "step": 7080 + }, + { + "epoch": 0.7776191522073358, + "grad_norm": 2.0349960327148438, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7100118398666382, + "num_tokens": 176797419.0, + "step": 7081 + }, + { + "epoch": 0.7777289699099494, + "grad_norm": 2.4483015537261963, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7033262252807617, + "num_tokens": 176818514.0, + "step": 7082 + }, + { + "epoch": 0.7778387876125632, + "grad_norm": 1.9157323837280273, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7111313343048096, + "num_tokens": 176849685.0, + "step": 7083 + }, + { + "epoch": 0.7779486053151768, + "grad_norm": 2.129063367843628, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6922982335090637, + "num_tokens": 176876935.0, + "step": 7084 + }, + { + "epoch": 0.7780584230177905, + "grad_norm": 2.764401912689209, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7267489433288574, + "num_tokens": 176893027.0, + "step": 7085 + }, + { + "epoch": 0.7781682407204041, + "grad_norm": 2.2565226554870605, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7117884159088135, + "num_tokens": 176917116.0, + "step": 7086 + }, + { + "epoch": 0.7782780584230178, + "grad_norm": 2.305490255355835, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7020276188850403, + "num_tokens": 176941343.0, + "step": 7087 + }, + { + "epoch": 0.7783878761256314, + "grad_norm": 2.0025224685668945, + "learning_rate": 1e-06, + "loss": 1.0952, + "mean_token_accuracy": 0.6792680025100708, + "num_tokens": 176972311.0, + "step": 7088 + }, + { + "epoch": 0.7784976938282451, + "grad_norm": 2.4433603286743164, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.712971568107605, + "num_tokens": 176993354.0, + "step": 7089 + }, + { + "epoch": 0.7786075115308587, + "grad_norm": 2.674372911453247, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7540779113769531, + "num_tokens": 177008404.0, + "step": 7090 + }, + { + "epoch": 0.7787173292334725, + "grad_norm": 2.445241689682007, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7007589340209961, + "num_tokens": 177030896.0, + "step": 7091 + }, + { + "epoch": 0.7788271469360861, + "grad_norm": 2.125882625579834, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6867285370826721, + "num_tokens": 177059138.0, + "step": 7092 + }, + { + "epoch": 0.7789369646386998, + "grad_norm": 1.9206031560897827, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6826545000076294, + "num_tokens": 177093403.0, + "step": 7093 + }, + { + "epoch": 0.7790467823413134, + "grad_norm": 2.470277786254883, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6884551644325256, + "num_tokens": 177113896.0, + "step": 7094 + }, + { + "epoch": 0.7791566000439271, + "grad_norm": 2.3884236812591553, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7176584005355835, + "num_tokens": 177135626.0, + "step": 7095 + }, + { + "epoch": 0.7792664177465407, + "grad_norm": 2.4023525714874268, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6940500736236572, + "num_tokens": 177159556.0, + "step": 7096 + }, + { + "epoch": 0.7793762354491544, + "grad_norm": 2.2758450508117676, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7075210809707642, + "num_tokens": 177181465.0, + "step": 7097 + }, + { + "epoch": 0.7794860531517681, + "grad_norm": 2.2817742824554443, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7217346429824829, + "num_tokens": 177204306.0, + "step": 7098 + }, + { + "epoch": 0.7795958708543818, + "grad_norm": 2.053694725036621, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7058734893798828, + "num_tokens": 177233629.0, + "step": 7099 + }, + { + "epoch": 0.7797056885569954, + "grad_norm": 2.1622254848480225, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7105463743209839, + "num_tokens": 177260512.0, + "step": 7100 + }, + { + "epoch": 0.779815506259609, + "grad_norm": 2.1975343227386475, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6925216317176819, + "num_tokens": 177287446.0, + "step": 7101 + }, + { + "epoch": 0.7799253239622227, + "grad_norm": 2.1941137313842773, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7147315144538879, + "num_tokens": 177314758.0, + "step": 7102 + }, + { + "epoch": 0.7800351416648363, + "grad_norm": 2.29882550239563, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7123339176177979, + "num_tokens": 177337622.0, + "step": 7103 + }, + { + "epoch": 0.78014495936745, + "grad_norm": 2.201186418533325, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.705777645111084, + "num_tokens": 177362398.0, + "step": 7104 + }, + { + "epoch": 0.7802547770700637, + "grad_norm": 2.090517997741699, + "learning_rate": 1e-06, + "loss": 1.0702, + "mean_token_accuracy": 0.6790578365325928, + "num_tokens": 177391522.0, + "step": 7105 + }, + { + "epoch": 0.7803645947726774, + "grad_norm": 1.9203563928604126, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7065916657447815, + "num_tokens": 177423045.0, + "step": 7106 + }, + { + "epoch": 0.780474412475291, + "grad_norm": 2.329911947250366, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7026016712188721, + "num_tokens": 177445226.0, + "step": 7107 + }, + { + "epoch": 0.7805842301779047, + "grad_norm": 1.9840058088302612, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7128523588180542, + "num_tokens": 177470593.0, + "step": 7108 + }, + { + "epoch": 0.7806940478805183, + "grad_norm": 2.016057252883911, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7155263423919678, + "num_tokens": 177497008.0, + "step": 7109 + }, + { + "epoch": 0.780803865583132, + "grad_norm": 2.4325084686279297, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7219448685646057, + "num_tokens": 177516837.0, + "step": 7110 + }, + { + "epoch": 0.7809136832857456, + "grad_norm": 2.1769871711730957, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6915568709373474, + "num_tokens": 177542443.0, + "step": 7111 + }, + { + "epoch": 0.7810235009883594, + "grad_norm": 1.9696168899536133, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7007724046707153, + "num_tokens": 177572348.0, + "step": 7112 + }, + { + "epoch": 0.781133318690973, + "grad_norm": 2.215869188308716, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7204368114471436, + "num_tokens": 177597519.0, + "step": 7113 + }, + { + "epoch": 0.7812431363935867, + "grad_norm": 2.2523255348205566, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7179393768310547, + "num_tokens": 177620552.0, + "step": 7114 + }, + { + "epoch": 0.7813529540962003, + "grad_norm": 2.298096179962158, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7090343832969666, + "num_tokens": 177643920.0, + "step": 7115 + }, + { + "epoch": 0.781462771798814, + "grad_norm": 2.18218731880188, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7100214958190918, + "num_tokens": 177668578.0, + "step": 7116 + }, + { + "epoch": 0.7815725895014276, + "grad_norm": 2.185245990753174, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6963151097297668, + "num_tokens": 177694635.0, + "step": 7117 + }, + { + "epoch": 0.7816824072040413, + "grad_norm": 2.5531439781188965, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.731867790222168, + "num_tokens": 177713607.0, + "step": 7118 + }, + { + "epoch": 0.7817922249066549, + "grad_norm": 2.0670979022979736, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7062980532646179, + "num_tokens": 177742154.0, + "step": 7119 + }, + { + "epoch": 0.7819020426092687, + "grad_norm": 2.163252830505371, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7059187293052673, + "num_tokens": 177768391.0, + "step": 7120 + }, + { + "epoch": 0.7820118603118823, + "grad_norm": 2.3104867935180664, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7077102065086365, + "num_tokens": 177791808.0, + "step": 7121 + }, + { + "epoch": 0.782121678014496, + "grad_norm": 2.3188652992248535, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.6966189742088318, + "num_tokens": 177815364.0, + "step": 7122 + }, + { + "epoch": 0.7822314957171096, + "grad_norm": 2.0369200706481934, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7015055418014526, + "num_tokens": 177845865.0, + "step": 7123 + }, + { + "epoch": 0.7823413134197232, + "grad_norm": 2.0857291221618652, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7022973895072937, + "num_tokens": 177874084.0, + "step": 7124 + }, + { + "epoch": 0.7824511311223369, + "grad_norm": 2.036839723587036, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6918375492095947, + "num_tokens": 177903899.0, + "step": 7125 + }, + { + "epoch": 0.7825609488249505, + "grad_norm": 2.0694897174835205, + "learning_rate": 1e-06, + "loss": 1.0881, + "mean_token_accuracy": 0.6729307770729065, + "num_tokens": 177931058.0, + "step": 7126 + }, + { + "epoch": 0.7826707665275643, + "grad_norm": 2.350687265396118, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7303478717803955, + "num_tokens": 177951662.0, + "step": 7127 + }, + { + "epoch": 0.7827805842301779, + "grad_norm": 2.130586624145508, + "learning_rate": 1e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.6733394861221313, + "num_tokens": 177978848.0, + "step": 7128 + }, + { + "epoch": 0.7828904019327916, + "grad_norm": 2.2014384269714355, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6956834197044373, + "num_tokens": 178004041.0, + "step": 7129 + }, + { + "epoch": 0.7830002196354052, + "grad_norm": 2.255524158477783, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7336626052856445, + "num_tokens": 178025332.0, + "step": 7130 + }, + { + "epoch": 0.7831100373380189, + "grad_norm": 2.148528575897217, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6935088634490967, + "num_tokens": 178052675.0, + "step": 7131 + }, + { + "epoch": 0.7832198550406325, + "grad_norm": 2.2056541442871094, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7054975032806396, + "num_tokens": 178079087.0, + "step": 7132 + }, + { + "epoch": 0.7833296727432462, + "grad_norm": 2.1414711475372314, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7023558616638184, + "num_tokens": 178105604.0, + "step": 7133 + }, + { + "epoch": 0.7834394904458599, + "grad_norm": 2.276655912399292, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7144544124603271, + "num_tokens": 178130212.0, + "step": 7134 + }, + { + "epoch": 0.7835493081484736, + "grad_norm": 2.3110039234161377, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7246601581573486, + "num_tokens": 178152053.0, + "step": 7135 + }, + { + "epoch": 0.7836591258510872, + "grad_norm": 2.337590456008911, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6866493225097656, + "num_tokens": 178173427.0, + "step": 7136 + }, + { + "epoch": 0.7837689435537009, + "grad_norm": 2.0528724193573, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6894508600234985, + "num_tokens": 178203584.0, + "step": 7137 + }, + { + "epoch": 0.7838787612563145, + "grad_norm": 2.815732717514038, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7099893689155579, + "num_tokens": 178222913.0, + "step": 7138 + }, + { + "epoch": 0.7839885789589282, + "grad_norm": 2.359930992126465, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7369333505630493, + "num_tokens": 178243156.0, + "step": 7139 + }, + { + "epoch": 0.7840983966615418, + "grad_norm": 2.1262781620025635, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7284853458404541, + "num_tokens": 178268681.0, + "step": 7140 + }, + { + "epoch": 0.7842082143641556, + "grad_norm": 1.9407850503921509, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6904192566871643, + "num_tokens": 178301176.0, + "step": 7141 + }, + { + "epoch": 0.7843180320667692, + "grad_norm": 2.0964529514312744, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.706885576248169, + "num_tokens": 178327305.0, + "step": 7142 + }, + { + "epoch": 0.7844278497693828, + "grad_norm": 2.191016912460327, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7063393592834473, + "num_tokens": 178354366.0, + "step": 7143 + }, + { + "epoch": 0.7845376674719965, + "grad_norm": 2.200519561767578, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7080358266830444, + "num_tokens": 178378422.0, + "step": 7144 + }, + { + "epoch": 0.7846474851746101, + "grad_norm": 2.008976459503174, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7049221992492676, + "num_tokens": 178405750.0, + "step": 7145 + }, + { + "epoch": 0.7847573028772238, + "grad_norm": 2.384901762008667, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7037010192871094, + "num_tokens": 178428446.0, + "step": 7146 + }, + { + "epoch": 0.7848671205798374, + "grad_norm": 2.4471824169158936, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7145038843154907, + "num_tokens": 178452016.0, + "step": 7147 + }, + { + "epoch": 0.7849769382824512, + "grad_norm": 2.274942398071289, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7098963260650635, + "num_tokens": 178476078.0, + "step": 7148 + }, + { + "epoch": 0.7850867559850648, + "grad_norm": 2.011115312576294, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7079389095306396, + "num_tokens": 178505816.0, + "step": 7149 + }, + { + "epoch": 0.7851965736876785, + "grad_norm": 2.1070399284362793, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7013258337974548, + "num_tokens": 178533352.0, + "step": 7150 + }, + { + "epoch": 0.7853063913902921, + "grad_norm": 2.1893739700317383, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6852400302886963, + "num_tokens": 178558792.0, + "step": 7151 + }, + { + "epoch": 0.7854162090929058, + "grad_norm": 2.393812417984009, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7266083359718323, + "num_tokens": 178579749.0, + "step": 7152 + }, + { + "epoch": 0.7855260267955194, + "grad_norm": 2.0669474601745605, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7039151787757874, + "num_tokens": 178607874.0, + "step": 7153 + }, + { + "epoch": 0.7856358444981331, + "grad_norm": 2.2107391357421875, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7115007638931274, + "num_tokens": 178634077.0, + "step": 7154 + }, + { + "epoch": 0.7857456622007467, + "grad_norm": 2.471112012863159, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6932165622711182, + "num_tokens": 178655370.0, + "step": 7155 + }, + { + "epoch": 0.7858554799033605, + "grad_norm": 2.198190450668335, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.714097261428833, + "num_tokens": 178680263.0, + "step": 7156 + }, + { + "epoch": 0.7859652976059741, + "grad_norm": 2.171924114227295, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7027885317802429, + "num_tokens": 178706559.0, + "step": 7157 + }, + { + "epoch": 0.7860751153085878, + "grad_norm": 2.2399682998657227, + "learning_rate": 1e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7509560585021973, + "num_tokens": 178726985.0, + "step": 7158 + }, + { + "epoch": 0.7861849330112014, + "grad_norm": 2.6451833248138428, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.699899435043335, + "num_tokens": 178746302.0, + "step": 7159 + }, + { + "epoch": 0.786294750713815, + "grad_norm": 2.3652853965759277, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7220258116722107, + "num_tokens": 178769079.0, + "step": 7160 + }, + { + "epoch": 0.7864045684164287, + "grad_norm": 2.139730453491211, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6957030296325684, + "num_tokens": 178794593.0, + "step": 7161 + }, + { + "epoch": 0.7865143861190423, + "grad_norm": 2.1650209426879883, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7352359294891357, + "num_tokens": 178817664.0, + "step": 7162 + }, + { + "epoch": 0.7866242038216561, + "grad_norm": 2.31152081489563, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6871189475059509, + "num_tokens": 178842866.0, + "step": 7163 + }, + { + "epoch": 0.7867340215242697, + "grad_norm": 2.099757671356201, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7059730291366577, + "num_tokens": 178870993.0, + "step": 7164 + }, + { + "epoch": 0.7868438392268834, + "grad_norm": 2.071767807006836, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7099051475524902, + "num_tokens": 178898669.0, + "step": 7165 + }, + { + "epoch": 0.786953656929497, + "grad_norm": 2.0689475536346436, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7241566777229309, + "num_tokens": 178923606.0, + "step": 7166 + }, + { + "epoch": 0.7870634746321107, + "grad_norm": 2.4431660175323486, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.71027672290802, + "num_tokens": 178946297.0, + "step": 7167 + }, + { + "epoch": 0.7871732923347243, + "grad_norm": 2.4848482608795166, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7295883893966675, + "num_tokens": 178965965.0, + "step": 7168 + }, + { + "epoch": 0.787283110037338, + "grad_norm": 2.128453254699707, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7118570804595947, + "num_tokens": 178993057.0, + "step": 7169 + }, + { + "epoch": 0.7873929277399517, + "grad_norm": 2.121020555496216, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7075821161270142, + "num_tokens": 179019979.0, + "step": 7170 + }, + { + "epoch": 0.7875027454425654, + "grad_norm": 2.2276828289031982, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7135204076766968, + "num_tokens": 179043227.0, + "step": 7171 + }, + { + "epoch": 0.787612563145179, + "grad_norm": 2.5526838302612305, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7254068851470947, + "num_tokens": 179062137.0, + "step": 7172 + }, + { + "epoch": 0.7877223808477927, + "grad_norm": 2.054098606109619, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7019835710525513, + "num_tokens": 179090026.0, + "step": 7173 + }, + { + "epoch": 0.7878321985504063, + "grad_norm": 2.240203857421875, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.706250011920929, + "num_tokens": 179115706.0, + "step": 7174 + }, + { + "epoch": 0.78794201625302, + "grad_norm": 2.5565359592437744, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7013899087905884, + "num_tokens": 179137150.0, + "step": 7175 + }, + { + "epoch": 0.7880518339556336, + "grad_norm": 2.3869903087615967, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6697816848754883, + "num_tokens": 179161239.0, + "step": 7176 + }, + { + "epoch": 0.7881616516582474, + "grad_norm": 2.4515089988708496, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7096562385559082, + "num_tokens": 179181902.0, + "step": 7177 + }, + { + "epoch": 0.788271469360861, + "grad_norm": 2.1666297912597656, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7115942239761353, + "num_tokens": 179207606.0, + "step": 7178 + }, + { + "epoch": 0.7883812870634747, + "grad_norm": 2.2249083518981934, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.6968633532524109, + "num_tokens": 179234774.0, + "step": 7179 + }, + { + "epoch": 0.7884911047660883, + "grad_norm": 2.379103183746338, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7101287841796875, + "num_tokens": 179257656.0, + "step": 7180 + }, + { + "epoch": 0.788600922468702, + "grad_norm": 2.052396297454834, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7286615967750549, + "num_tokens": 179282302.0, + "step": 7181 + }, + { + "epoch": 0.7887107401713156, + "grad_norm": 2.200831413269043, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.71268630027771, + "num_tokens": 179306651.0, + "step": 7182 + }, + { + "epoch": 0.7888205578739292, + "grad_norm": 2.326070785522461, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7022053003311157, + "num_tokens": 179329235.0, + "step": 7183 + }, + { + "epoch": 0.7889303755765429, + "grad_norm": 2.501624345779419, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7108612060546875, + "num_tokens": 179351090.0, + "step": 7184 + }, + { + "epoch": 0.7890401932791566, + "grad_norm": 2.3705852031707764, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.6955029964447021, + "num_tokens": 179375589.0, + "step": 7185 + }, + { + "epoch": 0.7891500109817703, + "grad_norm": 1.8977278470993042, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7013688683509827, + "num_tokens": 179406838.0, + "step": 7186 + }, + { + "epoch": 0.7892598286843839, + "grad_norm": 2.2330665588378906, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7258394956588745, + "num_tokens": 179430266.0, + "step": 7187 + }, + { + "epoch": 0.7893696463869976, + "grad_norm": 2.4652411937713623, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7150667905807495, + "num_tokens": 179451202.0, + "step": 7188 + }, + { + "epoch": 0.7894794640896112, + "grad_norm": 2.283188819885254, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7149373292922974, + "num_tokens": 179473752.0, + "step": 7189 + }, + { + "epoch": 0.7895892817922249, + "grad_norm": 2.239398956298828, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6942795515060425, + "num_tokens": 179498974.0, + "step": 7190 + }, + { + "epoch": 0.7896990994948385, + "grad_norm": 2.436174154281616, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7110140323638916, + "num_tokens": 179520466.0, + "step": 7191 + }, + { + "epoch": 0.7898089171974523, + "grad_norm": 2.0266571044921875, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7089568376541138, + "num_tokens": 179551239.0, + "step": 7192 + }, + { + "epoch": 0.7899187349000659, + "grad_norm": 2.021341323852539, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7045179605484009, + "num_tokens": 179579184.0, + "step": 7193 + }, + { + "epoch": 0.7900285526026796, + "grad_norm": 2.25234055519104, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7288299798965454, + "num_tokens": 179602366.0, + "step": 7194 + }, + { + "epoch": 0.7901383703052932, + "grad_norm": 1.9601359367370605, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.7022004723548889, + "num_tokens": 179633675.0, + "step": 7195 + }, + { + "epoch": 0.7902481880079069, + "grad_norm": 2.588367223739624, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.741385817527771, + "num_tokens": 179652849.0, + "step": 7196 + }, + { + "epoch": 0.7903580057105205, + "grad_norm": 2.0888960361480713, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.715631902217865, + "num_tokens": 179678503.0, + "step": 7197 + }, + { + "epoch": 0.7904678234131342, + "grad_norm": 2.2715706825256348, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7180080413818359, + "num_tokens": 179700927.0, + "step": 7198 + }, + { + "epoch": 0.7905776411157479, + "grad_norm": 2.1797375679016113, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7216736078262329, + "num_tokens": 179725778.0, + "step": 7199 + }, + { + "epoch": 0.7906874588183616, + "grad_norm": 2.279067039489746, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7273379564285278, + "num_tokens": 179749070.0, + "step": 7200 + }, + { + "epoch": 0.7907972765209752, + "grad_norm": 2.3206419944763184, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.729022204875946, + "num_tokens": 179772106.0, + "step": 7201 + }, + { + "epoch": 0.7909070942235888, + "grad_norm": 2.076171636581421, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7131137847900391, + "num_tokens": 179797355.0, + "step": 7202 + }, + { + "epoch": 0.7910169119262025, + "grad_norm": 2.1193161010742188, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7234674692153931, + "num_tokens": 179821510.0, + "step": 7203 + }, + { + "epoch": 0.7911267296288161, + "grad_norm": 2.2862775325775146, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7106566429138184, + "num_tokens": 179845266.0, + "step": 7204 + }, + { + "epoch": 0.7912365473314298, + "grad_norm": 2.276845932006836, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6925085783004761, + "num_tokens": 179868945.0, + "step": 7205 + }, + { + "epoch": 0.7913463650340435, + "grad_norm": 2.052190065383911, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7221616506576538, + "num_tokens": 179894107.0, + "step": 7206 + }, + { + "epoch": 0.7914561827366572, + "grad_norm": 2.1956770420074463, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6861469745635986, + "num_tokens": 179920385.0, + "step": 7207 + }, + { + "epoch": 0.7915660004392708, + "grad_norm": 2.260380744934082, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7132080793380737, + "num_tokens": 179945397.0, + "step": 7208 + }, + { + "epoch": 0.7916758181418845, + "grad_norm": 2.2706453800201416, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6945878267288208, + "num_tokens": 179970902.0, + "step": 7209 + }, + { + "epoch": 0.7917856358444981, + "grad_norm": 2.331822395324707, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7257769107818604, + "num_tokens": 179992219.0, + "step": 7210 + }, + { + "epoch": 0.7918954535471118, + "grad_norm": 2.211794376373291, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.722298264503479, + "num_tokens": 180016982.0, + "step": 7211 + }, + { + "epoch": 0.7920052712497254, + "grad_norm": 2.3172028064727783, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7206117510795593, + "num_tokens": 180038210.0, + "step": 7212 + }, + { + "epoch": 0.7921150889523391, + "grad_norm": 2.010326862335205, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7188315391540527, + "num_tokens": 180065348.0, + "step": 7213 + }, + { + "epoch": 0.7922249066549528, + "grad_norm": 2.235675573348999, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.70055091381073, + "num_tokens": 180088465.0, + "step": 7214 + }, + { + "epoch": 0.7923347243575665, + "grad_norm": 2.40920352935791, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7138186693191528, + "num_tokens": 180109219.0, + "step": 7215 + }, + { + "epoch": 0.7924445420601801, + "grad_norm": 2.3668041229248047, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6896716356277466, + "num_tokens": 180130900.0, + "step": 7216 + }, + { + "epoch": 0.7925543597627938, + "grad_norm": 1.9405200481414795, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7170142531394958, + "num_tokens": 180159335.0, + "step": 7217 + }, + { + "epoch": 0.7926641774654074, + "grad_norm": 2.200000762939453, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6963834762573242, + "num_tokens": 180183568.0, + "step": 7218 + }, + { + "epoch": 0.792773995168021, + "grad_norm": 1.878446340560913, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6939330101013184, + "num_tokens": 180217221.0, + "step": 7219 + }, + { + "epoch": 0.7928838128706347, + "grad_norm": 2.0769095420837402, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6933995485305786, + "num_tokens": 180245081.0, + "step": 7220 + }, + { + "epoch": 0.7929936305732485, + "grad_norm": 1.9801698923110962, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7119913101196289, + "num_tokens": 180274678.0, + "step": 7221 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 2.0375113487243652, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.7033886313438416, + "num_tokens": 180305341.0, + "step": 7222 + }, + { + "epoch": 0.7932132659784757, + "grad_norm": 2.6199498176574707, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7163540124893188, + "num_tokens": 180325100.0, + "step": 7223 + }, + { + "epoch": 0.7933230836810894, + "grad_norm": 2.128957986831665, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6852518320083618, + "num_tokens": 180355265.0, + "step": 7224 + }, + { + "epoch": 0.793432901383703, + "grad_norm": 2.2911972999572754, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7165592908859253, + "num_tokens": 180377540.0, + "step": 7225 + }, + { + "epoch": 0.7935427190863167, + "grad_norm": 2.549739122390747, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7148362994194031, + "num_tokens": 180397208.0, + "step": 7226 + }, + { + "epoch": 0.7936525367889303, + "grad_norm": 2.244429588317871, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7057783603668213, + "num_tokens": 180423084.0, + "step": 7227 + }, + { + "epoch": 0.7937623544915441, + "grad_norm": 2.3084967136383057, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.702818751335144, + "num_tokens": 180448411.0, + "step": 7228 + }, + { + "epoch": 0.7938721721941577, + "grad_norm": 2.2774219512939453, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7167959213256836, + "num_tokens": 180471859.0, + "step": 7229 + }, + { + "epoch": 0.7939819898967714, + "grad_norm": 2.227466106414795, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6870641708374023, + "num_tokens": 180496891.0, + "step": 7230 + }, + { + "epoch": 0.794091807599385, + "grad_norm": 2.733952283859253, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7346415519714355, + "num_tokens": 180514205.0, + "step": 7231 + }, + { + "epoch": 0.7942016253019987, + "grad_norm": 2.032634973526001, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.706458330154419, + "num_tokens": 180543423.0, + "step": 7232 + }, + { + "epoch": 0.7943114430046123, + "grad_norm": 2.291055202484131, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7287498712539673, + "num_tokens": 180566608.0, + "step": 7233 + }, + { + "epoch": 0.794421260707226, + "grad_norm": 2.246631383895874, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7044777274131775, + "num_tokens": 180589477.0, + "step": 7234 + }, + { + "epoch": 0.7945310784098397, + "grad_norm": 2.146401882171631, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7018622159957886, + "num_tokens": 180615733.0, + "step": 7235 + }, + { + "epoch": 0.7946408961124534, + "grad_norm": 2.195049524307251, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7220379114151001, + "num_tokens": 180644077.0, + "step": 7236 + }, + { + "epoch": 0.794750713815067, + "grad_norm": 2.445023536682129, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7165398597717285, + "num_tokens": 180667286.0, + "step": 7237 + }, + { + "epoch": 0.7948605315176807, + "grad_norm": 1.9440078735351562, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7064594030380249, + "num_tokens": 180698182.0, + "step": 7238 + }, + { + "epoch": 0.7949703492202943, + "grad_norm": 2.1062705516815186, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7204055190086365, + "num_tokens": 180725363.0, + "step": 7239 + }, + { + "epoch": 0.795080166922908, + "grad_norm": 2.122995376586914, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.708710253238678, + "num_tokens": 180752845.0, + "step": 7240 + }, + { + "epoch": 0.7951899846255216, + "grad_norm": 2.593276262283325, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7353965044021606, + "num_tokens": 180770420.0, + "step": 7241 + }, + { + "epoch": 0.7952998023281352, + "grad_norm": 2.2556562423706055, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7227047085762024, + "num_tokens": 180792286.0, + "step": 7242 + }, + { + "epoch": 0.795409620030749, + "grad_norm": 2.3849241733551025, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7003189325332642, + "num_tokens": 180814778.0, + "step": 7243 + }, + { + "epoch": 0.7955194377333626, + "grad_norm": 2.570572853088379, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6855436563491821, + "num_tokens": 180834638.0, + "step": 7244 + }, + { + "epoch": 0.7956292554359763, + "grad_norm": 2.2641348838806152, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7149375677108765, + "num_tokens": 180860236.0, + "step": 7245 + }, + { + "epoch": 0.7957390731385899, + "grad_norm": 2.1575770378112793, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7112258076667786, + "num_tokens": 180885054.0, + "step": 7246 + }, + { + "epoch": 0.7958488908412036, + "grad_norm": 2.0325074195861816, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6992213726043701, + "num_tokens": 180913807.0, + "step": 7247 + }, + { + "epoch": 0.7959587085438172, + "grad_norm": 2.1449527740478516, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7268427610397339, + "num_tokens": 180938139.0, + "step": 7248 + }, + { + "epoch": 0.7960685262464309, + "grad_norm": 2.4606451988220215, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7297764420509338, + "num_tokens": 180957268.0, + "step": 7249 + }, + { + "epoch": 0.7961783439490446, + "grad_norm": 2.21140456199646, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7060659527778625, + "num_tokens": 180980948.0, + "step": 7250 + }, + { + "epoch": 0.7962881616516583, + "grad_norm": 2.01389479637146, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6960660815238953, + "num_tokens": 181011364.0, + "step": 7251 + }, + { + "epoch": 0.7963979793542719, + "grad_norm": 2.1204733848571777, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6993271112442017, + "num_tokens": 181038543.0, + "step": 7252 + }, + { + "epoch": 0.7965077970568856, + "grad_norm": 1.9753806591033936, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7002005577087402, + "num_tokens": 181067902.0, + "step": 7253 + }, + { + "epoch": 0.7966176147594992, + "grad_norm": 2.2590224742889404, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6860026121139526, + "num_tokens": 181092901.0, + "step": 7254 + }, + { + "epoch": 0.7967274324621129, + "grad_norm": 2.579741954803467, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7071759104728699, + "num_tokens": 181112143.0, + "step": 7255 + }, + { + "epoch": 0.7968372501647265, + "grad_norm": 2.065587043762207, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.708274245262146, + "num_tokens": 181139814.0, + "step": 7256 + }, + { + "epoch": 0.7969470678673403, + "grad_norm": 2.2411017417907715, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7536492943763733, + "num_tokens": 181162783.0, + "step": 7257 + }, + { + "epoch": 0.7970568855699539, + "grad_norm": 1.8101228475570679, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6920678019523621, + "num_tokens": 181198371.0, + "step": 7258 + }, + { + "epoch": 0.7971667032725676, + "grad_norm": 2.0920403003692627, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.6964568495750427, + "num_tokens": 181226169.0, + "step": 7259 + }, + { + "epoch": 0.7972765209751812, + "grad_norm": 2.183530807495117, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.706261932849884, + "num_tokens": 181251973.0, + "step": 7260 + }, + { + "epoch": 0.7973863386777948, + "grad_norm": 2.2471816539764404, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.728317379951477, + "num_tokens": 181276420.0, + "step": 7261 + }, + { + "epoch": 0.7974961563804085, + "grad_norm": 2.1368050575256348, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.698904812335968, + "num_tokens": 181302543.0, + "step": 7262 + }, + { + "epoch": 0.7976059740830221, + "grad_norm": 2.3294570446014404, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6871271133422852, + "num_tokens": 181326422.0, + "step": 7263 + }, + { + "epoch": 0.7977157917856359, + "grad_norm": 1.9683234691619873, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7090973258018494, + "num_tokens": 181356825.0, + "step": 7264 + }, + { + "epoch": 0.7978256094882495, + "grad_norm": 1.9583408832550049, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.715210497379303, + "num_tokens": 181386165.0, + "step": 7265 + }, + { + "epoch": 0.7979354271908632, + "grad_norm": 2.4302847385406494, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7311371564865112, + "num_tokens": 181406773.0, + "step": 7266 + }, + { + "epoch": 0.7980452448934768, + "grad_norm": 2.278411626815796, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7057849168777466, + "num_tokens": 181429868.0, + "step": 7267 + }, + { + "epoch": 0.7981550625960905, + "grad_norm": 2.397088050842285, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6712684631347656, + "num_tokens": 181454227.0, + "step": 7268 + }, + { + "epoch": 0.7982648802987041, + "grad_norm": 2.678515672683716, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7193300724029541, + "num_tokens": 181472364.0, + "step": 7269 + }, + { + "epoch": 0.7983746980013178, + "grad_norm": 2.125797986984253, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.697442889213562, + "num_tokens": 181498285.0, + "step": 7270 + }, + { + "epoch": 0.7984845157039314, + "grad_norm": 2.4565813541412354, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7324733734130859, + "num_tokens": 181517395.0, + "step": 7271 + }, + { + "epoch": 0.7985943334065452, + "grad_norm": 2.4694972038269043, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7374676465988159, + "num_tokens": 181536839.0, + "step": 7272 + }, + { + "epoch": 0.7987041511091588, + "grad_norm": 2.1956818103790283, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7142678499221802, + "num_tokens": 181563140.0, + "step": 7273 + }, + { + "epoch": 0.7988139688117725, + "grad_norm": 2.3364202976226807, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7260771989822388, + "num_tokens": 181584131.0, + "step": 7274 + }, + { + "epoch": 0.7989237865143861, + "grad_norm": 2.3981330394744873, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7266343235969543, + "num_tokens": 181604613.0, + "step": 7275 + }, + { + "epoch": 0.7990336042169998, + "grad_norm": 2.0830955505371094, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6828414797782898, + "num_tokens": 181633453.0, + "step": 7276 + }, + { + "epoch": 0.7991434219196134, + "grad_norm": 2.0324485301971436, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7117420434951782, + "num_tokens": 181659768.0, + "step": 7277 + }, + { + "epoch": 0.799253239622227, + "grad_norm": 2.2605838775634766, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7148962020874023, + "num_tokens": 181682897.0, + "step": 7278 + }, + { + "epoch": 0.7993630573248408, + "grad_norm": 2.182443380355835, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6931121349334717, + "num_tokens": 181709512.0, + "step": 7279 + }, + { + "epoch": 0.7994728750274545, + "grad_norm": 2.3858954906463623, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7068462371826172, + "num_tokens": 181730121.0, + "step": 7280 + }, + { + "epoch": 0.7995826927300681, + "grad_norm": 2.2413532733917236, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7243497371673584, + "num_tokens": 181752150.0, + "step": 7281 + }, + { + "epoch": 0.7996925104326817, + "grad_norm": 2.1236307621002197, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7318059206008911, + "num_tokens": 181778854.0, + "step": 7282 + }, + { + "epoch": 0.7998023281352954, + "grad_norm": 2.312795877456665, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7119313478469849, + "num_tokens": 181801850.0, + "step": 7283 + }, + { + "epoch": 0.799912145837909, + "grad_norm": 2.1218020915985107, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7057027816772461, + "num_tokens": 181828502.0, + "step": 7284 + }, + { + "epoch": 0.8000219635405227, + "grad_norm": 2.0149624347686768, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6955005526542664, + "num_tokens": 181857921.0, + "step": 7285 + }, + { + "epoch": 0.8001317812431364, + "grad_norm": 2.228905200958252, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7048249244689941, + "num_tokens": 181882762.0, + "step": 7286 + }, + { + "epoch": 0.8002415989457501, + "grad_norm": 2.170008420944214, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7004507780075073, + "num_tokens": 181907787.0, + "step": 7287 + }, + { + "epoch": 0.8003514166483637, + "grad_norm": 2.6330506801605225, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7092971205711365, + "num_tokens": 181925531.0, + "step": 7288 + }, + { + "epoch": 0.8004612343509774, + "grad_norm": 2.2076687812805176, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7328827977180481, + "num_tokens": 181949223.0, + "step": 7289 + }, + { + "epoch": 0.800571052053591, + "grad_norm": 2.2617099285125732, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6996724605560303, + "num_tokens": 181972705.0, + "step": 7290 + }, + { + "epoch": 0.8006808697562047, + "grad_norm": 2.152144432067871, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7170124053955078, + "num_tokens": 181999846.0, + "step": 7291 + }, + { + "epoch": 0.8007906874588183, + "grad_norm": 1.929884672164917, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6920605897903442, + "num_tokens": 182032101.0, + "step": 7292 + }, + { + "epoch": 0.8009005051614321, + "grad_norm": 2.236207962036133, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7157971858978271, + "num_tokens": 182055051.0, + "step": 7293 + }, + { + "epoch": 0.8010103228640457, + "grad_norm": 1.997100830078125, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7048087120056152, + "num_tokens": 182087687.0, + "step": 7294 + }, + { + "epoch": 0.8011201405666594, + "grad_norm": 1.871663212776184, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6975643634796143, + "num_tokens": 182120844.0, + "step": 7295 + }, + { + "epoch": 0.801229958269273, + "grad_norm": 2.039166212081909, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7383027076721191, + "num_tokens": 182148006.0, + "step": 7296 + }, + { + "epoch": 0.8013397759718867, + "grad_norm": 2.1817352771759033, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6844474673271179, + "num_tokens": 182176425.0, + "step": 7297 + }, + { + "epoch": 0.8014495936745003, + "grad_norm": 2.4415230751037598, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7236326932907104, + "num_tokens": 182197477.0, + "step": 7298 + }, + { + "epoch": 0.801559411377114, + "grad_norm": 2.209057331085205, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7071079015731812, + "num_tokens": 182221335.0, + "step": 7299 + }, + { + "epoch": 0.8016692290797276, + "grad_norm": 1.9313565492630005, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6879035234451294, + "num_tokens": 182253407.0, + "step": 7300 + }, + { + "epoch": 0.8017790467823414, + "grad_norm": 2.133857488632202, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.703624963760376, + "num_tokens": 182280729.0, + "step": 7301 + }, + { + "epoch": 0.801888864484955, + "grad_norm": 2.1663670539855957, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6812707185745239, + "num_tokens": 182307933.0, + "step": 7302 + }, + { + "epoch": 0.8019986821875686, + "grad_norm": 2.3288815021514893, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7053849101066589, + "num_tokens": 182331746.0, + "step": 7303 + }, + { + "epoch": 0.8021084998901823, + "grad_norm": 2.486619234085083, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6982040405273438, + "num_tokens": 182352053.0, + "step": 7304 + }, + { + "epoch": 0.8022183175927959, + "grad_norm": 2.5911805629730225, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.733687162399292, + "num_tokens": 182370698.0, + "step": 7305 + }, + { + "epoch": 0.8023281352954096, + "grad_norm": 2.303903341293335, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.6924995183944702, + "num_tokens": 182394077.0, + "step": 7306 + }, + { + "epoch": 0.8024379529980232, + "grad_norm": 2.3166098594665527, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7328149080276489, + "num_tokens": 182413796.0, + "step": 7307 + }, + { + "epoch": 0.802547770700637, + "grad_norm": 2.165579319000244, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.711384117603302, + "num_tokens": 182438065.0, + "step": 7308 + }, + { + "epoch": 0.8026575884032506, + "grad_norm": 2.591968536376953, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7106937170028687, + "num_tokens": 182457651.0, + "step": 7309 + }, + { + "epoch": 0.8027674061058643, + "grad_norm": 2.416266918182373, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7263692617416382, + "num_tokens": 182477549.0, + "step": 7310 + }, + { + "epoch": 0.8028772238084779, + "grad_norm": 2.2898693084716797, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7171779870986938, + "num_tokens": 182500818.0, + "step": 7311 + }, + { + "epoch": 0.8029870415110916, + "grad_norm": 2.0465078353881836, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7160584926605225, + "num_tokens": 182528481.0, + "step": 7312 + }, + { + "epoch": 0.8030968592137052, + "grad_norm": 2.2576699256896973, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7106711864471436, + "num_tokens": 182551419.0, + "step": 7313 + }, + { + "epoch": 0.8032066769163189, + "grad_norm": 2.3859570026397705, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7147912383079529, + "num_tokens": 182572622.0, + "step": 7314 + }, + { + "epoch": 0.8033164946189326, + "grad_norm": 2.1410417556762695, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7305341958999634, + "num_tokens": 182597184.0, + "step": 7315 + }, + { + "epoch": 0.8034263123215463, + "grad_norm": 2.4036247730255127, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7164289355278015, + "num_tokens": 182616927.0, + "step": 7316 + }, + { + "epoch": 0.8035361300241599, + "grad_norm": 2.1526451110839844, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6899145841598511, + "num_tokens": 182642317.0, + "step": 7317 + }, + { + "epoch": 0.8036459477267736, + "grad_norm": 2.203237771987915, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7199431657791138, + "num_tokens": 182666180.0, + "step": 7318 + }, + { + "epoch": 0.8037557654293872, + "grad_norm": 2.153052568435669, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7226204872131348, + "num_tokens": 182691731.0, + "step": 7319 + }, + { + "epoch": 0.8038655831320008, + "grad_norm": 2.4233880043029785, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7011473178863525, + "num_tokens": 182712923.0, + "step": 7320 + }, + { + "epoch": 0.8039754008346145, + "grad_norm": 2.471883773803711, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7315077781677246, + "num_tokens": 182732363.0, + "step": 7321 + }, + { + "epoch": 0.8040852185372283, + "grad_norm": 1.9925017356872559, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7042833566665649, + "num_tokens": 182762024.0, + "step": 7322 + }, + { + "epoch": 0.8041950362398419, + "grad_norm": 2.1818387508392334, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7284332513809204, + "num_tokens": 182786200.0, + "step": 7323 + }, + { + "epoch": 0.8043048539424555, + "grad_norm": 2.2235748767852783, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.697851300239563, + "num_tokens": 182811437.0, + "step": 7324 + }, + { + "epoch": 0.8044146716450692, + "grad_norm": 1.8699597120285034, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6925574541091919, + "num_tokens": 182846193.0, + "step": 7325 + }, + { + "epoch": 0.8045244893476828, + "grad_norm": 2.0372226238250732, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7243373394012451, + "num_tokens": 182874253.0, + "step": 7326 + }, + { + "epoch": 0.8046343070502965, + "grad_norm": 2.0606327056884766, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.718467116355896, + "num_tokens": 182901155.0, + "step": 7327 + }, + { + "epoch": 0.8047441247529101, + "grad_norm": 2.102762222290039, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.704612135887146, + "num_tokens": 182927444.0, + "step": 7328 + }, + { + "epoch": 0.8048539424555239, + "grad_norm": 2.125239372253418, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7006654143333435, + "num_tokens": 182954124.0, + "step": 7329 + }, + { + "epoch": 0.8049637601581375, + "grad_norm": 2.6296238899230957, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7206631898880005, + "num_tokens": 182974204.0, + "step": 7330 + }, + { + "epoch": 0.8050735778607512, + "grad_norm": 2.1503894329071045, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7069848775863647, + "num_tokens": 183001422.0, + "step": 7331 + }, + { + "epoch": 0.8051833955633648, + "grad_norm": 2.2181596755981445, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7121024131774902, + "num_tokens": 183027231.0, + "step": 7332 + }, + { + "epoch": 0.8052932132659785, + "grad_norm": 1.923258662223816, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7054111957550049, + "num_tokens": 183058326.0, + "step": 7333 + }, + { + "epoch": 0.8054030309685921, + "grad_norm": 2.2306549549102783, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7039582133293152, + "num_tokens": 183084565.0, + "step": 7334 + }, + { + "epoch": 0.8055128486712058, + "grad_norm": 1.9787157773971558, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6833949685096741, + "num_tokens": 183115720.0, + "step": 7335 + }, + { + "epoch": 0.8056226663738194, + "grad_norm": 1.9883313179016113, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7014568448066711, + "num_tokens": 183143799.0, + "step": 7336 + }, + { + "epoch": 0.8057324840764332, + "grad_norm": 2.516730546951294, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7360904216766357, + "num_tokens": 183162073.0, + "step": 7337 + }, + { + "epoch": 0.8058423017790468, + "grad_norm": 1.8968251943588257, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7000672221183777, + "num_tokens": 183192583.0, + "step": 7338 + }, + { + "epoch": 0.8059521194816605, + "grad_norm": 2.1743905544281006, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7171924114227295, + "num_tokens": 183216856.0, + "step": 7339 + }, + { + "epoch": 0.8060619371842741, + "grad_norm": 2.231288194656372, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.709962010383606, + "num_tokens": 183241063.0, + "step": 7340 + }, + { + "epoch": 0.8061717548868877, + "grad_norm": 2.2144949436187744, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7061721086502075, + "num_tokens": 183266265.0, + "step": 7341 + }, + { + "epoch": 0.8062815725895014, + "grad_norm": 2.2149035930633545, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7132576704025269, + "num_tokens": 183291564.0, + "step": 7342 + }, + { + "epoch": 0.806391390292115, + "grad_norm": 2.2250921726226807, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7071508169174194, + "num_tokens": 183314716.0, + "step": 7343 + }, + { + "epoch": 0.8065012079947288, + "grad_norm": 2.276304006576538, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7185806632041931, + "num_tokens": 183337511.0, + "step": 7344 + }, + { + "epoch": 0.8066110256973424, + "grad_norm": 2.304637908935547, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6830963492393494, + "num_tokens": 183362641.0, + "step": 7345 + }, + { + "epoch": 0.8067208433999561, + "grad_norm": 2.198634386062622, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7202625870704651, + "num_tokens": 183387742.0, + "step": 7346 + }, + { + "epoch": 0.8068306611025697, + "grad_norm": 2.2621707916259766, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6877306699752808, + "num_tokens": 183411554.0, + "step": 7347 + }, + { + "epoch": 0.8069404788051834, + "grad_norm": 1.8767540454864502, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.688012421131134, + "num_tokens": 183445386.0, + "step": 7348 + }, + { + "epoch": 0.807050296507797, + "grad_norm": 2.2895021438598633, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7037689685821533, + "num_tokens": 183468912.0, + "step": 7349 + }, + { + "epoch": 0.8071601142104107, + "grad_norm": 1.9551458358764648, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7041233777999878, + "num_tokens": 183496918.0, + "step": 7350 + }, + { + "epoch": 0.8072699319130244, + "grad_norm": 2.2735595703125, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6879468560218811, + "num_tokens": 183522274.0, + "step": 7351 + }, + { + "epoch": 0.8073797496156381, + "grad_norm": 2.322537899017334, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7072296142578125, + "num_tokens": 183546102.0, + "step": 7352 + }, + { + "epoch": 0.8074895673182517, + "grad_norm": 2.1161231994628906, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7187103033065796, + "num_tokens": 183571640.0, + "step": 7353 + }, + { + "epoch": 0.8075993850208654, + "grad_norm": 2.268177032470703, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6911133527755737, + "num_tokens": 183595433.0, + "step": 7354 + }, + { + "epoch": 0.807709202723479, + "grad_norm": 2.2943081855773926, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7410687804222107, + "num_tokens": 183617079.0, + "step": 7355 + }, + { + "epoch": 0.8078190204260927, + "grad_norm": 2.0933780670166016, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6926251649856567, + "num_tokens": 183645222.0, + "step": 7356 + }, + { + "epoch": 0.8079288381287063, + "grad_norm": 2.081845283508301, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.733486533164978, + "num_tokens": 183670443.0, + "step": 7357 + }, + { + "epoch": 0.8080386558313201, + "grad_norm": 2.2126247882843018, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7053048610687256, + "num_tokens": 183693517.0, + "step": 7358 + }, + { + "epoch": 0.8081484735339337, + "grad_norm": 2.4328203201293945, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7138094902038574, + "num_tokens": 183715132.0, + "step": 7359 + }, + { + "epoch": 0.8082582912365474, + "grad_norm": 2.1870319843292236, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7292805314064026, + "num_tokens": 183737971.0, + "step": 7360 + }, + { + "epoch": 0.808368108939161, + "grad_norm": 2.0560505390167236, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7085046172142029, + "num_tokens": 183768943.0, + "step": 7361 + }, + { + "epoch": 0.8084779266417746, + "grad_norm": 2.27816104888916, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7183170914649963, + "num_tokens": 183790867.0, + "step": 7362 + }, + { + "epoch": 0.8085877443443883, + "grad_norm": 2.1985044479370117, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.70838463306427, + "num_tokens": 183815917.0, + "step": 7363 + }, + { + "epoch": 0.8086975620470019, + "grad_norm": 2.312077283859253, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7054175734519958, + "num_tokens": 183838967.0, + "step": 7364 + }, + { + "epoch": 0.8088073797496156, + "grad_norm": 2.2337207794189453, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7201496958732605, + "num_tokens": 183862757.0, + "step": 7365 + }, + { + "epoch": 0.8089171974522293, + "grad_norm": 2.0481536388397217, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.697799801826477, + "num_tokens": 183892801.0, + "step": 7366 + }, + { + "epoch": 0.809027015154843, + "grad_norm": 2.3592939376831055, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7231276035308838, + "num_tokens": 183914697.0, + "step": 7367 + }, + { + "epoch": 0.8091368328574566, + "grad_norm": 2.460639238357544, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7078726291656494, + "num_tokens": 183936476.0, + "step": 7368 + }, + { + "epoch": 0.8092466505600703, + "grad_norm": 2.0260374546051025, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7006946802139282, + "num_tokens": 183966587.0, + "step": 7369 + }, + { + "epoch": 0.8093564682626839, + "grad_norm": 2.3525969982147217, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7115678787231445, + "num_tokens": 183991807.0, + "step": 7370 + }, + { + "epoch": 0.8094662859652976, + "grad_norm": 2.2957844734191895, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7024359703063965, + "num_tokens": 184017400.0, + "step": 7371 + }, + { + "epoch": 0.8095761036679112, + "grad_norm": 2.29388689994812, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7289047241210938, + "num_tokens": 184041221.0, + "step": 7372 + }, + { + "epoch": 0.809685921370525, + "grad_norm": 2.171146869659424, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6936179995536804, + "num_tokens": 184068280.0, + "step": 7373 + }, + { + "epoch": 0.8097957390731386, + "grad_norm": 2.384859085083008, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7081011533737183, + "num_tokens": 184090464.0, + "step": 7374 + }, + { + "epoch": 0.8099055567757523, + "grad_norm": 2.080305576324463, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.6980112791061401, + "num_tokens": 184118048.0, + "step": 7375 + }, + { + "epoch": 0.8100153744783659, + "grad_norm": 2.229017972946167, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7158752679824829, + "num_tokens": 184142633.0, + "step": 7376 + }, + { + "epoch": 0.8101251921809796, + "grad_norm": 2.0668883323669434, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.7083773612976074, + "num_tokens": 184169242.0, + "step": 7377 + }, + { + "epoch": 0.8102350098835932, + "grad_norm": 2.251875400543213, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7096141576766968, + "num_tokens": 184192928.0, + "step": 7378 + }, + { + "epoch": 0.8103448275862069, + "grad_norm": 2.229172945022583, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6924362182617188, + "num_tokens": 184217612.0, + "step": 7379 + }, + { + "epoch": 0.8104546452888206, + "grad_norm": 2.252344846725464, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6974759101867676, + "num_tokens": 184241212.0, + "step": 7380 + }, + { + "epoch": 0.8105644629914343, + "grad_norm": 2.0897793769836426, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6931114196777344, + "num_tokens": 184267301.0, + "step": 7381 + }, + { + "epoch": 0.8106742806940479, + "grad_norm": 2.217372417449951, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6962639093399048, + "num_tokens": 184291190.0, + "step": 7382 + }, + { + "epoch": 0.8107840983966615, + "grad_norm": 2.4966373443603516, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7178812026977539, + "num_tokens": 184311137.0, + "step": 7383 + }, + { + "epoch": 0.8108939160992752, + "grad_norm": 2.161890745162964, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7088806629180908, + "num_tokens": 184337063.0, + "step": 7384 + }, + { + "epoch": 0.8110037338018888, + "grad_norm": 2.322127103805542, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7493248581886292, + "num_tokens": 184360764.0, + "step": 7385 + }, + { + "epoch": 0.8111135515045025, + "grad_norm": 2.220357656478882, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7138180136680603, + "num_tokens": 184384860.0, + "step": 7386 + }, + { + "epoch": 0.8112233692071162, + "grad_norm": 2.2306981086730957, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6897355914115906, + "num_tokens": 184411121.0, + "step": 7387 + }, + { + "epoch": 0.8113331869097299, + "grad_norm": 2.0984392166137695, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7116891145706177, + "num_tokens": 184437522.0, + "step": 7388 + }, + { + "epoch": 0.8114430046123435, + "grad_norm": 2.0950303077697754, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7065322995185852, + "num_tokens": 184466972.0, + "step": 7389 + }, + { + "epoch": 0.8115528223149572, + "grad_norm": 2.3362584114074707, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.6986404657363892, + "num_tokens": 184491357.0, + "step": 7390 + }, + { + "epoch": 0.8116626400175708, + "grad_norm": 2.2846877574920654, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7203859090805054, + "num_tokens": 184513537.0, + "step": 7391 + }, + { + "epoch": 0.8117724577201845, + "grad_norm": 2.095393180847168, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7048028707504272, + "num_tokens": 184541071.0, + "step": 7392 + }, + { + "epoch": 0.8118822754227981, + "grad_norm": 2.2656006813049316, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7140602469444275, + "num_tokens": 184565015.0, + "step": 7393 + }, + { + "epoch": 0.8119920931254118, + "grad_norm": 2.5373826026916504, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7166328430175781, + "num_tokens": 184583678.0, + "step": 7394 + }, + { + "epoch": 0.8121019108280255, + "grad_norm": 2.0537431240081787, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.707951545715332, + "num_tokens": 184610979.0, + "step": 7395 + }, + { + "epoch": 0.8122117285306392, + "grad_norm": 2.215362787246704, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7113988995552063, + "num_tokens": 184633505.0, + "step": 7396 + }, + { + "epoch": 0.8123215462332528, + "grad_norm": 2.241554021835327, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7145429849624634, + "num_tokens": 184657021.0, + "step": 7397 + }, + { + "epoch": 0.8124313639358665, + "grad_norm": 2.1099600791931152, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7023967504501343, + "num_tokens": 184683132.0, + "step": 7398 + }, + { + "epoch": 0.8125411816384801, + "grad_norm": 2.465592384338379, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7307133078575134, + "num_tokens": 184703660.0, + "step": 7399 + }, + { + "epoch": 0.8126509993410937, + "grad_norm": 2.4738316535949707, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6881871223449707, + "num_tokens": 184724876.0, + "step": 7400 + }, + { + "epoch": 0.8127608170437074, + "grad_norm": 2.5320827960968018, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7365989089012146, + "num_tokens": 184744110.0, + "step": 7401 + }, + { + "epoch": 0.8128706347463212, + "grad_norm": 2.262753486633301, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7131949663162231, + "num_tokens": 184768586.0, + "step": 7402 + }, + { + "epoch": 0.8129804524489348, + "grad_norm": 2.085451364517212, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7152898907661438, + "num_tokens": 184795623.0, + "step": 7403 + }, + { + "epoch": 0.8130902701515484, + "grad_norm": 2.264470100402832, + "learning_rate": 1e-06, + "loss": 1.0932, + "mean_token_accuracy": 0.683998703956604, + "num_tokens": 184820548.0, + "step": 7404 + }, + { + "epoch": 0.8132000878541621, + "grad_norm": 1.975555181503296, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7104765772819519, + "num_tokens": 184850944.0, + "step": 7405 + }, + { + "epoch": 0.8133099055567757, + "grad_norm": 2.0669474601745605, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7089448571205139, + "num_tokens": 184878083.0, + "step": 7406 + }, + { + "epoch": 0.8134197232593894, + "grad_norm": 1.9637142419815063, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7040572166442871, + "num_tokens": 184910110.0, + "step": 7407 + }, + { + "epoch": 0.813529540962003, + "grad_norm": 2.2885830402374268, + "learning_rate": 1e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6831738948822021, + "num_tokens": 184935268.0, + "step": 7408 + }, + { + "epoch": 0.8136393586646168, + "grad_norm": 2.1931676864624023, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7051087617874146, + "num_tokens": 184960667.0, + "step": 7409 + }, + { + "epoch": 0.8137491763672304, + "grad_norm": 2.15908145904541, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7218969464302063, + "num_tokens": 184986600.0, + "step": 7410 + }, + { + "epoch": 0.8138589940698441, + "grad_norm": 2.480215072631836, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7126774191856384, + "num_tokens": 185007016.0, + "step": 7411 + }, + { + "epoch": 0.8139688117724577, + "grad_norm": 2.340851068496704, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7404017448425293, + "num_tokens": 185029181.0, + "step": 7412 + }, + { + "epoch": 0.8140786294750714, + "grad_norm": 2.387671947479248, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7135944366455078, + "num_tokens": 185052566.0, + "step": 7413 + }, + { + "epoch": 0.814188447177685, + "grad_norm": 2.1674346923828125, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7255688905715942, + "num_tokens": 185078016.0, + "step": 7414 + }, + { + "epoch": 0.8142982648802987, + "grad_norm": 2.3913071155548096, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7054803371429443, + "num_tokens": 185099916.0, + "step": 7415 + }, + { + "epoch": 0.8144080825829124, + "grad_norm": 2.3248586654663086, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7117912769317627, + "num_tokens": 185122153.0, + "step": 7416 + }, + { + "epoch": 0.8145179002855261, + "grad_norm": 2.759230613708496, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7168084979057312, + "num_tokens": 185140320.0, + "step": 7417 + }, + { + "epoch": 0.8146277179881397, + "grad_norm": 2.6298530101776123, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.70632004737854, + "num_tokens": 185160695.0, + "step": 7418 + }, + { + "epoch": 0.8147375356907534, + "grad_norm": 2.17958927154541, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7401683330535889, + "num_tokens": 185184150.0, + "step": 7419 + }, + { + "epoch": 0.814847353393367, + "grad_norm": 1.8975402116775513, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7025592923164368, + "num_tokens": 185216674.0, + "step": 7420 + }, + { + "epoch": 0.8149571710959806, + "grad_norm": 2.363180637359619, + "learning_rate": 1e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7364015579223633, + "num_tokens": 185238438.0, + "step": 7421 + }, + { + "epoch": 0.8150669887985943, + "grad_norm": 2.1200966835021973, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6906351447105408, + "num_tokens": 185266139.0, + "step": 7422 + }, + { + "epoch": 0.8151768065012079, + "grad_norm": 2.0373964309692383, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6782835125923157, + "num_tokens": 185296056.0, + "step": 7423 + }, + { + "epoch": 0.8152866242038217, + "grad_norm": 2.6451025009155273, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7225313186645508, + "num_tokens": 185313634.0, + "step": 7424 + }, + { + "epoch": 0.8153964419064353, + "grad_norm": 1.923734426498413, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6885334253311157, + "num_tokens": 185344928.0, + "step": 7425 + }, + { + "epoch": 0.815506259609049, + "grad_norm": 2.535848617553711, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7326585650444031, + "num_tokens": 185364097.0, + "step": 7426 + }, + { + "epoch": 0.8156160773116626, + "grad_norm": 2.227060079574585, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7307858467102051, + "num_tokens": 185388910.0, + "step": 7427 + }, + { + "epoch": 0.8157258950142763, + "grad_norm": 2.066969871520996, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.7054409980773926, + "num_tokens": 185420008.0, + "step": 7428 + }, + { + "epoch": 0.8158357127168899, + "grad_norm": 2.556572198867798, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7185893058776855, + "num_tokens": 185439163.0, + "step": 7429 + }, + { + "epoch": 0.8159455304195036, + "grad_norm": 2.2020492553710938, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6934657692909241, + "num_tokens": 185464105.0, + "step": 7430 + }, + { + "epoch": 0.8160553481221173, + "grad_norm": 2.1288866996765137, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7108050584793091, + "num_tokens": 185490899.0, + "step": 7431 + }, + { + "epoch": 0.816165165824731, + "grad_norm": 2.1481244564056396, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7094931602478027, + "num_tokens": 185516225.0, + "step": 7432 + }, + { + "epoch": 0.8162749835273446, + "grad_norm": 2.4010634422302246, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7140707969665527, + "num_tokens": 185538682.0, + "step": 7433 + }, + { + "epoch": 0.8163848012299583, + "grad_norm": 2.0698401927948, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6849722862243652, + "num_tokens": 185566314.0, + "step": 7434 + }, + { + "epoch": 0.8164946189325719, + "grad_norm": 2.058168411254883, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.714780867099762, + "num_tokens": 185595442.0, + "step": 7435 + }, + { + "epoch": 0.8166044366351856, + "grad_norm": 1.9355909824371338, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6966711282730103, + "num_tokens": 185627826.0, + "step": 7436 + }, + { + "epoch": 0.8167142543377992, + "grad_norm": 2.121913433074951, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7072603106498718, + "num_tokens": 185652081.0, + "step": 7437 + }, + { + "epoch": 0.816824072040413, + "grad_norm": 2.000659227371216, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6926106810569763, + "num_tokens": 185682027.0, + "step": 7438 + }, + { + "epoch": 0.8169338897430266, + "grad_norm": 2.3764259815216064, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7247980833053589, + "num_tokens": 185702614.0, + "step": 7439 + }, + { + "epoch": 0.8170437074456403, + "grad_norm": 2.2055583000183105, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7095791101455688, + "num_tokens": 185728512.0, + "step": 7440 + }, + { + "epoch": 0.8171535251482539, + "grad_norm": 1.9576382637023926, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6980504989624023, + "num_tokens": 185759382.0, + "step": 7441 + }, + { + "epoch": 0.8172633428508675, + "grad_norm": 2.110326051712036, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7275750637054443, + "num_tokens": 185785289.0, + "step": 7442 + }, + { + "epoch": 0.8173731605534812, + "grad_norm": 2.4162092208862305, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7094546556472778, + "num_tokens": 185806174.0, + "step": 7443 + }, + { + "epoch": 0.8174829782560948, + "grad_norm": 2.587890625, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7309144735336304, + "num_tokens": 185825686.0, + "step": 7444 + }, + { + "epoch": 0.8175927959587086, + "grad_norm": 2.0394985675811768, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.6968608498573303, + "num_tokens": 185854836.0, + "step": 7445 + }, + { + "epoch": 0.8177026136613222, + "grad_norm": 2.285064697265625, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.708122968673706, + "num_tokens": 185878953.0, + "step": 7446 + }, + { + "epoch": 0.8178124313639359, + "grad_norm": 2.2172036170959473, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7365762591362, + "num_tokens": 185904013.0, + "step": 7447 + }, + { + "epoch": 0.8179222490665495, + "grad_norm": 2.2206835746765137, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6802741289138794, + "num_tokens": 185929062.0, + "step": 7448 + }, + { + "epoch": 0.8180320667691632, + "grad_norm": 2.073131799697876, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.706647515296936, + "num_tokens": 185958450.0, + "step": 7449 + }, + { + "epoch": 0.8181418844717768, + "grad_norm": 2.4143309593200684, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6905194520950317, + "num_tokens": 185982309.0, + "step": 7450 + }, + { + "epoch": 0.8182517021743905, + "grad_norm": 2.286257028579712, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.730014443397522, + "num_tokens": 186005208.0, + "step": 7451 + }, + { + "epoch": 0.8183615198770041, + "grad_norm": 1.9136384725570679, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6963574886322021, + "num_tokens": 186036598.0, + "step": 7452 + }, + { + "epoch": 0.8184713375796179, + "grad_norm": 2.2456510066986084, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.69818115234375, + "num_tokens": 186063241.0, + "step": 7453 + }, + { + "epoch": 0.8185811552822315, + "grad_norm": 2.205925941467285, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7342376708984375, + "num_tokens": 186086828.0, + "step": 7454 + }, + { + "epoch": 0.8186909729848452, + "grad_norm": 2.3496859073638916, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7006473541259766, + "num_tokens": 186108875.0, + "step": 7455 + }, + { + "epoch": 0.8188007906874588, + "grad_norm": 2.007720708847046, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7062445878982544, + "num_tokens": 186136561.0, + "step": 7456 + }, + { + "epoch": 0.8189106083900725, + "grad_norm": 2.028656244277954, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.688434362411499, + "num_tokens": 186164708.0, + "step": 7457 + }, + { + "epoch": 0.8190204260926861, + "grad_norm": 2.2106878757476807, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7085046172142029, + "num_tokens": 186191619.0, + "step": 7458 + }, + { + "epoch": 0.8191302437952998, + "grad_norm": 2.102033853530884, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7261794805526733, + "num_tokens": 186217563.0, + "step": 7459 + }, + { + "epoch": 0.8192400614979135, + "grad_norm": 2.439578056335449, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7439273595809937, + "num_tokens": 186236173.0, + "step": 7460 + }, + { + "epoch": 0.8193498792005272, + "grad_norm": 2.1156463623046875, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7039294242858887, + "num_tokens": 186261687.0, + "step": 7461 + }, + { + "epoch": 0.8194596969031408, + "grad_norm": 2.2637012004852295, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6877257823944092, + "num_tokens": 186286887.0, + "step": 7462 + }, + { + "epoch": 0.8195695146057544, + "grad_norm": 2.0102357864379883, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7006075382232666, + "num_tokens": 186315244.0, + "step": 7463 + }, + { + "epoch": 0.8196793323083681, + "grad_norm": 2.211005449295044, + "learning_rate": 1e-06, + "loss": 1.0713, + "mean_token_accuracy": 0.6737910509109497, + "num_tokens": 186341385.0, + "step": 7464 + }, + { + "epoch": 0.8197891500109817, + "grad_norm": 2.1691811084747314, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7125762701034546, + "num_tokens": 186364172.0, + "step": 7465 + }, + { + "epoch": 0.8198989677135954, + "grad_norm": 2.194268226623535, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7064660787582397, + "num_tokens": 186387416.0, + "step": 7466 + }, + { + "epoch": 0.8200087854162091, + "grad_norm": 2.3782174587249756, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7089940905570984, + "num_tokens": 186409733.0, + "step": 7467 + }, + { + "epoch": 0.8201186031188228, + "grad_norm": 2.219031810760498, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7167720198631287, + "num_tokens": 186433311.0, + "step": 7468 + }, + { + "epoch": 0.8202284208214364, + "grad_norm": 2.067187547683716, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6944706439971924, + "num_tokens": 186462493.0, + "step": 7469 + }, + { + "epoch": 0.8203382385240501, + "grad_norm": 2.0643532276153564, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.6964013576507568, + "num_tokens": 186490980.0, + "step": 7470 + }, + { + "epoch": 0.8204480562266637, + "grad_norm": 2.3805975914001465, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7170728445053101, + "num_tokens": 186513295.0, + "step": 7471 + }, + { + "epoch": 0.8205578739292774, + "grad_norm": 2.1301145553588867, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7209647297859192, + "num_tokens": 186538913.0, + "step": 7472 + }, + { + "epoch": 0.820667691631891, + "grad_norm": 2.266897439956665, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7187166213989258, + "num_tokens": 186562053.0, + "step": 7473 + }, + { + "epoch": 0.8207775093345048, + "grad_norm": 2.421304941177368, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7184100151062012, + "num_tokens": 186581663.0, + "step": 7474 + }, + { + "epoch": 0.8208873270371184, + "grad_norm": 2.525139808654785, + "learning_rate": 1e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7412269115447998, + "num_tokens": 186602048.0, + "step": 7475 + }, + { + "epoch": 0.8209971447397321, + "grad_norm": 2.1530873775482178, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7294309139251709, + "num_tokens": 186627032.0, + "step": 7476 + }, + { + "epoch": 0.8211069624423457, + "grad_norm": 2.2585625648498535, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7029471397399902, + "num_tokens": 186651889.0, + "step": 7477 + }, + { + "epoch": 0.8212167801449594, + "grad_norm": 2.3475472927093506, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7014082670211792, + "num_tokens": 186675008.0, + "step": 7478 + }, + { + "epoch": 0.821326597847573, + "grad_norm": 2.10215163230896, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7042554616928101, + "num_tokens": 186701682.0, + "step": 7479 + }, + { + "epoch": 0.8214364155501866, + "grad_norm": 2.1382927894592285, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7006075382232666, + "num_tokens": 186729346.0, + "step": 7480 + }, + { + "epoch": 0.8215462332528004, + "grad_norm": 1.9795856475830078, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6990524530410767, + "num_tokens": 186758708.0, + "step": 7481 + }, + { + "epoch": 0.821656050955414, + "grad_norm": 2.06616473197937, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7100269198417664, + "num_tokens": 186786118.0, + "step": 7482 + }, + { + "epoch": 0.8217658686580277, + "grad_norm": 2.1336588859558105, + "learning_rate": 1e-06, + "loss": 1.0545, + "mean_token_accuracy": 0.6799366474151611, + "num_tokens": 186815728.0, + "step": 7483 + }, + { + "epoch": 0.8218756863606413, + "grad_norm": 2.1676340103149414, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7040044069290161, + "num_tokens": 186841934.0, + "step": 7484 + }, + { + "epoch": 0.821985504063255, + "grad_norm": 2.1559011936187744, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7069620490074158, + "num_tokens": 186868215.0, + "step": 7485 + }, + { + "epoch": 0.8220953217658686, + "grad_norm": 2.1121089458465576, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6942011117935181, + "num_tokens": 186893831.0, + "step": 7486 + }, + { + "epoch": 0.8222051394684823, + "grad_norm": 2.5499191284179688, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7211799621582031, + "num_tokens": 186913834.0, + "step": 7487 + }, + { + "epoch": 0.8223149571710959, + "grad_norm": 2.2947638034820557, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7347313165664673, + "num_tokens": 186937033.0, + "step": 7488 + }, + { + "epoch": 0.8224247748737097, + "grad_norm": 2.2854251861572266, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7010232210159302, + "num_tokens": 186962067.0, + "step": 7489 + }, + { + "epoch": 0.8225345925763233, + "grad_norm": 1.9243923425674438, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6923855543136597, + "num_tokens": 186993417.0, + "step": 7490 + }, + { + "epoch": 0.822644410278937, + "grad_norm": 2.0201947689056396, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7249388694763184, + "num_tokens": 187022183.0, + "step": 7491 + }, + { + "epoch": 0.8227542279815506, + "grad_norm": 2.3447036743164062, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7017700672149658, + "num_tokens": 187044492.0, + "step": 7492 + }, + { + "epoch": 0.8228640456841643, + "grad_norm": 2.1175613403320312, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6983139514923096, + "num_tokens": 187072868.0, + "step": 7493 + }, + { + "epoch": 0.8229738633867779, + "grad_norm": 2.2231075763702393, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7045779824256897, + "num_tokens": 187096091.0, + "step": 7494 + }, + { + "epoch": 0.8230836810893916, + "grad_norm": 2.3029797077178955, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6878116130828857, + "num_tokens": 187121541.0, + "step": 7495 + }, + { + "epoch": 0.8231934987920053, + "grad_norm": 2.1320059299468994, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6996333003044128, + "num_tokens": 187147001.0, + "step": 7496 + }, + { + "epoch": 0.823303316494619, + "grad_norm": 2.276113271713257, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7082266807556152, + "num_tokens": 187168860.0, + "step": 7497 + }, + { + "epoch": 0.8234131341972326, + "grad_norm": 2.3067574501037598, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7253009080886841, + "num_tokens": 187191577.0, + "step": 7498 + }, + { + "epoch": 0.8235229518998463, + "grad_norm": 2.238741397857666, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7246717810630798, + "num_tokens": 187212650.0, + "step": 7499 + }, + { + "epoch": 0.8236327696024599, + "grad_norm": 2.0790441036224365, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7098349928855896, + "num_tokens": 187239471.0, + "step": 7500 + }, + { + "epoch": 0.8237425873050735, + "grad_norm": 2.2607781887054443, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7221994400024414, + "num_tokens": 187261619.0, + "step": 7501 + }, + { + "epoch": 0.8238524050076872, + "grad_norm": 2.6490478515625, + "learning_rate": 1e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.7550935745239258, + "num_tokens": 187278957.0, + "step": 7502 + }, + { + "epoch": 0.823962222710301, + "grad_norm": 2.181596040725708, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7033458948135376, + "num_tokens": 187303871.0, + "step": 7503 + }, + { + "epoch": 0.8240720404129146, + "grad_norm": 2.5345489978790283, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7432489395141602, + "num_tokens": 187322905.0, + "step": 7504 + }, + { + "epoch": 0.8241818581155282, + "grad_norm": 2.2510488033294678, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7046822309494019, + "num_tokens": 187345789.0, + "step": 7505 + }, + { + "epoch": 0.8242916758181419, + "grad_norm": 2.1298155784606934, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7269397377967834, + "num_tokens": 187369780.0, + "step": 7506 + }, + { + "epoch": 0.8244014935207555, + "grad_norm": 2.49251127243042, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7188234329223633, + "num_tokens": 187390439.0, + "step": 7507 + }, + { + "epoch": 0.8245113112233692, + "grad_norm": 2.0873610973358154, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7316005825996399, + "num_tokens": 187417470.0, + "step": 7508 + }, + { + "epoch": 0.8246211289259828, + "grad_norm": 2.298110008239746, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7076993584632874, + "num_tokens": 187442278.0, + "step": 7509 + }, + { + "epoch": 0.8247309466285966, + "grad_norm": 2.2802438735961914, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.713590145111084, + "num_tokens": 187465900.0, + "step": 7510 + }, + { + "epoch": 0.8248407643312102, + "grad_norm": 2.1364312171936035, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6958521604537964, + "num_tokens": 187493146.0, + "step": 7511 + }, + { + "epoch": 0.8249505820338239, + "grad_norm": 2.252932071685791, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6951049566268921, + "num_tokens": 187517741.0, + "step": 7512 + }, + { + "epoch": 0.8250603997364375, + "grad_norm": 2.47202730178833, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.724533200263977, + "num_tokens": 187537363.0, + "step": 7513 + }, + { + "epoch": 0.8251702174390512, + "grad_norm": 2.198287010192871, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.6736006736755371, + "num_tokens": 187565642.0, + "step": 7514 + }, + { + "epoch": 0.8252800351416648, + "grad_norm": 2.4845073223114014, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.710258960723877, + "num_tokens": 187586047.0, + "step": 7515 + }, + { + "epoch": 0.8253898528442785, + "grad_norm": 2.2278246879577637, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7101704478263855, + "num_tokens": 187613596.0, + "step": 7516 + }, + { + "epoch": 0.8254996705468921, + "grad_norm": 2.0275988578796387, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7061318159103394, + "num_tokens": 187641327.0, + "step": 7517 + }, + { + "epoch": 0.8256094882495059, + "grad_norm": 2.162261962890625, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6957484483718872, + "num_tokens": 187666275.0, + "step": 7518 + }, + { + "epoch": 0.8257193059521195, + "grad_norm": 2.2273261547088623, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.705215573310852, + "num_tokens": 187691369.0, + "step": 7519 + }, + { + "epoch": 0.8258291236547332, + "grad_norm": 2.2352097034454346, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.7014933824539185, + "num_tokens": 187715156.0, + "step": 7520 + }, + { + "epoch": 0.8259389413573468, + "grad_norm": 2.1150906085968018, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.706484317779541, + "num_tokens": 187741831.0, + "step": 7521 + }, + { + "epoch": 0.8260487590599604, + "grad_norm": 2.295320987701416, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.6950082778930664, + "num_tokens": 187765881.0, + "step": 7522 + }, + { + "epoch": 0.8261585767625741, + "grad_norm": 2.251225233078003, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7111146450042725, + "num_tokens": 187789537.0, + "step": 7523 + }, + { + "epoch": 0.8262683944651877, + "grad_norm": 2.260972738265991, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7314358949661255, + "num_tokens": 187812446.0, + "step": 7524 + }, + { + "epoch": 0.8263782121678015, + "grad_norm": 2.6210861206054688, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7132192254066467, + "num_tokens": 187831056.0, + "step": 7525 + }, + { + "epoch": 0.8264880298704151, + "grad_norm": 2.2356278896331787, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7094897031784058, + "num_tokens": 187856803.0, + "step": 7526 + }, + { + "epoch": 0.8265978475730288, + "grad_norm": 2.143465280532837, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.704250693321228, + "num_tokens": 187884008.0, + "step": 7527 + }, + { + "epoch": 0.8267076652756424, + "grad_norm": 2.369853973388672, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7334479093551636, + "num_tokens": 187908190.0, + "step": 7528 + }, + { + "epoch": 0.8268174829782561, + "grad_norm": 1.9408787488937378, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7031580805778503, + "num_tokens": 187940389.0, + "step": 7529 + }, + { + "epoch": 0.8269273006808697, + "grad_norm": 2.2443668842315674, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6998850107192993, + "num_tokens": 187963521.0, + "step": 7530 + }, + { + "epoch": 0.8270371183834834, + "grad_norm": 2.0652878284454346, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7163599729537964, + "num_tokens": 187991209.0, + "step": 7531 + }, + { + "epoch": 0.8271469360860971, + "grad_norm": 2.0444724559783936, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7171157598495483, + "num_tokens": 188017848.0, + "step": 7532 + }, + { + "epoch": 0.8272567537887108, + "grad_norm": 2.1814398765563965, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6951812505722046, + "num_tokens": 188044343.0, + "step": 7533 + }, + { + "epoch": 0.8273665714913244, + "grad_norm": 2.1804580688476562, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7005617022514343, + "num_tokens": 188069115.0, + "step": 7534 + }, + { + "epoch": 0.8274763891939381, + "grad_norm": 2.577700138092041, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7219257354736328, + "num_tokens": 188087346.0, + "step": 7535 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 2.0872457027435303, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7160667181015015, + "num_tokens": 188112547.0, + "step": 7536 + }, + { + "epoch": 0.8276960245991654, + "grad_norm": 2.677645683288574, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7328429222106934, + "num_tokens": 188130684.0, + "step": 7537 + }, + { + "epoch": 0.827805842301779, + "grad_norm": 2.060584783554077, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7124563455581665, + "num_tokens": 188160230.0, + "step": 7538 + }, + { + "epoch": 0.8279156600043928, + "grad_norm": 1.994194746017456, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7211589813232422, + "num_tokens": 188187522.0, + "step": 7539 + }, + { + "epoch": 0.8280254777070064, + "grad_norm": 2.074838399887085, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.702438235282898, + "num_tokens": 188213934.0, + "step": 7540 + }, + { + "epoch": 0.82813529540962, + "grad_norm": 2.5372488498687744, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7213402986526489, + "num_tokens": 188233320.0, + "step": 7541 + }, + { + "epoch": 0.8282451131122337, + "grad_norm": 2.267673969268799, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.726821780204773, + "num_tokens": 188255019.0, + "step": 7542 + }, + { + "epoch": 0.8283549308148473, + "grad_norm": 1.924855351448059, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7237069010734558, + "num_tokens": 188285803.0, + "step": 7543 + }, + { + "epoch": 0.828464748517461, + "grad_norm": 2.256855010986328, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.706883430480957, + "num_tokens": 188310376.0, + "step": 7544 + }, + { + "epoch": 0.8285745662200746, + "grad_norm": 2.4792251586914062, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7225189208984375, + "num_tokens": 188330345.0, + "step": 7545 + }, + { + "epoch": 0.8286843839226883, + "grad_norm": 2.186908483505249, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6871556043624878, + "num_tokens": 188356863.0, + "step": 7546 + }, + { + "epoch": 0.828794201625302, + "grad_norm": 1.99515962600708, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7269944548606873, + "num_tokens": 188383724.0, + "step": 7547 + }, + { + "epoch": 0.8289040193279157, + "grad_norm": 2.2434451580047607, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.6958649754524231, + "num_tokens": 188408779.0, + "step": 7548 + }, + { + "epoch": 0.8290138370305293, + "grad_norm": 1.9913123846054077, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.6980499029159546, + "num_tokens": 188439059.0, + "step": 7549 + }, + { + "epoch": 0.829123654733143, + "grad_norm": 2.349827527999878, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7008498907089233, + "num_tokens": 188461273.0, + "step": 7550 + }, + { + "epoch": 0.8292334724357566, + "grad_norm": 2.2798101902008057, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7149690389633179, + "num_tokens": 188485404.0, + "step": 7551 + }, + { + "epoch": 0.8293432901383703, + "grad_norm": 2.251030206680298, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7065064311027527, + "num_tokens": 188509624.0, + "step": 7552 + }, + { + "epoch": 0.8294531078409839, + "grad_norm": 1.9990839958190918, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7341363430023193, + "num_tokens": 188536992.0, + "step": 7553 + }, + { + "epoch": 0.8295629255435977, + "grad_norm": 2.047236442565918, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7191201448440552, + "num_tokens": 188566348.0, + "step": 7554 + }, + { + "epoch": 0.8296727432462113, + "grad_norm": 1.987475872039795, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7011584639549255, + "num_tokens": 188596801.0, + "step": 7555 + }, + { + "epoch": 0.829782560948825, + "grad_norm": 2.3154170513153076, + "learning_rate": 1e-06, + "loss": 0.7882, + "mean_token_accuracy": 0.7525942325592041, + "num_tokens": 188617888.0, + "step": 7556 + }, + { + "epoch": 0.8298923786514386, + "grad_norm": 1.9292007684707642, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6955714225769043, + "num_tokens": 188650784.0, + "step": 7557 + }, + { + "epoch": 0.8300021963540523, + "grad_norm": 2.4454939365386963, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7354745864868164, + "num_tokens": 188670609.0, + "step": 7558 + }, + { + "epoch": 0.8301120140566659, + "grad_norm": 2.079136848449707, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.699974775314331, + "num_tokens": 188698373.0, + "step": 7559 + }, + { + "epoch": 0.8302218317592795, + "grad_norm": 2.0618138313293457, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7104291319847107, + "num_tokens": 188728276.0, + "step": 7560 + }, + { + "epoch": 0.8303316494618933, + "grad_norm": 2.3441450595855713, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7140452265739441, + "num_tokens": 188751382.0, + "step": 7561 + }, + { + "epoch": 0.830441467164507, + "grad_norm": 2.3651981353759766, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6772335767745972, + "num_tokens": 188775248.0, + "step": 7562 + }, + { + "epoch": 0.8305512848671206, + "grad_norm": 2.092745542526245, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6891118884086609, + "num_tokens": 188803673.0, + "step": 7563 + }, + { + "epoch": 0.8306611025697342, + "grad_norm": 2.2271976470947266, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7081723809242249, + "num_tokens": 188826522.0, + "step": 7564 + }, + { + "epoch": 0.8307709202723479, + "grad_norm": 2.138552665710449, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7186766862869263, + "num_tokens": 188852086.0, + "step": 7565 + }, + { + "epoch": 0.8308807379749615, + "grad_norm": 2.276642322540283, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7078887820243835, + "num_tokens": 188875857.0, + "step": 7566 + }, + { + "epoch": 0.8309905556775752, + "grad_norm": 1.9226365089416504, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6890478730201721, + "num_tokens": 188908765.0, + "step": 7567 + }, + { + "epoch": 0.8311003733801889, + "grad_norm": 2.2733025550842285, + "learning_rate": 1e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7464385032653809, + "num_tokens": 188931047.0, + "step": 7568 + }, + { + "epoch": 0.8312101910828026, + "grad_norm": 2.2530691623687744, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6964321732521057, + "num_tokens": 188955577.0, + "step": 7569 + }, + { + "epoch": 0.8313200087854162, + "grad_norm": 2.650139093399048, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7306394577026367, + "num_tokens": 188971678.0, + "step": 7570 + }, + { + "epoch": 0.8314298264880299, + "grad_norm": 2.0928683280944824, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6939904689788818, + "num_tokens": 189000938.0, + "step": 7571 + }, + { + "epoch": 0.8315396441906435, + "grad_norm": 2.1860902309417725, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6924447417259216, + "num_tokens": 189026212.0, + "step": 7572 + }, + { + "epoch": 0.8316494618932572, + "grad_norm": 2.337430238723755, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6926753520965576, + "num_tokens": 189050775.0, + "step": 7573 + }, + { + "epoch": 0.8317592795958708, + "grad_norm": 2.2235515117645264, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7170957326889038, + "num_tokens": 189075614.0, + "step": 7574 + }, + { + "epoch": 0.8318690972984845, + "grad_norm": 2.1691629886627197, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6833282709121704, + "num_tokens": 189102964.0, + "step": 7575 + }, + { + "epoch": 0.8319789150010982, + "grad_norm": 2.2409982681274414, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7155799865722656, + "num_tokens": 189129509.0, + "step": 7576 + }, + { + "epoch": 0.8320887327037119, + "grad_norm": 2.281177282333374, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7006626725196838, + "num_tokens": 189153004.0, + "step": 7577 + }, + { + "epoch": 0.8321985504063255, + "grad_norm": 2.2743608951568604, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7112557291984558, + "num_tokens": 189177573.0, + "step": 7578 + }, + { + "epoch": 0.8323083681089392, + "grad_norm": 2.1829769611358643, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.710586428642273, + "num_tokens": 189203154.0, + "step": 7579 + }, + { + "epoch": 0.8324181858115528, + "grad_norm": 2.1117823123931885, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7160846590995789, + "num_tokens": 189227953.0, + "step": 7580 + }, + { + "epoch": 0.8325280035141664, + "grad_norm": 2.327214479446411, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7114979028701782, + "num_tokens": 189248800.0, + "step": 7581 + }, + { + "epoch": 0.8326378212167801, + "grad_norm": 2.194185495376587, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.7011891007423401, + "num_tokens": 189274339.0, + "step": 7582 + }, + { + "epoch": 0.8327476389193939, + "grad_norm": 2.3213143348693848, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7276365756988525, + "num_tokens": 189296220.0, + "step": 7583 + }, + { + "epoch": 0.8328574566220075, + "grad_norm": 2.0053346157073975, + "learning_rate": 1e-06, + "loss": 1.1051, + "mean_token_accuracy": 0.667579174041748, + "num_tokens": 189327701.0, + "step": 7584 + }, + { + "epoch": 0.8329672743246211, + "grad_norm": 2.5760457515716553, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7307729721069336, + "num_tokens": 189347028.0, + "step": 7585 + }, + { + "epoch": 0.8330770920272348, + "grad_norm": 2.2209722995758057, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6870906949043274, + "num_tokens": 189370912.0, + "step": 7586 + }, + { + "epoch": 0.8331869097298484, + "grad_norm": 2.1399145126342773, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.690865695476532, + "num_tokens": 189397159.0, + "step": 7587 + }, + { + "epoch": 0.8332967274324621, + "grad_norm": 2.003267288208008, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6950093507766724, + "num_tokens": 189425638.0, + "step": 7588 + }, + { + "epoch": 0.8334065451350757, + "grad_norm": 2.035104274749756, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6978800892829895, + "num_tokens": 189453976.0, + "step": 7589 + }, + { + "epoch": 0.8335163628376895, + "grad_norm": 2.036513090133667, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7045182585716248, + "num_tokens": 189482414.0, + "step": 7590 + }, + { + "epoch": 0.8336261805403031, + "grad_norm": 2.172593116760254, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6982132196426392, + "num_tokens": 189508053.0, + "step": 7591 + }, + { + "epoch": 0.8337359982429168, + "grad_norm": 2.1118197441101074, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7090514898300171, + "num_tokens": 189535353.0, + "step": 7592 + }, + { + "epoch": 0.8338458159455304, + "grad_norm": 2.218754291534424, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7082844972610474, + "num_tokens": 189560626.0, + "step": 7593 + }, + { + "epoch": 0.8339556336481441, + "grad_norm": 2.2785868644714355, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7193048596382141, + "num_tokens": 189582011.0, + "step": 7594 + }, + { + "epoch": 0.8340654513507577, + "grad_norm": 2.1962685585021973, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.709713339805603, + "num_tokens": 189606049.0, + "step": 7595 + }, + { + "epoch": 0.8341752690533714, + "grad_norm": 2.0439577102661133, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7089478373527527, + "num_tokens": 189633406.0, + "step": 7596 + }, + { + "epoch": 0.8342850867559851, + "grad_norm": 1.9084709882736206, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.708373486995697, + "num_tokens": 189663391.0, + "step": 7597 + }, + { + "epoch": 0.8343949044585988, + "grad_norm": 2.0625314712524414, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6960084438323975, + "num_tokens": 189691675.0, + "step": 7598 + }, + { + "epoch": 0.8345047221612124, + "grad_norm": 2.520019292831421, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7319448590278625, + "num_tokens": 189711640.0, + "step": 7599 + }, + { + "epoch": 0.834614539863826, + "grad_norm": 2.1201674938201904, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7049835920333862, + "num_tokens": 189738708.0, + "step": 7600 + }, + { + "epoch": 0.8347243575664397, + "grad_norm": 2.2487967014312744, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6875615119934082, + "num_tokens": 189763934.0, + "step": 7601 + }, + { + "epoch": 0.8348341752690533, + "grad_norm": 2.3113811016082764, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.720512866973877, + "num_tokens": 189786940.0, + "step": 7602 + }, + { + "epoch": 0.834943992971667, + "grad_norm": 2.343324661254883, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7265995144844055, + "num_tokens": 189808603.0, + "step": 7603 + }, + { + "epoch": 0.8350538106742806, + "grad_norm": 2.0029642581939697, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6980690956115723, + "num_tokens": 189837587.0, + "step": 7604 + }, + { + "epoch": 0.8351636283768944, + "grad_norm": 2.2195637226104736, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7086228132247925, + "num_tokens": 189861618.0, + "step": 7605 + }, + { + "epoch": 0.835273446079508, + "grad_norm": 2.306049108505249, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7187199592590332, + "num_tokens": 189883606.0, + "step": 7606 + }, + { + "epoch": 0.8353832637821217, + "grad_norm": 2.1518142223358154, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6872826218605042, + "num_tokens": 189911933.0, + "step": 7607 + }, + { + "epoch": 0.8354930814847353, + "grad_norm": 2.259732723236084, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.7019697427749634, + "num_tokens": 189937068.0, + "step": 7608 + }, + { + "epoch": 0.835602899187349, + "grad_norm": 2.2290384769439697, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.6915457844734192, + "num_tokens": 189961535.0, + "step": 7609 + }, + { + "epoch": 0.8357127168899626, + "grad_norm": 2.146756887435913, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7093802690505981, + "num_tokens": 189988244.0, + "step": 7610 + }, + { + "epoch": 0.8358225345925763, + "grad_norm": 2.042896270751953, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7147597074508667, + "num_tokens": 190015193.0, + "step": 7611 + }, + { + "epoch": 0.83593235229519, + "grad_norm": 2.482356071472168, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7024953961372375, + "num_tokens": 190037920.0, + "step": 7612 + }, + { + "epoch": 0.8360421699978037, + "grad_norm": 2.2823755741119385, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6932768225669861, + "num_tokens": 190062039.0, + "step": 7613 + }, + { + "epoch": 0.8361519877004173, + "grad_norm": 2.5176210403442383, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7097630500793457, + "num_tokens": 190082926.0, + "step": 7614 + }, + { + "epoch": 0.836261805403031, + "grad_norm": 2.122990131378174, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.707755982875824, + "num_tokens": 190110499.0, + "step": 7615 + }, + { + "epoch": 0.8363716231056446, + "grad_norm": 2.021512031555176, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7168632745742798, + "num_tokens": 190140054.0, + "step": 7616 + }, + { + "epoch": 0.8364814408082583, + "grad_norm": 2.264462471008301, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7068036794662476, + "num_tokens": 190164934.0, + "step": 7617 + }, + { + "epoch": 0.8365912585108719, + "grad_norm": 2.3600852489471436, + "learning_rate": 1e-06, + "loss": 0.789, + "mean_token_accuracy": 0.755731999874115, + "num_tokens": 190185762.0, + "step": 7618 + }, + { + "epoch": 0.8367010762134857, + "grad_norm": 2.286177396774292, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7207744121551514, + "num_tokens": 190208301.0, + "step": 7619 + }, + { + "epoch": 0.8368108939160993, + "grad_norm": 2.1171276569366455, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7177889943122864, + "num_tokens": 190234471.0, + "step": 7620 + }, + { + "epoch": 0.836920711618713, + "grad_norm": 2.4771454334259033, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7005446553230286, + "num_tokens": 190257289.0, + "step": 7621 + }, + { + "epoch": 0.8370305293213266, + "grad_norm": 2.14546537399292, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7156786322593689, + "num_tokens": 190282847.0, + "step": 7622 + }, + { + "epoch": 0.8371403470239402, + "grad_norm": 2.6677091121673584, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6967596411705017, + "num_tokens": 190305983.0, + "step": 7623 + }, + { + "epoch": 0.8372501647265539, + "grad_norm": 2.522491931915283, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7221322655677795, + "num_tokens": 190326354.0, + "step": 7624 + }, + { + "epoch": 0.8373599824291675, + "grad_norm": 2.4082322120666504, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7255439758300781, + "num_tokens": 190348342.0, + "step": 7625 + }, + { + "epoch": 0.8374698001317813, + "grad_norm": 2.2752931118011475, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7306029796600342, + "num_tokens": 190371522.0, + "step": 7626 + }, + { + "epoch": 0.8375796178343949, + "grad_norm": 2.390901565551758, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7349528670310974, + "num_tokens": 190391452.0, + "step": 7627 + }, + { + "epoch": 0.8376894355370086, + "grad_norm": 2.239243507385254, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7011083364486694, + "num_tokens": 190415598.0, + "step": 7628 + }, + { + "epoch": 0.8377992532396222, + "grad_norm": 2.0108625888824463, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.739363968372345, + "num_tokens": 190444444.0, + "step": 7629 + }, + { + "epoch": 0.8379090709422359, + "grad_norm": 2.464665651321411, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7229121327400208, + "num_tokens": 190464133.0, + "step": 7630 + }, + { + "epoch": 0.8380188886448495, + "grad_norm": 2.180992841720581, + "learning_rate": 1e-06, + "loss": 1.084, + "mean_token_accuracy": 0.670942485332489, + "num_tokens": 190490772.0, + "step": 7631 + }, + { + "epoch": 0.8381287063474632, + "grad_norm": 2.212120294570923, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.724746823310852, + "num_tokens": 190514613.0, + "step": 7632 + }, + { + "epoch": 0.8382385240500769, + "grad_norm": 2.0974719524383545, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7195894122123718, + "num_tokens": 190542074.0, + "step": 7633 + }, + { + "epoch": 0.8383483417526906, + "grad_norm": 2.3499765396118164, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.708547830581665, + "num_tokens": 190565721.0, + "step": 7634 + }, + { + "epoch": 0.8384581594553042, + "grad_norm": 2.222376585006714, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7087173461914062, + "num_tokens": 190591910.0, + "step": 7635 + }, + { + "epoch": 0.8385679771579179, + "grad_norm": 1.9651143550872803, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7071301341056824, + "num_tokens": 190622881.0, + "step": 7636 + }, + { + "epoch": 0.8386777948605315, + "grad_norm": 1.9898056983947754, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7057249546051025, + "num_tokens": 190652694.0, + "step": 7637 + }, + { + "epoch": 0.8387876125631452, + "grad_norm": 1.9964741468429565, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7166090607643127, + "num_tokens": 190682190.0, + "step": 7638 + }, + { + "epoch": 0.8388974302657588, + "grad_norm": 2.155837059020996, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7207711338996887, + "num_tokens": 190707403.0, + "step": 7639 + }, + { + "epoch": 0.8390072479683724, + "grad_norm": 2.3725993633270264, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7005578279495239, + "num_tokens": 190729124.0, + "step": 7640 + }, + { + "epoch": 0.8391170656709862, + "grad_norm": 2.1885414123535156, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7350197434425354, + "num_tokens": 190755056.0, + "step": 7641 + }, + { + "epoch": 0.8392268833735999, + "grad_norm": 2.181821584701538, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7167664766311646, + "num_tokens": 190782975.0, + "step": 7642 + }, + { + "epoch": 0.8393367010762135, + "grad_norm": 2.2227628231048584, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7063316702842712, + "num_tokens": 190806583.0, + "step": 7643 + }, + { + "epoch": 0.8394465187788271, + "grad_norm": 2.1015350818634033, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6986186504364014, + "num_tokens": 190833259.0, + "step": 7644 + }, + { + "epoch": 0.8395563364814408, + "grad_norm": 2.0583600997924805, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7107508182525635, + "num_tokens": 190862028.0, + "step": 7645 + }, + { + "epoch": 0.8396661541840544, + "grad_norm": 2.319458484649658, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7085922956466675, + "num_tokens": 190884916.0, + "step": 7646 + }, + { + "epoch": 0.8397759718866681, + "grad_norm": 2.0754270553588867, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6996584534645081, + "num_tokens": 190914038.0, + "step": 7647 + }, + { + "epoch": 0.8398857895892818, + "grad_norm": 2.156911849975586, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7227412462234497, + "num_tokens": 190941547.0, + "step": 7648 + }, + { + "epoch": 0.8399956072918955, + "grad_norm": 1.9913959503173828, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6951018571853638, + "num_tokens": 190971577.0, + "step": 7649 + }, + { + "epoch": 0.8401054249945091, + "grad_norm": 2.365186929702759, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7113682627677917, + "num_tokens": 190994213.0, + "step": 7650 + }, + { + "epoch": 0.8402152426971228, + "grad_norm": 2.192431688308716, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7150859832763672, + "num_tokens": 191020294.0, + "step": 7651 + }, + { + "epoch": 0.8403250603997364, + "grad_norm": 2.1654250621795654, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6919283866882324, + "num_tokens": 191047518.0, + "step": 7652 + }, + { + "epoch": 0.8404348781023501, + "grad_norm": 2.3784334659576416, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7227905988693237, + "num_tokens": 191067885.0, + "step": 7653 + }, + { + "epoch": 0.8405446958049637, + "grad_norm": 2.0115087032318115, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6914200782775879, + "num_tokens": 191095509.0, + "step": 7654 + }, + { + "epoch": 0.8406545135075775, + "grad_norm": 2.2634449005126953, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7467919588088989, + "num_tokens": 191118569.0, + "step": 7655 + }, + { + "epoch": 0.8407643312101911, + "grad_norm": 2.515468120574951, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6990455389022827, + "num_tokens": 191139456.0, + "step": 7656 + }, + { + "epoch": 0.8408741489128048, + "grad_norm": 2.25351619720459, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6875066757202148, + "num_tokens": 191163928.0, + "step": 7657 + }, + { + "epoch": 0.8409839666154184, + "grad_norm": 2.218092679977417, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7230228185653687, + "num_tokens": 191186468.0, + "step": 7658 + }, + { + "epoch": 0.8410937843180321, + "grad_norm": 2.3539769649505615, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.692933201789856, + "num_tokens": 191210658.0, + "step": 7659 + }, + { + "epoch": 0.8412036020206457, + "grad_norm": 2.1597630977630615, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6941895484924316, + "num_tokens": 191238561.0, + "step": 7660 + }, + { + "epoch": 0.8413134197232593, + "grad_norm": 2.1526412963867188, + "learning_rate": 1e-06, + "loss": 1.0585, + "mean_token_accuracy": 0.6827611923217773, + "num_tokens": 191264865.0, + "step": 7661 + }, + { + "epoch": 0.8414232374258731, + "grad_norm": 2.620851516723633, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7112545967102051, + "num_tokens": 191282852.0, + "step": 7662 + }, + { + "epoch": 0.8415330551284868, + "grad_norm": 2.146289110183716, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7184195518493652, + "num_tokens": 191311046.0, + "step": 7663 + }, + { + "epoch": 0.8416428728311004, + "grad_norm": 2.3439781665802, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7174739837646484, + "num_tokens": 191331769.0, + "step": 7664 + }, + { + "epoch": 0.841752690533714, + "grad_norm": 2.084470272064209, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7092026472091675, + "num_tokens": 191358860.0, + "step": 7665 + }, + { + "epoch": 0.8418625082363277, + "grad_norm": 2.149820327758789, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6972255706787109, + "num_tokens": 191384578.0, + "step": 7666 + }, + { + "epoch": 0.8419723259389413, + "grad_norm": 2.2242395877838135, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7188724279403687, + "num_tokens": 191408894.0, + "step": 7667 + }, + { + "epoch": 0.842082143641555, + "grad_norm": 2.4450411796569824, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.7061353325843811, + "num_tokens": 191431819.0, + "step": 7668 + }, + { + "epoch": 0.8421919613441686, + "grad_norm": 2.0778911113739014, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7001879215240479, + "num_tokens": 191459213.0, + "step": 7669 + }, + { + "epoch": 0.8423017790467824, + "grad_norm": 2.3912012577056885, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7093641757965088, + "num_tokens": 191481751.0, + "step": 7670 + }, + { + "epoch": 0.842411596749396, + "grad_norm": 2.120486259460449, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7187296748161316, + "num_tokens": 191507951.0, + "step": 7671 + }, + { + "epoch": 0.8425214144520097, + "grad_norm": 2.342458724975586, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.696447491645813, + "num_tokens": 191532641.0, + "step": 7672 + }, + { + "epoch": 0.8426312321546233, + "grad_norm": 2.1096949577331543, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7008106708526611, + "num_tokens": 191561071.0, + "step": 7673 + }, + { + "epoch": 0.842741049857237, + "grad_norm": 2.434375524520874, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7217196226119995, + "num_tokens": 191584050.0, + "step": 7674 + }, + { + "epoch": 0.8428508675598506, + "grad_norm": 2.1175012588500977, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.739260196685791, + "num_tokens": 191608741.0, + "step": 7675 + }, + { + "epoch": 0.8429606852624643, + "grad_norm": 2.2506144046783447, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6938586235046387, + "num_tokens": 191634357.0, + "step": 7676 + }, + { + "epoch": 0.843070502965078, + "grad_norm": 2.5007455348968506, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6913689374923706, + "num_tokens": 191655593.0, + "step": 7677 + }, + { + "epoch": 0.8431803206676917, + "grad_norm": 2.1676299571990967, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7259501814842224, + "num_tokens": 191678875.0, + "step": 7678 + }, + { + "epoch": 0.8432901383703053, + "grad_norm": 2.2651913166046143, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7273573875427246, + "num_tokens": 191701454.0, + "step": 7679 + }, + { + "epoch": 0.843399956072919, + "grad_norm": 2.123783588409424, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7232828140258789, + "num_tokens": 191726717.0, + "step": 7680 + }, + { + "epoch": 0.8435097737755326, + "grad_norm": 2.3792943954467773, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.707115113735199, + "num_tokens": 191746889.0, + "step": 7681 + }, + { + "epoch": 0.8436195914781462, + "grad_norm": 2.613518476486206, + "learning_rate": 1e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.7562925815582275, + "num_tokens": 191763753.0, + "step": 7682 + }, + { + "epoch": 0.8437294091807599, + "grad_norm": 2.4617207050323486, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7160063982009888, + "num_tokens": 191784078.0, + "step": 7683 + }, + { + "epoch": 0.8438392268833736, + "grad_norm": 1.9623528718948364, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.707288920879364, + "num_tokens": 191814542.0, + "step": 7684 + }, + { + "epoch": 0.8439490445859873, + "grad_norm": 2.302565574645996, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6957767009735107, + "num_tokens": 191838723.0, + "step": 7685 + }, + { + "epoch": 0.8440588622886009, + "grad_norm": 2.4711153507232666, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7051715850830078, + "num_tokens": 191859180.0, + "step": 7686 + }, + { + "epoch": 0.8441686799912146, + "grad_norm": 2.1010749340057373, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6955276727676392, + "num_tokens": 191885916.0, + "step": 7687 + }, + { + "epoch": 0.8442784976938282, + "grad_norm": 2.3149592876434326, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7106600999832153, + "num_tokens": 191909367.0, + "step": 7688 + }, + { + "epoch": 0.8443883153964419, + "grad_norm": 2.188051223754883, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7315874695777893, + "num_tokens": 191934885.0, + "step": 7689 + }, + { + "epoch": 0.8444981330990555, + "grad_norm": 2.4161410331726074, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7115821242332458, + "num_tokens": 191955179.0, + "step": 7690 + }, + { + "epoch": 0.8446079508016693, + "grad_norm": 2.3206231594085693, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.681526243686676, + "num_tokens": 191979309.0, + "step": 7691 + }, + { + "epoch": 0.8447177685042829, + "grad_norm": 2.1693146228790283, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7099642157554626, + "num_tokens": 192005293.0, + "step": 7692 + }, + { + "epoch": 0.8448275862068966, + "grad_norm": 2.0546326637268066, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7028775215148926, + "num_tokens": 192033592.0, + "step": 7693 + }, + { + "epoch": 0.8449374039095102, + "grad_norm": 2.098247528076172, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7233883142471313, + "num_tokens": 192060608.0, + "step": 7694 + }, + { + "epoch": 0.8450472216121239, + "grad_norm": 2.72882080078125, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7284375429153442, + "num_tokens": 192077288.0, + "step": 7695 + }, + { + "epoch": 0.8451570393147375, + "grad_norm": 2.127690076828003, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7324151992797852, + "num_tokens": 192102527.0, + "step": 7696 + }, + { + "epoch": 0.8452668570173512, + "grad_norm": 2.3431007862091064, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.718239426612854, + "num_tokens": 192125154.0, + "step": 7697 + }, + { + "epoch": 0.8453766747199648, + "grad_norm": 2.163677930831909, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7191786170005798, + "num_tokens": 192150842.0, + "step": 7698 + }, + { + "epoch": 0.8454864924225786, + "grad_norm": 2.02236008644104, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.68682861328125, + "num_tokens": 192181297.0, + "step": 7699 + }, + { + "epoch": 0.8455963101251922, + "grad_norm": 2.1597161293029785, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7097872495651245, + "num_tokens": 192206802.0, + "step": 7700 + }, + { + "epoch": 0.8457061278278059, + "grad_norm": 2.0765671730041504, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.700179934501648, + "num_tokens": 192234492.0, + "step": 7701 + }, + { + "epoch": 0.8458159455304195, + "grad_norm": 2.2848403453826904, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7264266610145569, + "num_tokens": 192256121.0, + "step": 7702 + }, + { + "epoch": 0.8459257632330331, + "grad_norm": 2.1139473915100098, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.724841296672821, + "num_tokens": 192282191.0, + "step": 7703 + }, + { + "epoch": 0.8460355809356468, + "grad_norm": 2.3133411407470703, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7002412676811218, + "num_tokens": 192305851.0, + "step": 7704 + }, + { + "epoch": 0.8461453986382604, + "grad_norm": 1.929060459136963, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7035324573516846, + "num_tokens": 192337873.0, + "step": 7705 + }, + { + "epoch": 0.8462552163408742, + "grad_norm": 1.7941268682479858, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6897240877151489, + "num_tokens": 192374842.0, + "step": 7706 + }, + { + "epoch": 0.8463650340434878, + "grad_norm": 2.1774065494537354, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7062084674835205, + "num_tokens": 192398008.0, + "step": 7707 + }, + { + "epoch": 0.8464748517461015, + "grad_norm": 1.9604721069335938, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7096424102783203, + "num_tokens": 192426953.0, + "step": 7708 + }, + { + "epoch": 0.8465846694487151, + "grad_norm": 2.26973032951355, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7176400423049927, + "num_tokens": 192449632.0, + "step": 7709 + }, + { + "epoch": 0.8466944871513288, + "grad_norm": 2.2901856899261475, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7150523662567139, + "num_tokens": 192473484.0, + "step": 7710 + }, + { + "epoch": 0.8468043048539424, + "grad_norm": 2.18332576751709, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7210783362388611, + "num_tokens": 192497466.0, + "step": 7711 + }, + { + "epoch": 0.8469141225565561, + "grad_norm": 1.9697681665420532, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7002787590026855, + "num_tokens": 192529304.0, + "step": 7712 + }, + { + "epoch": 0.8470239402591698, + "grad_norm": 2.3428256511688232, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7002394795417786, + "num_tokens": 192552953.0, + "step": 7713 + }, + { + "epoch": 0.8471337579617835, + "grad_norm": 2.1899874210357666, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7144578695297241, + "num_tokens": 192576716.0, + "step": 7714 + }, + { + "epoch": 0.8472435756643971, + "grad_norm": 2.3028147220611572, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6952658295631409, + "num_tokens": 192600830.0, + "step": 7715 + }, + { + "epoch": 0.8473533933670108, + "grad_norm": 2.0248382091522217, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7244162559509277, + "num_tokens": 192629233.0, + "step": 7716 + }, + { + "epoch": 0.8474632110696244, + "grad_norm": 2.4340853691101074, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.69295334815979, + "num_tokens": 192653816.0, + "step": 7717 + }, + { + "epoch": 0.8475730287722381, + "grad_norm": 2.1299309730529785, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6885522603988647, + "num_tokens": 192680520.0, + "step": 7718 + }, + { + "epoch": 0.8476828464748517, + "grad_norm": 2.035733222961426, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7230789661407471, + "num_tokens": 192708737.0, + "step": 7719 + }, + { + "epoch": 0.8477926641774655, + "grad_norm": 2.3170108795166016, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7202533483505249, + "num_tokens": 192730747.0, + "step": 7720 + }, + { + "epoch": 0.8479024818800791, + "grad_norm": 2.2348082065582275, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7007490396499634, + "num_tokens": 192756729.0, + "step": 7721 + }, + { + "epoch": 0.8480122995826928, + "grad_norm": 2.3413805961608887, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7216305732727051, + "num_tokens": 192777634.0, + "step": 7722 + }, + { + "epoch": 0.8481221172853064, + "grad_norm": 2.220912218093872, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6927559971809387, + "num_tokens": 192804605.0, + "step": 7723 + }, + { + "epoch": 0.84823193498792, + "grad_norm": 2.0736172199249268, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7020348310470581, + "num_tokens": 192830287.0, + "step": 7724 + }, + { + "epoch": 0.8483417526905337, + "grad_norm": 2.1311850547790527, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7021379470825195, + "num_tokens": 192856676.0, + "step": 7725 + }, + { + "epoch": 0.8484515703931473, + "grad_norm": 2.354212760925293, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7010709643363953, + "num_tokens": 192879747.0, + "step": 7726 + }, + { + "epoch": 0.848561388095761, + "grad_norm": 2.157020092010498, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6797858476638794, + "num_tokens": 192907987.0, + "step": 7727 + }, + { + "epoch": 0.8486712057983747, + "grad_norm": 2.0504658222198486, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.703439474105835, + "num_tokens": 192937678.0, + "step": 7728 + }, + { + "epoch": 0.8487810235009884, + "grad_norm": 2.1551995277404785, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.7004920840263367, + "num_tokens": 192964303.0, + "step": 7729 + }, + { + "epoch": 0.848890841203602, + "grad_norm": 2.0054309368133545, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7113767266273499, + "num_tokens": 192992986.0, + "step": 7730 + }, + { + "epoch": 0.8490006589062157, + "grad_norm": 2.0676825046539307, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6929166913032532, + "num_tokens": 193022048.0, + "step": 7731 + }, + { + "epoch": 0.8491104766088293, + "grad_norm": 2.230024576187134, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7195379137992859, + "num_tokens": 193044735.0, + "step": 7732 + }, + { + "epoch": 0.849220294311443, + "grad_norm": 2.5217666625976562, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7047456502914429, + "num_tokens": 193064667.0, + "step": 7733 + }, + { + "epoch": 0.8493301120140566, + "grad_norm": 2.720082998275757, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7185649871826172, + "num_tokens": 193080849.0, + "step": 7734 + }, + { + "epoch": 0.8494399297166704, + "grad_norm": 2.147869348526001, + "learning_rate": 1e-06, + "loss": 1.077, + "mean_token_accuracy": 0.6810795664787292, + "num_tokens": 193108337.0, + "step": 7735 + }, + { + "epoch": 0.849549747419284, + "grad_norm": 2.1093289852142334, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6956373453140259, + "num_tokens": 193135292.0, + "step": 7736 + }, + { + "epoch": 0.8496595651218977, + "grad_norm": 2.3225085735321045, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7009313106536865, + "num_tokens": 193158274.0, + "step": 7737 + }, + { + "epoch": 0.8497693828245113, + "grad_norm": 2.4590513706207275, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7323458194732666, + "num_tokens": 193177702.0, + "step": 7738 + }, + { + "epoch": 0.849879200527125, + "grad_norm": 2.2573795318603516, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7357983589172363, + "num_tokens": 193199598.0, + "step": 7739 + }, + { + "epoch": 0.8499890182297386, + "grad_norm": 2.012840747833252, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6951025724411011, + "num_tokens": 193229054.0, + "step": 7740 + }, + { + "epoch": 0.8500988359323522, + "grad_norm": 2.317028760910034, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7046018838882446, + "num_tokens": 193252220.0, + "step": 7741 + }, + { + "epoch": 0.850208653634966, + "grad_norm": 2.2276225090026855, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7304647564888, + "num_tokens": 193273744.0, + "step": 7742 + }, + { + "epoch": 0.8503184713375797, + "grad_norm": 1.9948703050613403, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7179192304611206, + "num_tokens": 193302855.0, + "step": 7743 + }, + { + "epoch": 0.8504282890401933, + "grad_norm": 2.0407090187072754, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7261360287666321, + "num_tokens": 193332845.0, + "step": 7744 + }, + { + "epoch": 0.8505381067428069, + "grad_norm": 2.3449394702911377, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6854877471923828, + "num_tokens": 193356799.0, + "step": 7745 + }, + { + "epoch": 0.8506479244454206, + "grad_norm": 1.9170925617218018, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6900274157524109, + "num_tokens": 193388147.0, + "step": 7746 + }, + { + "epoch": 0.8507577421480342, + "grad_norm": 2.392665147781372, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7064737677574158, + "num_tokens": 193409024.0, + "step": 7747 + }, + { + "epoch": 0.8508675598506479, + "grad_norm": 2.387965202331543, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.727536678314209, + "num_tokens": 193428413.0, + "step": 7748 + }, + { + "epoch": 0.8509773775532616, + "grad_norm": 2.5641491413116455, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7161945700645447, + "num_tokens": 193446287.0, + "step": 7749 + }, + { + "epoch": 0.8510871952558753, + "grad_norm": 1.9660688638687134, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7109602689743042, + "num_tokens": 193475994.0, + "step": 7750 + }, + { + "epoch": 0.8511970129584889, + "grad_norm": 2.1824634075164795, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6935151219367981, + "num_tokens": 193503734.0, + "step": 7751 + }, + { + "epoch": 0.8513068306611026, + "grad_norm": 2.3102738857269287, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.6997296810150146, + "num_tokens": 193527780.0, + "step": 7752 + }, + { + "epoch": 0.8514166483637162, + "grad_norm": 1.9076085090637207, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6813676953315735, + "num_tokens": 193561144.0, + "step": 7753 + }, + { + "epoch": 0.8515264660663299, + "grad_norm": 2.209416151046753, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7065662145614624, + "num_tokens": 193586798.0, + "step": 7754 + }, + { + "epoch": 0.8516362837689435, + "grad_norm": 2.592163324356079, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7267878651618958, + "num_tokens": 193605444.0, + "step": 7755 + }, + { + "epoch": 0.8517461014715572, + "grad_norm": 2.37943434715271, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7264889478683472, + "num_tokens": 193625716.0, + "step": 7756 + }, + { + "epoch": 0.8518559191741709, + "grad_norm": 2.1680920124053955, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7229958772659302, + "num_tokens": 193650686.0, + "step": 7757 + }, + { + "epoch": 0.8519657368767846, + "grad_norm": 2.0935304164886475, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7034814357757568, + "num_tokens": 193678885.0, + "step": 7758 + }, + { + "epoch": 0.8520755545793982, + "grad_norm": 2.500976085662842, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.7001700401306152, + "num_tokens": 193701048.0, + "step": 7759 + }, + { + "epoch": 0.8521853722820119, + "grad_norm": 2.200256586074829, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7046385407447815, + "num_tokens": 193728813.0, + "step": 7760 + }, + { + "epoch": 0.8522951899846255, + "grad_norm": 2.485506296157837, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6940432786941528, + "num_tokens": 193750306.0, + "step": 7761 + }, + { + "epoch": 0.8524050076872391, + "grad_norm": 2.309601068496704, + "learning_rate": 1e-06, + "loss": 1.0591, + "mean_token_accuracy": 0.6747898459434509, + "num_tokens": 193775346.0, + "step": 7762 + }, + { + "epoch": 0.8525148253898528, + "grad_norm": 2.1401925086975098, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7084813117980957, + "num_tokens": 193799904.0, + "step": 7763 + }, + { + "epoch": 0.8526246430924665, + "grad_norm": 2.2042076587677, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.701294481754303, + "num_tokens": 193824487.0, + "step": 7764 + }, + { + "epoch": 0.8527344607950802, + "grad_norm": 2.1604161262512207, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.704757571220398, + "num_tokens": 193849317.0, + "step": 7765 + }, + { + "epoch": 0.8528442784976938, + "grad_norm": 2.3055295944213867, + "learning_rate": 1e-06, + "loss": 1.0636, + "mean_token_accuracy": 0.6810786128044128, + "num_tokens": 193874461.0, + "step": 7766 + }, + { + "epoch": 0.8529540962003075, + "grad_norm": 2.5711333751678467, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6925411820411682, + "num_tokens": 193894758.0, + "step": 7767 + }, + { + "epoch": 0.8530639139029211, + "grad_norm": 2.211693286895752, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6766846179962158, + "num_tokens": 193923112.0, + "step": 7768 + }, + { + "epoch": 0.8531737316055348, + "grad_norm": 2.6043262481689453, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7187986969947815, + "num_tokens": 193944644.0, + "step": 7769 + }, + { + "epoch": 0.8532835493081484, + "grad_norm": 2.1683826446533203, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7206223011016846, + "num_tokens": 193968796.0, + "step": 7770 + }, + { + "epoch": 0.8533933670107622, + "grad_norm": 2.260782480239868, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7198178768157959, + "num_tokens": 193991843.0, + "step": 7771 + }, + { + "epoch": 0.8535031847133758, + "grad_norm": 2.1464614868164062, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7119965553283691, + "num_tokens": 194020288.0, + "step": 7772 + }, + { + "epoch": 0.8536130024159895, + "grad_norm": 2.0652637481689453, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7427507638931274, + "num_tokens": 194047285.0, + "step": 7773 + }, + { + "epoch": 0.8537228201186031, + "grad_norm": 2.272442579269409, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7096231579780579, + "num_tokens": 194071400.0, + "step": 7774 + }, + { + "epoch": 0.8538326378212168, + "grad_norm": 2.231962203979492, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7253029346466064, + "num_tokens": 194097062.0, + "step": 7775 + }, + { + "epoch": 0.8539424555238304, + "grad_norm": 2.4568612575531006, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7006348371505737, + "num_tokens": 194118479.0, + "step": 7776 + }, + { + "epoch": 0.8540522732264441, + "grad_norm": 2.211792469024658, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.714751124382019, + "num_tokens": 194143521.0, + "step": 7777 + }, + { + "epoch": 0.8541620909290578, + "grad_norm": 2.341501474380493, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.74222731590271, + "num_tokens": 194163284.0, + "step": 7778 + }, + { + "epoch": 0.8542719086316715, + "grad_norm": 2.4224607944488525, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.6990739107131958, + "num_tokens": 194185076.0, + "step": 7779 + }, + { + "epoch": 0.8543817263342851, + "grad_norm": 1.9484318494796753, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7038443684577942, + "num_tokens": 194216910.0, + "step": 7780 + }, + { + "epoch": 0.8544915440368988, + "grad_norm": 2.1101505756378174, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.7008285522460938, + "num_tokens": 194244913.0, + "step": 7781 + }, + { + "epoch": 0.8546013617395124, + "grad_norm": 1.9760651588439941, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.698630690574646, + "num_tokens": 194272123.0, + "step": 7782 + }, + { + "epoch": 0.854711179442126, + "grad_norm": 2.0339784622192383, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.713212251663208, + "num_tokens": 194299395.0, + "step": 7783 + }, + { + "epoch": 0.8548209971447397, + "grad_norm": 2.0450830459594727, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6917139291763306, + "num_tokens": 194327290.0, + "step": 7784 + }, + { + "epoch": 0.8549308148473534, + "grad_norm": 2.3343236446380615, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7214223742485046, + "num_tokens": 194347970.0, + "step": 7785 + }, + { + "epoch": 0.8550406325499671, + "grad_norm": 2.484034538269043, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7009701728820801, + "num_tokens": 194369375.0, + "step": 7786 + }, + { + "epoch": 0.8551504502525807, + "grad_norm": 2.1061134338378906, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.7049592137336731, + "num_tokens": 194395530.0, + "step": 7787 + }, + { + "epoch": 0.8552602679551944, + "grad_norm": 2.0687713623046875, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7088715434074402, + "num_tokens": 194424277.0, + "step": 7788 + }, + { + "epoch": 0.855370085657808, + "grad_norm": 2.365327835083008, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.726218044757843, + "num_tokens": 194445337.0, + "step": 7789 + }, + { + "epoch": 0.8554799033604217, + "grad_norm": 2.115629196166992, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7084405422210693, + "num_tokens": 194470925.0, + "step": 7790 + }, + { + "epoch": 0.8555897210630353, + "grad_norm": 2.104551076889038, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7173510193824768, + "num_tokens": 194494614.0, + "step": 7791 + }, + { + "epoch": 0.855699538765649, + "grad_norm": 2.3504750728607178, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7215150594711304, + "num_tokens": 194516641.0, + "step": 7792 + }, + { + "epoch": 0.8558093564682627, + "grad_norm": 2.232393741607666, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.70265793800354, + "num_tokens": 194540780.0, + "step": 7793 + }, + { + "epoch": 0.8559191741708764, + "grad_norm": 2.17653751373291, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6895267963409424, + "num_tokens": 194564454.0, + "step": 7794 + }, + { + "epoch": 0.85602899187349, + "grad_norm": 1.9381877183914185, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7086429595947266, + "num_tokens": 194595092.0, + "step": 7795 + }, + { + "epoch": 0.8561388095761037, + "grad_norm": 2.397097110748291, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6989720463752747, + "num_tokens": 194618427.0, + "step": 7796 + }, + { + "epoch": 0.8562486272787173, + "grad_norm": 2.787182092666626, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.701823353767395, + "num_tokens": 194635821.0, + "step": 7797 + }, + { + "epoch": 0.856358444981331, + "grad_norm": 2.5359714031219482, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.724097490310669, + "num_tokens": 194655314.0, + "step": 7798 + }, + { + "epoch": 0.8564682626839446, + "grad_norm": 2.654364824295044, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7201623320579529, + "num_tokens": 194673650.0, + "step": 7799 + }, + { + "epoch": 0.8565780803865584, + "grad_norm": 2.524237871170044, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7308050394058228, + "num_tokens": 194691087.0, + "step": 7800 + }, + { + "epoch": 0.856687898089172, + "grad_norm": 2.365072250366211, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7235875725746155, + "num_tokens": 194713469.0, + "step": 7801 + }, + { + "epoch": 0.8567977157917857, + "grad_norm": 2.15328311920166, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6849357485771179, + "num_tokens": 194740324.0, + "step": 7802 + }, + { + "epoch": 0.8569075334943993, + "grad_norm": 2.4832611083984375, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7105571627616882, + "num_tokens": 194762625.0, + "step": 7803 + }, + { + "epoch": 0.8570173511970129, + "grad_norm": 2.160856008529663, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7164415121078491, + "num_tokens": 194789291.0, + "step": 7804 + }, + { + "epoch": 0.8571271688996266, + "grad_norm": 2.2239327430725098, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6910197734832764, + "num_tokens": 194813626.0, + "step": 7805 + }, + { + "epoch": 0.8572369866022402, + "grad_norm": 1.9962718486785889, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7036778926849365, + "num_tokens": 194842249.0, + "step": 7806 + }, + { + "epoch": 0.857346804304854, + "grad_norm": 2.12937068939209, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.700789213180542, + "num_tokens": 194868165.0, + "step": 7807 + }, + { + "epoch": 0.8574566220074676, + "grad_norm": 2.205310821533203, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7047756910324097, + "num_tokens": 194895068.0, + "step": 7808 + }, + { + "epoch": 0.8575664397100813, + "grad_norm": 2.129666328430176, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6949378252029419, + "num_tokens": 194921283.0, + "step": 7809 + }, + { + "epoch": 0.8576762574126949, + "grad_norm": 2.1133174896240234, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7240135073661804, + "num_tokens": 194947738.0, + "step": 7810 + }, + { + "epoch": 0.8577860751153086, + "grad_norm": 2.3619544506073, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7151151895523071, + "num_tokens": 194969317.0, + "step": 7811 + }, + { + "epoch": 0.8578958928179222, + "grad_norm": 2.3082969188690186, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7107003927230835, + "num_tokens": 194993212.0, + "step": 7812 + }, + { + "epoch": 0.8580057105205359, + "grad_norm": 2.2628893852233887, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.722125768661499, + "num_tokens": 195016043.0, + "step": 7813 + }, + { + "epoch": 0.8581155282231496, + "grad_norm": 2.25877046585083, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6878595352172852, + "num_tokens": 195044494.0, + "step": 7814 + }, + { + "epoch": 0.8582253459257633, + "grad_norm": 2.2528696060180664, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6976209878921509, + "num_tokens": 195067765.0, + "step": 7815 + }, + { + "epoch": 0.8583351636283769, + "grad_norm": 2.0394082069396973, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.6989222764968872, + "num_tokens": 195098687.0, + "step": 7816 + }, + { + "epoch": 0.8584449813309906, + "grad_norm": 2.12261700630188, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.698976457118988, + "num_tokens": 195123831.0, + "step": 7817 + }, + { + "epoch": 0.8585547990336042, + "grad_norm": 2.0814614295959473, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7452214956283569, + "num_tokens": 195148700.0, + "step": 7818 + }, + { + "epoch": 0.8586646167362179, + "grad_norm": 2.2151620388031006, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7140721678733826, + "num_tokens": 195174538.0, + "step": 7819 + }, + { + "epoch": 0.8587744344388315, + "grad_norm": 2.170861005783081, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7165055274963379, + "num_tokens": 195200466.0, + "step": 7820 + }, + { + "epoch": 0.8588842521414451, + "grad_norm": 2.4051058292388916, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7057719826698303, + "num_tokens": 195221999.0, + "step": 7821 + }, + { + "epoch": 0.8589940698440589, + "grad_norm": 2.097248077392578, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6860947608947754, + "num_tokens": 195248866.0, + "step": 7822 + }, + { + "epoch": 0.8591038875466726, + "grad_norm": 2.2001938819885254, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.719211220741272, + "num_tokens": 195273173.0, + "step": 7823 + }, + { + "epoch": 0.8592137052492862, + "grad_norm": 2.2502593994140625, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7009055018424988, + "num_tokens": 195296178.0, + "step": 7824 + }, + { + "epoch": 0.8593235229518998, + "grad_norm": 2.151630401611328, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7082540988922119, + "num_tokens": 195321079.0, + "step": 7825 + }, + { + "epoch": 0.8594333406545135, + "grad_norm": 2.4710536003112793, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7080949544906616, + "num_tokens": 195343384.0, + "step": 7826 + }, + { + "epoch": 0.8595431583571271, + "grad_norm": 2.0848286151885986, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7180562615394592, + "num_tokens": 195367379.0, + "step": 7827 + }, + { + "epoch": 0.8596529760597408, + "grad_norm": 2.167957067489624, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7144067287445068, + "num_tokens": 195390864.0, + "step": 7828 + }, + { + "epoch": 0.8597627937623545, + "grad_norm": 2.3437371253967285, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7482571005821228, + "num_tokens": 195411232.0, + "step": 7829 + }, + { + "epoch": 0.8598726114649682, + "grad_norm": 2.2557172775268555, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7042128443717957, + "num_tokens": 195435385.0, + "step": 7830 + }, + { + "epoch": 0.8599824291675818, + "grad_norm": 2.128103256225586, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7260931134223938, + "num_tokens": 195460711.0, + "step": 7831 + }, + { + "epoch": 0.8600922468701955, + "grad_norm": 2.0423054695129395, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6916371583938599, + "num_tokens": 195490270.0, + "step": 7832 + }, + { + "epoch": 0.8602020645728091, + "grad_norm": 2.108229398727417, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.69059818983078, + "num_tokens": 195517318.0, + "step": 7833 + }, + { + "epoch": 0.8603118822754228, + "grad_norm": 2.2974629402160645, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7069330215454102, + "num_tokens": 195540770.0, + "step": 7834 + }, + { + "epoch": 0.8604216999780364, + "grad_norm": 2.2072105407714844, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7015919089317322, + "num_tokens": 195564400.0, + "step": 7835 + }, + { + "epoch": 0.8605315176806502, + "grad_norm": 2.1182608604431152, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7076247334480286, + "num_tokens": 195590476.0, + "step": 7836 + }, + { + "epoch": 0.8606413353832638, + "grad_norm": 2.237039566040039, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7031912803649902, + "num_tokens": 195615009.0, + "step": 7837 + }, + { + "epoch": 0.8607511530858775, + "grad_norm": 2.2360167503356934, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6748589873313904, + "num_tokens": 195643545.0, + "step": 7838 + }, + { + "epoch": 0.8608609707884911, + "grad_norm": 1.9776617288589478, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7022954821586609, + "num_tokens": 195674042.0, + "step": 7839 + }, + { + "epoch": 0.8609707884911048, + "grad_norm": 2.0828309059143066, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.690055251121521, + "num_tokens": 195702329.0, + "step": 7840 + }, + { + "epoch": 0.8610806061937184, + "grad_norm": 2.075251340866089, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6958093643188477, + "num_tokens": 195730952.0, + "step": 7841 + }, + { + "epoch": 0.861190423896332, + "grad_norm": 2.1592841148376465, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7115373015403748, + "num_tokens": 195758449.0, + "step": 7842 + }, + { + "epoch": 0.8613002415989458, + "grad_norm": 2.351828098297119, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7393902540206909, + "num_tokens": 195779270.0, + "step": 7843 + }, + { + "epoch": 0.8614100593015594, + "grad_norm": 2.077589273452759, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.6990212202072144, + "num_tokens": 195805120.0, + "step": 7844 + }, + { + "epoch": 0.8615198770041731, + "grad_norm": 2.3329339027404785, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7115359306335449, + "num_tokens": 195827241.0, + "step": 7845 + }, + { + "epoch": 0.8616296947067867, + "grad_norm": 2.413837194442749, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7207792401313782, + "num_tokens": 195848568.0, + "step": 7846 + }, + { + "epoch": 0.8617395124094004, + "grad_norm": 2.6495306491851807, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7157109975814819, + "num_tokens": 195867461.0, + "step": 7847 + }, + { + "epoch": 0.861849330112014, + "grad_norm": 2.166419506072998, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6985504031181335, + "num_tokens": 195895122.0, + "step": 7848 + }, + { + "epoch": 0.8619591478146277, + "grad_norm": 2.416877269744873, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7046393156051636, + "num_tokens": 195916340.0, + "step": 7849 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 1.984912395477295, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6999378204345703, + "num_tokens": 195947208.0, + "step": 7850 + }, + { + "epoch": 0.8621787832198551, + "grad_norm": 1.9089807271957397, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6965705156326294, + "num_tokens": 195978121.0, + "step": 7851 + }, + { + "epoch": 0.8622886009224687, + "grad_norm": 2.380284309387207, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7208133935928345, + "num_tokens": 196000213.0, + "step": 7852 + }, + { + "epoch": 0.8623984186250824, + "grad_norm": 2.3315060138702393, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7172739505767822, + "num_tokens": 196022404.0, + "step": 7853 + }, + { + "epoch": 0.862508236327696, + "grad_norm": 2.22234845161438, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.706041693687439, + "num_tokens": 196047482.0, + "step": 7854 + }, + { + "epoch": 0.8626180540303097, + "grad_norm": 2.6469316482543945, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7226902842521667, + "num_tokens": 196065627.0, + "step": 7855 + }, + { + "epoch": 0.8627278717329233, + "grad_norm": 2.3044519424438477, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7147065997123718, + "num_tokens": 196088129.0, + "step": 7856 + }, + { + "epoch": 0.862837689435537, + "grad_norm": 2.1635324954986572, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7100081443786621, + "num_tokens": 196114192.0, + "step": 7857 + }, + { + "epoch": 0.8629475071381507, + "grad_norm": 2.344571352005005, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7113579511642456, + "num_tokens": 196135502.0, + "step": 7858 + }, + { + "epoch": 0.8630573248407644, + "grad_norm": 2.532838821411133, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7108571529388428, + "num_tokens": 196157191.0, + "step": 7859 + }, + { + "epoch": 0.863167142543378, + "grad_norm": 2.0102663040161133, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7116785049438477, + "num_tokens": 196183705.0, + "step": 7860 + }, + { + "epoch": 0.8632769602459917, + "grad_norm": 2.496664047241211, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.728303074836731, + "num_tokens": 196204203.0, + "step": 7861 + }, + { + "epoch": 0.8633867779486053, + "grad_norm": 2.601830244064331, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6956090927124023, + "num_tokens": 196223834.0, + "step": 7862 + }, + { + "epoch": 0.863496595651219, + "grad_norm": 2.4084408283233643, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7253104448318481, + "num_tokens": 196246117.0, + "step": 7863 + }, + { + "epoch": 0.8636064133538326, + "grad_norm": 2.3390753269195557, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7288757562637329, + "num_tokens": 196267950.0, + "step": 7864 + }, + { + "epoch": 0.8637162310564463, + "grad_norm": 2.3761279582977295, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7011032104492188, + "num_tokens": 196289997.0, + "step": 7865 + }, + { + "epoch": 0.86382604875906, + "grad_norm": 2.2354652881622314, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7052053213119507, + "num_tokens": 196313130.0, + "step": 7866 + }, + { + "epoch": 0.8639358664616736, + "grad_norm": 2.236454963684082, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7172974944114685, + "num_tokens": 196336887.0, + "step": 7867 + }, + { + "epoch": 0.8640456841642873, + "grad_norm": 2.7628660202026367, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6914754509925842, + "num_tokens": 196353902.0, + "step": 7868 + }, + { + "epoch": 0.8641555018669009, + "grad_norm": 2.2494595050811768, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.691722571849823, + "num_tokens": 196380448.0, + "step": 7869 + }, + { + "epoch": 0.8642653195695146, + "grad_norm": 2.1539268493652344, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6896994113922119, + "num_tokens": 196407058.0, + "step": 7870 + }, + { + "epoch": 0.8643751372721282, + "grad_norm": 2.185563087463379, + "learning_rate": 1e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7487155199050903, + "num_tokens": 196430321.0, + "step": 7871 + }, + { + "epoch": 0.864484954974742, + "grad_norm": 2.305701732635498, + "learning_rate": 1e-06, + "loss": 0.7318, + "mean_token_accuracy": 0.7542898654937744, + "num_tokens": 196451113.0, + "step": 7872 + }, + { + "epoch": 0.8645947726773556, + "grad_norm": 2.0584940910339355, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6963775157928467, + "num_tokens": 196478702.0, + "step": 7873 + }, + { + "epoch": 0.8647045903799693, + "grad_norm": 2.0592684745788574, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7246430516242981, + "num_tokens": 196504948.0, + "step": 7874 + }, + { + "epoch": 0.8648144080825829, + "grad_norm": 2.474525213241577, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6989678144454956, + "num_tokens": 196524075.0, + "step": 7875 + }, + { + "epoch": 0.8649242257851966, + "grad_norm": 2.2803406715393066, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7125604152679443, + "num_tokens": 196547309.0, + "step": 7876 + }, + { + "epoch": 0.8650340434878102, + "grad_norm": 2.158092737197876, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7256374359130859, + "num_tokens": 196572219.0, + "step": 7877 + }, + { + "epoch": 0.8651438611904239, + "grad_norm": 2.4414010047912598, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7340291738510132, + "num_tokens": 196592681.0, + "step": 7878 + }, + { + "epoch": 0.8652536788930375, + "grad_norm": 2.222921133041382, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6841126680374146, + "num_tokens": 196618866.0, + "step": 7879 + }, + { + "epoch": 0.8653634965956513, + "grad_norm": 2.1837120056152344, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6891456842422485, + "num_tokens": 196646394.0, + "step": 7880 + }, + { + "epoch": 0.8654733142982649, + "grad_norm": 2.3114547729492188, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7070140838623047, + "num_tokens": 196670718.0, + "step": 7881 + }, + { + "epoch": 0.8655831320008786, + "grad_norm": 2.029000997543335, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7030543684959412, + "num_tokens": 196698546.0, + "step": 7882 + }, + { + "epoch": 0.8656929497034922, + "grad_norm": 2.142638921737671, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6996648907661438, + "num_tokens": 196724275.0, + "step": 7883 + }, + { + "epoch": 0.8658027674061058, + "grad_norm": 2.4813129901885986, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6867021918296814, + "num_tokens": 196745493.0, + "step": 7884 + }, + { + "epoch": 0.8659125851087195, + "grad_norm": 1.8694299459457397, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7060903310775757, + "num_tokens": 196777329.0, + "step": 7885 + }, + { + "epoch": 0.8660224028113331, + "grad_norm": 2.074625253677368, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6962957978248596, + "num_tokens": 196805597.0, + "step": 7886 + }, + { + "epoch": 0.8661322205139469, + "grad_norm": 2.0948944091796875, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6820932030677795, + "num_tokens": 196835153.0, + "step": 7887 + }, + { + "epoch": 0.8662420382165605, + "grad_norm": 2.024484634399414, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.6981006860733032, + "num_tokens": 196867367.0, + "step": 7888 + }, + { + "epoch": 0.8663518559191742, + "grad_norm": 1.9602471590042114, + "learning_rate": 1e-06, + "loss": 1.0879, + "mean_token_accuracy": 0.6784287095069885, + "num_tokens": 196897854.0, + "step": 7889 + }, + { + "epoch": 0.8664616736217878, + "grad_norm": 2.2795350551605225, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.727159857749939, + "num_tokens": 196920693.0, + "step": 7890 + }, + { + "epoch": 0.8665714913244015, + "grad_norm": 2.266397714614868, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6939905285835266, + "num_tokens": 196945681.0, + "step": 7891 + }, + { + "epoch": 0.8666813090270151, + "grad_norm": 1.970642328262329, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7291865348815918, + "num_tokens": 196973636.0, + "step": 7892 + }, + { + "epoch": 0.8667911267296288, + "grad_norm": 2.2147531509399414, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7101441621780396, + "num_tokens": 196996666.0, + "step": 7893 + }, + { + "epoch": 0.8669009444322425, + "grad_norm": 2.048213481903076, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7033131122589111, + "num_tokens": 197026697.0, + "step": 7894 + }, + { + "epoch": 0.8670107621348562, + "grad_norm": 2.2559945583343506, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7237569689750671, + "num_tokens": 197050907.0, + "step": 7895 + }, + { + "epoch": 0.8671205798374698, + "grad_norm": 2.26774525642395, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6936216354370117, + "num_tokens": 197075536.0, + "step": 7896 + }, + { + "epoch": 0.8672303975400835, + "grad_norm": 1.9508298635482788, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6935569643974304, + "num_tokens": 197105991.0, + "step": 7897 + }, + { + "epoch": 0.8673402152426971, + "grad_norm": 1.96298086643219, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6960015296936035, + "num_tokens": 197140169.0, + "step": 7898 + }, + { + "epoch": 0.8674500329453108, + "grad_norm": 2.28348445892334, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6818547248840332, + "num_tokens": 197165017.0, + "step": 7899 + }, + { + "epoch": 0.8675598506479244, + "grad_norm": 2.1824445724487305, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.722977340221405, + "num_tokens": 197188896.0, + "step": 7900 + }, + { + "epoch": 0.8676696683505382, + "grad_norm": 2.30377459526062, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7218468189239502, + "num_tokens": 197213220.0, + "step": 7901 + }, + { + "epoch": 0.8677794860531518, + "grad_norm": 2.3612923622131348, + "learning_rate": 1e-06, + "loss": 1.0331, + "mean_token_accuracy": 0.700182318687439, + "num_tokens": 197236027.0, + "step": 7902 + }, + { + "epoch": 0.8678893037557655, + "grad_norm": 2.349574565887451, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7235773801803589, + "num_tokens": 197258901.0, + "step": 7903 + }, + { + "epoch": 0.8679991214583791, + "grad_norm": 1.9654631614685059, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6946886777877808, + "num_tokens": 197290437.0, + "step": 7904 + }, + { + "epoch": 0.8681089391609927, + "grad_norm": 2.338811159133911, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6938797831535339, + "num_tokens": 197314269.0, + "step": 7905 + }, + { + "epoch": 0.8682187568636064, + "grad_norm": 2.3860034942626953, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.6993814706802368, + "num_tokens": 197336717.0, + "step": 7906 + }, + { + "epoch": 0.86832857456622, + "grad_norm": 2.0673680305480957, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7045013904571533, + "num_tokens": 197364404.0, + "step": 7907 + }, + { + "epoch": 0.8684383922688337, + "grad_norm": 1.86776864528656, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7113909125328064, + "num_tokens": 197396497.0, + "step": 7908 + }, + { + "epoch": 0.8685482099714474, + "grad_norm": 2.1688215732574463, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6903195381164551, + "num_tokens": 197422409.0, + "step": 7909 + }, + { + "epoch": 0.8686580276740611, + "grad_norm": 2.6811656951904297, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7049750089645386, + "num_tokens": 197442956.0, + "step": 7910 + }, + { + "epoch": 0.8687678453766747, + "grad_norm": 2.216859817504883, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7160720825195312, + "num_tokens": 197467172.0, + "step": 7911 + }, + { + "epoch": 0.8688776630792884, + "grad_norm": 2.5864386558532715, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.714108943939209, + "num_tokens": 197486158.0, + "step": 7912 + }, + { + "epoch": 0.868987480781902, + "grad_norm": 2.104965925216675, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7049557566642761, + "num_tokens": 197513439.0, + "step": 7913 + }, + { + "epoch": 0.8690972984845157, + "grad_norm": 2.3012588024139404, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7052580118179321, + "num_tokens": 197538225.0, + "step": 7914 + }, + { + "epoch": 0.8692071161871293, + "grad_norm": 2.301582098007202, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7220844030380249, + "num_tokens": 197563168.0, + "step": 7915 + }, + { + "epoch": 0.8693169338897431, + "grad_norm": 1.980179786682129, + "learning_rate": 1e-06, + "loss": 1.0756, + "mean_token_accuracy": 0.6743733286857605, + "num_tokens": 197594044.0, + "step": 7916 + }, + { + "epoch": 0.8694267515923567, + "grad_norm": 2.4362447261810303, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.702083945274353, + "num_tokens": 197618573.0, + "step": 7917 + }, + { + "epoch": 0.8695365692949704, + "grad_norm": 2.0384860038757324, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7035548686981201, + "num_tokens": 197647036.0, + "step": 7918 + }, + { + "epoch": 0.869646386997584, + "grad_norm": 2.1289377212524414, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.704422116279602, + "num_tokens": 197674866.0, + "step": 7919 + }, + { + "epoch": 0.8697562047001977, + "grad_norm": 2.176034450531006, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6941951513290405, + "num_tokens": 197701432.0, + "step": 7920 + }, + { + "epoch": 0.8698660224028113, + "grad_norm": 2.095855474472046, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7264552116394043, + "num_tokens": 197727906.0, + "step": 7921 + }, + { + "epoch": 0.869975840105425, + "grad_norm": 2.154120922088623, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.7123011350631714, + "num_tokens": 197753146.0, + "step": 7922 + }, + { + "epoch": 0.8700856578080387, + "grad_norm": 2.324592351913452, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7113773822784424, + "num_tokens": 197776371.0, + "step": 7923 + }, + { + "epoch": 0.8701954755106523, + "grad_norm": 2.5448718070983887, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7443355321884155, + "num_tokens": 197795687.0, + "step": 7924 + }, + { + "epoch": 0.870305293213266, + "grad_norm": 2.092087984085083, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6924728155136108, + "num_tokens": 197823176.0, + "step": 7925 + }, + { + "epoch": 0.8704151109158796, + "grad_norm": 2.3579444885253906, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7261988520622253, + "num_tokens": 197846763.0, + "step": 7926 + }, + { + "epoch": 0.8705249286184933, + "grad_norm": 2.3967902660369873, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.7030366659164429, + "num_tokens": 197870883.0, + "step": 7927 + }, + { + "epoch": 0.8706347463211069, + "grad_norm": 2.019193649291992, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6963587999343872, + "num_tokens": 197900349.0, + "step": 7928 + }, + { + "epoch": 0.8707445640237206, + "grad_norm": 2.333405017852783, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7058525085449219, + "num_tokens": 197921792.0, + "step": 7929 + }, + { + "epoch": 0.8708543817263343, + "grad_norm": 2.284515857696533, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7293427586555481, + "num_tokens": 197945758.0, + "step": 7930 + }, + { + "epoch": 0.870964199428948, + "grad_norm": 2.108414649963379, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7139424681663513, + "num_tokens": 197972696.0, + "step": 7931 + }, + { + "epoch": 0.8710740171315616, + "grad_norm": 2.709326982498169, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7300912141799927, + "num_tokens": 197989217.0, + "step": 7932 + }, + { + "epoch": 0.8711838348341753, + "grad_norm": 2.013456106185913, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7094660997390747, + "num_tokens": 198022037.0, + "step": 7933 + }, + { + "epoch": 0.8712936525367889, + "grad_norm": 2.44185471534729, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7270747423171997, + "num_tokens": 198042765.0, + "step": 7934 + }, + { + "epoch": 0.8714034702394026, + "grad_norm": 2.142465829849243, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7217069268226624, + "num_tokens": 198069464.0, + "step": 7935 + }, + { + "epoch": 0.8715132879420162, + "grad_norm": 2.0676755905151367, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6966941952705383, + "num_tokens": 198099575.0, + "step": 7936 + }, + { + "epoch": 0.87162310564463, + "grad_norm": 2.3563106060028076, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7139518857002258, + "num_tokens": 198121695.0, + "step": 7937 + }, + { + "epoch": 0.8717329233472436, + "grad_norm": 2.5689265727996826, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7302384376525879, + "num_tokens": 198140782.0, + "step": 7938 + }, + { + "epoch": 0.8718427410498573, + "grad_norm": 2.149211883544922, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7213592529296875, + "num_tokens": 198166206.0, + "step": 7939 + }, + { + "epoch": 0.8719525587524709, + "grad_norm": 2.128490447998047, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.723017692565918, + "num_tokens": 198191585.0, + "step": 7940 + }, + { + "epoch": 0.8720623764550846, + "grad_norm": 2.2252089977264404, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6961207985877991, + "num_tokens": 198217435.0, + "step": 7941 + }, + { + "epoch": 0.8721721941576982, + "grad_norm": 2.3094751834869385, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6949342489242554, + "num_tokens": 198242714.0, + "step": 7942 + }, + { + "epoch": 0.8722820118603118, + "grad_norm": 2.362360715866089, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7093364000320435, + "num_tokens": 198263744.0, + "step": 7943 + }, + { + "epoch": 0.8723918295629255, + "grad_norm": 2.1032397747039795, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7203088998794556, + "num_tokens": 198288288.0, + "step": 7944 + }, + { + "epoch": 0.8725016472655392, + "grad_norm": 2.0995419025421143, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7004716396331787, + "num_tokens": 198316361.0, + "step": 7945 + }, + { + "epoch": 0.8726114649681529, + "grad_norm": 2.035423994064331, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6967012882232666, + "num_tokens": 198346057.0, + "step": 7946 + }, + { + "epoch": 0.8727212826707665, + "grad_norm": 2.106412410736084, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6855610013008118, + "num_tokens": 198374202.0, + "step": 7947 + }, + { + "epoch": 0.8728311003733802, + "grad_norm": 2.312263250350952, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7167137861251831, + "num_tokens": 198396211.0, + "step": 7948 + }, + { + "epoch": 0.8729409180759938, + "grad_norm": 2.145489454269409, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7142199873924255, + "num_tokens": 198422568.0, + "step": 7949 + }, + { + "epoch": 0.8730507357786075, + "grad_norm": 2.19278883934021, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.702525794506073, + "num_tokens": 198450458.0, + "step": 7950 + }, + { + "epoch": 0.8731605534812211, + "grad_norm": 2.424377918243408, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7134464383125305, + "num_tokens": 198471593.0, + "step": 7951 + }, + { + "epoch": 0.8732703711838349, + "grad_norm": 2.074673891067505, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7028166055679321, + "num_tokens": 198498601.0, + "step": 7952 + }, + { + "epoch": 0.8733801888864485, + "grad_norm": 2.116196632385254, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.702689528465271, + "num_tokens": 198525741.0, + "step": 7953 + }, + { + "epoch": 0.8734900065890622, + "grad_norm": 2.2305307388305664, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.7052879333496094, + "num_tokens": 198550588.0, + "step": 7954 + }, + { + "epoch": 0.8735998242916758, + "grad_norm": 2.2224478721618652, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7012746334075928, + "num_tokens": 198575483.0, + "step": 7955 + }, + { + "epoch": 0.8737096419942895, + "grad_norm": 2.0266029834747314, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6853051781654358, + "num_tokens": 198606213.0, + "step": 7956 + }, + { + "epoch": 0.8738194596969031, + "grad_norm": 2.358942985534668, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6925559043884277, + "num_tokens": 198630399.0, + "step": 7957 + }, + { + "epoch": 0.8739292773995168, + "grad_norm": 1.994680404663086, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7142183184623718, + "num_tokens": 198658327.0, + "step": 7958 + }, + { + "epoch": 0.8740390951021305, + "grad_norm": 1.9464209079742432, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7096730470657349, + "num_tokens": 198688376.0, + "step": 7959 + }, + { + "epoch": 0.8741489128047442, + "grad_norm": 2.1220664978027344, + "learning_rate": 1e-06, + "loss": 1.0922, + "mean_token_accuracy": 0.6750074625015259, + "num_tokens": 198717725.0, + "step": 7960 + }, + { + "epoch": 0.8742587305073578, + "grad_norm": 2.4405617713928223, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7172610759735107, + "num_tokens": 198739291.0, + "step": 7961 + }, + { + "epoch": 0.8743685482099715, + "grad_norm": 2.0922985076904297, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7094451189041138, + "num_tokens": 198767416.0, + "step": 7962 + }, + { + "epoch": 0.8744783659125851, + "grad_norm": 2.084895610809326, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7027119398117065, + "num_tokens": 198794280.0, + "step": 7963 + }, + { + "epoch": 0.8745881836151987, + "grad_norm": 2.63262939453125, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7218194603919983, + "num_tokens": 198814420.0, + "step": 7964 + }, + { + "epoch": 0.8746980013178124, + "grad_norm": 2.164921283721924, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7257798910140991, + "num_tokens": 198838718.0, + "step": 7965 + }, + { + "epoch": 0.8748078190204261, + "grad_norm": 2.0861518383026123, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7121780514717102, + "num_tokens": 198866074.0, + "step": 7966 + }, + { + "epoch": 0.8749176367230398, + "grad_norm": 2.186244249343872, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7032392621040344, + "num_tokens": 198890624.0, + "step": 7967 + }, + { + "epoch": 0.8750274544256534, + "grad_norm": 2.038038969039917, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7147835493087769, + "num_tokens": 198921293.0, + "step": 7968 + }, + { + "epoch": 0.8751372721282671, + "grad_norm": 2.5040526390075684, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7225195169448853, + "num_tokens": 198940150.0, + "step": 7969 + }, + { + "epoch": 0.8752470898308807, + "grad_norm": 2.4680535793304443, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.722797155380249, + "num_tokens": 198960178.0, + "step": 7970 + }, + { + "epoch": 0.8753569075334944, + "grad_norm": 2.1572115421295166, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7181315422058105, + "num_tokens": 198987252.0, + "step": 7971 + }, + { + "epoch": 0.875466725236108, + "grad_norm": 2.2187931537628174, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6968146562576294, + "num_tokens": 199014064.0, + "step": 7972 + }, + { + "epoch": 0.8755765429387217, + "grad_norm": 1.9911870956420898, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7063671350479126, + "num_tokens": 199044581.0, + "step": 7973 + }, + { + "epoch": 0.8756863606413354, + "grad_norm": 2.1557457447052, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7038151025772095, + "num_tokens": 199072169.0, + "step": 7974 + }, + { + "epoch": 0.8757961783439491, + "grad_norm": 2.2902934551239014, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6869370937347412, + "num_tokens": 199095912.0, + "step": 7975 + }, + { + "epoch": 0.8759059960465627, + "grad_norm": 2.0315027236938477, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7217413187026978, + "num_tokens": 199123679.0, + "step": 7976 + }, + { + "epoch": 0.8760158137491764, + "grad_norm": 2.3381457328796387, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7101191282272339, + "num_tokens": 199147797.0, + "step": 7977 + }, + { + "epoch": 0.87612563145179, + "grad_norm": 2.590895175933838, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7242529988288879, + "num_tokens": 199165504.0, + "step": 7978 + }, + { + "epoch": 0.8762354491544037, + "grad_norm": 2.3514227867126465, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7013252973556519, + "num_tokens": 199187079.0, + "step": 7979 + }, + { + "epoch": 0.8763452668570173, + "grad_norm": 2.3600921630859375, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7186755537986755, + "num_tokens": 199212870.0, + "step": 7980 + }, + { + "epoch": 0.8764550845596311, + "grad_norm": 2.0469281673431396, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.6915850639343262, + "num_tokens": 199241133.0, + "step": 7981 + }, + { + "epoch": 0.8765649022622447, + "grad_norm": 2.1851601600646973, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7244583964347839, + "num_tokens": 199264837.0, + "step": 7982 + }, + { + "epoch": 0.8766747199648584, + "grad_norm": 2.138815402984619, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7106952667236328, + "num_tokens": 199292278.0, + "step": 7983 + }, + { + "epoch": 0.876784537667472, + "grad_norm": 2.376981735229492, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7086696624755859, + "num_tokens": 199313646.0, + "step": 7984 + }, + { + "epoch": 0.8768943553700856, + "grad_norm": 1.9842545986175537, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.6933467984199524, + "num_tokens": 199344297.0, + "step": 7985 + }, + { + "epoch": 0.8770041730726993, + "grad_norm": 2.0572192668914795, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7075293064117432, + "num_tokens": 199369889.0, + "step": 7986 + }, + { + "epoch": 0.8771139907753129, + "grad_norm": 2.3796446323394775, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6922562122344971, + "num_tokens": 199393919.0, + "step": 7987 + }, + { + "epoch": 0.8772238084779267, + "grad_norm": 1.9916048049926758, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6943593621253967, + "num_tokens": 199423966.0, + "step": 7988 + }, + { + "epoch": 0.8773336261805403, + "grad_norm": 2.082576274871826, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6953370571136475, + "num_tokens": 199451989.0, + "step": 7989 + }, + { + "epoch": 0.877443443883154, + "grad_norm": 2.3388476371765137, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7176622152328491, + "num_tokens": 199473630.0, + "step": 7990 + }, + { + "epoch": 0.8775532615857676, + "grad_norm": 2.259319543838501, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7016715407371521, + "num_tokens": 199496482.0, + "step": 7991 + }, + { + "epoch": 0.8776630792883813, + "grad_norm": 2.242527723312378, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.711783230304718, + "num_tokens": 199521880.0, + "step": 7992 + }, + { + "epoch": 0.8777728969909949, + "grad_norm": 2.103410482406616, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7294890880584717, + "num_tokens": 199546721.0, + "step": 7993 + }, + { + "epoch": 0.8778827146936086, + "grad_norm": 1.9418010711669922, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6799879670143127, + "num_tokens": 199581643.0, + "step": 7994 + }, + { + "epoch": 0.8779925323962223, + "grad_norm": 2.2397780418395996, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6961789131164551, + "num_tokens": 199606706.0, + "step": 7995 + }, + { + "epoch": 0.878102350098836, + "grad_norm": 1.9337165355682373, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6899853944778442, + "num_tokens": 199640525.0, + "step": 7996 + }, + { + "epoch": 0.8782121678014496, + "grad_norm": 2.223362684249878, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7165035605430603, + "num_tokens": 199664726.0, + "step": 7997 + }, + { + "epoch": 0.8783219855040633, + "grad_norm": 2.328472375869751, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6914680600166321, + "num_tokens": 199686866.0, + "step": 7998 + }, + { + "epoch": 0.8784318032066769, + "grad_norm": 2.09031343460083, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6905319690704346, + "num_tokens": 199716103.0, + "step": 7999 + }, + { + "epoch": 0.8785416209092906, + "grad_norm": 2.6560237407684326, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7415369749069214, + "num_tokens": 199733964.0, + "step": 8000 + }, + { + "epoch": 0.8786514386119042, + "grad_norm": 2.001966953277588, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.6979285478591919, + "num_tokens": 199763741.0, + "step": 8001 + }, + { + "epoch": 0.8787612563145178, + "grad_norm": 2.2975990772247314, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7258737087249756, + "num_tokens": 199786761.0, + "step": 8002 + }, + { + "epoch": 0.8788710740171316, + "grad_norm": 2.549795627593994, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7220879793167114, + "num_tokens": 199806104.0, + "step": 8003 + }, + { + "epoch": 0.8789808917197452, + "grad_norm": 2.06387996673584, + "learning_rate": 1e-06, + "loss": 1.0609, + "mean_token_accuracy": 0.6855480670928955, + "num_tokens": 199835630.0, + "step": 8004 + }, + { + "epoch": 0.8790907094223589, + "grad_norm": 2.1458351612091064, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7221500277519226, + "num_tokens": 199860439.0, + "step": 8005 + }, + { + "epoch": 0.8792005271249725, + "grad_norm": 2.5938494205474854, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.737527072429657, + "num_tokens": 199878251.0, + "step": 8006 + }, + { + "epoch": 0.8793103448275862, + "grad_norm": 2.327813148498535, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6970758438110352, + "num_tokens": 199901783.0, + "step": 8007 + }, + { + "epoch": 0.8794201625301998, + "grad_norm": 2.0767743587493896, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7154473066329956, + "num_tokens": 199928418.0, + "step": 8008 + }, + { + "epoch": 0.8795299802328135, + "grad_norm": 2.1919236183166504, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7417325973510742, + "num_tokens": 199951325.0, + "step": 8009 + }, + { + "epoch": 0.8796397979354272, + "grad_norm": 2.0776267051696777, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7115849852561951, + "num_tokens": 199977165.0, + "step": 8010 + }, + { + "epoch": 0.8797496156380409, + "grad_norm": 2.315385103225708, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7362197637557983, + "num_tokens": 199999855.0, + "step": 8011 + }, + { + "epoch": 0.8798594333406545, + "grad_norm": 2.182234764099121, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7239373326301575, + "num_tokens": 200026869.0, + "step": 8012 + }, + { + "epoch": 0.8799692510432682, + "grad_norm": 2.3106610774993896, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.734288215637207, + "num_tokens": 200048800.0, + "step": 8013 + }, + { + "epoch": 0.8800790687458818, + "grad_norm": 2.1477854251861572, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7092580795288086, + "num_tokens": 200077047.0, + "step": 8014 + }, + { + "epoch": 0.8801888864484955, + "grad_norm": 2.2097156047821045, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7007679343223572, + "num_tokens": 200101712.0, + "step": 8015 + }, + { + "epoch": 0.8802987041511091, + "grad_norm": 2.036376953125, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.703106164932251, + "num_tokens": 200130371.0, + "step": 8016 + }, + { + "epoch": 0.8804085218537229, + "grad_norm": 2.5287744998931885, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7157272100448608, + "num_tokens": 200151381.0, + "step": 8017 + }, + { + "epoch": 0.8805183395563365, + "grad_norm": 2.161367416381836, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6929254531860352, + "num_tokens": 200176282.0, + "step": 8018 + }, + { + "epoch": 0.8806281572589502, + "grad_norm": 2.201826333999634, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7058002352714539, + "num_tokens": 200201664.0, + "step": 8019 + }, + { + "epoch": 0.8807379749615638, + "grad_norm": 2.0307865142822266, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7098944187164307, + "num_tokens": 200230414.0, + "step": 8020 + }, + { + "epoch": 0.8808477926641775, + "grad_norm": 2.1906120777130127, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7095874547958374, + "num_tokens": 200253786.0, + "step": 8021 + }, + { + "epoch": 0.8809576103667911, + "grad_norm": 2.4264519214630127, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7374168634414673, + "num_tokens": 200274870.0, + "step": 8022 + }, + { + "epoch": 0.8810674280694047, + "grad_norm": 2.1138992309570312, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7075986862182617, + "num_tokens": 200302392.0, + "step": 8023 + }, + { + "epoch": 0.8811772457720185, + "grad_norm": 2.0517919063568115, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7010395526885986, + "num_tokens": 200332018.0, + "step": 8024 + }, + { + "epoch": 0.8812870634746321, + "grad_norm": 2.191227912902832, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7285647392272949, + "num_tokens": 200356806.0, + "step": 8025 + }, + { + "epoch": 0.8813968811772458, + "grad_norm": 2.2522597312927246, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.713445782661438, + "num_tokens": 200380703.0, + "step": 8026 + }, + { + "epoch": 0.8815066988798594, + "grad_norm": 1.8828752040863037, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6978325247764587, + "num_tokens": 200410723.0, + "step": 8027 + }, + { + "epoch": 0.8816165165824731, + "grad_norm": 2.0832908153533936, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7181060314178467, + "num_tokens": 200436365.0, + "step": 8028 + }, + { + "epoch": 0.8817263342850867, + "grad_norm": 2.219984292984009, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7115834355354309, + "num_tokens": 200460703.0, + "step": 8029 + }, + { + "epoch": 0.8818361519877004, + "grad_norm": 2.054180383682251, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7220873236656189, + "num_tokens": 200486877.0, + "step": 8030 + }, + { + "epoch": 0.881945969690314, + "grad_norm": 2.336733341217041, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.720141589641571, + "num_tokens": 200508471.0, + "step": 8031 + }, + { + "epoch": 0.8820557873929278, + "grad_norm": 2.6683921813964844, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7226955890655518, + "num_tokens": 200527169.0, + "step": 8032 + }, + { + "epoch": 0.8821656050955414, + "grad_norm": 2.100822925567627, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7230022549629211, + "num_tokens": 200553994.0, + "step": 8033 + }, + { + "epoch": 0.8822754227981551, + "grad_norm": 2.2941715717315674, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7214647531509399, + "num_tokens": 200577176.0, + "step": 8034 + }, + { + "epoch": 0.8823852405007687, + "grad_norm": 2.203341484069824, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7172104716300964, + "num_tokens": 200600728.0, + "step": 8035 + }, + { + "epoch": 0.8824950582033824, + "grad_norm": 2.5967283248901367, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7093392610549927, + "num_tokens": 200619807.0, + "step": 8036 + }, + { + "epoch": 0.882604875905996, + "grad_norm": 1.935524344444275, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7066998481750488, + "num_tokens": 200649777.0, + "step": 8037 + }, + { + "epoch": 0.8827146936086097, + "grad_norm": 2.2740674018859863, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7048647403717041, + "num_tokens": 200672253.0, + "step": 8038 + }, + { + "epoch": 0.8828245113112234, + "grad_norm": 2.0374674797058105, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7027207016944885, + "num_tokens": 200701941.0, + "step": 8039 + }, + { + "epoch": 0.8829343290138371, + "grad_norm": 2.118804693222046, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.712437629699707, + "num_tokens": 200729367.0, + "step": 8040 + }, + { + "epoch": 0.8830441467164507, + "grad_norm": 2.1807711124420166, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7219285368919373, + "num_tokens": 200753350.0, + "step": 8041 + }, + { + "epoch": 0.8831539644190644, + "grad_norm": 2.2789523601531982, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6959378123283386, + "num_tokens": 200778932.0, + "step": 8042 + }, + { + "epoch": 0.883263782121678, + "grad_norm": 2.393963575363159, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7341457009315491, + "num_tokens": 200800148.0, + "step": 8043 + }, + { + "epoch": 0.8833735998242916, + "grad_norm": 2.105304002761841, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7225264310836792, + "num_tokens": 200825637.0, + "step": 8044 + }, + { + "epoch": 0.8834834175269053, + "grad_norm": 2.299464464187622, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6974852085113525, + "num_tokens": 200849349.0, + "step": 8045 + }, + { + "epoch": 0.883593235229519, + "grad_norm": 2.4211795330047607, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7080581188201904, + "num_tokens": 200873151.0, + "step": 8046 + }, + { + "epoch": 0.8837030529321327, + "grad_norm": 2.1409425735473633, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7019864320755005, + "num_tokens": 200897720.0, + "step": 8047 + }, + { + "epoch": 0.8838128706347463, + "grad_norm": 1.9399912357330322, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7252756953239441, + "num_tokens": 200927937.0, + "step": 8048 + }, + { + "epoch": 0.88392268833736, + "grad_norm": 2.392286777496338, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7007664442062378, + "num_tokens": 200951039.0, + "step": 8049 + }, + { + "epoch": 0.8840325060399736, + "grad_norm": 2.089501142501831, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7086216807365417, + "num_tokens": 200976936.0, + "step": 8050 + }, + { + "epoch": 0.8841423237425873, + "grad_norm": 2.4241719245910645, + "learning_rate": 1e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7426666021347046, + "num_tokens": 200996356.0, + "step": 8051 + }, + { + "epoch": 0.8842521414452009, + "grad_norm": 2.0657002925872803, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7196127772331238, + "num_tokens": 201022757.0, + "step": 8052 + }, + { + "epoch": 0.8843619591478147, + "grad_norm": 2.330228328704834, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7019338011741638, + "num_tokens": 201045258.0, + "step": 8053 + }, + { + "epoch": 0.8844717768504283, + "grad_norm": 2.3259520530700684, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7124133110046387, + "num_tokens": 201069386.0, + "step": 8054 + }, + { + "epoch": 0.884581594553042, + "grad_norm": 2.37841534614563, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.703515887260437, + "num_tokens": 201092510.0, + "step": 8055 + }, + { + "epoch": 0.8846914122556556, + "grad_norm": 2.3547823429107666, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7241209149360657, + "num_tokens": 201113687.0, + "step": 8056 + }, + { + "epoch": 0.8848012299582693, + "grad_norm": 2.264223575592041, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7114638090133667, + "num_tokens": 201137800.0, + "step": 8057 + }, + { + "epoch": 0.8849110476608829, + "grad_norm": 1.7691407203674316, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6991672515869141, + "num_tokens": 201173987.0, + "step": 8058 + }, + { + "epoch": 0.8850208653634966, + "grad_norm": 2.3738975524902344, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7118101716041565, + "num_tokens": 201196079.0, + "step": 8059 + }, + { + "epoch": 0.8851306830661102, + "grad_norm": 2.095979928970337, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6907947063446045, + "num_tokens": 201223226.0, + "step": 8060 + }, + { + "epoch": 0.885240500768724, + "grad_norm": 2.3839519023895264, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6956892609596252, + "num_tokens": 201248266.0, + "step": 8061 + }, + { + "epoch": 0.8853503184713376, + "grad_norm": 2.118743419647217, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7079836130142212, + "num_tokens": 201275339.0, + "step": 8062 + }, + { + "epoch": 0.8854601361739513, + "grad_norm": 1.9481180906295776, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7148764133453369, + "num_tokens": 201304211.0, + "step": 8063 + }, + { + "epoch": 0.8855699538765649, + "grad_norm": 2.247180461883545, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7167631387710571, + "num_tokens": 201329288.0, + "step": 8064 + }, + { + "epoch": 0.8856797715791785, + "grad_norm": 2.2176523208618164, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6850228309631348, + "num_tokens": 201355328.0, + "step": 8065 + }, + { + "epoch": 0.8857895892817922, + "grad_norm": 2.113494873046875, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.712857723236084, + "num_tokens": 201381873.0, + "step": 8066 + }, + { + "epoch": 0.8858994069844058, + "grad_norm": 2.1355056762695312, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7057121992111206, + "num_tokens": 201408531.0, + "step": 8067 + }, + { + "epoch": 0.8860092246870196, + "grad_norm": 2.2446866035461426, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7348921298980713, + "num_tokens": 201432259.0, + "step": 8068 + }, + { + "epoch": 0.8861190423896332, + "grad_norm": 2.8554563522338867, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6966560482978821, + "num_tokens": 201451032.0, + "step": 8069 + }, + { + "epoch": 0.8862288600922469, + "grad_norm": 2.416398525238037, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.740271806716919, + "num_tokens": 201471045.0, + "step": 8070 + }, + { + "epoch": 0.8863386777948605, + "grad_norm": 2.1902341842651367, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7372434735298157, + "num_tokens": 201497605.0, + "step": 8071 + }, + { + "epoch": 0.8864484954974742, + "grad_norm": 2.545821189880371, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.710807204246521, + "num_tokens": 201518215.0, + "step": 8072 + }, + { + "epoch": 0.8865583132000878, + "grad_norm": 2.0959556102752686, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7213878631591797, + "num_tokens": 201545689.0, + "step": 8073 + }, + { + "epoch": 0.8866681309027015, + "grad_norm": 2.168489694595337, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7324295043945312, + "num_tokens": 201572542.0, + "step": 8074 + }, + { + "epoch": 0.8867779486053152, + "grad_norm": 2.2849626541137695, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7024022340774536, + "num_tokens": 201596385.0, + "step": 8075 + }, + { + "epoch": 0.8868877663079289, + "grad_norm": 2.067654609680176, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7277068495750427, + "num_tokens": 201621556.0, + "step": 8076 + }, + { + "epoch": 0.8869975840105425, + "grad_norm": 2.3690383434295654, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7142422199249268, + "num_tokens": 201644458.0, + "step": 8077 + }, + { + "epoch": 0.8871074017131562, + "grad_norm": 1.9754457473754883, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6945239305496216, + "num_tokens": 201675233.0, + "step": 8078 + }, + { + "epoch": 0.8872172194157698, + "grad_norm": 2.4326560497283936, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7079005837440491, + "num_tokens": 201696498.0, + "step": 8079 + }, + { + "epoch": 0.8873270371183835, + "grad_norm": 2.3786299228668213, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7000075578689575, + "num_tokens": 201716947.0, + "step": 8080 + }, + { + "epoch": 0.8874368548209971, + "grad_norm": 2.3233702182769775, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6809552311897278, + "num_tokens": 201740028.0, + "step": 8081 + }, + { + "epoch": 0.8875466725236109, + "grad_norm": 2.506537437438965, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7099442481994629, + "num_tokens": 201760303.0, + "step": 8082 + }, + { + "epoch": 0.8876564902262245, + "grad_norm": 2.4597246646881104, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7203630208969116, + "num_tokens": 201780894.0, + "step": 8083 + }, + { + "epoch": 0.8877663079288381, + "grad_norm": 2.1185007095336914, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.707348644733429, + "num_tokens": 201807653.0, + "step": 8084 + }, + { + "epoch": 0.8878761256314518, + "grad_norm": 2.245654344558716, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.707743227481842, + "num_tokens": 201831056.0, + "step": 8085 + }, + { + "epoch": 0.8879859433340654, + "grad_norm": 2.412184000015259, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7105706334114075, + "num_tokens": 201852988.0, + "step": 8086 + }, + { + "epoch": 0.8880957610366791, + "grad_norm": 2.091141939163208, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6931065320968628, + "num_tokens": 201881448.0, + "step": 8087 + }, + { + "epoch": 0.8882055787392927, + "grad_norm": 2.5379838943481445, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7163154482841492, + "num_tokens": 201902651.0, + "step": 8088 + }, + { + "epoch": 0.8883153964419065, + "grad_norm": 2.4571073055267334, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7196232676506042, + "num_tokens": 201924462.0, + "step": 8089 + }, + { + "epoch": 0.8884252141445201, + "grad_norm": 2.039088487625122, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6973403692245483, + "num_tokens": 201956019.0, + "step": 8090 + }, + { + "epoch": 0.8885350318471338, + "grad_norm": 1.8664644956588745, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6872774958610535, + "num_tokens": 201989704.0, + "step": 8091 + }, + { + "epoch": 0.8886448495497474, + "grad_norm": 2.1425774097442627, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7023248076438904, + "num_tokens": 202016103.0, + "step": 8092 + }, + { + "epoch": 0.8887546672523611, + "grad_norm": 2.127574920654297, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7021744847297668, + "num_tokens": 202044029.0, + "step": 8093 + }, + { + "epoch": 0.8888644849549747, + "grad_norm": 1.9639896154403687, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7162955403327942, + "num_tokens": 202073582.0, + "step": 8094 + }, + { + "epoch": 0.8889743026575884, + "grad_norm": 2.032721757888794, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7045820355415344, + "num_tokens": 202102595.0, + "step": 8095 + }, + { + "epoch": 0.889084120360202, + "grad_norm": 2.3666982650756836, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7156115770339966, + "num_tokens": 202125853.0, + "step": 8096 + }, + { + "epoch": 0.8891939380628158, + "grad_norm": 2.144834518432617, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7180496454238892, + "num_tokens": 202152083.0, + "step": 8097 + }, + { + "epoch": 0.8893037557654294, + "grad_norm": 2.1904256343841553, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6909987926483154, + "num_tokens": 202178492.0, + "step": 8098 + }, + { + "epoch": 0.8894135734680431, + "grad_norm": 2.2863998413085938, + "learning_rate": 1e-06, + "loss": 0.7605, + "mean_token_accuracy": 0.756045937538147, + "num_tokens": 202198880.0, + "step": 8099 + }, + { + "epoch": 0.8895233911706567, + "grad_norm": 1.9907817840576172, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6831102967262268, + "num_tokens": 202233231.0, + "step": 8100 + }, + { + "epoch": 0.8896332088732704, + "grad_norm": 2.0400326251983643, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7033751010894775, + "num_tokens": 202260731.0, + "step": 8101 + }, + { + "epoch": 0.889743026575884, + "grad_norm": 2.4240453243255615, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6979409456253052, + "num_tokens": 202283409.0, + "step": 8102 + }, + { + "epoch": 0.8898528442784976, + "grad_norm": 1.969769835472107, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.6974940299987793, + "num_tokens": 202314644.0, + "step": 8103 + }, + { + "epoch": 0.8899626619811114, + "grad_norm": 2.327915668487549, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7031360864639282, + "num_tokens": 202338045.0, + "step": 8104 + }, + { + "epoch": 0.890072479683725, + "grad_norm": 2.107391595840454, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6833522319793701, + "num_tokens": 202366075.0, + "step": 8105 + }, + { + "epoch": 0.8901822973863387, + "grad_norm": 2.3046813011169434, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.6982946991920471, + "num_tokens": 202394500.0, + "step": 8106 + }, + { + "epoch": 0.8902921150889523, + "grad_norm": 2.200477123260498, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.712342381477356, + "num_tokens": 202419812.0, + "step": 8107 + }, + { + "epoch": 0.890401932791566, + "grad_norm": 2.697624683380127, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7246742248535156, + "num_tokens": 202437403.0, + "step": 8108 + }, + { + "epoch": 0.8905117504941796, + "grad_norm": 2.025012254714966, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6967929005622864, + "num_tokens": 202464870.0, + "step": 8109 + }, + { + "epoch": 0.8906215681967933, + "grad_norm": 2.173407793045044, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7084833383560181, + "num_tokens": 202490078.0, + "step": 8110 + }, + { + "epoch": 0.890731385899407, + "grad_norm": 2.3803229331970215, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7027855515480042, + "num_tokens": 202515259.0, + "step": 8111 + }, + { + "epoch": 0.8908412036020207, + "grad_norm": 2.0834054946899414, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7259724736213684, + "num_tokens": 202541650.0, + "step": 8112 + }, + { + "epoch": 0.8909510213046343, + "grad_norm": 2.508653163909912, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.731591522693634, + "num_tokens": 202560799.0, + "step": 8113 + }, + { + "epoch": 0.891060839007248, + "grad_norm": 2.5723214149475098, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7240982055664062, + "num_tokens": 202580185.0, + "step": 8114 + }, + { + "epoch": 0.8911706567098616, + "grad_norm": 1.9179306030273438, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7005293965339661, + "num_tokens": 202610406.0, + "step": 8115 + }, + { + "epoch": 0.8912804744124753, + "grad_norm": 1.9789811372756958, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7326299548149109, + "num_tokens": 202638838.0, + "step": 8116 + }, + { + "epoch": 0.8913902921150889, + "grad_norm": 2.3094401359558105, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7301658391952515, + "num_tokens": 202660625.0, + "step": 8117 + }, + { + "epoch": 0.8915001098177027, + "grad_norm": 2.2482473850250244, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.6968581676483154, + "num_tokens": 202684434.0, + "step": 8118 + }, + { + "epoch": 0.8916099275203163, + "grad_norm": 2.2172763347625732, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7151963710784912, + "num_tokens": 202710749.0, + "step": 8119 + }, + { + "epoch": 0.89171974522293, + "grad_norm": 2.164228677749634, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.712622880935669, + "num_tokens": 202736584.0, + "step": 8120 + }, + { + "epoch": 0.8918295629255436, + "grad_norm": 2.020003080368042, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7029500007629395, + "num_tokens": 202766497.0, + "step": 8121 + }, + { + "epoch": 0.8919393806281573, + "grad_norm": 2.048433542251587, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7041061520576477, + "num_tokens": 202793974.0, + "step": 8122 + }, + { + "epoch": 0.8920491983307709, + "grad_norm": 2.217263698577881, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7007405757904053, + "num_tokens": 202818127.0, + "step": 8123 + }, + { + "epoch": 0.8921590160333845, + "grad_norm": 2.5552430152893066, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7142379283905029, + "num_tokens": 202836470.0, + "step": 8124 + }, + { + "epoch": 0.8922688337359982, + "grad_norm": 2.141632080078125, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7278969883918762, + "num_tokens": 202862467.0, + "step": 8125 + }, + { + "epoch": 0.892378651438612, + "grad_norm": 2.292912483215332, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7117623686790466, + "num_tokens": 202884509.0, + "step": 8126 + }, + { + "epoch": 0.8924884691412256, + "grad_norm": 2.273029327392578, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7248908281326294, + "num_tokens": 202906266.0, + "step": 8127 + }, + { + "epoch": 0.8925982868438392, + "grad_norm": 2.409921646118164, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7183831930160522, + "num_tokens": 202927979.0, + "step": 8128 + }, + { + "epoch": 0.8927081045464529, + "grad_norm": 2.5680832862854004, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7450630068778992, + "num_tokens": 202946155.0, + "step": 8129 + }, + { + "epoch": 0.8928179222490665, + "grad_norm": 2.126591682434082, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7120155692100525, + "num_tokens": 202971514.0, + "step": 8130 + }, + { + "epoch": 0.8929277399516802, + "grad_norm": 2.72371244430542, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7139312624931335, + "num_tokens": 202991527.0, + "step": 8131 + }, + { + "epoch": 0.8930375576542938, + "grad_norm": 1.7959566116333008, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7032486200332642, + "num_tokens": 203028304.0, + "step": 8132 + }, + { + "epoch": 0.8931473753569076, + "grad_norm": 2.466451406478882, + "learning_rate": 1e-06, + "loss": 1.0968, + "mean_token_accuracy": 0.6745115518569946, + "num_tokens": 203054751.0, + "step": 8133 + }, + { + "epoch": 0.8932571930595212, + "grad_norm": 2.1040892601013184, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6841185092926025, + "num_tokens": 203083564.0, + "step": 8134 + }, + { + "epoch": 0.8933670107621349, + "grad_norm": 2.070549249649048, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6825730800628662, + "num_tokens": 203111441.0, + "step": 8135 + }, + { + "epoch": 0.8934768284647485, + "grad_norm": 2.201704263687134, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7170802354812622, + "num_tokens": 203136747.0, + "step": 8136 + }, + { + "epoch": 0.8935866461673622, + "grad_norm": 2.1851847171783447, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.708735466003418, + "num_tokens": 203161334.0, + "step": 8137 + }, + { + "epoch": 0.8936964638699758, + "grad_norm": 2.09567928314209, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7141565084457397, + "num_tokens": 203188477.0, + "step": 8138 + }, + { + "epoch": 0.8938062815725895, + "grad_norm": 2.1827073097229004, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7008728981018066, + "num_tokens": 203214458.0, + "step": 8139 + }, + { + "epoch": 0.8939160992752032, + "grad_norm": 2.402573347091675, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.710768461227417, + "num_tokens": 203237511.0, + "step": 8140 + }, + { + "epoch": 0.8940259169778169, + "grad_norm": 2.2665092945098877, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6993343830108643, + "num_tokens": 203261029.0, + "step": 8141 + }, + { + "epoch": 0.8941357346804305, + "grad_norm": 2.4402987957000732, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7128205895423889, + "num_tokens": 203281335.0, + "step": 8142 + }, + { + "epoch": 0.8942455523830442, + "grad_norm": 2.3932888507843018, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7081094980239868, + "num_tokens": 203303890.0, + "step": 8143 + }, + { + "epoch": 0.8943553700856578, + "grad_norm": 1.9686641693115234, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6938450336456299, + "num_tokens": 203336454.0, + "step": 8144 + }, + { + "epoch": 0.8944651877882714, + "grad_norm": 1.9256610870361328, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6967751979827881, + "num_tokens": 203367679.0, + "step": 8145 + }, + { + "epoch": 0.8945750054908851, + "grad_norm": 1.9467015266418457, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7158764004707336, + "num_tokens": 203396415.0, + "step": 8146 + }, + { + "epoch": 0.8946848231934988, + "grad_norm": 2.3224668502807617, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6969054937362671, + "num_tokens": 203421689.0, + "step": 8147 + }, + { + "epoch": 0.8947946408961125, + "grad_norm": 2.4422690868377686, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.716321587562561, + "num_tokens": 203442856.0, + "step": 8148 + }, + { + "epoch": 0.8949044585987261, + "grad_norm": 2.590561866760254, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7065690755844116, + "num_tokens": 203461780.0, + "step": 8149 + }, + { + "epoch": 0.8950142763013398, + "grad_norm": 2.2834227085113525, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7053923606872559, + "num_tokens": 203485680.0, + "step": 8150 + }, + { + "epoch": 0.8951240940039534, + "grad_norm": 2.1522185802459717, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.705869197845459, + "num_tokens": 203513256.0, + "step": 8151 + }, + { + "epoch": 0.8952339117065671, + "grad_norm": 2.3208634853363037, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7073830366134644, + "num_tokens": 203537238.0, + "step": 8152 + }, + { + "epoch": 0.8953437294091807, + "grad_norm": 2.216902732849121, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.713772177696228, + "num_tokens": 203562357.0, + "step": 8153 + }, + { + "epoch": 0.8954535471117944, + "grad_norm": 2.0861754417419434, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.711264967918396, + "num_tokens": 203589020.0, + "step": 8154 + }, + { + "epoch": 0.8955633648144081, + "grad_norm": 2.0870370864868164, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.712611734867096, + "num_tokens": 203614272.0, + "step": 8155 + }, + { + "epoch": 0.8956731825170218, + "grad_norm": 2.14898419380188, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7246499061584473, + "num_tokens": 203638580.0, + "step": 8156 + }, + { + "epoch": 0.8957830002196354, + "grad_norm": 2.242022752761841, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.725229024887085, + "num_tokens": 203662762.0, + "step": 8157 + }, + { + "epoch": 0.8958928179222491, + "grad_norm": 2.273678779602051, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7279224991798401, + "num_tokens": 203684441.0, + "step": 8158 + }, + { + "epoch": 0.8960026356248627, + "grad_norm": 2.320190906524658, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7156411409378052, + "num_tokens": 203708271.0, + "step": 8159 + }, + { + "epoch": 0.8961124533274764, + "grad_norm": 2.112544298171997, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.698517382144928, + "num_tokens": 203734942.0, + "step": 8160 + }, + { + "epoch": 0.89622227103009, + "grad_norm": 2.4721930027008057, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7012101411819458, + "num_tokens": 203754879.0, + "step": 8161 + }, + { + "epoch": 0.8963320887327038, + "grad_norm": 2.0567195415496826, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.711988091468811, + "num_tokens": 203782377.0, + "step": 8162 + }, + { + "epoch": 0.8964419064353174, + "grad_norm": 2.572836399078369, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7257814407348633, + "num_tokens": 203801601.0, + "step": 8163 + }, + { + "epoch": 0.896551724137931, + "grad_norm": 2.3127424716949463, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7112842202186584, + "num_tokens": 203824042.0, + "step": 8164 + }, + { + "epoch": 0.8966615418405447, + "grad_norm": 2.089491844177246, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6982380151748657, + "num_tokens": 203851073.0, + "step": 8165 + }, + { + "epoch": 0.8967713595431583, + "grad_norm": 2.301254987716675, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7265375256538391, + "num_tokens": 203874190.0, + "step": 8166 + }, + { + "epoch": 0.896881177245772, + "grad_norm": 2.006162643432617, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7111897468566895, + "num_tokens": 203902447.0, + "step": 8167 + }, + { + "epoch": 0.8969909949483856, + "grad_norm": 1.9289004802703857, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7023463249206543, + "num_tokens": 203934337.0, + "step": 8168 + }, + { + "epoch": 0.8971008126509994, + "grad_norm": 2.2596352100372314, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7114641070365906, + "num_tokens": 203957596.0, + "step": 8169 + }, + { + "epoch": 0.897210630353613, + "grad_norm": 2.093141794204712, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7133963108062744, + "num_tokens": 203982447.0, + "step": 8170 + }, + { + "epoch": 0.8973204480562267, + "grad_norm": 2.1235604286193848, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6938992738723755, + "num_tokens": 204011409.0, + "step": 8171 + }, + { + "epoch": 0.8974302657588403, + "grad_norm": 2.2299084663391113, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7311775088310242, + "num_tokens": 204034428.0, + "step": 8172 + }, + { + "epoch": 0.897540083461454, + "grad_norm": 2.450155735015869, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.6990752220153809, + "num_tokens": 204055274.0, + "step": 8173 + }, + { + "epoch": 0.8976499011640676, + "grad_norm": 2.257016897201538, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7163563370704651, + "num_tokens": 204079577.0, + "step": 8174 + }, + { + "epoch": 0.8977597188666813, + "grad_norm": 2.1864168643951416, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.688119649887085, + "num_tokens": 204105896.0, + "step": 8175 + }, + { + "epoch": 0.897869536569295, + "grad_norm": 2.3009400367736816, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6982955932617188, + "num_tokens": 204128974.0, + "step": 8176 + }, + { + "epoch": 0.8979793542719087, + "grad_norm": 2.0798885822296143, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7065276503562927, + "num_tokens": 204155368.0, + "step": 8177 + }, + { + "epoch": 0.8980891719745223, + "grad_norm": 2.2466676235198975, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6926718950271606, + "num_tokens": 204181417.0, + "step": 8178 + }, + { + "epoch": 0.898198989677136, + "grad_norm": 2.578364372253418, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7096073031425476, + "num_tokens": 204200677.0, + "step": 8179 + }, + { + "epoch": 0.8983088073797496, + "grad_norm": 2.234696388244629, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.6979851722717285, + "num_tokens": 204225457.0, + "step": 8180 + }, + { + "epoch": 0.8984186250823633, + "grad_norm": 2.3524091243743896, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7365013957023621, + "num_tokens": 204248386.0, + "step": 8181 + }, + { + "epoch": 0.8985284427849769, + "grad_norm": 2.257020950317383, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7140292525291443, + "num_tokens": 204272240.0, + "step": 8182 + }, + { + "epoch": 0.8986382604875905, + "grad_norm": 2.0705058574676514, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6929994821548462, + "num_tokens": 204301241.0, + "step": 8183 + }, + { + "epoch": 0.8987480781902043, + "grad_norm": 2.012134313583374, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6909089088439941, + "num_tokens": 204331960.0, + "step": 8184 + }, + { + "epoch": 0.898857895892818, + "grad_norm": 2.403848648071289, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7037450671195984, + "num_tokens": 204354999.0, + "step": 8185 + }, + { + "epoch": 0.8989677135954316, + "grad_norm": 2.0355327129364014, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6877492070198059, + "num_tokens": 204385505.0, + "step": 8186 + }, + { + "epoch": 0.8990775312980452, + "grad_norm": 2.521423578262329, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7059637904167175, + "num_tokens": 204406094.0, + "step": 8187 + }, + { + "epoch": 0.8991873490006589, + "grad_norm": 2.030082941055298, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7009294033050537, + "num_tokens": 204436210.0, + "step": 8188 + }, + { + "epoch": 0.8992971667032725, + "grad_norm": 2.210559129714966, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.7008076906204224, + "num_tokens": 204462382.0, + "step": 8189 + }, + { + "epoch": 0.8994069844058862, + "grad_norm": 2.145033836364746, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.698685348033905, + "num_tokens": 204490191.0, + "step": 8190 + }, + { + "epoch": 0.8995168021084999, + "grad_norm": 2.246805429458618, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7280972003936768, + "num_tokens": 204513540.0, + "step": 8191 + }, + { + "epoch": 0.8996266198111136, + "grad_norm": 2.2370643615722656, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6913595199584961, + "num_tokens": 204538285.0, + "step": 8192 + }, + { + "epoch": 0.8997364375137272, + "grad_norm": 2.2483713626861572, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7496751546859741, + "num_tokens": 204561033.0, + "step": 8193 + }, + { + "epoch": 0.8998462552163409, + "grad_norm": 2.153041124343872, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.701221764087677, + "num_tokens": 204587771.0, + "step": 8194 + }, + { + "epoch": 0.8999560729189545, + "grad_norm": 2.0962703227996826, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7085994482040405, + "num_tokens": 204614643.0, + "step": 8195 + }, + { + "epoch": 0.9000658906215682, + "grad_norm": 2.464141607284546, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7036740183830261, + "num_tokens": 204637382.0, + "step": 8196 + }, + { + "epoch": 0.9001757083241818, + "grad_norm": 2.1009950637817383, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7152149677276611, + "num_tokens": 204664307.0, + "step": 8197 + }, + { + "epoch": 0.9002855260267956, + "grad_norm": 2.3627243041992188, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6991627216339111, + "num_tokens": 204687091.0, + "step": 8198 + }, + { + "epoch": 0.9003953437294092, + "grad_norm": 1.942710280418396, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7086811065673828, + "num_tokens": 204716879.0, + "step": 8199 + }, + { + "epoch": 0.9005051614320229, + "grad_norm": 2.125235080718994, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6963392496109009, + "num_tokens": 204744330.0, + "step": 8200 + }, + { + "epoch": 0.9006149791346365, + "grad_norm": 2.3906972408294678, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7096759080886841, + "num_tokens": 204766371.0, + "step": 8201 + }, + { + "epoch": 0.9007247968372502, + "grad_norm": 2.285876750946045, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7093493938446045, + "num_tokens": 204790514.0, + "step": 8202 + }, + { + "epoch": 0.9008346145398638, + "grad_norm": 2.100367307662964, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6929383277893066, + "num_tokens": 204818627.0, + "step": 8203 + }, + { + "epoch": 0.9009444322424774, + "grad_norm": 2.0766184329986572, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7135251760482788, + "num_tokens": 204846638.0, + "step": 8204 + }, + { + "epoch": 0.9010542499450912, + "grad_norm": 2.4538609981536865, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7328338623046875, + "num_tokens": 204866097.0, + "step": 8205 + }, + { + "epoch": 0.9011640676477048, + "grad_norm": 2.170729160308838, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.705641508102417, + "num_tokens": 204890898.0, + "step": 8206 + }, + { + "epoch": 0.9012738853503185, + "grad_norm": 2.0821259021759033, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7265440225601196, + "num_tokens": 204918394.0, + "step": 8207 + }, + { + "epoch": 0.9013837030529321, + "grad_norm": 2.382035732269287, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7198325991630554, + "num_tokens": 204940224.0, + "step": 8208 + }, + { + "epoch": 0.9014935207555458, + "grad_norm": 2.011516809463501, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6969819068908691, + "num_tokens": 204971297.0, + "step": 8209 + }, + { + "epoch": 0.9016033384581594, + "grad_norm": 2.0580615997314453, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7123005986213684, + "num_tokens": 204999572.0, + "step": 8210 + }, + { + "epoch": 0.9017131561607731, + "grad_norm": 1.9754351377487183, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6954993009567261, + "num_tokens": 205029312.0, + "step": 8211 + }, + { + "epoch": 0.9018229738633867, + "grad_norm": 2.4253454208374023, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.724950909614563, + "num_tokens": 205048974.0, + "step": 8212 + }, + { + "epoch": 0.9019327915660005, + "grad_norm": 3.2888991832733154, + "learning_rate": 1e-06, + "loss": 0.7549, + "mean_token_accuracy": 0.7489614486694336, + "num_tokens": 205061383.0, + "step": 8213 + }, + { + "epoch": 0.9020426092686141, + "grad_norm": 2.413299083709717, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6945352554321289, + "num_tokens": 205084588.0, + "step": 8214 + }, + { + "epoch": 0.9021524269712278, + "grad_norm": 2.4831955432891846, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7254141569137573, + "num_tokens": 205103748.0, + "step": 8215 + }, + { + "epoch": 0.9022622446738414, + "grad_norm": 2.3703346252441406, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7137596011161804, + "num_tokens": 205125832.0, + "step": 8216 + }, + { + "epoch": 0.9023720623764551, + "grad_norm": 2.163738250732422, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6875019669532776, + "num_tokens": 205151519.0, + "step": 8217 + }, + { + "epoch": 0.9024818800790687, + "grad_norm": 2.0833098888397217, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6913930773735046, + "num_tokens": 205179093.0, + "step": 8218 + }, + { + "epoch": 0.9025916977816824, + "grad_norm": 2.3995041847229004, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.724539577960968, + "num_tokens": 205199532.0, + "step": 8219 + }, + { + "epoch": 0.9027015154842961, + "grad_norm": 2.240795850753784, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6836727857589722, + "num_tokens": 205223937.0, + "step": 8220 + }, + { + "epoch": 0.9028113331869098, + "grad_norm": 2.1841933727264404, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7317410111427307, + "num_tokens": 205248911.0, + "step": 8221 + }, + { + "epoch": 0.9029211508895234, + "grad_norm": 2.425426483154297, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.727497935295105, + "num_tokens": 205269019.0, + "step": 8222 + }, + { + "epoch": 0.903030968592137, + "grad_norm": 2.1453778743743896, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6960653066635132, + "num_tokens": 205297171.0, + "step": 8223 + }, + { + "epoch": 0.9031407862947507, + "grad_norm": 2.2000958919525146, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7269783616065979, + "num_tokens": 205321074.0, + "step": 8224 + }, + { + "epoch": 0.9032506039973643, + "grad_norm": 2.4037728309631348, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7164778709411621, + "num_tokens": 205342426.0, + "step": 8225 + }, + { + "epoch": 0.903360421699978, + "grad_norm": 2.514587640762329, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7043994665145874, + "num_tokens": 205362994.0, + "step": 8226 + }, + { + "epoch": 0.9034702394025917, + "grad_norm": 2.0907576084136963, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6932480931282043, + "num_tokens": 205392009.0, + "step": 8227 + }, + { + "epoch": 0.9035800571052054, + "grad_norm": 2.3261678218841553, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.6980708837509155, + "num_tokens": 205416254.0, + "step": 8228 + }, + { + "epoch": 0.903689874807819, + "grad_norm": 2.3306241035461426, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7152596116065979, + "num_tokens": 205439125.0, + "step": 8229 + }, + { + "epoch": 0.9037996925104327, + "grad_norm": 2.1834232807159424, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.706299901008606, + "num_tokens": 205465826.0, + "step": 8230 + }, + { + "epoch": 0.9039095102130463, + "grad_norm": 2.483759880065918, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7143158316612244, + "num_tokens": 205485743.0, + "step": 8231 + }, + { + "epoch": 0.90401932791566, + "grad_norm": 2.0430126190185547, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6833969354629517, + "num_tokens": 205515300.0, + "step": 8232 + }, + { + "epoch": 0.9041291456182736, + "grad_norm": 2.0872058868408203, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7095616459846497, + "num_tokens": 205541468.0, + "step": 8233 + }, + { + "epoch": 0.9042389633208874, + "grad_norm": 2.3024542331695557, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.733090341091156, + "num_tokens": 205563015.0, + "step": 8234 + }, + { + "epoch": 0.904348781023501, + "grad_norm": 2.1427853107452393, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6976794600486755, + "num_tokens": 205587804.0, + "step": 8235 + }, + { + "epoch": 0.9044585987261147, + "grad_norm": 2.292332172393799, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7215983271598816, + "num_tokens": 205610176.0, + "step": 8236 + }, + { + "epoch": 0.9045684164287283, + "grad_norm": 2.4025235176086426, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7107992768287659, + "num_tokens": 205633577.0, + "step": 8237 + }, + { + "epoch": 0.904678234131342, + "grad_norm": 2.192629337310791, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.6997345685958862, + "num_tokens": 205658189.0, + "step": 8238 + }, + { + "epoch": 0.9047880518339556, + "grad_norm": 2.349691152572632, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7131510972976685, + "num_tokens": 205681437.0, + "step": 8239 + }, + { + "epoch": 0.9048978695365693, + "grad_norm": 1.9805842638015747, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.71015465259552, + "num_tokens": 205712217.0, + "step": 8240 + }, + { + "epoch": 0.9050076872391829, + "grad_norm": 2.2727489471435547, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7094037532806396, + "num_tokens": 205738068.0, + "step": 8241 + }, + { + "epoch": 0.9051175049417967, + "grad_norm": 2.291680335998535, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6877981424331665, + "num_tokens": 205764860.0, + "step": 8242 + }, + { + "epoch": 0.9052273226444103, + "grad_norm": 2.293396234512329, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7183796167373657, + "num_tokens": 205789372.0, + "step": 8243 + }, + { + "epoch": 0.905337140347024, + "grad_norm": 2.5364315509796143, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7216359376907349, + "num_tokens": 205809586.0, + "step": 8244 + }, + { + "epoch": 0.9054469580496376, + "grad_norm": 2.5429840087890625, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6995089054107666, + "num_tokens": 205830668.0, + "step": 8245 + }, + { + "epoch": 0.9055567757522512, + "grad_norm": 2.292607307434082, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6899032592773438, + "num_tokens": 205854051.0, + "step": 8246 + }, + { + "epoch": 0.9056665934548649, + "grad_norm": 2.1596872806549072, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7406328320503235, + "num_tokens": 205880133.0, + "step": 8247 + }, + { + "epoch": 0.9057764111574785, + "grad_norm": 2.2193169593811035, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7168935537338257, + "num_tokens": 205905082.0, + "step": 8248 + }, + { + "epoch": 0.9058862288600923, + "grad_norm": 2.355935573577881, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.72342848777771, + "num_tokens": 205926044.0, + "step": 8249 + }, + { + "epoch": 0.9059960465627059, + "grad_norm": 2.463890790939331, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7038659453392029, + "num_tokens": 205946662.0, + "step": 8250 + }, + { + "epoch": 0.9061058642653196, + "grad_norm": 2.105604410171509, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7162301540374756, + "num_tokens": 205973936.0, + "step": 8251 + }, + { + "epoch": 0.9062156819679332, + "grad_norm": 2.2333500385284424, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7096400260925293, + "num_tokens": 205997473.0, + "step": 8252 + }, + { + "epoch": 0.9063254996705469, + "grad_norm": 2.0692944526672363, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6998064517974854, + "num_tokens": 206027101.0, + "step": 8253 + }, + { + "epoch": 0.9064353173731605, + "grad_norm": 2.2360999584198, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6918678879737854, + "num_tokens": 206052968.0, + "step": 8254 + }, + { + "epoch": 0.9065451350757742, + "grad_norm": 2.0912909507751465, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6857359409332275, + "num_tokens": 206082605.0, + "step": 8255 + }, + { + "epoch": 0.9066549527783879, + "grad_norm": 2.056894540786743, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7110573053359985, + "num_tokens": 206107530.0, + "step": 8256 + }, + { + "epoch": 0.9067647704810016, + "grad_norm": 2.446361780166626, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7243236899375916, + "num_tokens": 206128617.0, + "step": 8257 + }, + { + "epoch": 0.9068745881836152, + "grad_norm": 2.00667667388916, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.707120954990387, + "num_tokens": 206156852.0, + "step": 8258 + }, + { + "epoch": 0.9069844058862289, + "grad_norm": 2.4508371353149414, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7269392013549805, + "num_tokens": 206177862.0, + "step": 8259 + }, + { + "epoch": 0.9070942235888425, + "grad_norm": 2.570108652114868, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7299046516418457, + "num_tokens": 206197553.0, + "step": 8260 + }, + { + "epoch": 0.9072040412914562, + "grad_norm": 2.2800655364990234, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7237924337387085, + "num_tokens": 206218912.0, + "step": 8261 + }, + { + "epoch": 0.9073138589940698, + "grad_norm": 2.362330198287964, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7334771156311035, + "num_tokens": 206240887.0, + "step": 8262 + }, + { + "epoch": 0.9074236766966836, + "grad_norm": 2.1019341945648193, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7085607647895813, + "num_tokens": 206266812.0, + "step": 8263 + }, + { + "epoch": 0.9075334943992972, + "grad_norm": 2.263646364212036, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7497775554656982, + "num_tokens": 206288905.0, + "step": 8264 + }, + { + "epoch": 0.9076433121019108, + "grad_norm": 2.0069284439086914, + "learning_rate": 1e-06, + "loss": 1.1555, + "mean_token_accuracy": 0.6657530069351196, + "num_tokens": 206318720.0, + "step": 8265 + }, + { + "epoch": 0.9077531298045245, + "grad_norm": 2.3773531913757324, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.7018981575965881, + "num_tokens": 206340611.0, + "step": 8266 + }, + { + "epoch": 0.9078629475071381, + "grad_norm": 1.8631014823913574, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6939535737037659, + "num_tokens": 206374796.0, + "step": 8267 + }, + { + "epoch": 0.9079727652097518, + "grad_norm": 2.3428080081939697, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7047706842422485, + "num_tokens": 206396965.0, + "step": 8268 + }, + { + "epoch": 0.9080825829123654, + "grad_norm": 2.3097338676452637, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7305300235748291, + "num_tokens": 206419882.0, + "step": 8269 + }, + { + "epoch": 0.9081924006149792, + "grad_norm": 2.3172523975372314, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.711185097694397, + "num_tokens": 206442471.0, + "step": 8270 + }, + { + "epoch": 0.9083022183175928, + "grad_norm": 2.103070020675659, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.700486958026886, + "num_tokens": 206470711.0, + "step": 8271 + }, + { + "epoch": 0.9084120360202065, + "grad_norm": 2.137490749359131, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6954178214073181, + "num_tokens": 206497520.0, + "step": 8272 + }, + { + "epoch": 0.9085218537228201, + "grad_norm": 2.118636131286621, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7056832313537598, + "num_tokens": 206523425.0, + "step": 8273 + }, + { + "epoch": 0.9086316714254338, + "grad_norm": 2.2549777030944824, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7209784984588623, + "num_tokens": 206546320.0, + "step": 8274 + }, + { + "epoch": 0.9087414891280474, + "grad_norm": 2.0408523082733154, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7120276689529419, + "num_tokens": 206575271.0, + "step": 8275 + }, + { + "epoch": 0.9088513068306611, + "grad_norm": 2.0535571575164795, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7001345157623291, + "num_tokens": 206605836.0, + "step": 8276 + }, + { + "epoch": 0.9089611245332747, + "grad_norm": 2.4036924839019775, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.721177875995636, + "num_tokens": 206628846.0, + "step": 8277 + }, + { + "epoch": 0.9090709422358885, + "grad_norm": 2.0418622493743896, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7068204283714294, + "num_tokens": 206657238.0, + "step": 8278 + }, + { + "epoch": 0.9091807599385021, + "grad_norm": 2.6867403984069824, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7097967267036438, + "num_tokens": 206677055.0, + "step": 8279 + }, + { + "epoch": 0.9092905776411158, + "grad_norm": 2.331188440322876, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7069334983825684, + "num_tokens": 206700480.0, + "step": 8280 + }, + { + "epoch": 0.9094003953437294, + "grad_norm": 2.1058573722839355, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.70354163646698, + "num_tokens": 206728207.0, + "step": 8281 + }, + { + "epoch": 0.909510213046343, + "grad_norm": 1.9449597597122192, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7040866613388062, + "num_tokens": 206756908.0, + "step": 8282 + }, + { + "epoch": 0.9096200307489567, + "grad_norm": 2.5451042652130127, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.716214656829834, + "num_tokens": 206776027.0, + "step": 8283 + }, + { + "epoch": 0.9097298484515703, + "grad_norm": 2.191310405731201, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6949797868728638, + "num_tokens": 206802226.0, + "step": 8284 + }, + { + "epoch": 0.9098396661541841, + "grad_norm": 2.504791498184204, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7240897417068481, + "num_tokens": 206822652.0, + "step": 8285 + }, + { + "epoch": 0.9099494838567977, + "grad_norm": 2.572227716445923, + "learning_rate": 1e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7541542053222656, + "num_tokens": 206840509.0, + "step": 8286 + }, + { + "epoch": 0.9100593015594114, + "grad_norm": 1.9810315370559692, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7021187543869019, + "num_tokens": 206870249.0, + "step": 8287 + }, + { + "epoch": 0.910169119262025, + "grad_norm": 2.081632137298584, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6936454772949219, + "num_tokens": 206897371.0, + "step": 8288 + }, + { + "epoch": 0.9102789369646387, + "grad_norm": 2.307114601135254, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7222061157226562, + "num_tokens": 206920399.0, + "step": 8289 + }, + { + "epoch": 0.9103887546672523, + "grad_norm": 2.0596892833709717, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7188475728034973, + "num_tokens": 206947592.0, + "step": 8290 + }, + { + "epoch": 0.910498572369866, + "grad_norm": 2.365914821624756, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7137641310691833, + "num_tokens": 206969685.0, + "step": 8291 + }, + { + "epoch": 0.9106083900724797, + "grad_norm": 2.074148416519165, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7154356241226196, + "num_tokens": 206996459.0, + "step": 8292 + }, + { + "epoch": 0.9107182077750934, + "grad_norm": 2.246741771697998, + "learning_rate": 1e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6743167638778687, + "num_tokens": 207023626.0, + "step": 8293 + }, + { + "epoch": 0.910828025477707, + "grad_norm": 2.3700616359710693, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.712148904800415, + "num_tokens": 207048060.0, + "step": 8294 + }, + { + "epoch": 0.9109378431803207, + "grad_norm": 2.0163936614990234, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6996926069259644, + "num_tokens": 207075755.0, + "step": 8295 + }, + { + "epoch": 0.9110476608829343, + "grad_norm": 2.097015380859375, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7062265872955322, + "num_tokens": 207101378.0, + "step": 8296 + }, + { + "epoch": 0.911157478585548, + "grad_norm": 2.453620433807373, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7207975387573242, + "num_tokens": 207121498.0, + "step": 8297 + }, + { + "epoch": 0.9112672962881616, + "grad_norm": 3.0700747966766357, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7132573127746582, + "num_tokens": 207136421.0, + "step": 8298 + }, + { + "epoch": 0.9113771139907754, + "grad_norm": 2.199800968170166, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7267175912857056, + "num_tokens": 207159554.0, + "step": 8299 + }, + { + "epoch": 0.911486931693389, + "grad_norm": 2.7258923053741455, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7108191251754761, + "num_tokens": 207179064.0, + "step": 8300 + }, + { + "epoch": 0.9115967493960027, + "grad_norm": 2.233511209487915, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7118588089942932, + "num_tokens": 207205354.0, + "step": 8301 + }, + { + "epoch": 0.9117065670986163, + "grad_norm": 2.2014076709747314, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6953268051147461, + "num_tokens": 207230759.0, + "step": 8302 + }, + { + "epoch": 0.91181638480123, + "grad_norm": 2.3015482425689697, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7159677743911743, + "num_tokens": 207252550.0, + "step": 8303 + }, + { + "epoch": 0.9119262025038436, + "grad_norm": 2.010923147201538, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6819684505462646, + "num_tokens": 207283475.0, + "step": 8304 + }, + { + "epoch": 0.9120360202064572, + "grad_norm": 2.1601715087890625, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7088431119918823, + "num_tokens": 207311727.0, + "step": 8305 + }, + { + "epoch": 0.9121458379090709, + "grad_norm": 2.144091844558716, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7216151356697083, + "num_tokens": 207336763.0, + "step": 8306 + }, + { + "epoch": 0.9122556556116846, + "grad_norm": 2.1693172454833984, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7227580547332764, + "num_tokens": 207360391.0, + "step": 8307 + }, + { + "epoch": 0.9123654733142983, + "grad_norm": 2.0750694274902344, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6954842805862427, + "num_tokens": 207388859.0, + "step": 8308 + }, + { + "epoch": 0.9124752910169119, + "grad_norm": 1.917970061302185, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7159382104873657, + "num_tokens": 207420105.0, + "step": 8309 + }, + { + "epoch": 0.9125851087195256, + "grad_norm": 2.731130599975586, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7163807153701782, + "num_tokens": 207437973.0, + "step": 8310 + }, + { + "epoch": 0.9126949264221392, + "grad_norm": 2.0631911754608154, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.6964200735092163, + "num_tokens": 207467042.0, + "step": 8311 + }, + { + "epoch": 0.9128047441247529, + "grad_norm": 2.1160550117492676, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6910637617111206, + "num_tokens": 207496667.0, + "step": 8312 + }, + { + "epoch": 0.9129145618273665, + "grad_norm": 2.3389151096343994, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7262070775032043, + "num_tokens": 207518759.0, + "step": 8313 + }, + { + "epoch": 0.9130243795299803, + "grad_norm": 2.2215988636016846, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7130330801010132, + "num_tokens": 207542280.0, + "step": 8314 + }, + { + "epoch": 0.9131341972325939, + "grad_norm": 2.1642680168151855, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7075052857398987, + "num_tokens": 207569025.0, + "step": 8315 + }, + { + "epoch": 0.9132440149352076, + "grad_norm": 2.299807071685791, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6906090378761292, + "num_tokens": 207593775.0, + "step": 8316 + }, + { + "epoch": 0.9133538326378212, + "grad_norm": 2.2261812686920166, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6886448860168457, + "num_tokens": 207619581.0, + "step": 8317 + }, + { + "epoch": 0.9134636503404349, + "grad_norm": 2.034440755844116, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7121866345405579, + "num_tokens": 207647282.0, + "step": 8318 + }, + { + "epoch": 0.9135734680430485, + "grad_norm": 2.1167306900024414, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6804717779159546, + "num_tokens": 207675267.0, + "step": 8319 + }, + { + "epoch": 0.9136832857456622, + "grad_norm": 2.310316801071167, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7095276117324829, + "num_tokens": 207698517.0, + "step": 8320 + }, + { + "epoch": 0.9137931034482759, + "grad_norm": 2.1864120960235596, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.70945143699646, + "num_tokens": 207722692.0, + "step": 8321 + }, + { + "epoch": 0.9139029211508896, + "grad_norm": 1.971190333366394, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7213782072067261, + "num_tokens": 207751580.0, + "step": 8322 + }, + { + "epoch": 0.9140127388535032, + "grad_norm": 2.255997896194458, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7336562871932983, + "num_tokens": 207775702.0, + "step": 8323 + }, + { + "epoch": 0.9141225565561168, + "grad_norm": 2.2877070903778076, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7346681952476501, + "num_tokens": 207798169.0, + "step": 8324 + }, + { + "epoch": 0.9142323742587305, + "grad_norm": 2.094355344772339, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7196301817893982, + "num_tokens": 207826340.0, + "step": 8325 + }, + { + "epoch": 0.9143421919613441, + "grad_norm": 2.1459929943084717, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6934401988983154, + "num_tokens": 207854257.0, + "step": 8326 + }, + { + "epoch": 0.9144520096639578, + "grad_norm": 2.094712972640991, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6813816428184509, + "num_tokens": 207881754.0, + "step": 8327 + }, + { + "epoch": 0.9145618273665715, + "grad_norm": 2.6757822036743164, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7203452587127686, + "num_tokens": 207900570.0, + "step": 8328 + }, + { + "epoch": 0.9146716450691852, + "grad_norm": 2.0102477073669434, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6912264823913574, + "num_tokens": 207930018.0, + "step": 8329 + }, + { + "epoch": 0.9147814627717988, + "grad_norm": 2.104945659637451, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.703513503074646, + "num_tokens": 207957115.0, + "step": 8330 + }, + { + "epoch": 0.9148912804744125, + "grad_norm": 2.2420008182525635, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7012410163879395, + "num_tokens": 207983201.0, + "step": 8331 + }, + { + "epoch": 0.9150010981770261, + "grad_norm": 1.971451759338379, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6990545988082886, + "num_tokens": 208013042.0, + "step": 8332 + }, + { + "epoch": 0.9151109158796398, + "grad_norm": 1.9237463474273682, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7139182090759277, + "num_tokens": 208042254.0, + "step": 8333 + }, + { + "epoch": 0.9152207335822534, + "grad_norm": 2.2008776664733887, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7290087938308716, + "num_tokens": 208065381.0, + "step": 8334 + }, + { + "epoch": 0.9153305512848671, + "grad_norm": 1.9154635667800903, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6858174204826355, + "num_tokens": 208097815.0, + "step": 8335 + }, + { + "epoch": 0.9154403689874808, + "grad_norm": 2.3225855827331543, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7122469544410706, + "num_tokens": 208119838.0, + "step": 8336 + }, + { + "epoch": 0.9155501866900945, + "grad_norm": 2.561917781829834, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7050876021385193, + "num_tokens": 208139827.0, + "step": 8337 + }, + { + "epoch": 0.9156600043927081, + "grad_norm": 1.9077544212341309, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.703362226486206, + "num_tokens": 208171641.0, + "step": 8338 + }, + { + "epoch": 0.9157698220953218, + "grad_norm": 2.453526496887207, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.6997712254524231, + "num_tokens": 208193675.0, + "step": 8339 + }, + { + "epoch": 0.9158796397979354, + "grad_norm": 1.94698166847229, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7050622701644897, + "num_tokens": 208226068.0, + "step": 8340 + }, + { + "epoch": 0.915989457500549, + "grad_norm": 2.0231289863586426, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.704839289188385, + "num_tokens": 208255885.0, + "step": 8341 + }, + { + "epoch": 0.9160992752031627, + "grad_norm": 2.039750337600708, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6961811780929565, + "num_tokens": 208283902.0, + "step": 8342 + }, + { + "epoch": 0.9162090929057765, + "grad_norm": 2.1280572414398193, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.685859739780426, + "num_tokens": 208312036.0, + "step": 8343 + }, + { + "epoch": 0.9163189106083901, + "grad_norm": 2.0388131141662598, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7043940424919128, + "num_tokens": 208341222.0, + "step": 8344 + }, + { + "epoch": 0.9164287283110037, + "grad_norm": 2.2039120197296143, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7044961452484131, + "num_tokens": 208365132.0, + "step": 8345 + }, + { + "epoch": 0.9165385460136174, + "grad_norm": 2.3807218074798584, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7284541726112366, + "num_tokens": 208386060.0, + "step": 8346 + }, + { + "epoch": 0.916648363716231, + "grad_norm": 2.3131234645843506, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6902085542678833, + "num_tokens": 208411490.0, + "step": 8347 + }, + { + "epoch": 0.9167581814188447, + "grad_norm": 2.11663556098938, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.712925910949707, + "num_tokens": 208436954.0, + "step": 8348 + }, + { + "epoch": 0.9168679991214583, + "grad_norm": 2.132835626602173, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7129548192024231, + "num_tokens": 208464322.0, + "step": 8349 + }, + { + "epoch": 0.9169778168240721, + "grad_norm": 2.363267660140991, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7063044309616089, + "num_tokens": 208486303.0, + "step": 8350 + }, + { + "epoch": 0.9170876345266857, + "grad_norm": 2.042123317718506, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7120535969734192, + "num_tokens": 208517681.0, + "step": 8351 + }, + { + "epoch": 0.9171974522292994, + "grad_norm": 2.472599506378174, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6956552267074585, + "num_tokens": 208538805.0, + "step": 8352 + }, + { + "epoch": 0.917307269931913, + "grad_norm": 2.5446553230285645, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7196718454360962, + "num_tokens": 208559262.0, + "step": 8353 + }, + { + "epoch": 0.9174170876345267, + "grad_norm": 2.387801170349121, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6964070796966553, + "num_tokens": 208581706.0, + "step": 8354 + }, + { + "epoch": 0.9175269053371403, + "grad_norm": 2.444338083267212, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7446913719177246, + "num_tokens": 208600863.0, + "step": 8355 + }, + { + "epoch": 0.917636723039754, + "grad_norm": 2.2206003665924072, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7199212312698364, + "num_tokens": 208626948.0, + "step": 8356 + }, + { + "epoch": 0.9177465407423677, + "grad_norm": 1.946928858757019, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.713126540184021, + "num_tokens": 208657497.0, + "step": 8357 + }, + { + "epoch": 0.9178563584449814, + "grad_norm": 2.355010747909546, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7203270792961121, + "num_tokens": 208680069.0, + "step": 8358 + }, + { + "epoch": 0.917966176147595, + "grad_norm": 1.966877818107605, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6990752220153809, + "num_tokens": 208709032.0, + "step": 8359 + }, + { + "epoch": 0.9180759938502087, + "grad_norm": 2.6868648529052734, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7198927402496338, + "num_tokens": 208728531.0, + "step": 8360 + }, + { + "epoch": 0.9181858115528223, + "grad_norm": 2.2385926246643066, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7070918679237366, + "num_tokens": 208754255.0, + "step": 8361 + }, + { + "epoch": 0.918295629255436, + "grad_norm": 2.3264832496643066, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7148808836936951, + "num_tokens": 208776073.0, + "step": 8362 + }, + { + "epoch": 0.9184054469580496, + "grad_norm": 2.4365077018737793, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7340788841247559, + "num_tokens": 208796143.0, + "step": 8363 + }, + { + "epoch": 0.9185152646606632, + "grad_norm": 2.451131820678711, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7131429314613342, + "num_tokens": 208816292.0, + "step": 8364 + }, + { + "epoch": 0.918625082363277, + "grad_norm": 2.2779479026794434, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.709166407585144, + "num_tokens": 208837992.0, + "step": 8365 + }, + { + "epoch": 0.9187349000658906, + "grad_norm": 2.245966911315918, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7103538513183594, + "num_tokens": 208861605.0, + "step": 8366 + }, + { + "epoch": 0.9188447177685043, + "grad_norm": 2.8082027435302734, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7180229425430298, + "num_tokens": 208878207.0, + "step": 8367 + }, + { + "epoch": 0.9189545354711179, + "grad_norm": 2.022965669631958, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.6998921632766724, + "num_tokens": 208906634.0, + "step": 8368 + }, + { + "epoch": 0.9190643531737316, + "grad_norm": 2.1883721351623535, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7337031364440918, + "num_tokens": 208930689.0, + "step": 8369 + }, + { + "epoch": 0.9191741708763452, + "grad_norm": 1.9856622219085693, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.689693808555603, + "num_tokens": 208959243.0, + "step": 8370 + }, + { + "epoch": 0.9192839885789589, + "grad_norm": 2.3352854251861572, + "learning_rate": 1e-06, + "loss": 0.7801, + "mean_token_accuracy": 0.747039794921875, + "num_tokens": 208979670.0, + "step": 8371 + }, + { + "epoch": 0.9193938062815726, + "grad_norm": 2.1900410652160645, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7069321870803833, + "num_tokens": 209004232.0, + "step": 8372 + }, + { + "epoch": 0.9195036239841863, + "grad_norm": 2.0817434787750244, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7333625555038452, + "num_tokens": 209030477.0, + "step": 8373 + }, + { + "epoch": 0.9196134416867999, + "grad_norm": 2.467717170715332, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.714040219783783, + "num_tokens": 209050381.0, + "step": 8374 + }, + { + "epoch": 0.9197232593894136, + "grad_norm": 2.401721715927124, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.72746741771698, + "num_tokens": 209071897.0, + "step": 8375 + }, + { + "epoch": 0.9198330770920272, + "grad_norm": 1.919743299484253, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6922225952148438, + "num_tokens": 209104496.0, + "step": 8376 + }, + { + "epoch": 0.9199428947946409, + "grad_norm": 2.416504383087158, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6993809342384338, + "num_tokens": 209126532.0, + "step": 8377 + }, + { + "epoch": 0.9200527124972545, + "grad_norm": 1.9701120853424072, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6889166831970215, + "num_tokens": 209159390.0, + "step": 8378 + }, + { + "epoch": 0.9201625301998683, + "grad_norm": 2.523179531097412, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7278803586959839, + "num_tokens": 209178870.0, + "step": 8379 + }, + { + "epoch": 0.9202723479024819, + "grad_norm": 2.104130983352661, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7012828588485718, + "num_tokens": 209204666.0, + "step": 8380 + }, + { + "epoch": 0.9203821656050956, + "grad_norm": 2.089383363723755, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7217074632644653, + "num_tokens": 209230082.0, + "step": 8381 + }, + { + "epoch": 0.9204919833077092, + "grad_norm": 2.3485894203186035, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7237883806228638, + "num_tokens": 209253391.0, + "step": 8382 + }, + { + "epoch": 0.9206018010103229, + "grad_norm": 2.3374791145324707, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7098814249038696, + "num_tokens": 209276833.0, + "step": 8383 + }, + { + "epoch": 0.9207116187129365, + "grad_norm": 2.1781442165374756, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.721735954284668, + "num_tokens": 209301349.0, + "step": 8384 + }, + { + "epoch": 0.9208214364155501, + "grad_norm": 2.0822951793670654, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7334719896316528, + "num_tokens": 209328685.0, + "step": 8385 + }, + { + "epoch": 0.9209312541181639, + "grad_norm": 2.2294859886169434, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7078238725662231, + "num_tokens": 209353199.0, + "step": 8386 + }, + { + "epoch": 0.9210410718207775, + "grad_norm": 2.1253726482391357, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7166228890419006, + "num_tokens": 209380894.0, + "step": 8387 + }, + { + "epoch": 0.9211508895233912, + "grad_norm": 2.4351813793182373, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.7017401456832886, + "num_tokens": 209402365.0, + "step": 8388 + }, + { + "epoch": 0.9212607072260048, + "grad_norm": 2.0308947563171387, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7363424301147461, + "num_tokens": 209428786.0, + "step": 8389 + }, + { + "epoch": 0.9213705249286185, + "grad_norm": 2.2290446758270264, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7265273332595825, + "num_tokens": 209452901.0, + "step": 8390 + }, + { + "epoch": 0.9214803426312321, + "grad_norm": 2.1817562580108643, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7065478563308716, + "num_tokens": 209477742.0, + "step": 8391 + }, + { + "epoch": 0.9215901603338458, + "grad_norm": 2.430229902267456, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7460241317749023, + "num_tokens": 209497373.0, + "step": 8392 + }, + { + "epoch": 0.9216999780364594, + "grad_norm": 2.1744327545166016, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7293424606323242, + "num_tokens": 209521622.0, + "step": 8393 + }, + { + "epoch": 0.9218097957390732, + "grad_norm": 2.120297908782959, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7100266814231873, + "num_tokens": 209549873.0, + "step": 8394 + }, + { + "epoch": 0.9219196134416868, + "grad_norm": 2.250492811203003, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7251336574554443, + "num_tokens": 209572362.0, + "step": 8395 + }, + { + "epoch": 0.9220294311443005, + "grad_norm": 2.154996395111084, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.711555004119873, + "num_tokens": 209600013.0, + "step": 8396 + }, + { + "epoch": 0.9221392488469141, + "grad_norm": 2.082319498062134, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7055613398551941, + "num_tokens": 209628241.0, + "step": 8397 + }, + { + "epoch": 0.9222490665495278, + "grad_norm": 2.302968740463257, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7259763479232788, + "num_tokens": 209649812.0, + "step": 8398 + }, + { + "epoch": 0.9223588842521414, + "grad_norm": 2.2768213748931885, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6954671144485474, + "num_tokens": 209674956.0, + "step": 8399 + }, + { + "epoch": 0.922468701954755, + "grad_norm": 1.9114419221878052, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6932605504989624, + "num_tokens": 209707490.0, + "step": 8400 + }, + { + "epoch": 0.9225785196573688, + "grad_norm": 2.014371156692505, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6906387805938721, + "num_tokens": 209735788.0, + "step": 8401 + }, + { + "epoch": 0.9226883373599825, + "grad_norm": 2.070810556411743, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7000858187675476, + "num_tokens": 209765967.0, + "step": 8402 + }, + { + "epoch": 0.9227981550625961, + "grad_norm": 2.160311222076416, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7167309522628784, + "num_tokens": 209790196.0, + "step": 8403 + }, + { + "epoch": 0.9229079727652097, + "grad_norm": 2.1037659645080566, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7025085687637329, + "num_tokens": 209817993.0, + "step": 8404 + }, + { + "epoch": 0.9230177904678234, + "grad_norm": 2.2730815410614014, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7034015655517578, + "num_tokens": 209843018.0, + "step": 8405 + }, + { + "epoch": 0.923127608170437, + "grad_norm": 2.0985734462738037, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7175199389457703, + "num_tokens": 209869084.0, + "step": 8406 + }, + { + "epoch": 0.9232374258730507, + "grad_norm": 2.101274013519287, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7070392370223999, + "num_tokens": 209894500.0, + "step": 8407 + }, + { + "epoch": 0.9233472435756644, + "grad_norm": 2.245612144470215, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7190191745758057, + "num_tokens": 209918683.0, + "step": 8408 + }, + { + "epoch": 0.9234570612782781, + "grad_norm": 2.3926656246185303, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7355802059173584, + "num_tokens": 209940121.0, + "step": 8409 + }, + { + "epoch": 0.9235668789808917, + "grad_norm": 2.0147476196289062, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6997222900390625, + "num_tokens": 209968732.0, + "step": 8410 + }, + { + "epoch": 0.9236766966835054, + "grad_norm": 2.0936801433563232, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6928241848945618, + "num_tokens": 209996908.0, + "step": 8411 + }, + { + "epoch": 0.923786514386119, + "grad_norm": 2.1003365516662598, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6858779788017273, + "num_tokens": 210027874.0, + "step": 8412 + }, + { + "epoch": 0.9238963320887327, + "grad_norm": 2.2663393020629883, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7272889018058777, + "num_tokens": 210050913.0, + "step": 8413 + }, + { + "epoch": 0.9240061497913463, + "grad_norm": 2.003582000732422, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7152608036994934, + "num_tokens": 210079557.0, + "step": 8414 + }, + { + "epoch": 0.9241159674939601, + "grad_norm": 1.9189460277557373, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6788713932037354, + "num_tokens": 210112122.0, + "step": 8415 + }, + { + "epoch": 0.9242257851965737, + "grad_norm": 1.9044016599655151, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7060460448265076, + "num_tokens": 210143794.0, + "step": 8416 + }, + { + "epoch": 0.9243356028991874, + "grad_norm": 2.4002346992492676, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7232767343521118, + "num_tokens": 210165210.0, + "step": 8417 + }, + { + "epoch": 0.924445420601801, + "grad_norm": 2.209602117538452, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6902046203613281, + "num_tokens": 210191529.0, + "step": 8418 + }, + { + "epoch": 0.9245552383044147, + "grad_norm": 2.3475987911224365, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7151856422424316, + "num_tokens": 210213495.0, + "step": 8419 + }, + { + "epoch": 0.9246650560070283, + "grad_norm": 2.369137763977051, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7074445486068726, + "num_tokens": 210235974.0, + "step": 8420 + }, + { + "epoch": 0.924774873709642, + "grad_norm": 2.734431266784668, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7179890871047974, + "num_tokens": 210253835.0, + "step": 8421 + }, + { + "epoch": 0.9248846914122557, + "grad_norm": 2.214390277862549, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7054884433746338, + "num_tokens": 210278972.0, + "step": 8422 + }, + { + "epoch": 0.9249945091148694, + "grad_norm": 2.1240479946136475, + "learning_rate": 1e-06, + "loss": 1.1058, + "mean_token_accuracy": 0.6849406957626343, + "num_tokens": 210308266.0, + "step": 8423 + }, + { + "epoch": 0.925104326817483, + "grad_norm": 2.072558879852295, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6913280487060547, + "num_tokens": 210339580.0, + "step": 8424 + }, + { + "epoch": 0.9252141445200966, + "grad_norm": 2.3530948162078857, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7083523869514465, + "num_tokens": 210362030.0, + "step": 8425 + }, + { + "epoch": 0.9253239622227103, + "grad_norm": 2.2199316024780273, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7083585858345032, + "num_tokens": 210386109.0, + "step": 8426 + }, + { + "epoch": 0.9254337799253239, + "grad_norm": 2.101287364959717, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7057497501373291, + "num_tokens": 210411536.0, + "step": 8427 + }, + { + "epoch": 0.9255435976279376, + "grad_norm": 1.98703932762146, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7097991704940796, + "num_tokens": 210440810.0, + "step": 8428 + }, + { + "epoch": 0.9256534153305512, + "grad_norm": 2.265425205230713, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.711742103099823, + "num_tokens": 210464871.0, + "step": 8429 + }, + { + "epoch": 0.925763233033165, + "grad_norm": 2.0602943897247314, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7089697122573853, + "num_tokens": 210493693.0, + "step": 8430 + }, + { + "epoch": 0.9258730507357786, + "grad_norm": 2.1267073154449463, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.702206015586853, + "num_tokens": 210521317.0, + "step": 8431 + }, + { + "epoch": 0.9259828684383923, + "grad_norm": 2.101111650466919, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6981812715530396, + "num_tokens": 210552305.0, + "step": 8432 + }, + { + "epoch": 0.9260926861410059, + "grad_norm": 2.2694640159606934, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7249945998191833, + "num_tokens": 210575246.0, + "step": 8433 + }, + { + "epoch": 0.9262025038436196, + "grad_norm": 2.0380098819732666, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7228253483772278, + "num_tokens": 210602162.0, + "step": 8434 + }, + { + "epoch": 0.9263123215462332, + "grad_norm": 2.0499460697174072, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7033758759498596, + "num_tokens": 210630325.0, + "step": 8435 + }, + { + "epoch": 0.9264221392488469, + "grad_norm": 2.1410586833953857, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.691124439239502, + "num_tokens": 210655610.0, + "step": 8436 + }, + { + "epoch": 0.9265319569514606, + "grad_norm": 2.2214913368225098, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.710126519203186, + "num_tokens": 210681696.0, + "step": 8437 + }, + { + "epoch": 0.9266417746540743, + "grad_norm": 1.8118572235107422, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.693570077419281, + "num_tokens": 210717751.0, + "step": 8438 + }, + { + "epoch": 0.9267515923566879, + "grad_norm": 2.1982011795043945, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6960463523864746, + "num_tokens": 210742029.0, + "step": 8439 + }, + { + "epoch": 0.9268614100593016, + "grad_norm": 2.3752706050872803, + "learning_rate": 1e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.7448389530181885, + "num_tokens": 210763463.0, + "step": 8440 + }, + { + "epoch": 0.9269712277619152, + "grad_norm": 2.057900905609131, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6892961859703064, + "num_tokens": 210791900.0, + "step": 8441 + }, + { + "epoch": 0.9270810454645289, + "grad_norm": 2.386216640472412, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7123250961303711, + "num_tokens": 210813044.0, + "step": 8442 + }, + { + "epoch": 0.9271908631671425, + "grad_norm": 2.0797171592712402, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7283757925033569, + "num_tokens": 210836239.0, + "step": 8443 + }, + { + "epoch": 0.9273006808697563, + "grad_norm": 2.201016426086426, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6935158967971802, + "num_tokens": 210864108.0, + "step": 8444 + }, + { + "epoch": 0.9274104985723699, + "grad_norm": 1.8716834783554077, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6919898986816406, + "num_tokens": 210896216.0, + "step": 8445 + }, + { + "epoch": 0.9275203162749835, + "grad_norm": 2.687457799911499, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7131950259208679, + "num_tokens": 210913604.0, + "step": 8446 + }, + { + "epoch": 0.9276301339775972, + "grad_norm": 2.2233285903930664, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7013261318206787, + "num_tokens": 210941384.0, + "step": 8447 + }, + { + "epoch": 0.9277399516802108, + "grad_norm": 2.5890488624572754, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.733148455619812, + "num_tokens": 210958587.0, + "step": 8448 + }, + { + "epoch": 0.9278497693828245, + "grad_norm": 2.05112624168396, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6888406276702881, + "num_tokens": 210987639.0, + "step": 8449 + }, + { + "epoch": 0.9279595870854381, + "grad_norm": 2.180828094482422, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7303365468978882, + "num_tokens": 211012024.0, + "step": 8450 + }, + { + "epoch": 0.9280694047880519, + "grad_norm": 2.2432398796081543, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.686869740486145, + "num_tokens": 211036575.0, + "step": 8451 + }, + { + "epoch": 0.9281792224906655, + "grad_norm": 2.0813863277435303, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.703039288520813, + "num_tokens": 211064865.0, + "step": 8452 + }, + { + "epoch": 0.9282890401932792, + "grad_norm": 2.169520854949951, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7058655023574829, + "num_tokens": 211089296.0, + "step": 8453 + }, + { + "epoch": 0.9283988578958928, + "grad_norm": 2.311677932739258, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7183478474617004, + "num_tokens": 211112709.0, + "step": 8454 + }, + { + "epoch": 0.9285086755985065, + "grad_norm": 2.0709235668182373, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.69979327917099, + "num_tokens": 211139413.0, + "step": 8455 + }, + { + "epoch": 0.9286184933011201, + "grad_norm": 1.7447396516799927, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6908407211303711, + "num_tokens": 211177348.0, + "step": 8456 + }, + { + "epoch": 0.9287283110037338, + "grad_norm": 2.1294755935668945, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7131927013397217, + "num_tokens": 211203946.0, + "step": 8457 + }, + { + "epoch": 0.9288381287063474, + "grad_norm": 2.455228567123413, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7155086994171143, + "num_tokens": 211225625.0, + "step": 8458 + }, + { + "epoch": 0.9289479464089612, + "grad_norm": 1.9533023834228516, + "learning_rate": 1e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.6770862936973572, + "num_tokens": 211256296.0, + "step": 8459 + }, + { + "epoch": 0.9290577641115748, + "grad_norm": 2.265068769454956, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7194361686706543, + "num_tokens": 211280064.0, + "step": 8460 + }, + { + "epoch": 0.9291675818141885, + "grad_norm": 2.009993553161621, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.721713125705719, + "num_tokens": 211308426.0, + "step": 8461 + }, + { + "epoch": 0.9292773995168021, + "grad_norm": 2.065563201904297, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7260741591453552, + "num_tokens": 211337659.0, + "step": 8462 + }, + { + "epoch": 0.9293872172194158, + "grad_norm": 2.441251754760742, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7263086438179016, + "num_tokens": 211357950.0, + "step": 8463 + }, + { + "epoch": 0.9294970349220294, + "grad_norm": 2.571805477142334, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7015752196311951, + "num_tokens": 211377702.0, + "step": 8464 + }, + { + "epoch": 0.929606852624643, + "grad_norm": 1.8242663145065308, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7283027172088623, + "num_tokens": 211412072.0, + "step": 8465 + }, + { + "epoch": 0.9297166703272568, + "grad_norm": 2.05026912689209, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.708675742149353, + "num_tokens": 211442028.0, + "step": 8466 + }, + { + "epoch": 0.9298264880298704, + "grad_norm": 2.175930976867676, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7154659032821655, + "num_tokens": 211466364.0, + "step": 8467 + }, + { + "epoch": 0.9299363057324841, + "grad_norm": 2.3033664226531982, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7017172574996948, + "num_tokens": 211488638.0, + "step": 8468 + }, + { + "epoch": 0.9300461234350977, + "grad_norm": 2.208514928817749, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7232316732406616, + "num_tokens": 211512289.0, + "step": 8469 + }, + { + "epoch": 0.9301559411377114, + "grad_norm": 2.201935291290283, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7046493291854858, + "num_tokens": 211538475.0, + "step": 8470 + }, + { + "epoch": 0.930265758840325, + "grad_norm": 2.1073901653289795, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7143916487693787, + "num_tokens": 211564787.0, + "step": 8471 + }, + { + "epoch": 0.9303755765429387, + "grad_norm": 2.3286709785461426, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7045164108276367, + "num_tokens": 211589267.0, + "step": 8472 + }, + { + "epoch": 0.9304853942455524, + "grad_norm": 2.1158931255340576, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.6948713064193726, + "num_tokens": 211616658.0, + "step": 8473 + }, + { + "epoch": 0.9305952119481661, + "grad_norm": 2.5231411457061768, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7281970381736755, + "num_tokens": 211634899.0, + "step": 8474 + }, + { + "epoch": 0.9307050296507797, + "grad_norm": 2.1471128463745117, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7139815092086792, + "num_tokens": 211659944.0, + "step": 8475 + }, + { + "epoch": 0.9308148473533934, + "grad_norm": 2.560940980911255, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7284954190254211, + "num_tokens": 211678663.0, + "step": 8476 + }, + { + "epoch": 0.930924665056007, + "grad_norm": 2.583695411682129, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.712777853012085, + "num_tokens": 211697673.0, + "step": 8477 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 2.859574556350708, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7175565361976624, + "num_tokens": 211713675.0, + "step": 8478 + }, + { + "epoch": 0.9311443004612343, + "grad_norm": 2.310129404067993, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.703134298324585, + "num_tokens": 211738169.0, + "step": 8479 + }, + { + "epoch": 0.9312541181638481, + "grad_norm": 2.6112139225006104, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7235363125801086, + "num_tokens": 211757725.0, + "step": 8480 + }, + { + "epoch": 0.9313639358664617, + "grad_norm": 2.2122292518615723, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6964093446731567, + "num_tokens": 211784408.0, + "step": 8481 + }, + { + "epoch": 0.9314737535690754, + "grad_norm": 2.317061424255371, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7045617699623108, + "num_tokens": 211812142.0, + "step": 8482 + }, + { + "epoch": 0.931583571271689, + "grad_norm": 2.455811023712158, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.6982499361038208, + "num_tokens": 211834000.0, + "step": 8483 + }, + { + "epoch": 0.9316933889743026, + "grad_norm": 2.0164921283721924, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.711455225944519, + "num_tokens": 211861750.0, + "step": 8484 + }, + { + "epoch": 0.9318032066769163, + "grad_norm": 2.062385320663452, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.712884247303009, + "num_tokens": 211888807.0, + "step": 8485 + }, + { + "epoch": 0.9319130243795299, + "grad_norm": 2.181704044342041, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6902176141738892, + "num_tokens": 211914763.0, + "step": 8486 + }, + { + "epoch": 0.9320228420821436, + "grad_norm": 2.11810564994812, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.699416995048523, + "num_tokens": 211941885.0, + "step": 8487 + }, + { + "epoch": 0.9321326597847573, + "grad_norm": 2.1319496631622314, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6987165808677673, + "num_tokens": 211967525.0, + "step": 8488 + }, + { + "epoch": 0.932242477487371, + "grad_norm": 2.265469789505005, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7044380307197571, + "num_tokens": 211990813.0, + "step": 8489 + }, + { + "epoch": 0.9323522951899846, + "grad_norm": 2.4988460540771484, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7047805190086365, + "num_tokens": 212010711.0, + "step": 8490 + }, + { + "epoch": 0.9324621128925983, + "grad_norm": 2.0884363651275635, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7097994089126587, + "num_tokens": 212035002.0, + "step": 8491 + }, + { + "epoch": 0.9325719305952119, + "grad_norm": 2.3771326541900635, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7105628252029419, + "num_tokens": 212056516.0, + "step": 8492 + }, + { + "epoch": 0.9326817482978256, + "grad_norm": 2.055720567703247, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7118772268295288, + "num_tokens": 212084002.0, + "step": 8493 + }, + { + "epoch": 0.9327915660004392, + "grad_norm": 2.1153151988983154, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6968623995780945, + "num_tokens": 212111073.0, + "step": 8494 + }, + { + "epoch": 0.932901383703053, + "grad_norm": 1.9946200847625732, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7301489114761353, + "num_tokens": 212138950.0, + "step": 8495 + }, + { + "epoch": 0.9330112014056666, + "grad_norm": 2.3348402976989746, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.700783908367157, + "num_tokens": 212161640.0, + "step": 8496 + }, + { + "epoch": 0.9331210191082803, + "grad_norm": 2.1583728790283203, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7190765142440796, + "num_tokens": 212186697.0, + "step": 8497 + }, + { + "epoch": 0.9332308368108939, + "grad_norm": 2.3540401458740234, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7274818420410156, + "num_tokens": 212209051.0, + "step": 8498 + }, + { + "epoch": 0.9333406545135076, + "grad_norm": 2.3287599086761475, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7228101491928101, + "num_tokens": 212231596.0, + "step": 8499 + }, + { + "epoch": 0.9334504722161212, + "grad_norm": 2.2459959983825684, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7059242725372314, + "num_tokens": 212257128.0, + "step": 8500 + }, + { + "epoch": 0.9335602899187349, + "grad_norm": 1.9579991102218628, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.6973414421081543, + "num_tokens": 212287530.0, + "step": 8501 + }, + { + "epoch": 0.9336701076213486, + "grad_norm": 2.2477715015411377, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.700545072555542, + "num_tokens": 212310688.0, + "step": 8502 + }, + { + "epoch": 0.9337799253239623, + "grad_norm": 2.0881094932556152, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.718036413192749, + "num_tokens": 212337221.0, + "step": 8503 + }, + { + "epoch": 0.9338897430265759, + "grad_norm": 2.1998136043548584, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7072485685348511, + "num_tokens": 212363951.0, + "step": 8504 + }, + { + "epoch": 0.9339995607291895, + "grad_norm": 2.1668567657470703, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.713219165802002, + "num_tokens": 212388554.0, + "step": 8505 + }, + { + "epoch": 0.9341093784318032, + "grad_norm": 2.356158494949341, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7058559656143188, + "num_tokens": 212410995.0, + "step": 8506 + }, + { + "epoch": 0.9342191961344168, + "grad_norm": 2.28688645362854, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7144530415534973, + "num_tokens": 212433942.0, + "step": 8507 + }, + { + "epoch": 0.9343290138370305, + "grad_norm": 2.544029712677002, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7155115604400635, + "num_tokens": 212453914.0, + "step": 8508 + }, + { + "epoch": 0.9344388315396442, + "grad_norm": 2.39581561088562, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7250484228134155, + "num_tokens": 212476370.0, + "step": 8509 + }, + { + "epoch": 0.9345486492422579, + "grad_norm": 2.635157346725464, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7235932350158691, + "num_tokens": 212494882.0, + "step": 8510 + }, + { + "epoch": 0.9346584669448715, + "grad_norm": 2.3564023971557617, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7221433520317078, + "num_tokens": 212516341.0, + "step": 8511 + }, + { + "epoch": 0.9347682846474852, + "grad_norm": 2.3092565536499023, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7084600329399109, + "num_tokens": 212540000.0, + "step": 8512 + }, + { + "epoch": 0.9348781023500988, + "grad_norm": 2.1828317642211914, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.698173999786377, + "num_tokens": 212563592.0, + "step": 8513 + }, + { + "epoch": 0.9349879200527125, + "grad_norm": 2.0973799228668213, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7034555673599243, + "num_tokens": 212592180.0, + "step": 8514 + }, + { + "epoch": 0.9350977377553261, + "grad_norm": 2.237156867980957, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7245403528213501, + "num_tokens": 212612675.0, + "step": 8515 + }, + { + "epoch": 0.9352075554579398, + "grad_norm": 1.954453468322754, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6881722211837769, + "num_tokens": 212644774.0, + "step": 8516 + }, + { + "epoch": 0.9353173731605535, + "grad_norm": 2.146834135055542, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7032651901245117, + "num_tokens": 212670539.0, + "step": 8517 + }, + { + "epoch": 0.9354271908631672, + "grad_norm": 2.3000328540802, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7110898494720459, + "num_tokens": 212694287.0, + "step": 8518 + }, + { + "epoch": 0.9355370085657808, + "grad_norm": 2.3641762733459473, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7143266201019287, + "num_tokens": 212717555.0, + "step": 8519 + }, + { + "epoch": 0.9356468262683945, + "grad_norm": 2.2037999629974365, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.706025242805481, + "num_tokens": 212742411.0, + "step": 8520 + }, + { + "epoch": 0.9357566439710081, + "grad_norm": 2.3338911533355713, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7276270389556885, + "num_tokens": 212764114.0, + "step": 8521 + }, + { + "epoch": 0.9358664616736218, + "grad_norm": 2.2571327686309814, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7129749655723572, + "num_tokens": 212788074.0, + "step": 8522 + }, + { + "epoch": 0.9359762793762354, + "grad_norm": 2.050771951675415, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7053724527359009, + "num_tokens": 212816694.0, + "step": 8523 + }, + { + "epoch": 0.9360860970788492, + "grad_norm": 2.664937973022461, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7220616936683655, + "num_tokens": 212832745.0, + "step": 8524 + }, + { + "epoch": 0.9361959147814628, + "grad_norm": 2.0047597885131836, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7224988341331482, + "num_tokens": 212860504.0, + "step": 8525 + }, + { + "epoch": 0.9363057324840764, + "grad_norm": 2.0966949462890625, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7094118595123291, + "num_tokens": 212886867.0, + "step": 8526 + }, + { + "epoch": 0.9364155501866901, + "grad_norm": 2.112518072128296, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7307331562042236, + "num_tokens": 212909781.0, + "step": 8527 + }, + { + "epoch": 0.9365253678893037, + "grad_norm": 2.4193692207336426, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7371880412101746, + "num_tokens": 212932367.0, + "step": 8528 + }, + { + "epoch": 0.9366351855919174, + "grad_norm": 2.315664768218994, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7408062219619751, + "num_tokens": 212952700.0, + "step": 8529 + }, + { + "epoch": 0.936745003294531, + "grad_norm": 2.4371838569641113, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7201564908027649, + "num_tokens": 212973582.0, + "step": 8530 + }, + { + "epoch": 0.9368548209971448, + "grad_norm": 2.1455087661743164, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6908400058746338, + "num_tokens": 213002169.0, + "step": 8531 + }, + { + "epoch": 0.9369646386997584, + "grad_norm": 2.095879316329956, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7018047571182251, + "num_tokens": 213030218.0, + "step": 8532 + }, + { + "epoch": 0.9370744564023721, + "grad_norm": 2.5947318077087402, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7114198207855225, + "num_tokens": 213048883.0, + "step": 8533 + }, + { + "epoch": 0.9371842741049857, + "grad_norm": 2.317944049835205, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7124761939048767, + "num_tokens": 213071298.0, + "step": 8534 + }, + { + "epoch": 0.9372940918075994, + "grad_norm": 2.345456123352051, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7070209383964539, + "num_tokens": 213094580.0, + "step": 8535 + }, + { + "epoch": 0.937403909510213, + "grad_norm": 2.0264172554016113, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.720199465751648, + "num_tokens": 213122246.0, + "step": 8536 + }, + { + "epoch": 0.9375137272128267, + "grad_norm": 2.0065767765045166, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6939933896064758, + "num_tokens": 213150998.0, + "step": 8537 + }, + { + "epoch": 0.9376235449154404, + "grad_norm": 2.0702075958251953, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7035182118415833, + "num_tokens": 213179634.0, + "step": 8538 + }, + { + "epoch": 0.9377333626180541, + "grad_norm": 2.1937010288238525, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7052502632141113, + "num_tokens": 213204921.0, + "step": 8539 + }, + { + "epoch": 0.9378431803206677, + "grad_norm": 2.0591323375701904, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.718237578868866, + "num_tokens": 213231550.0, + "step": 8540 + }, + { + "epoch": 0.9379529980232814, + "grad_norm": 2.226644515991211, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7135623693466187, + "num_tokens": 213255480.0, + "step": 8541 + }, + { + "epoch": 0.938062815725895, + "grad_norm": 2.479250907897949, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7193775177001953, + "num_tokens": 213276864.0, + "step": 8542 + }, + { + "epoch": 0.9381726334285087, + "grad_norm": 2.4833836555480957, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7226815223693848, + "num_tokens": 213297449.0, + "step": 8543 + }, + { + "epoch": 0.9382824511311223, + "grad_norm": 2.0368614196777344, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6998686790466309, + "num_tokens": 213326724.0, + "step": 8544 + }, + { + "epoch": 0.9383922688337359, + "grad_norm": 2.1012885570526123, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7103835344314575, + "num_tokens": 213352416.0, + "step": 8545 + }, + { + "epoch": 0.9385020865363497, + "grad_norm": 2.1784346103668213, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7142615914344788, + "num_tokens": 213377749.0, + "step": 8546 + }, + { + "epoch": 0.9386119042389633, + "grad_norm": 2.0708556175231934, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.69692063331604, + "num_tokens": 213406316.0, + "step": 8547 + }, + { + "epoch": 0.938721721941577, + "grad_norm": 2.300551176071167, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7041218280792236, + "num_tokens": 213431223.0, + "step": 8548 + }, + { + "epoch": 0.9388315396441906, + "grad_norm": 2.303420305252075, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7116533517837524, + "num_tokens": 213454180.0, + "step": 8549 + }, + { + "epoch": 0.9389413573468043, + "grad_norm": 2.1776907444000244, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.690758466720581, + "num_tokens": 213480570.0, + "step": 8550 + }, + { + "epoch": 0.9390511750494179, + "grad_norm": 2.0870280265808105, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7342360019683838, + "num_tokens": 213506623.0, + "step": 8551 + }, + { + "epoch": 0.9391609927520316, + "grad_norm": 2.4806339740753174, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7368495464324951, + "num_tokens": 213526919.0, + "step": 8552 + }, + { + "epoch": 0.9392708104546453, + "grad_norm": 2.619736909866333, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7002305388450623, + "num_tokens": 213547376.0, + "step": 8553 + }, + { + "epoch": 0.939380628157259, + "grad_norm": 2.721802234649658, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.6978658437728882, + "num_tokens": 213565369.0, + "step": 8554 + }, + { + "epoch": 0.9394904458598726, + "grad_norm": 2.2055671215057373, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7076478004455566, + "num_tokens": 213591153.0, + "step": 8555 + }, + { + "epoch": 0.9396002635624863, + "grad_norm": 2.171191930770874, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6920286417007446, + "num_tokens": 213616979.0, + "step": 8556 + }, + { + "epoch": 0.9397100812650999, + "grad_norm": 2.097646474838257, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7091665863990784, + "num_tokens": 213645568.0, + "step": 8557 + }, + { + "epoch": 0.9398198989677136, + "grad_norm": 2.154752016067505, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6904582381248474, + "num_tokens": 213670961.0, + "step": 8558 + }, + { + "epoch": 0.9399297166703272, + "grad_norm": 2.5156490802764893, + "learning_rate": 1e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7289366722106934, + "num_tokens": 213690715.0, + "step": 8559 + }, + { + "epoch": 0.940039534372941, + "grad_norm": 2.237780809402466, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7014826536178589, + "num_tokens": 213714701.0, + "step": 8560 + }, + { + "epoch": 0.9401493520755546, + "grad_norm": 2.1578617095947266, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7172492146492004, + "num_tokens": 213740334.0, + "step": 8561 + }, + { + "epoch": 0.9402591697781683, + "grad_norm": 2.0459799766540527, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.699089527130127, + "num_tokens": 213770458.0, + "step": 8562 + }, + { + "epoch": 0.9403689874807819, + "grad_norm": 3.0169782638549805, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7243003845214844, + "num_tokens": 213791659.0, + "step": 8563 + }, + { + "epoch": 0.9404788051833955, + "grad_norm": 2.0113987922668457, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7094812393188477, + "num_tokens": 213819582.0, + "step": 8564 + }, + { + "epoch": 0.9405886228860092, + "grad_norm": 2.0199100971221924, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7104833126068115, + "num_tokens": 213849619.0, + "step": 8565 + }, + { + "epoch": 0.9406984405886228, + "grad_norm": 2.293933153152466, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7247997522354126, + "num_tokens": 213871569.0, + "step": 8566 + }, + { + "epoch": 0.9408082582912366, + "grad_norm": 2.2301437854766846, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7245935797691345, + "num_tokens": 213894807.0, + "step": 8567 + }, + { + "epoch": 0.9409180759938502, + "grad_norm": 2.069896936416626, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7058167457580566, + "num_tokens": 213923922.0, + "step": 8568 + }, + { + "epoch": 0.9410278936964639, + "grad_norm": 2.1076765060424805, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7050080299377441, + "num_tokens": 213951834.0, + "step": 8569 + }, + { + "epoch": 0.9411377113990775, + "grad_norm": 2.398529529571533, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7218696475028992, + "num_tokens": 213972987.0, + "step": 8570 + }, + { + "epoch": 0.9412475291016912, + "grad_norm": 2.099147081375122, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7252766489982605, + "num_tokens": 213999771.0, + "step": 8571 + }, + { + "epoch": 0.9413573468043048, + "grad_norm": 2.3171675205230713, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6977769732475281, + "num_tokens": 214024030.0, + "step": 8572 + }, + { + "epoch": 0.9414671645069185, + "grad_norm": 2.193640947341919, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7396814823150635, + "num_tokens": 214046398.0, + "step": 8573 + }, + { + "epoch": 0.9415769822095322, + "grad_norm": 2.2822976112365723, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7050141096115112, + "num_tokens": 214071619.0, + "step": 8574 + }, + { + "epoch": 0.9416867999121459, + "grad_norm": 2.1060173511505127, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.698623538017273, + "num_tokens": 214099517.0, + "step": 8575 + }, + { + "epoch": 0.9417966176147595, + "grad_norm": 2.065790891647339, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7075035572052002, + "num_tokens": 214128542.0, + "step": 8576 + }, + { + "epoch": 0.9419064353173732, + "grad_norm": 2.0808777809143066, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7050703763961792, + "num_tokens": 214155483.0, + "step": 8577 + }, + { + "epoch": 0.9420162530199868, + "grad_norm": 2.4317054748535156, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7248700857162476, + "num_tokens": 214176383.0, + "step": 8578 + }, + { + "epoch": 0.9421260707226005, + "grad_norm": 2.173109531402588, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7250335216522217, + "num_tokens": 214203013.0, + "step": 8579 + }, + { + "epoch": 0.9422358884252141, + "grad_norm": 2.391735553741455, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7128679752349854, + "num_tokens": 214225420.0, + "step": 8580 + }, + { + "epoch": 0.9423457061278278, + "grad_norm": 2.127398729324341, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7119678258895874, + "num_tokens": 214250786.0, + "step": 8581 + }, + { + "epoch": 0.9424555238304415, + "grad_norm": 2.1653318405151367, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6881625652313232, + "num_tokens": 214277270.0, + "step": 8582 + }, + { + "epoch": 0.9425653415330552, + "grad_norm": 2.6302759647369385, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7184292674064636, + "num_tokens": 214295193.0, + "step": 8583 + }, + { + "epoch": 0.9426751592356688, + "grad_norm": 2.034644603729248, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6905455589294434, + "num_tokens": 214323959.0, + "step": 8584 + }, + { + "epoch": 0.9427849769382824, + "grad_norm": 2.4199070930480957, + "learning_rate": 1e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7540738582611084, + "num_tokens": 214343445.0, + "step": 8585 + }, + { + "epoch": 0.9428947946408961, + "grad_norm": 1.9545384645462036, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6997041702270508, + "num_tokens": 214375956.0, + "step": 8586 + }, + { + "epoch": 0.9430046123435097, + "grad_norm": 2.223158359527588, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7078111171722412, + "num_tokens": 214399212.0, + "step": 8587 + }, + { + "epoch": 0.9431144300461234, + "grad_norm": 2.3483142852783203, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7209107875823975, + "num_tokens": 214422021.0, + "step": 8588 + }, + { + "epoch": 0.9432242477487371, + "grad_norm": 2.474931001663208, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7389651536941528, + "num_tokens": 214441016.0, + "step": 8589 + }, + { + "epoch": 0.9433340654513508, + "grad_norm": 2.514423370361328, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.716663122177124, + "num_tokens": 214461243.0, + "step": 8590 + }, + { + "epoch": 0.9434438831539644, + "grad_norm": 2.193743944168091, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7072064876556396, + "num_tokens": 214485475.0, + "step": 8591 + }, + { + "epoch": 0.9435537008565781, + "grad_norm": 2.2161898612976074, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7082681059837341, + "num_tokens": 214508902.0, + "step": 8592 + }, + { + "epoch": 0.9436635185591917, + "grad_norm": 2.2045884132385254, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7299178838729858, + "num_tokens": 214532364.0, + "step": 8593 + }, + { + "epoch": 0.9437733362618054, + "grad_norm": 1.8690537214279175, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7055256962776184, + "num_tokens": 214565013.0, + "step": 8594 + }, + { + "epoch": 0.943883153964419, + "grad_norm": 2.010296583175659, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6964120864868164, + "num_tokens": 214592789.0, + "step": 8595 + }, + { + "epoch": 0.9439929716670328, + "grad_norm": 2.221747875213623, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7159388065338135, + "num_tokens": 214617441.0, + "step": 8596 + }, + { + "epoch": 0.9441027893696464, + "grad_norm": 1.8058606386184692, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.723017156124115, + "num_tokens": 214648764.0, + "step": 8597 + }, + { + "epoch": 0.9442126070722601, + "grad_norm": 2.2343108654022217, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7306344509124756, + "num_tokens": 214674605.0, + "step": 8598 + }, + { + "epoch": 0.9443224247748737, + "grad_norm": 2.2666289806365967, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7015581727027893, + "num_tokens": 214697651.0, + "step": 8599 + }, + { + "epoch": 0.9444322424774874, + "grad_norm": 2.2040464878082275, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6949318647384644, + "num_tokens": 214724593.0, + "step": 8600 + }, + { + "epoch": 0.944542060180101, + "grad_norm": 2.289292812347412, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7025573253631592, + "num_tokens": 214747657.0, + "step": 8601 + }, + { + "epoch": 0.9446518778827147, + "grad_norm": 2.0989367961883545, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.715874433517456, + "num_tokens": 214774755.0, + "step": 8602 + }, + { + "epoch": 0.9447616955853284, + "grad_norm": 2.292139768600464, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7210592031478882, + "num_tokens": 214798246.0, + "step": 8603 + }, + { + "epoch": 0.944871513287942, + "grad_norm": 2.1750495433807373, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.721707284450531, + "num_tokens": 214822236.0, + "step": 8604 + }, + { + "epoch": 0.9449813309905557, + "grad_norm": 2.143167495727539, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.700682520866394, + "num_tokens": 214847038.0, + "step": 8605 + }, + { + "epoch": 0.9450911486931693, + "grad_norm": 2.13848614692688, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6930708289146423, + "num_tokens": 214875261.0, + "step": 8606 + }, + { + "epoch": 0.945200966395783, + "grad_norm": 2.1902737617492676, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6830084323883057, + "num_tokens": 214903143.0, + "step": 8607 + }, + { + "epoch": 0.9453107840983966, + "grad_norm": 1.796958565711975, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7006258964538574, + "num_tokens": 214936470.0, + "step": 8608 + }, + { + "epoch": 0.9454206018010103, + "grad_norm": 2.282365560531616, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6990867853164673, + "num_tokens": 214960513.0, + "step": 8609 + }, + { + "epoch": 0.9455304195036239, + "grad_norm": 2.130356788635254, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6838831901550293, + "num_tokens": 214986650.0, + "step": 8610 + }, + { + "epoch": 0.9456402372062377, + "grad_norm": 1.8818358182907104, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7116202712059021, + "num_tokens": 215017206.0, + "step": 8611 + }, + { + "epoch": 0.9457500549088513, + "grad_norm": 2.2325477600097656, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7268815636634827, + "num_tokens": 215038888.0, + "step": 8612 + }, + { + "epoch": 0.945859872611465, + "grad_norm": 2.2546651363372803, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7039361000061035, + "num_tokens": 215062406.0, + "step": 8613 + }, + { + "epoch": 0.9459696903140786, + "grad_norm": 2.8092799186706543, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7161720395088196, + "num_tokens": 215078981.0, + "step": 8614 + }, + { + "epoch": 0.9460795080166923, + "grad_norm": 2.4026591777801514, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.6995408535003662, + "num_tokens": 215098955.0, + "step": 8615 + }, + { + "epoch": 0.9461893257193059, + "grad_norm": 2.208991289138794, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6942044496536255, + "num_tokens": 215123917.0, + "step": 8616 + }, + { + "epoch": 0.9462991434219196, + "grad_norm": 2.409654140472412, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7252118587493896, + "num_tokens": 215143267.0, + "step": 8617 + }, + { + "epoch": 0.9464089611245333, + "grad_norm": 2.0531187057495117, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6980605125427246, + "num_tokens": 215170264.0, + "step": 8618 + }, + { + "epoch": 0.946518778827147, + "grad_norm": 2.195574998855591, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7215365767478943, + "num_tokens": 215194360.0, + "step": 8619 + }, + { + "epoch": 0.9466285965297606, + "grad_norm": 2.041738510131836, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6825581192970276, + "num_tokens": 215222813.0, + "step": 8620 + }, + { + "epoch": 0.9467384142323743, + "grad_norm": 1.8374290466308594, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6866426467895508, + "num_tokens": 215259031.0, + "step": 8621 + }, + { + "epoch": 0.9468482319349879, + "grad_norm": 2.5803143978118896, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7028440237045288, + "num_tokens": 215280047.0, + "step": 8622 + }, + { + "epoch": 0.9469580496376016, + "grad_norm": 2.1587975025177, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6872085332870483, + "num_tokens": 215308081.0, + "step": 8623 + }, + { + "epoch": 0.9470678673402152, + "grad_norm": 2.2148144245147705, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7196458578109741, + "num_tokens": 215333214.0, + "step": 8624 + }, + { + "epoch": 0.947177685042829, + "grad_norm": 2.1753411293029785, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7187322974205017, + "num_tokens": 215358923.0, + "step": 8625 + }, + { + "epoch": 0.9472875027454426, + "grad_norm": 2.181872606277466, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7054147720336914, + "num_tokens": 215385683.0, + "step": 8626 + }, + { + "epoch": 0.9473973204480562, + "grad_norm": 2.304939031600952, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7314928770065308, + "num_tokens": 215407692.0, + "step": 8627 + }, + { + "epoch": 0.9475071381506699, + "grad_norm": 2.2716686725616455, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7185818552970886, + "num_tokens": 215431773.0, + "step": 8628 + }, + { + "epoch": 0.9476169558532835, + "grad_norm": 2.248516082763672, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7153761386871338, + "num_tokens": 215457503.0, + "step": 8629 + }, + { + "epoch": 0.9477267735558972, + "grad_norm": 2.185539484024048, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7063726186752319, + "num_tokens": 215483764.0, + "step": 8630 + }, + { + "epoch": 0.9478365912585108, + "grad_norm": 2.2130541801452637, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6980215907096863, + "num_tokens": 215509210.0, + "step": 8631 + }, + { + "epoch": 0.9479464089611246, + "grad_norm": 2.590766191482544, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7125310301780701, + "num_tokens": 215528685.0, + "step": 8632 + }, + { + "epoch": 0.9480562266637382, + "grad_norm": 2.6281068325042725, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7270227670669556, + "num_tokens": 215547799.0, + "step": 8633 + }, + { + "epoch": 0.9481660443663519, + "grad_norm": 2.082956075668335, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7238779067993164, + "num_tokens": 215572719.0, + "step": 8634 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 2.436145067214966, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7109951972961426, + "num_tokens": 215593821.0, + "step": 8635 + }, + { + "epoch": 0.9483856797715792, + "grad_norm": 2.3464760780334473, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7190605401992798, + "num_tokens": 215616161.0, + "step": 8636 + }, + { + "epoch": 0.9484954974741928, + "grad_norm": 2.308316230773926, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7342427968978882, + "num_tokens": 215637599.0, + "step": 8637 + }, + { + "epoch": 0.9486053151768065, + "grad_norm": 2.2826106548309326, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7044051289558411, + "num_tokens": 215662134.0, + "step": 8638 + }, + { + "epoch": 0.9487151328794201, + "grad_norm": 2.1325995922088623, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7286627292633057, + "num_tokens": 215685919.0, + "step": 8639 + }, + { + "epoch": 0.9488249505820339, + "grad_norm": 2.6308350563049316, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7453413009643555, + "num_tokens": 215704434.0, + "step": 8640 + }, + { + "epoch": 0.9489347682846475, + "grad_norm": 2.5235350131988525, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7009912729263306, + "num_tokens": 215725543.0, + "step": 8641 + }, + { + "epoch": 0.9490445859872612, + "grad_norm": 2.354783296585083, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7129334211349487, + "num_tokens": 215747266.0, + "step": 8642 + }, + { + "epoch": 0.9491544036898748, + "grad_norm": 2.1083221435546875, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7245549559593201, + "num_tokens": 215772346.0, + "step": 8643 + }, + { + "epoch": 0.9492642213924884, + "grad_norm": 2.0926198959350586, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7058870792388916, + "num_tokens": 215802046.0, + "step": 8644 + }, + { + "epoch": 0.9493740390951021, + "grad_norm": 2.241570234298706, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6984599232673645, + "num_tokens": 215826782.0, + "step": 8645 + }, + { + "epoch": 0.9494838567977157, + "grad_norm": 2.2729222774505615, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7245652675628662, + "num_tokens": 215849595.0, + "step": 8646 + }, + { + "epoch": 0.9495936745003295, + "grad_norm": 2.209105968475342, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7054473161697388, + "num_tokens": 215875003.0, + "step": 8647 + }, + { + "epoch": 0.9497034922029431, + "grad_norm": 2.4319117069244385, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7039639949798584, + "num_tokens": 215895723.0, + "step": 8648 + }, + { + "epoch": 0.9498133099055568, + "grad_norm": 2.0001628398895264, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6872880458831787, + "num_tokens": 215923736.0, + "step": 8649 + }, + { + "epoch": 0.9499231276081704, + "grad_norm": 1.992418885231018, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7029238939285278, + "num_tokens": 215953314.0, + "step": 8650 + }, + { + "epoch": 0.9500329453107841, + "grad_norm": 2.307459592819214, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.6966959238052368, + "num_tokens": 215978062.0, + "step": 8651 + }, + { + "epoch": 0.9501427630133977, + "grad_norm": 2.3236823081970215, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7152206897735596, + "num_tokens": 215998438.0, + "step": 8652 + }, + { + "epoch": 0.9502525807160114, + "grad_norm": 2.233386993408203, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7117211818695068, + "num_tokens": 216021413.0, + "step": 8653 + }, + { + "epoch": 0.9503623984186251, + "grad_norm": 1.9230287075042725, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6990606784820557, + "num_tokens": 216053974.0, + "step": 8654 + }, + { + "epoch": 0.9504722161212388, + "grad_norm": 2.1194756031036377, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7076661586761475, + "num_tokens": 216080686.0, + "step": 8655 + }, + { + "epoch": 0.9505820338238524, + "grad_norm": 2.2072694301605225, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.6992833614349365, + "num_tokens": 216105751.0, + "step": 8656 + }, + { + "epoch": 0.9506918515264661, + "grad_norm": 1.9422358274459839, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6982500553131104, + "num_tokens": 216136848.0, + "step": 8657 + }, + { + "epoch": 0.9508016692290797, + "grad_norm": 2.1530838012695312, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6995598077774048, + "num_tokens": 216162728.0, + "step": 8658 + }, + { + "epoch": 0.9509114869316934, + "grad_norm": 2.2952167987823486, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7300432920455933, + "num_tokens": 216184792.0, + "step": 8659 + }, + { + "epoch": 0.951021304634307, + "grad_norm": 2.3694028854370117, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7271488308906555, + "num_tokens": 216206208.0, + "step": 8660 + }, + { + "epoch": 0.9511311223369208, + "grad_norm": 2.090418577194214, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6951189637184143, + "num_tokens": 216232034.0, + "step": 8661 + }, + { + "epoch": 0.9512409400395344, + "grad_norm": 2.161411762237549, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7119282484054565, + "num_tokens": 216254798.0, + "step": 8662 + }, + { + "epoch": 0.9513507577421481, + "grad_norm": 2.1791813373565674, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7056408524513245, + "num_tokens": 216279689.0, + "step": 8663 + }, + { + "epoch": 0.9514605754447617, + "grad_norm": 2.5246903896331787, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.735345721244812, + "num_tokens": 216298334.0, + "step": 8664 + }, + { + "epoch": 0.9515703931473753, + "grad_norm": 2.1736607551574707, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6882862448692322, + "num_tokens": 216324529.0, + "step": 8665 + }, + { + "epoch": 0.951680210849989, + "grad_norm": 2.116685390472412, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6771470308303833, + "num_tokens": 216352089.0, + "step": 8666 + }, + { + "epoch": 0.9517900285526026, + "grad_norm": 2.4028165340423584, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7096327543258667, + "num_tokens": 216375877.0, + "step": 8667 + }, + { + "epoch": 0.9518998462552163, + "grad_norm": 2.1806955337524414, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7105955481529236, + "num_tokens": 216400910.0, + "step": 8668 + }, + { + "epoch": 0.95200966395783, + "grad_norm": 2.3068346977233887, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7107768654823303, + "num_tokens": 216423917.0, + "step": 8669 + }, + { + "epoch": 0.9521194816604437, + "grad_norm": 2.0960216522216797, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7107920050621033, + "num_tokens": 216452119.0, + "step": 8670 + }, + { + "epoch": 0.9522292993630573, + "grad_norm": 2.0383694171905518, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7138669490814209, + "num_tokens": 216481129.0, + "step": 8671 + }, + { + "epoch": 0.952339117065671, + "grad_norm": 2.2521615028381348, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7097809314727783, + "num_tokens": 216505626.0, + "step": 8672 + }, + { + "epoch": 0.9524489347682846, + "grad_norm": 2.198608636856079, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6987143754959106, + "num_tokens": 216533067.0, + "step": 8673 + }, + { + "epoch": 0.9525587524708983, + "grad_norm": 2.2555761337280273, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.715686023235321, + "num_tokens": 216555193.0, + "step": 8674 + }, + { + "epoch": 0.9526685701735119, + "grad_norm": 2.062397003173828, + "learning_rate": 1e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7421277761459351, + "num_tokens": 216581269.0, + "step": 8675 + }, + { + "epoch": 0.9527783878761257, + "grad_norm": 1.907497525215149, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7010767459869385, + "num_tokens": 216612856.0, + "step": 8676 + }, + { + "epoch": 0.9528882055787393, + "grad_norm": 2.4389727115631104, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7267177104949951, + "num_tokens": 216634085.0, + "step": 8677 + }, + { + "epoch": 0.952998023281353, + "grad_norm": 2.4298267364501953, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7269487977027893, + "num_tokens": 216654922.0, + "step": 8678 + }, + { + "epoch": 0.9531078409839666, + "grad_norm": 1.8392798900604248, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6921943426132202, + "num_tokens": 216688542.0, + "step": 8679 + }, + { + "epoch": 0.9532176586865803, + "grad_norm": 2.159024238586426, + "learning_rate": 1e-06, + "loss": 1.0846, + "mean_token_accuracy": 0.6694093346595764, + "num_tokens": 216717608.0, + "step": 8680 + }, + { + "epoch": 0.9533274763891939, + "grad_norm": 1.9087390899658203, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7060481905937195, + "num_tokens": 216749300.0, + "step": 8681 + }, + { + "epoch": 0.9534372940918076, + "grad_norm": 2.2309529781341553, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7393626570701599, + "num_tokens": 216774599.0, + "step": 8682 + }, + { + "epoch": 0.9535471117944213, + "grad_norm": 2.0368621349334717, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6925650835037231, + "num_tokens": 216802347.0, + "step": 8683 + }, + { + "epoch": 0.953656929497035, + "grad_norm": 2.1823079586029053, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7055391073226929, + "num_tokens": 216827688.0, + "step": 8684 + }, + { + "epoch": 0.9537667471996486, + "grad_norm": 2.0327980518341064, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.7048851251602173, + "num_tokens": 216855180.0, + "step": 8685 + }, + { + "epoch": 0.9538765649022622, + "grad_norm": 2.1038076877593994, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.695131242275238, + "num_tokens": 216883703.0, + "step": 8686 + }, + { + "epoch": 0.9539863826048759, + "grad_norm": 2.1437997817993164, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7110936045646667, + "num_tokens": 216909963.0, + "step": 8687 + }, + { + "epoch": 0.9540962003074895, + "grad_norm": 2.1863458156585693, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7004526853561401, + "num_tokens": 216935203.0, + "step": 8688 + }, + { + "epoch": 0.9542060180101032, + "grad_norm": 2.065532922744751, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6932039260864258, + "num_tokens": 216963206.0, + "step": 8689 + }, + { + "epoch": 0.9543158357127169, + "grad_norm": 2.682379722595215, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7107887864112854, + "num_tokens": 216982450.0, + "step": 8690 + }, + { + "epoch": 0.9544256534153306, + "grad_norm": 2.253190279006958, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6955516934394836, + "num_tokens": 217005229.0, + "step": 8691 + }, + { + "epoch": 0.9545354711179442, + "grad_norm": 2.712344169616699, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7233480215072632, + "num_tokens": 217022736.0, + "step": 8692 + }, + { + "epoch": 0.9546452888205579, + "grad_norm": 2.4755001068115234, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7386807799339294, + "num_tokens": 217041224.0, + "step": 8693 + }, + { + "epoch": 0.9547551065231715, + "grad_norm": 2.525925874710083, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7154549360275269, + "num_tokens": 217062492.0, + "step": 8694 + }, + { + "epoch": 0.9548649242257852, + "grad_norm": 2.2535059452056885, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7113395929336548, + "num_tokens": 217087205.0, + "step": 8695 + }, + { + "epoch": 0.9549747419283988, + "grad_norm": 2.31512188911438, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7189369797706604, + "num_tokens": 217109028.0, + "step": 8696 + }, + { + "epoch": 0.9550845596310125, + "grad_norm": 2.668612480163574, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7307138442993164, + "num_tokens": 217127875.0, + "step": 8697 + }, + { + "epoch": 0.9551943773336262, + "grad_norm": 2.353224277496338, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7276521325111389, + "num_tokens": 217150305.0, + "step": 8698 + }, + { + "epoch": 0.9553041950362399, + "grad_norm": 2.161494255065918, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.691340446472168, + "num_tokens": 217177673.0, + "step": 8699 + }, + { + "epoch": 0.9554140127388535, + "grad_norm": 2.319911003112793, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7320142388343811, + "num_tokens": 217201205.0, + "step": 8700 + }, + { + "epoch": 0.9555238304414672, + "grad_norm": 2.2528457641601562, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7156266570091248, + "num_tokens": 217223918.0, + "step": 8701 + }, + { + "epoch": 0.9556336481440808, + "grad_norm": 2.3833673000335693, + "learning_rate": 1e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6947581768035889, + "num_tokens": 217247144.0, + "step": 8702 + }, + { + "epoch": 0.9557434658466945, + "grad_norm": 2.080261468887329, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6845120787620544, + "num_tokens": 217274323.0, + "step": 8703 + }, + { + "epoch": 0.9558532835493081, + "grad_norm": 2.2157485485076904, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.6972127556800842, + "num_tokens": 217300479.0, + "step": 8704 + }, + { + "epoch": 0.9559631012519219, + "grad_norm": 2.1012752056121826, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7192627191543579, + "num_tokens": 217328127.0, + "step": 8705 + }, + { + "epoch": 0.9560729189545355, + "grad_norm": 2.057856321334839, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7173770666122437, + "num_tokens": 217354678.0, + "step": 8706 + }, + { + "epoch": 0.9561827366571491, + "grad_norm": 1.985559344291687, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.722274124622345, + "num_tokens": 217383037.0, + "step": 8707 + }, + { + "epoch": 0.9562925543597628, + "grad_norm": 2.2269608974456787, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7189251184463501, + "num_tokens": 217406606.0, + "step": 8708 + }, + { + "epoch": 0.9564023720623764, + "grad_norm": 1.975022554397583, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7149239182472229, + "num_tokens": 217436613.0, + "step": 8709 + }, + { + "epoch": 0.9565121897649901, + "grad_norm": 2.1719279289245605, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7196424007415771, + "num_tokens": 217461338.0, + "step": 8710 + }, + { + "epoch": 0.9566220074676037, + "grad_norm": 1.9831387996673584, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6736265420913696, + "num_tokens": 217493455.0, + "step": 8711 + }, + { + "epoch": 0.9567318251702175, + "grad_norm": 2.197960615158081, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7470900416374207, + "num_tokens": 217515662.0, + "step": 8712 + }, + { + "epoch": 0.9568416428728311, + "grad_norm": 2.3992879390716553, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7140758037567139, + "num_tokens": 217537635.0, + "step": 8713 + }, + { + "epoch": 0.9569514605754448, + "grad_norm": 2.170851230621338, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.6989002227783203, + "num_tokens": 217564004.0, + "step": 8714 + }, + { + "epoch": 0.9570612782780584, + "grad_norm": 2.2545297145843506, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7254056930541992, + "num_tokens": 217587063.0, + "step": 8715 + }, + { + "epoch": 0.9571710959806721, + "grad_norm": 2.002134323120117, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7411776185035706, + "num_tokens": 217614387.0, + "step": 8716 + }, + { + "epoch": 0.9572809136832857, + "grad_norm": 2.1788198947906494, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7210115194320679, + "num_tokens": 217639443.0, + "step": 8717 + }, + { + "epoch": 0.9573907313858994, + "grad_norm": 2.4398419857025146, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7131723165512085, + "num_tokens": 217662299.0, + "step": 8718 + }, + { + "epoch": 0.9575005490885131, + "grad_norm": 2.270301342010498, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6930117607116699, + "num_tokens": 217686720.0, + "step": 8719 + }, + { + "epoch": 0.9576103667911268, + "grad_norm": 1.9829044342041016, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7081102132797241, + "num_tokens": 217714153.0, + "step": 8720 + }, + { + "epoch": 0.9577201844937404, + "grad_norm": 1.9686890840530396, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7102460861206055, + "num_tokens": 217741437.0, + "step": 8721 + }, + { + "epoch": 0.9578300021963541, + "grad_norm": 2.1400277614593506, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7253831028938293, + "num_tokens": 217764574.0, + "step": 8722 + }, + { + "epoch": 0.9579398198989677, + "grad_norm": 2.2214548587799072, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.7016593813896179, + "num_tokens": 217791448.0, + "step": 8723 + }, + { + "epoch": 0.9580496376015813, + "grad_norm": 1.914392113685608, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6824049353599548, + "num_tokens": 217825152.0, + "step": 8724 + }, + { + "epoch": 0.958159455304195, + "grad_norm": 2.38362193107605, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7247562408447266, + "num_tokens": 217845841.0, + "step": 8725 + }, + { + "epoch": 0.9582692730068088, + "grad_norm": 2.206369161605835, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6902354955673218, + "num_tokens": 217873416.0, + "step": 8726 + }, + { + "epoch": 0.9583790907094224, + "grad_norm": 2.0673959255218506, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.6862159967422485, + "num_tokens": 217901768.0, + "step": 8727 + }, + { + "epoch": 0.958488908412036, + "grad_norm": 2.1044390201568604, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7258756160736084, + "num_tokens": 217926189.0, + "step": 8728 + }, + { + "epoch": 0.9585987261146497, + "grad_norm": 2.083475351333618, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7227023839950562, + "num_tokens": 217953421.0, + "step": 8729 + }, + { + "epoch": 0.9587085438172633, + "grad_norm": 2.174311399459839, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7055861949920654, + "num_tokens": 217981430.0, + "step": 8730 + }, + { + "epoch": 0.958818361519877, + "grad_norm": 2.1191368103027344, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7040557861328125, + "num_tokens": 218007611.0, + "step": 8731 + }, + { + "epoch": 0.9589281792224906, + "grad_norm": 2.307901382446289, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7087770104408264, + "num_tokens": 218032861.0, + "step": 8732 + }, + { + "epoch": 0.9590379969251043, + "grad_norm": 2.0819408893585205, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7137193083763123, + "num_tokens": 218060700.0, + "step": 8733 + }, + { + "epoch": 0.959147814627718, + "grad_norm": 2.2601113319396973, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7120375037193298, + "num_tokens": 218083424.0, + "step": 8734 + }, + { + "epoch": 0.9592576323303317, + "grad_norm": 2.327953338623047, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7002615928649902, + "num_tokens": 218106091.0, + "step": 8735 + }, + { + "epoch": 0.9593674500329453, + "grad_norm": 1.9083201885223389, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6932564377784729, + "num_tokens": 218141532.0, + "step": 8736 + }, + { + "epoch": 0.959477267735559, + "grad_norm": 1.9230214357376099, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.6987775564193726, + "num_tokens": 218173419.0, + "step": 8737 + }, + { + "epoch": 0.9595870854381726, + "grad_norm": 1.955682635307312, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7273335456848145, + "num_tokens": 218203188.0, + "step": 8738 + }, + { + "epoch": 0.9596969031407863, + "grad_norm": 1.9136565923690796, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6855499744415283, + "num_tokens": 218233873.0, + "step": 8739 + }, + { + "epoch": 0.9598067208433999, + "grad_norm": 2.31852650642395, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7178942561149597, + "num_tokens": 218257608.0, + "step": 8740 + }, + { + "epoch": 0.9599165385460137, + "grad_norm": 2.4404845237731934, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7222129106521606, + "num_tokens": 218278881.0, + "step": 8741 + }, + { + "epoch": 0.9600263562486273, + "grad_norm": 2.24308443069458, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7049047946929932, + "num_tokens": 218302437.0, + "step": 8742 + }, + { + "epoch": 0.960136173951241, + "grad_norm": 2.278200149536133, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7004477977752686, + "num_tokens": 218329161.0, + "step": 8743 + }, + { + "epoch": 0.9602459916538546, + "grad_norm": 2.273580551147461, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7271213531494141, + "num_tokens": 218351030.0, + "step": 8744 + }, + { + "epoch": 0.9603558093564682, + "grad_norm": 2.2075648307800293, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7320600748062134, + "num_tokens": 218374421.0, + "step": 8745 + }, + { + "epoch": 0.9604656270590819, + "grad_norm": 2.4354050159454346, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7268260717391968, + "num_tokens": 218393187.0, + "step": 8746 + }, + { + "epoch": 0.9605754447616955, + "grad_norm": 2.2651731967926025, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7198390364646912, + "num_tokens": 218416764.0, + "step": 8747 + }, + { + "epoch": 0.9606852624643093, + "grad_norm": 2.3037126064300537, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7187769412994385, + "num_tokens": 218440749.0, + "step": 8748 + }, + { + "epoch": 0.9607950801669229, + "grad_norm": 2.2000205516815186, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.70351243019104, + "num_tokens": 218466967.0, + "step": 8749 + }, + { + "epoch": 0.9609048978695366, + "grad_norm": 2.357379913330078, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7093439698219299, + "num_tokens": 218488375.0, + "step": 8750 + }, + { + "epoch": 0.9610147155721502, + "grad_norm": 2.599057674407959, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7290815114974976, + "num_tokens": 218506478.0, + "step": 8751 + }, + { + "epoch": 0.9611245332747639, + "grad_norm": 2.3348371982574463, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6975690722465515, + "num_tokens": 218529755.0, + "step": 8752 + }, + { + "epoch": 0.9612343509773775, + "grad_norm": 2.3336548805236816, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7287034392356873, + "num_tokens": 218550556.0, + "step": 8753 + }, + { + "epoch": 0.9613441686799912, + "grad_norm": 2.457672357559204, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7159818410873413, + "num_tokens": 218572078.0, + "step": 8754 + }, + { + "epoch": 0.9614539863826049, + "grad_norm": 2.340400457382202, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7189998626708984, + "num_tokens": 218593556.0, + "step": 8755 + }, + { + "epoch": 0.9615638040852186, + "grad_norm": 2.127586841583252, + "learning_rate": 1e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7564190030097961, + "num_tokens": 218618056.0, + "step": 8756 + }, + { + "epoch": 0.9616736217878322, + "grad_norm": 2.1916165351867676, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.69936603307724, + "num_tokens": 218643805.0, + "step": 8757 + }, + { + "epoch": 0.9617834394904459, + "grad_norm": 2.1004719734191895, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6967077255249023, + "num_tokens": 218668242.0, + "step": 8758 + }, + { + "epoch": 0.9618932571930595, + "grad_norm": 2.1915509700775146, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6905544400215149, + "num_tokens": 218694313.0, + "step": 8759 + }, + { + "epoch": 0.9620030748956732, + "grad_norm": 1.9078750610351562, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7236030697822571, + "num_tokens": 218722302.0, + "step": 8760 + }, + { + "epoch": 0.9621128925982868, + "grad_norm": 2.387948513031006, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7171431183815002, + "num_tokens": 218742893.0, + "step": 8761 + }, + { + "epoch": 0.9622227103009005, + "grad_norm": 1.8975274562835693, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7050789594650269, + "num_tokens": 218775592.0, + "step": 8762 + }, + { + "epoch": 0.9623325280035142, + "grad_norm": 2.154876470565796, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6899590492248535, + "num_tokens": 218801348.0, + "step": 8763 + }, + { + "epoch": 0.9624423457061279, + "grad_norm": 2.260955572128296, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7063559889793396, + "num_tokens": 218825568.0, + "step": 8764 + }, + { + "epoch": 0.9625521634087415, + "grad_norm": 2.1629981994628906, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7317550778388977, + "num_tokens": 218849932.0, + "step": 8765 + }, + { + "epoch": 0.9626619811113551, + "grad_norm": 2.344733715057373, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6973609328269958, + "num_tokens": 218873139.0, + "step": 8766 + }, + { + "epoch": 0.9627717988139688, + "grad_norm": 2.3788602352142334, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7264639735221863, + "num_tokens": 218894201.0, + "step": 8767 + }, + { + "epoch": 0.9628816165165824, + "grad_norm": 2.1257362365722656, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7053384780883789, + "num_tokens": 218921358.0, + "step": 8768 + }, + { + "epoch": 0.9629914342191961, + "grad_norm": 1.9205759763717651, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7094087600708008, + "num_tokens": 218953623.0, + "step": 8769 + }, + { + "epoch": 0.9631012519218098, + "grad_norm": 2.0857341289520264, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7160739302635193, + "num_tokens": 218980572.0, + "step": 8770 + }, + { + "epoch": 0.9632110696244235, + "grad_norm": 2.255434989929199, + "learning_rate": 1e-06, + "loss": 1.0636, + "mean_token_accuracy": 0.6850981116294861, + "num_tokens": 219006449.0, + "step": 8771 + }, + { + "epoch": 0.9633208873270371, + "grad_norm": 2.056675910949707, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7110528945922852, + "num_tokens": 219035182.0, + "step": 8772 + }, + { + "epoch": 0.9634307050296508, + "grad_norm": 2.2903425693511963, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.710525631904602, + "num_tokens": 219057820.0, + "step": 8773 + }, + { + "epoch": 0.9635405227322644, + "grad_norm": 2.045395851135254, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7078366875648499, + "num_tokens": 219085508.0, + "step": 8774 + }, + { + "epoch": 0.9636503404348781, + "grad_norm": 2.048598527908325, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7017546892166138, + "num_tokens": 219112934.0, + "step": 8775 + }, + { + "epoch": 0.9637601581374917, + "grad_norm": 2.1768240928649902, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7150906324386597, + "num_tokens": 219135737.0, + "step": 8776 + }, + { + "epoch": 0.9638699758401055, + "grad_norm": 2.4731271266937256, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7079784870147705, + "num_tokens": 219156327.0, + "step": 8777 + }, + { + "epoch": 0.9639797935427191, + "grad_norm": 2.4103474617004395, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7048951387405396, + "num_tokens": 219180144.0, + "step": 8778 + }, + { + "epoch": 0.9640896112453328, + "grad_norm": 2.4615590572357178, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.728456437587738, + "num_tokens": 219200752.0, + "step": 8779 + }, + { + "epoch": 0.9641994289479464, + "grad_norm": 1.9246392250061035, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6874172687530518, + "num_tokens": 219233027.0, + "step": 8780 + }, + { + "epoch": 0.9643092466505601, + "grad_norm": 1.8770371675491333, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7117379903793335, + "num_tokens": 219264019.0, + "step": 8781 + }, + { + "epoch": 0.9644190643531737, + "grad_norm": 2.532799243927002, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7124090194702148, + "num_tokens": 219284963.0, + "step": 8782 + }, + { + "epoch": 0.9645288820557874, + "grad_norm": 2.5715482234954834, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7038578391075134, + "num_tokens": 219306213.0, + "step": 8783 + }, + { + "epoch": 0.9646386997584011, + "grad_norm": 2.4617319107055664, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7167078256607056, + "num_tokens": 219327523.0, + "step": 8784 + }, + { + "epoch": 0.9647485174610148, + "grad_norm": 2.3043787479400635, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.7008647918701172, + "num_tokens": 219352071.0, + "step": 8785 + }, + { + "epoch": 0.9648583351636284, + "grad_norm": 2.2960660457611084, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7156578302383423, + "num_tokens": 219374999.0, + "step": 8786 + }, + { + "epoch": 0.964968152866242, + "grad_norm": 2.501573085784912, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7183541655540466, + "num_tokens": 219395532.0, + "step": 8787 + }, + { + "epoch": 0.9650779705688557, + "grad_norm": 2.286397695541382, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7145034074783325, + "num_tokens": 219417446.0, + "step": 8788 + }, + { + "epoch": 0.9651877882714693, + "grad_norm": 2.209888219833374, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7139495611190796, + "num_tokens": 219441683.0, + "step": 8789 + }, + { + "epoch": 0.965297605974083, + "grad_norm": 2.2803163528442383, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7103452682495117, + "num_tokens": 219465632.0, + "step": 8790 + }, + { + "epoch": 0.9654074236766966, + "grad_norm": 2.400728702545166, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7090757489204407, + "num_tokens": 219486798.0, + "step": 8791 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 2.524390697479248, + "learning_rate": 1e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7481784820556641, + "num_tokens": 219505943.0, + "step": 8792 + }, + { + "epoch": 0.965627059081924, + "grad_norm": 2.3516488075256348, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7100188732147217, + "num_tokens": 219528758.0, + "step": 8793 + }, + { + "epoch": 0.9657368767845377, + "grad_norm": 2.122727632522583, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7132700681686401, + "num_tokens": 219556844.0, + "step": 8794 + }, + { + "epoch": 0.9658466944871513, + "grad_norm": 2.1351397037506104, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7162693738937378, + "num_tokens": 219583680.0, + "step": 8795 + }, + { + "epoch": 0.965956512189765, + "grad_norm": 2.2410550117492676, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7056023478507996, + "num_tokens": 219609920.0, + "step": 8796 + }, + { + "epoch": 0.9660663298923786, + "grad_norm": 2.330517292022705, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7339390516281128, + "num_tokens": 219631058.0, + "step": 8797 + }, + { + "epoch": 0.9661761475949923, + "grad_norm": 2.5482375621795654, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7286485433578491, + "num_tokens": 219648888.0, + "step": 8798 + }, + { + "epoch": 0.966285965297606, + "grad_norm": 2.1088476181030273, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7072992324829102, + "num_tokens": 219675563.0, + "step": 8799 + }, + { + "epoch": 0.9663957830002197, + "grad_norm": 2.0789635181427, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7321035861968994, + "num_tokens": 219699923.0, + "step": 8800 + }, + { + "epoch": 0.9665056007028333, + "grad_norm": 2.4199700355529785, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7066781520843506, + "num_tokens": 219722284.0, + "step": 8801 + }, + { + "epoch": 0.966615418405447, + "grad_norm": 2.1512744426727295, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7107206583023071, + "num_tokens": 219746283.0, + "step": 8802 + }, + { + "epoch": 0.9667252361080606, + "grad_norm": 2.408212900161743, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7254809141159058, + "num_tokens": 219767762.0, + "step": 8803 + }, + { + "epoch": 0.9668350538106742, + "grad_norm": 2.5195441246032715, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7219294309616089, + "num_tokens": 219787001.0, + "step": 8804 + }, + { + "epoch": 0.9669448715132879, + "grad_norm": 2.266791582107544, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7301830053329468, + "num_tokens": 219808894.0, + "step": 8805 + }, + { + "epoch": 0.9670546892159017, + "grad_norm": 2.1939964294433594, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.733415961265564, + "num_tokens": 219831981.0, + "step": 8806 + }, + { + "epoch": 0.9671645069185153, + "grad_norm": 2.0910961627960205, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7066328525543213, + "num_tokens": 219858920.0, + "step": 8807 + }, + { + "epoch": 0.967274324621129, + "grad_norm": 2.067714214324951, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6899071335792542, + "num_tokens": 219889216.0, + "step": 8808 + }, + { + "epoch": 0.9673841423237426, + "grad_norm": 2.336552858352661, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7105631232261658, + "num_tokens": 219912679.0, + "step": 8809 + }, + { + "epoch": 0.9674939600263562, + "grad_norm": 2.2069790363311768, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7170900106430054, + "num_tokens": 219935536.0, + "step": 8810 + }, + { + "epoch": 0.9676037777289699, + "grad_norm": 1.9859082698822021, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7301225662231445, + "num_tokens": 219962814.0, + "step": 8811 + }, + { + "epoch": 0.9677135954315835, + "grad_norm": 2.580150842666626, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7171179056167603, + "num_tokens": 219982629.0, + "step": 8812 + }, + { + "epoch": 0.9678234131341973, + "grad_norm": 2.132307767868042, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7152619957923889, + "num_tokens": 220010258.0, + "step": 8813 + }, + { + "epoch": 0.9679332308368109, + "grad_norm": 2.1471731662750244, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7300865650177002, + "num_tokens": 220036784.0, + "step": 8814 + }, + { + "epoch": 0.9680430485394246, + "grad_norm": 2.304550886154175, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7080640196800232, + "num_tokens": 220061268.0, + "step": 8815 + }, + { + "epoch": 0.9681528662420382, + "grad_norm": 2.384338617324829, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7243632078170776, + "num_tokens": 220081644.0, + "step": 8816 + }, + { + "epoch": 0.9682626839446519, + "grad_norm": 2.3584835529327393, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7032643556594849, + "num_tokens": 220103515.0, + "step": 8817 + }, + { + "epoch": 0.9683725016472655, + "grad_norm": 2.080925703048706, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7279390096664429, + "num_tokens": 220130663.0, + "step": 8818 + }, + { + "epoch": 0.9684823193498792, + "grad_norm": 2.377584457397461, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6901789903640747, + "num_tokens": 220154301.0, + "step": 8819 + }, + { + "epoch": 0.9685921370524928, + "grad_norm": 1.9995697736740112, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7164977788925171, + "num_tokens": 220181728.0, + "step": 8820 + }, + { + "epoch": 0.9687019547551066, + "grad_norm": 2.2239365577697754, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6903048753738403, + "num_tokens": 220205108.0, + "step": 8821 + }, + { + "epoch": 0.9688117724577202, + "grad_norm": 2.143005609512329, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7176067233085632, + "num_tokens": 220229001.0, + "step": 8822 + }, + { + "epoch": 0.9689215901603339, + "grad_norm": 2.243107557296753, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7306885123252869, + "num_tokens": 220251767.0, + "step": 8823 + }, + { + "epoch": 0.9690314078629475, + "grad_norm": 2.650869131088257, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7233108282089233, + "num_tokens": 220269678.0, + "step": 8824 + }, + { + "epoch": 0.9691412255655611, + "grad_norm": 2.2019917964935303, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6948263645172119, + "num_tokens": 220295040.0, + "step": 8825 + }, + { + "epoch": 0.9692510432681748, + "grad_norm": 2.114365816116333, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6838797330856323, + "num_tokens": 220322719.0, + "step": 8826 + }, + { + "epoch": 0.9693608609707884, + "grad_norm": 2.2566840648651123, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7114786505699158, + "num_tokens": 220346527.0, + "step": 8827 + }, + { + "epoch": 0.9694706786734022, + "grad_norm": 2.1020355224609375, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7460277676582336, + "num_tokens": 220371087.0, + "step": 8828 + }, + { + "epoch": 0.9695804963760158, + "grad_norm": 1.997661828994751, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.726974368095398, + "num_tokens": 220399484.0, + "step": 8829 + }, + { + "epoch": 0.9696903140786295, + "grad_norm": 2.1016409397125244, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7099770307540894, + "num_tokens": 220426266.0, + "step": 8830 + }, + { + "epoch": 0.9698001317812431, + "grad_norm": 2.2457494735717773, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6953781843185425, + "num_tokens": 220452529.0, + "step": 8831 + }, + { + "epoch": 0.9699099494838568, + "grad_norm": 2.0472939014434814, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6947587728500366, + "num_tokens": 220483755.0, + "step": 8832 + }, + { + "epoch": 0.9700197671864704, + "grad_norm": 2.486687421798706, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7223131060600281, + "num_tokens": 220504481.0, + "step": 8833 + }, + { + "epoch": 0.9701295848890841, + "grad_norm": 2.1247549057006836, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7196012735366821, + "num_tokens": 220529482.0, + "step": 8834 + }, + { + "epoch": 0.9702394025916978, + "grad_norm": 2.327552318572998, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7335113286972046, + "num_tokens": 220551860.0, + "step": 8835 + }, + { + "epoch": 0.9703492202943115, + "grad_norm": 2.0663771629333496, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7039156556129456, + "num_tokens": 220580589.0, + "step": 8836 + }, + { + "epoch": 0.9704590379969251, + "grad_norm": 1.9056881666183472, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7043015956878662, + "num_tokens": 220611660.0, + "step": 8837 + }, + { + "epoch": 0.9705688556995388, + "grad_norm": 2.274036407470703, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7249140739440918, + "num_tokens": 220637722.0, + "step": 8838 + }, + { + "epoch": 0.9706786734021524, + "grad_norm": 2.311262607574463, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6909669637680054, + "num_tokens": 220661582.0, + "step": 8839 + }, + { + "epoch": 0.9707884911047661, + "grad_norm": 2.056226968765259, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7090519666671753, + "num_tokens": 220688890.0, + "step": 8840 + }, + { + "epoch": 0.9708983088073797, + "grad_norm": 2.192793607711792, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7084109783172607, + "num_tokens": 220713963.0, + "step": 8841 + }, + { + "epoch": 0.9710081265099935, + "grad_norm": 2.1676628589630127, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7021899819374084, + "num_tokens": 220738936.0, + "step": 8842 + }, + { + "epoch": 0.9711179442126071, + "grad_norm": 1.8950270414352417, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6882182359695435, + "num_tokens": 220773167.0, + "step": 8843 + }, + { + "epoch": 0.9712277619152208, + "grad_norm": 1.8775861263275146, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6929225921630859, + "num_tokens": 220806718.0, + "step": 8844 + }, + { + "epoch": 0.9713375796178344, + "grad_norm": 2.286983013153076, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.711068332195282, + "num_tokens": 220828933.0, + "step": 8845 + }, + { + "epoch": 0.971447397320448, + "grad_norm": 2.4321494102478027, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7150704264640808, + "num_tokens": 220850216.0, + "step": 8846 + }, + { + "epoch": 0.9715572150230617, + "grad_norm": 2.119403600692749, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.7022539973258972, + "num_tokens": 220876096.0, + "step": 8847 + }, + { + "epoch": 0.9716670327256753, + "grad_norm": 2.5537097454071045, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7280840873718262, + "num_tokens": 220895572.0, + "step": 8848 + }, + { + "epoch": 0.971776850428289, + "grad_norm": 1.9803634881973267, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7282522916793823, + "num_tokens": 220922937.0, + "step": 8849 + }, + { + "epoch": 0.9718866681309027, + "grad_norm": 2.496337413787842, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7276006937026978, + "num_tokens": 220942132.0, + "step": 8850 + }, + { + "epoch": 0.9719964858335164, + "grad_norm": 2.004002094268799, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7051502466201782, + "num_tokens": 220974472.0, + "step": 8851 + }, + { + "epoch": 0.97210630353613, + "grad_norm": 2.043653964996338, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.6944133639335632, + "num_tokens": 221000028.0, + "step": 8852 + }, + { + "epoch": 0.9722161212387437, + "grad_norm": 1.9663972854614258, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6945511102676392, + "num_tokens": 221030218.0, + "step": 8853 + }, + { + "epoch": 0.9723259389413573, + "grad_norm": 2.383395195007324, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7165230512619019, + "num_tokens": 221051065.0, + "step": 8854 + }, + { + "epoch": 0.972435756643971, + "grad_norm": 2.313136100769043, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7404651641845703, + "num_tokens": 221074266.0, + "step": 8855 + }, + { + "epoch": 0.9725455743465846, + "grad_norm": 2.278622627258301, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7500770092010498, + "num_tokens": 221095100.0, + "step": 8856 + }, + { + "epoch": 0.9726553920491984, + "grad_norm": 1.8284432888031006, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6897305250167847, + "num_tokens": 221129595.0, + "step": 8857 + }, + { + "epoch": 0.972765209751812, + "grad_norm": 2.2170157432556152, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7059941291809082, + "num_tokens": 221155763.0, + "step": 8858 + }, + { + "epoch": 0.9728750274544257, + "grad_norm": 2.2652828693389893, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7009835243225098, + "num_tokens": 221179682.0, + "step": 8859 + }, + { + "epoch": 0.9729848451570393, + "grad_norm": 2.304814577102661, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7185360193252563, + "num_tokens": 221202721.0, + "step": 8860 + }, + { + "epoch": 0.973094662859653, + "grad_norm": 2.203906297683716, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7203770279884338, + "num_tokens": 221225969.0, + "step": 8861 + }, + { + "epoch": 0.9732044805622666, + "grad_norm": 2.1386003494262695, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6789206862449646, + "num_tokens": 221254088.0, + "step": 8862 + }, + { + "epoch": 0.9733142982648803, + "grad_norm": 2.4496378898620605, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7298121452331543, + "num_tokens": 221274089.0, + "step": 8863 + }, + { + "epoch": 0.973424115967494, + "grad_norm": 2.6777472496032715, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7073438763618469, + "num_tokens": 221293498.0, + "step": 8864 + }, + { + "epoch": 0.9735339336701077, + "grad_norm": 2.1834423542022705, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6999295949935913, + "num_tokens": 221318482.0, + "step": 8865 + }, + { + "epoch": 0.9736437513727213, + "grad_norm": 2.102283239364624, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7043354511260986, + "num_tokens": 221344297.0, + "step": 8866 + }, + { + "epoch": 0.973753569075335, + "grad_norm": 2.11568546295166, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6954039335250854, + "num_tokens": 221372240.0, + "step": 8867 + }, + { + "epoch": 0.9738633867779486, + "grad_norm": 2.3662142753601074, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6982809901237488, + "num_tokens": 221395543.0, + "step": 8868 + }, + { + "epoch": 0.9739732044805622, + "grad_norm": 2.3125476837158203, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7136939764022827, + "num_tokens": 221419206.0, + "step": 8869 + }, + { + "epoch": 0.9740830221831759, + "grad_norm": 2.396953821182251, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7130587100982666, + "num_tokens": 221439432.0, + "step": 8870 + }, + { + "epoch": 0.9741928398857896, + "grad_norm": 2.3057730197906494, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7063791751861572, + "num_tokens": 221463293.0, + "step": 8871 + }, + { + "epoch": 0.9743026575884033, + "grad_norm": 2.3143224716186523, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7403225898742676, + "num_tokens": 221487036.0, + "step": 8872 + }, + { + "epoch": 0.9744124752910169, + "grad_norm": 2.1651077270507812, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6933595538139343, + "num_tokens": 221513472.0, + "step": 8873 + }, + { + "epoch": 0.9745222929936306, + "grad_norm": 2.256187677383423, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7202252745628357, + "num_tokens": 221536803.0, + "step": 8874 + }, + { + "epoch": 0.9746321106962442, + "grad_norm": 2.4495394229888916, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7345254421234131, + "num_tokens": 221556890.0, + "step": 8875 + }, + { + "epoch": 0.9747419283988579, + "grad_norm": 2.23642897605896, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6997103691101074, + "num_tokens": 221581223.0, + "step": 8876 + }, + { + "epoch": 0.9748517461014715, + "grad_norm": 2.0963449478149414, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6905090808868408, + "num_tokens": 221608342.0, + "step": 8877 + }, + { + "epoch": 0.9749615638040853, + "grad_norm": 2.4707517623901367, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6968839168548584, + "num_tokens": 221630264.0, + "step": 8878 + }, + { + "epoch": 0.9750713815066989, + "grad_norm": 2.493666172027588, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7113502621650696, + "num_tokens": 221650455.0, + "step": 8879 + }, + { + "epoch": 0.9751811992093126, + "grad_norm": 2.3243179321289062, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.717424750328064, + "num_tokens": 221671366.0, + "step": 8880 + }, + { + "epoch": 0.9752910169119262, + "grad_norm": 2.247450351715088, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7118784189224243, + "num_tokens": 221694735.0, + "step": 8881 + }, + { + "epoch": 0.9754008346145399, + "grad_norm": 2.4159929752349854, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7021775245666504, + "num_tokens": 221717481.0, + "step": 8882 + }, + { + "epoch": 0.9755106523171535, + "grad_norm": 2.1105377674102783, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7108372449874878, + "num_tokens": 221744467.0, + "step": 8883 + }, + { + "epoch": 0.9756204700197671, + "grad_norm": 2.533522844314575, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.713065505027771, + "num_tokens": 221763990.0, + "step": 8884 + }, + { + "epoch": 0.9757302877223808, + "grad_norm": 2.303571939468384, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7154037356376648, + "num_tokens": 221788169.0, + "step": 8885 + }, + { + "epoch": 0.9758401054249946, + "grad_norm": 2.176013469696045, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.6978122591972351, + "num_tokens": 221812467.0, + "step": 8886 + }, + { + "epoch": 0.9759499231276082, + "grad_norm": 1.9142868518829346, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6841083765029907, + "num_tokens": 221846394.0, + "step": 8887 + }, + { + "epoch": 0.9760597408302218, + "grad_norm": 2.177049398422241, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7188864946365356, + "num_tokens": 221872449.0, + "step": 8888 + }, + { + "epoch": 0.9761695585328355, + "grad_norm": 2.0981407165527344, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.696539044380188, + "num_tokens": 221900447.0, + "step": 8889 + }, + { + "epoch": 0.9762793762354491, + "grad_norm": 2.0394887924194336, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6956987380981445, + "num_tokens": 221930694.0, + "step": 8890 + }, + { + "epoch": 0.9763891939380628, + "grad_norm": 2.2595880031585693, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7278225421905518, + "num_tokens": 221954055.0, + "step": 8891 + }, + { + "epoch": 0.9764990116406764, + "grad_norm": 2.2057082653045654, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7186022996902466, + "num_tokens": 221978970.0, + "step": 8892 + }, + { + "epoch": 0.9766088293432902, + "grad_norm": 2.5132293701171875, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7325042486190796, + "num_tokens": 221999811.0, + "step": 8893 + }, + { + "epoch": 0.9767186470459038, + "grad_norm": 2.11448073387146, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7144497632980347, + "num_tokens": 222026156.0, + "step": 8894 + }, + { + "epoch": 0.9768284647485175, + "grad_norm": 2.2865357398986816, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7300699949264526, + "num_tokens": 222048519.0, + "step": 8895 + }, + { + "epoch": 0.9769382824511311, + "grad_norm": 2.144914388656616, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7242928743362427, + "num_tokens": 222074130.0, + "step": 8896 + }, + { + "epoch": 0.9770481001537448, + "grad_norm": 2.0799190998077393, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7254865169525146, + "num_tokens": 222102564.0, + "step": 8897 + }, + { + "epoch": 0.9771579178563584, + "grad_norm": 1.9931797981262207, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7014740705490112, + "num_tokens": 222130649.0, + "step": 8898 + }, + { + "epoch": 0.9772677355589721, + "grad_norm": 2.1079325675964355, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.701246440410614, + "num_tokens": 222157254.0, + "step": 8899 + }, + { + "epoch": 0.9773775532615858, + "grad_norm": 2.2461771965026855, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7104906439781189, + "num_tokens": 222181267.0, + "step": 8900 + }, + { + "epoch": 0.9774873709641995, + "grad_norm": 2.30295467376709, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7097209095954895, + "num_tokens": 222203789.0, + "step": 8901 + }, + { + "epoch": 0.9775971886668131, + "grad_norm": 2.275513172149658, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7254047393798828, + "num_tokens": 222227435.0, + "step": 8902 + }, + { + "epoch": 0.9777070063694268, + "grad_norm": 2.241166114807129, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7020751237869263, + "num_tokens": 222251724.0, + "step": 8903 + }, + { + "epoch": 0.9778168240720404, + "grad_norm": 1.9225950241088867, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7180379629135132, + "num_tokens": 222282125.0, + "step": 8904 + }, + { + "epoch": 0.977926641774654, + "grad_norm": 2.353734254837036, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7183228731155396, + "num_tokens": 222304405.0, + "step": 8905 + }, + { + "epoch": 0.9780364594772677, + "grad_norm": 2.3034703731536865, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.708530068397522, + "num_tokens": 222327573.0, + "step": 8906 + }, + { + "epoch": 0.9781462771798815, + "grad_norm": 1.9829761981964111, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.679311990737915, + "num_tokens": 222357536.0, + "step": 8907 + }, + { + "epoch": 0.9782560948824951, + "grad_norm": 2.8942394256591797, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7135829925537109, + "num_tokens": 222373033.0, + "step": 8908 + }, + { + "epoch": 0.9783659125851087, + "grad_norm": 2.159982681274414, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7343716621398926, + "num_tokens": 222398652.0, + "step": 8909 + }, + { + "epoch": 0.9784757302877224, + "grad_norm": 2.2112159729003906, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7218451499938965, + "num_tokens": 222422257.0, + "step": 8910 + }, + { + "epoch": 0.978585547990336, + "grad_norm": 1.8892630338668823, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6985636949539185, + "num_tokens": 222454484.0, + "step": 8911 + }, + { + "epoch": 0.9786953656929497, + "grad_norm": 2.1007516384124756, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6839847564697266, + "num_tokens": 222482420.0, + "step": 8912 + }, + { + "epoch": 0.9788051833955633, + "grad_norm": 2.149890422821045, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7141899466514587, + "num_tokens": 222507960.0, + "step": 8913 + }, + { + "epoch": 0.978915001098177, + "grad_norm": 2.166278839111328, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7339321374893188, + "num_tokens": 222531396.0, + "step": 8914 + }, + { + "epoch": 0.9790248188007907, + "grad_norm": 2.005967140197754, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7037111520767212, + "num_tokens": 222560502.0, + "step": 8915 + }, + { + "epoch": 0.9791346365034044, + "grad_norm": 2.2436745166778564, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6893891096115112, + "num_tokens": 222585061.0, + "step": 8916 + }, + { + "epoch": 0.979244454206018, + "grad_norm": 2.244811534881592, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7002851366996765, + "num_tokens": 222610634.0, + "step": 8917 + }, + { + "epoch": 0.9793542719086317, + "grad_norm": 1.8479926586151123, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.733439028263092, + "num_tokens": 222643289.0, + "step": 8918 + }, + { + "epoch": 0.9794640896112453, + "grad_norm": 1.9118469953536987, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7018345594406128, + "num_tokens": 222675921.0, + "step": 8919 + }, + { + "epoch": 0.979573907313859, + "grad_norm": 2.07084321975708, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7183674573898315, + "num_tokens": 222702134.0, + "step": 8920 + }, + { + "epoch": 0.9796837250164726, + "grad_norm": 2.1992228031158447, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7026957869529724, + "num_tokens": 222727979.0, + "step": 8921 + }, + { + "epoch": 0.9797935427190864, + "grad_norm": 2.150907516479492, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7014060616493225, + "num_tokens": 222753813.0, + "step": 8922 + }, + { + "epoch": 0.9799033604217, + "grad_norm": 2.0668387413024902, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.700053334236145, + "num_tokens": 222782611.0, + "step": 8923 + }, + { + "epoch": 0.9800131781243137, + "grad_norm": 2.0403785705566406, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7214148640632629, + "num_tokens": 222809321.0, + "step": 8924 + }, + { + "epoch": 0.9801229958269273, + "grad_norm": 2.3227601051330566, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7266327738761902, + "num_tokens": 222831452.0, + "step": 8925 + }, + { + "epoch": 0.980232813529541, + "grad_norm": 1.957796335220337, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7104991674423218, + "num_tokens": 222857904.0, + "step": 8926 + }, + { + "epoch": 0.9803426312321546, + "grad_norm": 2.0897724628448486, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7166151404380798, + "num_tokens": 222883026.0, + "step": 8927 + }, + { + "epoch": 0.9804524489347682, + "grad_norm": 1.979439377784729, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7218831777572632, + "num_tokens": 222912444.0, + "step": 8928 + }, + { + "epoch": 0.980562266637382, + "grad_norm": 2.1150002479553223, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6909245252609253, + "num_tokens": 222939406.0, + "step": 8929 + }, + { + "epoch": 0.9806720843399956, + "grad_norm": 2.311213493347168, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7002249956130981, + "num_tokens": 222961664.0, + "step": 8930 + }, + { + "epoch": 0.9807819020426093, + "grad_norm": 2.555276393890381, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7085345983505249, + "num_tokens": 222981980.0, + "step": 8931 + }, + { + "epoch": 0.9808917197452229, + "grad_norm": 2.199281930923462, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7052179574966431, + "num_tokens": 223004713.0, + "step": 8932 + }, + { + "epoch": 0.9810015374478366, + "grad_norm": 2.2437117099761963, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.699844241142273, + "num_tokens": 223029540.0, + "step": 8933 + }, + { + "epoch": 0.9811113551504502, + "grad_norm": 2.323150634765625, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.7050581574440002, + "num_tokens": 223053644.0, + "step": 8934 + }, + { + "epoch": 0.9812211728530639, + "grad_norm": 2.2647347450256348, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7177071571350098, + "num_tokens": 223078112.0, + "step": 8935 + }, + { + "epoch": 0.9813309905556776, + "grad_norm": 2.261547565460205, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6958221197128296, + "num_tokens": 223102763.0, + "step": 8936 + }, + { + "epoch": 0.9814408082582913, + "grad_norm": 2.5048956871032715, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7214038372039795, + "num_tokens": 223122340.0, + "step": 8937 + }, + { + "epoch": 0.9815506259609049, + "grad_norm": 2.526550054550171, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7285730838775635, + "num_tokens": 223141916.0, + "step": 8938 + }, + { + "epoch": 0.9816604436635186, + "grad_norm": 2.2686257362365723, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6949999928474426, + "num_tokens": 223166989.0, + "step": 8939 + }, + { + "epoch": 0.9817702613661322, + "grad_norm": 2.5313432216644287, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7276327013969421, + "num_tokens": 223185659.0, + "step": 8940 + }, + { + "epoch": 0.9818800790687459, + "grad_norm": 2.153317928314209, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7296285033226013, + "num_tokens": 223209996.0, + "step": 8941 + }, + { + "epoch": 0.9819898967713595, + "grad_norm": 2.2612085342407227, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7324123382568359, + "num_tokens": 223233035.0, + "step": 8942 + }, + { + "epoch": 0.9820997144739732, + "grad_norm": 2.0261330604553223, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6883608102798462, + "num_tokens": 223261745.0, + "step": 8943 + }, + { + "epoch": 0.9822095321765869, + "grad_norm": 2.7619903087615967, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7141498327255249, + "num_tokens": 223280038.0, + "step": 8944 + }, + { + "epoch": 0.9823193498792006, + "grad_norm": 2.06780743598938, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6992881894111633, + "num_tokens": 223309664.0, + "step": 8945 + }, + { + "epoch": 0.9824291675818142, + "grad_norm": 2.3569607734680176, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6943386793136597, + "num_tokens": 223333451.0, + "step": 8946 + }, + { + "epoch": 0.9825389852844278, + "grad_norm": 2.439164161682129, + "learning_rate": 1e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7403285503387451, + "num_tokens": 223352043.0, + "step": 8947 + }, + { + "epoch": 0.9826488029870415, + "grad_norm": 2.0862457752227783, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.723808765411377, + "num_tokens": 223378692.0, + "step": 8948 + }, + { + "epoch": 0.9827586206896551, + "grad_norm": 2.297950267791748, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6921903491020203, + "num_tokens": 223402635.0, + "step": 8949 + }, + { + "epoch": 0.9828684383922688, + "grad_norm": 2.3889851570129395, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7014918327331543, + "num_tokens": 223424322.0, + "step": 8950 + }, + { + "epoch": 0.9829782560948825, + "grad_norm": 2.081974744796753, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6955880522727966, + "num_tokens": 223451412.0, + "step": 8951 + }, + { + "epoch": 0.9830880737974962, + "grad_norm": 2.220571279525757, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7233684659004211, + "num_tokens": 223474588.0, + "step": 8952 + }, + { + "epoch": 0.9831978915001098, + "grad_norm": 2.551058530807495, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7090758085250854, + "num_tokens": 223492946.0, + "step": 8953 + }, + { + "epoch": 0.9833077092027235, + "grad_norm": 2.0296854972839355, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.715217113494873, + "num_tokens": 223520224.0, + "step": 8954 + }, + { + "epoch": 0.9834175269053371, + "grad_norm": 2.5275590419769287, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7066620588302612, + "num_tokens": 223540705.0, + "step": 8955 + }, + { + "epoch": 0.9835273446079508, + "grad_norm": 2.1716666221618652, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7173299193382263, + "num_tokens": 223564949.0, + "step": 8956 + }, + { + "epoch": 0.9836371623105644, + "grad_norm": 2.3210363388061523, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.705288290977478, + "num_tokens": 223587179.0, + "step": 8957 + }, + { + "epoch": 0.9837469800131782, + "grad_norm": 2.21230411529541, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7165746688842773, + "num_tokens": 223613242.0, + "step": 8958 + }, + { + "epoch": 0.9838567977157918, + "grad_norm": 2.38244891166687, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.6999738812446594, + "num_tokens": 223637720.0, + "step": 8959 + }, + { + "epoch": 0.9839666154184055, + "grad_norm": 2.1231186389923096, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7336729764938354, + "num_tokens": 223662089.0, + "step": 8960 + }, + { + "epoch": 0.9840764331210191, + "grad_norm": 2.120863676071167, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7141616344451904, + "num_tokens": 223688302.0, + "step": 8961 + }, + { + "epoch": 0.9841862508236328, + "grad_norm": 2.1341617107391357, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7367924451828003, + "num_tokens": 223714242.0, + "step": 8962 + }, + { + "epoch": 0.9842960685262464, + "grad_norm": 2.6943483352661133, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7166942358016968, + "num_tokens": 223734517.0, + "step": 8963 + }, + { + "epoch": 0.98440588622886, + "grad_norm": 1.9842199087142944, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.6997510194778442, + "num_tokens": 223765338.0, + "step": 8964 + }, + { + "epoch": 0.9845157039314738, + "grad_norm": 2.4094090461730957, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7067070007324219, + "num_tokens": 223786868.0, + "step": 8965 + }, + { + "epoch": 0.9846255216340875, + "grad_norm": 2.2921671867370605, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6950652599334717, + "num_tokens": 223810524.0, + "step": 8966 + }, + { + "epoch": 0.9847353393367011, + "grad_norm": 1.8573874235153198, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7087401151657104, + "num_tokens": 223842315.0, + "step": 8967 + }, + { + "epoch": 0.9848451570393147, + "grad_norm": 2.010497570037842, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7006940245628357, + "num_tokens": 223872056.0, + "step": 8968 + }, + { + "epoch": 0.9849549747419284, + "grad_norm": 1.9902737140655518, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7085731029510498, + "num_tokens": 223903670.0, + "step": 8969 + }, + { + "epoch": 0.985064792444542, + "grad_norm": 2.3474647998809814, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7168306112289429, + "num_tokens": 223926591.0, + "step": 8970 + }, + { + "epoch": 0.9851746101471557, + "grad_norm": 2.13687801361084, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7300475239753723, + "num_tokens": 223953697.0, + "step": 8971 + }, + { + "epoch": 0.9852844278497693, + "grad_norm": 2.3043723106384277, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6950939297676086, + "num_tokens": 223977064.0, + "step": 8972 + }, + { + "epoch": 0.9853942455523831, + "grad_norm": 2.0378527641296387, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7169570922851562, + "num_tokens": 224006708.0, + "step": 8973 + }, + { + "epoch": 0.9855040632549967, + "grad_norm": 2.0615406036376953, + "learning_rate": 1e-06, + "loss": 1.0302, + "mean_token_accuracy": 0.6844446659088135, + "num_tokens": 224035411.0, + "step": 8974 + }, + { + "epoch": 0.9856138809576104, + "grad_norm": 2.1690077781677246, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.6961312294006348, + "num_tokens": 224062586.0, + "step": 8975 + }, + { + "epoch": 0.985723698660224, + "grad_norm": 2.2338616847991943, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7310875654220581, + "num_tokens": 224086606.0, + "step": 8976 + }, + { + "epoch": 0.9858335163628377, + "grad_norm": 2.725372314453125, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7266254425048828, + "num_tokens": 224103625.0, + "step": 8977 + }, + { + "epoch": 0.9859433340654513, + "grad_norm": 2.572129249572754, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.6956400871276855, + "num_tokens": 224123341.0, + "step": 8978 + }, + { + "epoch": 0.986053151768065, + "grad_norm": 1.9888687133789062, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6951262950897217, + "num_tokens": 224154937.0, + "step": 8979 + }, + { + "epoch": 0.9861629694706787, + "grad_norm": 2.719299554824829, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7163932919502258, + "num_tokens": 224172341.0, + "step": 8980 + }, + { + "epoch": 0.9862727871732924, + "grad_norm": 2.32080340385437, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7138406038284302, + "num_tokens": 224193539.0, + "step": 8981 + }, + { + "epoch": 0.986382604875906, + "grad_norm": 2.5570132732391357, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7157115936279297, + "num_tokens": 224213409.0, + "step": 8982 + }, + { + "epoch": 0.9864924225785197, + "grad_norm": 1.9714545011520386, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7231351137161255, + "num_tokens": 224243692.0, + "step": 8983 + }, + { + "epoch": 0.9866022402811333, + "grad_norm": 2.227140188217163, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7081806659698486, + "num_tokens": 224268540.0, + "step": 8984 + }, + { + "epoch": 0.986712057983747, + "grad_norm": 2.4237003326416016, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7152804136276245, + "num_tokens": 224290801.0, + "step": 8985 + }, + { + "epoch": 0.9868218756863606, + "grad_norm": 2.372986078262329, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7083829641342163, + "num_tokens": 224313308.0, + "step": 8986 + }, + { + "epoch": 0.9869316933889744, + "grad_norm": 2.4214084148406982, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7114214897155762, + "num_tokens": 224335550.0, + "step": 8987 + }, + { + "epoch": 0.987041511091588, + "grad_norm": 2.217677593231201, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7271060347557068, + "num_tokens": 224360498.0, + "step": 8988 + }, + { + "epoch": 0.9871513287942016, + "grad_norm": 2.5181267261505127, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6933487057685852, + "num_tokens": 224383379.0, + "step": 8989 + }, + { + "epoch": 0.9872611464968153, + "grad_norm": 2.780590772628784, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7132567763328552, + "num_tokens": 224401836.0, + "step": 8990 + }, + { + "epoch": 0.9873709641994289, + "grad_norm": 2.719606637954712, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7374489307403564, + "num_tokens": 224419240.0, + "step": 8991 + }, + { + "epoch": 0.9874807819020426, + "grad_norm": 2.3278489112854004, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7301746606826782, + "num_tokens": 224440797.0, + "step": 8992 + }, + { + "epoch": 0.9875905996046562, + "grad_norm": 2.129365921020508, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6816303730010986, + "num_tokens": 224469333.0, + "step": 8993 + }, + { + "epoch": 0.98770041730727, + "grad_norm": 1.930467963218689, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7190970778465271, + "num_tokens": 224498840.0, + "step": 8994 + }, + { + "epoch": 0.9878102350098836, + "grad_norm": 2.3444435596466064, + "learning_rate": 1e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7483134865760803, + "num_tokens": 224518693.0, + "step": 8995 + }, + { + "epoch": 0.9879200527124973, + "grad_norm": 2.2757601737976074, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7104653120040894, + "num_tokens": 224543935.0, + "step": 8996 + }, + { + "epoch": 0.9880298704151109, + "grad_norm": 2.2132680416107178, + "learning_rate": 1e-06, + "loss": 1.0648, + "mean_token_accuracy": 0.6793808937072754, + "num_tokens": 224572473.0, + "step": 8997 + }, + { + "epoch": 0.9881396881177246, + "grad_norm": 2.274343729019165, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7036159038543701, + "num_tokens": 224595634.0, + "step": 8998 + }, + { + "epoch": 0.9882495058203382, + "grad_norm": 2.5992300510406494, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7434659004211426, + "num_tokens": 224615461.0, + "step": 8999 + }, + { + "epoch": 0.9883593235229519, + "grad_norm": 2.171743392944336, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.701623797416687, + "num_tokens": 224642478.0, + "step": 9000 + }, + { + "epoch": 0.9884691412255655, + "grad_norm": 2.0911672115325928, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7123138308525085, + "num_tokens": 224668415.0, + "step": 9001 + }, + { + "epoch": 0.9885789589281793, + "grad_norm": 2.4908082485198975, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7023205757141113, + "num_tokens": 224690405.0, + "step": 9002 + }, + { + "epoch": 0.9886887766307929, + "grad_norm": 1.9441320896148682, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7024332284927368, + "num_tokens": 224721347.0, + "step": 9003 + }, + { + "epoch": 0.9887985943334066, + "grad_norm": 2.2711524963378906, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7347056865692139, + "num_tokens": 224743062.0, + "step": 9004 + }, + { + "epoch": 0.9889084120360202, + "grad_norm": 2.030479669570923, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7384787797927856, + "num_tokens": 224770400.0, + "step": 9005 + }, + { + "epoch": 0.9890182297386338, + "grad_norm": 2.319061040878296, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.6981757283210754, + "num_tokens": 224791826.0, + "step": 9006 + }, + { + "epoch": 0.9891280474412475, + "grad_norm": 2.385939359664917, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.6996820569038391, + "num_tokens": 224815601.0, + "step": 9007 + }, + { + "epoch": 0.9892378651438611, + "grad_norm": 2.405317544937134, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6924525499343872, + "num_tokens": 224839884.0, + "step": 9008 + }, + { + "epoch": 0.9893476828464749, + "grad_norm": 2.1568925380706787, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.703614354133606, + "num_tokens": 224865691.0, + "step": 9009 + }, + { + "epoch": 0.9894575005490885, + "grad_norm": 2.5668606758117676, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7326853275299072, + "num_tokens": 224883743.0, + "step": 9010 + }, + { + "epoch": 0.9895673182517022, + "grad_norm": 2.2769663333892822, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7079852223396301, + "num_tokens": 224907919.0, + "step": 9011 + }, + { + "epoch": 0.9896771359543158, + "grad_norm": 2.224472999572754, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7145252227783203, + "num_tokens": 224932767.0, + "step": 9012 + }, + { + "epoch": 0.9897869536569295, + "grad_norm": 2.185296058654785, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7244570255279541, + "num_tokens": 224959293.0, + "step": 9013 + }, + { + "epoch": 0.9898967713595431, + "grad_norm": 2.2170684337615967, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7046886682510376, + "num_tokens": 224982718.0, + "step": 9014 + }, + { + "epoch": 0.9900065890621568, + "grad_norm": 2.156768321990967, + "learning_rate": 1e-06, + "loss": 0.7969, + "mean_token_accuracy": 0.7501223087310791, + "num_tokens": 225006080.0, + "step": 9015 + }, + { + "epoch": 0.9901164067647705, + "grad_norm": 2.2384021282196045, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7044941186904907, + "num_tokens": 225030396.0, + "step": 9016 + }, + { + "epoch": 0.9902262244673842, + "grad_norm": 1.9080445766448975, + "learning_rate": 1e-06, + "loss": 1.0792, + "mean_token_accuracy": 0.6727838516235352, + "num_tokens": 225064623.0, + "step": 9017 + }, + { + "epoch": 0.9903360421699978, + "grad_norm": 2.146799087524414, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.729269802570343, + "num_tokens": 225089113.0, + "step": 9018 + }, + { + "epoch": 0.9904458598726115, + "grad_norm": 2.548074245452881, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.714110791683197, + "num_tokens": 225108057.0, + "step": 9019 + }, + { + "epoch": 0.9905556775752251, + "grad_norm": 2.2014782428741455, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6823042631149292, + "num_tokens": 225135149.0, + "step": 9020 + }, + { + "epoch": 0.9906654952778388, + "grad_norm": 2.35305118560791, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7141535878181458, + "num_tokens": 225158469.0, + "step": 9021 + }, + { + "epoch": 0.9907753129804524, + "grad_norm": 2.1625301837921143, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7237014174461365, + "num_tokens": 225184197.0, + "step": 9022 + }, + { + "epoch": 0.9908851306830662, + "grad_norm": 2.0871288776397705, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7186294794082642, + "num_tokens": 225209616.0, + "step": 9023 + }, + { + "epoch": 0.9909949483856798, + "grad_norm": 2.3480284214019775, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7107081413269043, + "num_tokens": 225232984.0, + "step": 9024 + }, + { + "epoch": 0.9911047660882935, + "grad_norm": 2.0610971450805664, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7157536745071411, + "num_tokens": 225259530.0, + "step": 9025 + }, + { + "epoch": 0.9912145837909071, + "grad_norm": 2.3518965244293213, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7128281593322754, + "num_tokens": 225280806.0, + "step": 9026 + }, + { + "epoch": 0.9913244014935207, + "grad_norm": 2.5842528343200684, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7251150608062744, + "num_tokens": 225299918.0, + "step": 9027 + }, + { + "epoch": 0.9914342191961344, + "grad_norm": 2.2153942584991455, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7085340023040771, + "num_tokens": 225326015.0, + "step": 9028 + }, + { + "epoch": 0.991544036898748, + "grad_norm": 2.109809637069702, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.7017477750778198, + "num_tokens": 225353505.0, + "step": 9029 + }, + { + "epoch": 0.9916538546013618, + "grad_norm": 2.07653546333313, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6929017901420593, + "num_tokens": 225383112.0, + "step": 9030 + }, + { + "epoch": 0.9917636723039754, + "grad_norm": 2.557880163192749, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7195156812667847, + "num_tokens": 225403100.0, + "step": 9031 + }, + { + "epoch": 0.9918734900065891, + "grad_norm": 2.2679197788238525, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7338489890098572, + "num_tokens": 225424569.0, + "step": 9032 + }, + { + "epoch": 0.9919833077092027, + "grad_norm": 2.065530300140381, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7372949123382568, + "num_tokens": 225449475.0, + "step": 9033 + }, + { + "epoch": 0.9920931254118164, + "grad_norm": 2.066476821899414, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7303024530410767, + "num_tokens": 225478752.0, + "step": 9034 + }, + { + "epoch": 0.99220294311443, + "grad_norm": 2.353182554244995, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7132508754730225, + "num_tokens": 225501288.0, + "step": 9035 + }, + { + "epoch": 0.9923127608170437, + "grad_norm": 2.3355355262756348, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7089173197746277, + "num_tokens": 225523915.0, + "step": 9036 + }, + { + "epoch": 0.9924225785196573, + "grad_norm": 2.3475394248962402, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7307699918746948, + "num_tokens": 225546387.0, + "step": 9037 + }, + { + "epoch": 0.9925323962222711, + "grad_norm": 2.202169418334961, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7260298728942871, + "num_tokens": 225570358.0, + "step": 9038 + }, + { + "epoch": 0.9926422139248847, + "grad_norm": 2.3813767433166504, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7270370721817017, + "num_tokens": 225591144.0, + "step": 9039 + }, + { + "epoch": 0.9927520316274984, + "grad_norm": 2.170428514480591, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7237427830696106, + "num_tokens": 225617016.0, + "step": 9040 + }, + { + "epoch": 0.992861849330112, + "grad_norm": 2.138575315475464, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7062233686447144, + "num_tokens": 225644727.0, + "step": 9041 + }, + { + "epoch": 0.9929716670327257, + "grad_norm": 2.4720802307128906, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7086915373802185, + "num_tokens": 225664942.0, + "step": 9042 + }, + { + "epoch": 0.9930814847353393, + "grad_norm": 1.8001806735992432, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6907631158828735, + "num_tokens": 225701389.0, + "step": 9043 + }, + { + "epoch": 0.993191302437953, + "grad_norm": 2.5185611248016357, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7292689085006714, + "num_tokens": 225721603.0, + "step": 9044 + }, + { + "epoch": 0.9933011201405667, + "grad_norm": 2.031262159347534, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6882473230361938, + "num_tokens": 225749294.0, + "step": 9045 + }, + { + "epoch": 0.9934109378431804, + "grad_norm": 1.9951759576797485, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7251677513122559, + "num_tokens": 225775928.0, + "step": 9046 + }, + { + "epoch": 0.993520755545794, + "grad_norm": 2.7103381156921387, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7378867864608765, + "num_tokens": 225795876.0, + "step": 9047 + }, + { + "epoch": 0.9936305732484076, + "grad_norm": 2.2147302627563477, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7148230075836182, + "num_tokens": 225821713.0, + "step": 9048 + }, + { + "epoch": 0.9937403909510213, + "grad_norm": 2.2405998706817627, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7222872972488403, + "num_tokens": 225846138.0, + "step": 9049 + }, + { + "epoch": 0.9938502086536349, + "grad_norm": 2.079171657562256, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7291861176490784, + "num_tokens": 225874988.0, + "step": 9050 + }, + { + "epoch": 0.9939600263562486, + "grad_norm": 2.1197056770324707, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7038750648498535, + "num_tokens": 225899386.0, + "step": 9051 + }, + { + "epoch": 0.9940698440588623, + "grad_norm": 2.2346503734588623, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7268723249435425, + "num_tokens": 225923227.0, + "step": 9052 + }, + { + "epoch": 0.994179661761476, + "grad_norm": 2.158327102661133, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7051568627357483, + "num_tokens": 225952026.0, + "step": 9053 + }, + { + "epoch": 0.9942894794640896, + "grad_norm": 2.328457832336426, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7085138559341431, + "num_tokens": 225973748.0, + "step": 9054 + }, + { + "epoch": 0.9943992971667033, + "grad_norm": 2.361325263977051, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7101716995239258, + "num_tokens": 225996100.0, + "step": 9055 + }, + { + "epoch": 0.9945091148693169, + "grad_norm": 2.2894346714019775, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7081235647201538, + "num_tokens": 226019945.0, + "step": 9056 + }, + { + "epoch": 0.9946189325719306, + "grad_norm": 2.470271348953247, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.6969966292381287, + "num_tokens": 226039971.0, + "step": 9057 + }, + { + "epoch": 0.9947287502745442, + "grad_norm": 2.3263583183288574, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6993168592453003, + "num_tokens": 226065771.0, + "step": 9058 + }, + { + "epoch": 0.994838567977158, + "grad_norm": 2.332820177078247, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7066822052001953, + "num_tokens": 226087776.0, + "step": 9059 + }, + { + "epoch": 0.9949483856797716, + "grad_norm": 2.244415283203125, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7089896202087402, + "num_tokens": 226112751.0, + "step": 9060 + }, + { + "epoch": 0.9950582033823853, + "grad_norm": 2.206916570663452, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7357290983200073, + "num_tokens": 226135561.0, + "step": 9061 + }, + { + "epoch": 0.9951680210849989, + "grad_norm": 1.8650398254394531, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7224259376525879, + "num_tokens": 226167599.0, + "step": 9062 + }, + { + "epoch": 0.9952778387876126, + "grad_norm": 2.575162887573242, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7013243436813354, + "num_tokens": 226186945.0, + "step": 9063 + }, + { + "epoch": 0.9953876564902262, + "grad_norm": 2.1175930500030518, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7103841304779053, + "num_tokens": 226213030.0, + "step": 9064 + }, + { + "epoch": 0.9954974741928398, + "grad_norm": 2.2626216411590576, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7148274779319763, + "num_tokens": 226235685.0, + "step": 9065 + }, + { + "epoch": 0.9956072918954535, + "grad_norm": 2.1797280311584473, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.695182204246521, + "num_tokens": 226260594.0, + "step": 9066 + }, + { + "epoch": 0.9957171095980673, + "grad_norm": 2.446577548980713, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7317648530006409, + "num_tokens": 226279679.0, + "step": 9067 + }, + { + "epoch": 0.9958269273006809, + "grad_norm": 2.0194854736328125, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.695112943649292, + "num_tokens": 226308243.0, + "step": 9068 + }, + { + "epoch": 0.9959367450032945, + "grad_norm": 2.250581979751587, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7260340452194214, + "num_tokens": 226331287.0, + "step": 9069 + }, + { + "epoch": 0.9960465627059082, + "grad_norm": 2.017874240875244, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.715648889541626, + "num_tokens": 226359615.0, + "step": 9070 + }, + { + "epoch": 0.9961563804085218, + "grad_norm": 2.251307249069214, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7055485844612122, + "num_tokens": 226384044.0, + "step": 9071 + }, + { + "epoch": 0.9962661981111355, + "grad_norm": 2.2558069229125977, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7171775102615356, + "num_tokens": 226406749.0, + "step": 9072 + }, + { + "epoch": 0.9963760158137491, + "grad_norm": 2.4031801223754883, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7210026979446411, + "num_tokens": 226428515.0, + "step": 9073 + }, + { + "epoch": 0.9964858335163629, + "grad_norm": 2.359431505203247, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7003106474876404, + "num_tokens": 226452664.0, + "step": 9074 + }, + { + "epoch": 0.9965956512189765, + "grad_norm": 2.2970821857452393, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7130618691444397, + "num_tokens": 226477888.0, + "step": 9075 + }, + { + "epoch": 0.9967054689215902, + "grad_norm": 2.6992504596710205, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.715065062046051, + "num_tokens": 226496322.0, + "step": 9076 + }, + { + "epoch": 0.9968152866242038, + "grad_norm": 2.3702921867370605, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7186717391014099, + "num_tokens": 226518111.0, + "step": 9077 + }, + { + "epoch": 0.9969251043268175, + "grad_norm": 2.150002956390381, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7103087902069092, + "num_tokens": 226542393.0, + "step": 9078 + }, + { + "epoch": 0.9970349220294311, + "grad_norm": 2.1244170665740967, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6995181441307068, + "num_tokens": 226570116.0, + "step": 9079 + }, + { + "epoch": 0.9971447397320448, + "grad_norm": 2.298032283782959, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7011572122573853, + "num_tokens": 226593995.0, + "step": 9080 + }, + { + "epoch": 0.9972545574346585, + "grad_norm": 2.2438414096832275, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7191309332847595, + "num_tokens": 226620771.0, + "step": 9081 + }, + { + "epoch": 0.9973643751372722, + "grad_norm": 2.11863112449646, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7231220602989197, + "num_tokens": 226647087.0, + "step": 9082 + }, + { + "epoch": 0.9974741928398858, + "grad_norm": 2.2020835876464844, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7105984091758728, + "num_tokens": 226671609.0, + "step": 9083 + }, + { + "epoch": 0.9975840105424995, + "grad_norm": 2.2023208141326904, + "learning_rate": 1e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7273902297019958, + "num_tokens": 226695195.0, + "step": 9084 + }, + { + "epoch": 0.9976938282451131, + "grad_norm": 2.04874324798584, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7020134925842285, + "num_tokens": 226722892.0, + "step": 9085 + }, + { + "epoch": 0.9978036459477267, + "grad_norm": 2.134605646133423, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7043778896331787, + "num_tokens": 226748464.0, + "step": 9086 + }, + { + "epoch": 0.9979134636503404, + "grad_norm": 2.396388053894043, + "learning_rate": 1e-06, + "loss": 0.7692, + "mean_token_accuracy": 0.7564180493354797, + "num_tokens": 226766624.0, + "step": 9087 + }, + { + "epoch": 0.9980232813529542, + "grad_norm": 2.525977373123169, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7301821708679199, + "num_tokens": 226785967.0, + "step": 9088 + }, + { + "epoch": 0.9981330990555678, + "grad_norm": 2.2127370834350586, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.703173816204071, + "num_tokens": 226809798.0, + "step": 9089 + }, + { + "epoch": 0.9982429167581814, + "grad_norm": 2.2729744911193848, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7221353054046631, + "num_tokens": 226831623.0, + "step": 9090 + }, + { + "epoch": 0.9983527344607951, + "grad_norm": 2.202519655227661, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.708467423915863, + "num_tokens": 226856658.0, + "step": 9091 + }, + { + "epoch": 0.9984625521634087, + "grad_norm": 2.483712911605835, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7100302577018738, + "num_tokens": 226876573.0, + "step": 9092 + }, + { + "epoch": 0.9985723698660224, + "grad_norm": 2.4290614128112793, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7340178489685059, + "num_tokens": 226898507.0, + "step": 9093 + }, + { + "epoch": 0.998682187568636, + "grad_norm": 2.1601574420928955, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6871293783187866, + "num_tokens": 226925469.0, + "step": 9094 + }, + { + "epoch": 0.9987920052712497, + "grad_norm": 2.414477825164795, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7070509195327759, + "num_tokens": 226947116.0, + "step": 9095 + }, + { + "epoch": 0.9989018229738634, + "grad_norm": 2.173739194869995, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7058718800544739, + "num_tokens": 226969504.0, + "step": 9096 + }, + { + "epoch": 0.9990116406764771, + "grad_norm": 1.8012748956680298, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.6964530944824219, + "num_tokens": 227003878.0, + "step": 9097 + }, + { + "epoch": 0.9991214583790907, + "grad_norm": 2.203486680984497, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.6985020637512207, + "num_tokens": 227029667.0, + "step": 9098 + }, + { + "epoch": 0.9992312760817044, + "grad_norm": 2.1543805599212646, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6889052391052246, + "num_tokens": 227056107.0, + "step": 9099 + }, + { + "epoch": 0.999341093784318, + "grad_norm": 2.19289493560791, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7255274653434753, + "num_tokens": 227082475.0, + "step": 9100 + }, + { + "epoch": 0.9994509114869317, + "grad_norm": 2.0403997898101807, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6821871399879456, + "num_tokens": 227112049.0, + "step": 9101 + }, + { + "epoch": 0.9995607291895453, + "grad_norm": 2.4221315383911133, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7042822241783142, + "num_tokens": 227133403.0, + "step": 9102 + }, + { + "epoch": 0.9996705468921591, + "grad_norm": 2.249070644378662, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7152309417724609, + "num_tokens": 227157448.0, + "step": 9103 + }, + { + "epoch": 0.9997803645947727, + "grad_norm": 1.9210275411605835, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7071566581726074, + "num_tokens": 227185735.0, + "step": 9104 + }, + { + "epoch": 0.9998901822973864, + "grad_norm": 2.4883780479431152, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7284750938415527, + "num_tokens": 227205437.0, + "step": 9105 + }, + { + "epoch": 1.0, + "grad_norm": 2.1329843997955322, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.707643985748291, + "num_tokens": 227231715.0, + "step": 9106 + }, + { + "epoch": 1.0001098177026138, + "grad_norm": 2.206265687942505, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7087309956550598, + "num_tokens": 227255343.0, + "step": 9107 + }, + { + "epoch": 1.0002196354052273, + "grad_norm": 2.2653331756591797, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7014445662498474, + "num_tokens": 227278200.0, + "step": 9108 + }, + { + "epoch": 1.000329453107841, + "grad_norm": 2.2688944339752197, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7360444068908691, + "num_tokens": 227302201.0, + "step": 9109 + }, + { + "epoch": 1.0004392708104546, + "grad_norm": 2.2082364559173584, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7267978191375732, + "num_tokens": 227325920.0, + "step": 9110 + }, + { + "epoch": 1.0005490885130683, + "grad_norm": 2.4322383403778076, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7207239270210266, + "num_tokens": 227346036.0, + "step": 9111 + }, + { + "epoch": 1.0006589062156819, + "grad_norm": 2.1334025859832764, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7137426137924194, + "num_tokens": 227371685.0, + "step": 9112 + }, + { + "epoch": 1.0007687239182956, + "grad_norm": 2.180366277694702, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7235071659088135, + "num_tokens": 227396450.0, + "step": 9113 + }, + { + "epoch": 1.0008785416209094, + "grad_norm": 2.0027759075164795, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7131985425949097, + "num_tokens": 227424893.0, + "step": 9114 + }, + { + "epoch": 1.000988359323523, + "grad_norm": 2.0961430072784424, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7265672087669373, + "num_tokens": 227451185.0, + "step": 9115 + }, + { + "epoch": 1.0010981770261367, + "grad_norm": 1.7116632461547852, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7083829641342163, + "num_tokens": 227492006.0, + "step": 9116 + }, + { + "epoch": 1.0012079947287502, + "grad_norm": 2.246847629547119, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.705275297164917, + "num_tokens": 227517995.0, + "step": 9117 + }, + { + "epoch": 1.001317812431364, + "grad_norm": 2.1265649795532227, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7311334013938904, + "num_tokens": 227543460.0, + "step": 9118 + }, + { + "epoch": 1.0014276301339775, + "grad_norm": 2.030285358428955, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7225130200386047, + "num_tokens": 227573608.0, + "step": 9119 + }, + { + "epoch": 1.0015374478365913, + "grad_norm": 2.297051429748535, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7288864254951477, + "num_tokens": 227597446.0, + "step": 9120 + }, + { + "epoch": 1.001647265539205, + "grad_norm": 2.035468578338623, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7349139451980591, + "num_tokens": 227626255.0, + "step": 9121 + }, + { + "epoch": 1.0017570832418186, + "grad_norm": 2.261126756668091, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.741204023361206, + "num_tokens": 227649443.0, + "step": 9122 + }, + { + "epoch": 1.0018669009444323, + "grad_norm": 2.4059882164001465, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7034028768539429, + "num_tokens": 227673253.0, + "step": 9123 + }, + { + "epoch": 1.0019767186470458, + "grad_norm": 2.1187403202056885, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7209982872009277, + "num_tokens": 227699864.0, + "step": 9124 + }, + { + "epoch": 1.0020865363496596, + "grad_norm": 1.9577168226242065, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6727735996246338, + "num_tokens": 227730442.0, + "step": 9125 + }, + { + "epoch": 1.0021963540522731, + "grad_norm": 2.510345458984375, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7301664352416992, + "num_tokens": 227751186.0, + "step": 9126 + }, + { + "epoch": 1.002306171754887, + "grad_norm": 2.2364377975463867, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7171970009803772, + "num_tokens": 227774973.0, + "step": 9127 + }, + { + "epoch": 1.0024159894575007, + "grad_norm": 2.331094264984131, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.6914054751396179, + "num_tokens": 227798816.0, + "step": 9128 + }, + { + "epoch": 1.0025258071601142, + "grad_norm": 2.438349485397339, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7305253744125366, + "num_tokens": 227820329.0, + "step": 9129 + }, + { + "epoch": 1.002635624862728, + "grad_norm": 2.228177309036255, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7402688264846802, + "num_tokens": 227843059.0, + "step": 9130 + }, + { + "epoch": 1.0027454425653415, + "grad_norm": 2.1144330501556396, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7127668857574463, + "num_tokens": 227869710.0, + "step": 9131 + }, + { + "epoch": 1.0028552602679552, + "grad_norm": 2.085231065750122, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6776914596557617, + "num_tokens": 227900172.0, + "step": 9132 + }, + { + "epoch": 1.0029650779705688, + "grad_norm": 2.201505661010742, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7310100197792053, + "num_tokens": 227925131.0, + "step": 9133 + }, + { + "epoch": 1.0030748956731825, + "grad_norm": 2.1344833374023438, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7313968539237976, + "num_tokens": 227949032.0, + "step": 9134 + }, + { + "epoch": 1.0031847133757963, + "grad_norm": 2.3024041652679443, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7169564962387085, + "num_tokens": 227974566.0, + "step": 9135 + }, + { + "epoch": 1.0032945310784098, + "grad_norm": 2.138127565383911, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7141355276107788, + "num_tokens": 228002131.0, + "step": 9136 + }, + { + "epoch": 1.0034043487810236, + "grad_norm": 2.246210813522339, + "learning_rate": 1e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7373064756393433, + "num_tokens": 228025318.0, + "step": 9137 + }, + { + "epoch": 1.0035141664836371, + "grad_norm": 2.3733749389648438, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7356382608413696, + "num_tokens": 228047994.0, + "step": 9138 + }, + { + "epoch": 1.0036239841862509, + "grad_norm": 2.197269916534424, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7138022780418396, + "num_tokens": 228074430.0, + "step": 9139 + }, + { + "epoch": 1.0037338018888644, + "grad_norm": 2.4309940338134766, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7202441692352295, + "num_tokens": 228097028.0, + "step": 9140 + }, + { + "epoch": 1.0038436195914782, + "grad_norm": 2.2140698432922363, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7090862393379211, + "num_tokens": 228122251.0, + "step": 9141 + }, + { + "epoch": 1.0039534372940917, + "grad_norm": 2.0875680446624756, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6878281235694885, + "num_tokens": 228152481.0, + "step": 9142 + }, + { + "epoch": 1.0040632549967055, + "grad_norm": 1.9919921159744263, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7103813886642456, + "num_tokens": 228182464.0, + "step": 9143 + }, + { + "epoch": 1.0041730726993192, + "grad_norm": 2.764554738998413, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.736449122428894, + "num_tokens": 228201260.0, + "step": 9144 + }, + { + "epoch": 1.0042828904019327, + "grad_norm": 2.024172067642212, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7148981094360352, + "num_tokens": 228230881.0, + "step": 9145 + }, + { + "epoch": 1.0043927081045465, + "grad_norm": 2.221809148788452, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.715299665927887, + "num_tokens": 228257022.0, + "step": 9146 + }, + { + "epoch": 1.00450252580716, + "grad_norm": 1.8472402095794678, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6916992664337158, + "num_tokens": 228292296.0, + "step": 9147 + }, + { + "epoch": 1.0046123435097738, + "grad_norm": 2.2265191078186035, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7274734973907471, + "num_tokens": 228317812.0, + "step": 9148 + }, + { + "epoch": 1.0047221612123873, + "grad_norm": 2.194241523742676, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7187004089355469, + "num_tokens": 228341775.0, + "step": 9149 + }, + { + "epoch": 1.004831978915001, + "grad_norm": 2.77703857421875, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7329580783843994, + "num_tokens": 228358627.0, + "step": 9150 + }, + { + "epoch": 1.0049417966176148, + "grad_norm": 2.318906784057617, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7168009281158447, + "num_tokens": 228383201.0, + "step": 9151 + }, + { + "epoch": 1.0050516143202284, + "grad_norm": 2.0703036785125732, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7042978405952454, + "num_tokens": 228413847.0, + "step": 9152 + }, + { + "epoch": 1.0051614320228421, + "grad_norm": 2.4006197452545166, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7283886671066284, + "num_tokens": 228435203.0, + "step": 9153 + }, + { + "epoch": 1.0052712497254557, + "grad_norm": 2.043853759765625, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7088685035705566, + "num_tokens": 228462265.0, + "step": 9154 + }, + { + "epoch": 1.0053810674280694, + "grad_norm": 2.472062587738037, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7412248849868774, + "num_tokens": 228484203.0, + "step": 9155 + }, + { + "epoch": 1.005490885130683, + "grad_norm": 2.0447165966033936, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7266571521759033, + "num_tokens": 228513881.0, + "step": 9156 + }, + { + "epoch": 1.0056007028332967, + "grad_norm": 2.5049290657043457, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.743145227432251, + "num_tokens": 228534770.0, + "step": 9157 + }, + { + "epoch": 1.0057105205359105, + "grad_norm": 2.507983446121216, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7020086050033569, + "num_tokens": 228555812.0, + "step": 9158 + }, + { + "epoch": 1.005820338238524, + "grad_norm": 2.08844256401062, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7023999094963074, + "num_tokens": 228586177.0, + "step": 9159 + }, + { + "epoch": 1.0059301559411378, + "grad_norm": 2.5040457248687744, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.734331488609314, + "num_tokens": 228605712.0, + "step": 9160 + }, + { + "epoch": 1.0060399736437513, + "grad_norm": 2.115584373474121, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7212942242622375, + "num_tokens": 228634400.0, + "step": 9161 + }, + { + "epoch": 1.006149791346365, + "grad_norm": 2.6794068813323975, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7126440405845642, + "num_tokens": 228652031.0, + "step": 9162 + }, + { + "epoch": 1.0062596090489786, + "grad_norm": 2.650642156600952, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7168791890144348, + "num_tokens": 228671717.0, + "step": 9163 + }, + { + "epoch": 1.0063694267515924, + "grad_norm": 2.34035587310791, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.715430498123169, + "num_tokens": 228693673.0, + "step": 9164 + }, + { + "epoch": 1.0064792444542061, + "grad_norm": 2.3724424839019775, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.709466814994812, + "num_tokens": 228717878.0, + "step": 9165 + }, + { + "epoch": 1.0065890621568196, + "grad_norm": 2.226214647293091, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7165602445602417, + "num_tokens": 228743829.0, + "step": 9166 + }, + { + "epoch": 1.0066988798594334, + "grad_norm": 2.5030407905578613, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7288509607315063, + "num_tokens": 228763819.0, + "step": 9167 + }, + { + "epoch": 1.006808697562047, + "grad_norm": 2.344217300415039, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7316243648529053, + "num_tokens": 228786218.0, + "step": 9168 + }, + { + "epoch": 1.0069185152646607, + "grad_norm": 2.070492744445801, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7178244590759277, + "num_tokens": 228814221.0, + "step": 9169 + }, + { + "epoch": 1.0070283329672742, + "grad_norm": 2.339841842651367, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7336320877075195, + "num_tokens": 228834833.0, + "step": 9170 + }, + { + "epoch": 1.007138150669888, + "grad_norm": 2.341536521911621, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7230640649795532, + "num_tokens": 228859338.0, + "step": 9171 + }, + { + "epoch": 1.0072479683725017, + "grad_norm": 2.348220109939575, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7141221761703491, + "num_tokens": 228881711.0, + "step": 9172 + }, + { + "epoch": 1.0073577860751153, + "grad_norm": 1.9518613815307617, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7050204277038574, + "num_tokens": 228915280.0, + "step": 9173 + }, + { + "epoch": 1.007467603777729, + "grad_norm": 2.1660988330841064, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7339195013046265, + "num_tokens": 228942222.0, + "step": 9174 + }, + { + "epoch": 1.0075774214803426, + "grad_norm": 2.356156826019287, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7151247262954712, + "num_tokens": 228965075.0, + "step": 9175 + }, + { + "epoch": 1.0076872391829563, + "grad_norm": 2.199298143386841, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7213252782821655, + "num_tokens": 228988185.0, + "step": 9176 + }, + { + "epoch": 1.0077970568855699, + "grad_norm": 2.5800576210021973, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7462524175643921, + "num_tokens": 229007930.0, + "step": 9177 + }, + { + "epoch": 1.0079068745881836, + "grad_norm": 2.237645387649536, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7287861704826355, + "num_tokens": 229031211.0, + "step": 9178 + }, + { + "epoch": 1.0080166922907974, + "grad_norm": 1.989377737045288, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7033646106719971, + "num_tokens": 229062512.0, + "step": 9179 + }, + { + "epoch": 1.008126509993411, + "grad_norm": 2.187347412109375, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7379211187362671, + "num_tokens": 229088473.0, + "step": 9180 + }, + { + "epoch": 1.0082363276960247, + "grad_norm": 2.149768352508545, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7374022006988525, + "num_tokens": 229113781.0, + "step": 9181 + }, + { + "epoch": 1.0083461453986382, + "grad_norm": 2.0936105251312256, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7359221577644348, + "num_tokens": 229138569.0, + "step": 9182 + }, + { + "epoch": 1.008455963101252, + "grad_norm": 2.594841957092285, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7415857315063477, + "num_tokens": 229158100.0, + "step": 9183 + }, + { + "epoch": 1.0085657808038655, + "grad_norm": 2.5135834217071533, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7113330364227295, + "num_tokens": 229178708.0, + "step": 9184 + }, + { + "epoch": 1.0086755985064793, + "grad_norm": 2.2395474910736084, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.708747923374176, + "num_tokens": 229205279.0, + "step": 9185 + }, + { + "epoch": 1.008785416209093, + "grad_norm": 2.1770615577697754, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.717743456363678, + "num_tokens": 229230845.0, + "step": 9186 + }, + { + "epoch": 1.0088952339117065, + "grad_norm": 2.28891658782959, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6982555389404297, + "num_tokens": 229258373.0, + "step": 9187 + }, + { + "epoch": 1.0090050516143203, + "grad_norm": 2.3269927501678467, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6948791742324829, + "num_tokens": 229284539.0, + "step": 9188 + }, + { + "epoch": 1.0091148693169338, + "grad_norm": 2.1971285343170166, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7204540371894836, + "num_tokens": 229309465.0, + "step": 9189 + }, + { + "epoch": 1.0092246870195476, + "grad_norm": 2.741640567779541, + "learning_rate": 1e-06, + "loss": 0.7488, + "mean_token_accuracy": 0.755848228931427, + "num_tokens": 229325852.0, + "step": 9190 + }, + { + "epoch": 1.0093345047221611, + "grad_norm": 2.2498466968536377, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7256770133972168, + "num_tokens": 229350872.0, + "step": 9191 + }, + { + "epoch": 1.0094443224247749, + "grad_norm": 2.2857840061187744, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7155035734176636, + "num_tokens": 229375113.0, + "step": 9192 + }, + { + "epoch": 1.0095541401273886, + "grad_norm": 2.5707499980926514, + "learning_rate": 1e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7416328191757202, + "num_tokens": 229392980.0, + "step": 9193 + }, + { + "epoch": 1.0096639578300022, + "grad_norm": 2.434562921524048, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7129734754562378, + "num_tokens": 229413717.0, + "step": 9194 + }, + { + "epoch": 1.009773775532616, + "grad_norm": 1.9251540899276733, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7127432823181152, + "num_tokens": 229446262.0, + "step": 9195 + }, + { + "epoch": 1.0098835932352295, + "grad_norm": 2.476806163787842, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7307077646255493, + "num_tokens": 229467767.0, + "step": 9196 + }, + { + "epoch": 1.0099934109378432, + "grad_norm": 2.3741915225982666, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7104969024658203, + "num_tokens": 229490377.0, + "step": 9197 + }, + { + "epoch": 1.0101032286404568, + "grad_norm": 2.1882355213165283, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.719487190246582, + "num_tokens": 229517778.0, + "step": 9198 + }, + { + "epoch": 1.0102130463430705, + "grad_norm": 2.0297746658325195, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.6988976001739502, + "num_tokens": 229547013.0, + "step": 9199 + }, + { + "epoch": 1.010322864045684, + "grad_norm": 2.180922031402588, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7197078466415405, + "num_tokens": 229572760.0, + "step": 9200 + }, + { + "epoch": 1.0104326817482978, + "grad_norm": 2.0901126861572266, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6907455921173096, + "num_tokens": 229600145.0, + "step": 9201 + }, + { + "epoch": 1.0105424994509116, + "grad_norm": 2.1593220233917236, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7312914133071899, + "num_tokens": 229625572.0, + "step": 9202 + }, + { + "epoch": 1.010652317153525, + "grad_norm": 2.018066883087158, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7220373153686523, + "num_tokens": 229657285.0, + "step": 9203 + }, + { + "epoch": 1.0107621348561389, + "grad_norm": 2.210059881210327, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7121597528457642, + "num_tokens": 229684165.0, + "step": 9204 + }, + { + "epoch": 1.0108719525587524, + "grad_norm": 2.0683200359344482, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7427952289581299, + "num_tokens": 229711151.0, + "step": 9205 + }, + { + "epoch": 1.0109817702613662, + "grad_norm": 2.223785161972046, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7269713878631592, + "num_tokens": 229736806.0, + "step": 9206 + }, + { + "epoch": 1.0110915879639797, + "grad_norm": 2.2216413021087646, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7173922657966614, + "num_tokens": 229764650.0, + "step": 9207 + }, + { + "epoch": 1.0112014056665934, + "grad_norm": 2.3869338035583496, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7016094923019409, + "num_tokens": 229787997.0, + "step": 9208 + }, + { + "epoch": 1.0113112233692072, + "grad_norm": 2.1568894386291504, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7200109958648682, + "num_tokens": 229815003.0, + "step": 9209 + }, + { + "epoch": 1.0114210410718207, + "grad_norm": 2.044468402862549, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6903183460235596, + "num_tokens": 229851239.0, + "step": 9210 + }, + { + "epoch": 1.0115308587744345, + "grad_norm": 2.340749740600586, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7353510856628418, + "num_tokens": 229875809.0, + "step": 9211 + }, + { + "epoch": 1.011640676477048, + "grad_norm": 2.319876194000244, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7315186262130737, + "num_tokens": 229898046.0, + "step": 9212 + }, + { + "epoch": 1.0117504941796618, + "grad_norm": 2.410872459411621, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7350955009460449, + "num_tokens": 229920295.0, + "step": 9213 + }, + { + "epoch": 1.0118603118822753, + "grad_norm": 2.0103182792663574, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.723055362701416, + "num_tokens": 229949550.0, + "step": 9214 + }, + { + "epoch": 1.011970129584889, + "grad_norm": 2.3178703784942627, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7182950973510742, + "num_tokens": 229972878.0, + "step": 9215 + }, + { + "epoch": 1.0120799472875028, + "grad_norm": 2.2545034885406494, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7207604646682739, + "num_tokens": 229998200.0, + "step": 9216 + }, + { + "epoch": 1.0121897649901164, + "grad_norm": 2.3714303970336914, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.719089150428772, + "num_tokens": 230021636.0, + "step": 9217 + }, + { + "epoch": 1.0122995826927301, + "grad_norm": 2.531158924102783, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.726717472076416, + "num_tokens": 230040790.0, + "step": 9218 + }, + { + "epoch": 1.0124094003953437, + "grad_norm": 2.0811047554016113, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7174204587936401, + "num_tokens": 230071037.0, + "step": 9219 + }, + { + "epoch": 1.0125192180979574, + "grad_norm": 2.2704319953918457, + "learning_rate": 1e-06, + "loss": 0.7272, + "mean_token_accuracy": 0.7708531022071838, + "num_tokens": 230093422.0, + "step": 9220 + }, + { + "epoch": 1.012629035800571, + "grad_norm": 2.8031184673309326, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7146964073181152, + "num_tokens": 230112662.0, + "step": 9221 + }, + { + "epoch": 1.0127388535031847, + "grad_norm": 2.690077304840088, + "learning_rate": 1e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7443371415138245, + "num_tokens": 230131282.0, + "step": 9222 + }, + { + "epoch": 1.0128486712057985, + "grad_norm": 2.56230092048645, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7064534425735474, + "num_tokens": 230154019.0, + "step": 9223 + }, + { + "epoch": 1.012958488908412, + "grad_norm": 2.1742706298828125, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7148077487945557, + "num_tokens": 230182305.0, + "step": 9224 + }, + { + "epoch": 1.0130683066110258, + "grad_norm": 2.4776711463928223, + "learning_rate": 1e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.742993950843811, + "num_tokens": 230204069.0, + "step": 9225 + }, + { + "epoch": 1.0131781243136393, + "grad_norm": 2.194293260574341, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7244850397109985, + "num_tokens": 230231791.0, + "step": 9226 + }, + { + "epoch": 1.013287942016253, + "grad_norm": 2.442310333251953, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7303358316421509, + "num_tokens": 230252590.0, + "step": 9227 + }, + { + "epoch": 1.0133977597188666, + "grad_norm": 2.607712507247925, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7208388447761536, + "num_tokens": 230272427.0, + "step": 9228 + }, + { + "epoch": 1.0135075774214803, + "grad_norm": 2.283043384552002, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7562062740325928, + "num_tokens": 230295104.0, + "step": 9229 + }, + { + "epoch": 1.013617395124094, + "grad_norm": 2.1126039028167725, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7208705544471741, + "num_tokens": 230322965.0, + "step": 9230 + }, + { + "epoch": 1.0137272128267076, + "grad_norm": 2.3357045650482178, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7372850179672241, + "num_tokens": 230345646.0, + "step": 9231 + }, + { + "epoch": 1.0138370305293214, + "grad_norm": 2.623199939727783, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7286989688873291, + "num_tokens": 230365862.0, + "step": 9232 + }, + { + "epoch": 1.013946848231935, + "grad_norm": 2.0539004802703857, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7176454067230225, + "num_tokens": 230394957.0, + "step": 9233 + }, + { + "epoch": 1.0140566659345487, + "grad_norm": 2.1450371742248535, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7033838629722595, + "num_tokens": 230423522.0, + "step": 9234 + }, + { + "epoch": 1.0141664836371622, + "grad_norm": 2.572669506072998, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7320820689201355, + "num_tokens": 230444072.0, + "step": 9235 + }, + { + "epoch": 1.014276301339776, + "grad_norm": 2.3975863456726074, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7085136771202087, + "num_tokens": 230465644.0, + "step": 9236 + }, + { + "epoch": 1.0143861190423897, + "grad_norm": 2.327272653579712, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7024431824684143, + "num_tokens": 230493154.0, + "step": 9237 + }, + { + "epoch": 1.0144959367450033, + "grad_norm": 2.685723066329956, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7144746780395508, + "num_tokens": 230512136.0, + "step": 9238 + }, + { + "epoch": 1.014605754447617, + "grad_norm": 2.3404479026794434, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7385279536247253, + "num_tokens": 230535201.0, + "step": 9239 + }, + { + "epoch": 1.0147155721502306, + "grad_norm": 2.066373586654663, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7254329919815063, + "num_tokens": 230564171.0, + "step": 9240 + }, + { + "epoch": 1.0148253898528443, + "grad_norm": 2.1809089183807373, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7139170169830322, + "num_tokens": 230590357.0, + "step": 9241 + }, + { + "epoch": 1.0149352075554579, + "grad_norm": 2.4446778297424316, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7313299775123596, + "num_tokens": 230611923.0, + "step": 9242 + }, + { + "epoch": 1.0150450252580716, + "grad_norm": 2.289863348007202, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7161740064620972, + "num_tokens": 230635961.0, + "step": 9243 + }, + { + "epoch": 1.0151548429606854, + "grad_norm": 2.4431607723236084, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7356559038162231, + "num_tokens": 230657417.0, + "step": 9244 + }, + { + "epoch": 1.015264660663299, + "grad_norm": 2.1355948448181152, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7165848016738892, + "num_tokens": 230681806.0, + "step": 9245 + }, + { + "epoch": 1.0153744783659127, + "grad_norm": 2.067401647567749, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7175608277320862, + "num_tokens": 230713194.0, + "step": 9246 + }, + { + "epoch": 1.0154842960685262, + "grad_norm": 2.2631468772888184, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7230212092399597, + "num_tokens": 230739503.0, + "step": 9247 + }, + { + "epoch": 1.01559411377114, + "grad_norm": 2.034451961517334, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7161403298377991, + "num_tokens": 230767569.0, + "step": 9248 + }, + { + "epoch": 1.0157039314737535, + "grad_norm": 2.1263723373413086, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7380428910255432, + "num_tokens": 230793304.0, + "step": 9249 + }, + { + "epoch": 1.0158137491763672, + "grad_norm": 2.4447784423828125, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7445874214172363, + "num_tokens": 230814042.0, + "step": 9250 + }, + { + "epoch": 1.015923566878981, + "grad_norm": 2.5027620792388916, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.710002064704895, + "num_tokens": 230835541.0, + "step": 9251 + }, + { + "epoch": 1.0160333845815945, + "grad_norm": 2.318833827972412, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7098608016967773, + "num_tokens": 230859372.0, + "step": 9252 + }, + { + "epoch": 1.0161432022842083, + "grad_norm": 2.275174856185913, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7474451065063477, + "num_tokens": 230882105.0, + "step": 9253 + }, + { + "epoch": 1.0162530199868218, + "grad_norm": 2.2337660789489746, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.735650897026062, + "num_tokens": 230907314.0, + "step": 9254 + }, + { + "epoch": 1.0163628376894356, + "grad_norm": 2.428053617477417, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7115861177444458, + "num_tokens": 230928834.0, + "step": 9255 + }, + { + "epoch": 1.0164726553920491, + "grad_norm": 2.3173115253448486, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7130641341209412, + "num_tokens": 230952481.0, + "step": 9256 + }, + { + "epoch": 1.0165824730946629, + "grad_norm": 2.415499448776245, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7362890243530273, + "num_tokens": 230972714.0, + "step": 9257 + }, + { + "epoch": 1.0166922907972764, + "grad_norm": 2.3139803409576416, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.70335853099823, + "num_tokens": 230995414.0, + "step": 9258 + }, + { + "epoch": 1.0168021084998902, + "grad_norm": 2.4481217861175537, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7357523441314697, + "num_tokens": 231018028.0, + "step": 9259 + }, + { + "epoch": 1.016911926202504, + "grad_norm": 1.96889066696167, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.6951435804367065, + "num_tokens": 231047152.0, + "step": 9260 + }, + { + "epoch": 1.0170217439051175, + "grad_norm": 2.310793876647949, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.708891749382019, + "num_tokens": 231071272.0, + "step": 9261 + }, + { + "epoch": 1.0171315616077312, + "grad_norm": 2.1460046768188477, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7029262781143188, + "num_tokens": 231099281.0, + "step": 9262 + }, + { + "epoch": 1.0172413793103448, + "grad_norm": 2.3075180053710938, + "learning_rate": 1e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7462122440338135, + "num_tokens": 231120752.0, + "step": 9263 + }, + { + "epoch": 1.0173511970129585, + "grad_norm": 2.4674007892608643, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.749446451663971, + "num_tokens": 231140637.0, + "step": 9264 + }, + { + "epoch": 1.017461014715572, + "grad_norm": 2.4107344150543213, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7243621349334717, + "num_tokens": 231162779.0, + "step": 9265 + }, + { + "epoch": 1.0175708324181858, + "grad_norm": 2.241866111755371, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7209187150001526, + "num_tokens": 231188931.0, + "step": 9266 + }, + { + "epoch": 1.0176806501207996, + "grad_norm": 2.141050100326538, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7282789349555969, + "num_tokens": 231217121.0, + "step": 9267 + }, + { + "epoch": 1.017790467823413, + "grad_norm": 2.316077470779419, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6976742148399353, + "num_tokens": 231242456.0, + "step": 9268 + }, + { + "epoch": 1.0179002855260268, + "grad_norm": 2.642711877822876, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7249045968055725, + "num_tokens": 231260288.0, + "step": 9269 + }, + { + "epoch": 1.0180101032286404, + "grad_norm": 2.285290002822876, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7100875973701477, + "num_tokens": 231284647.0, + "step": 9270 + }, + { + "epoch": 1.0181199209312541, + "grad_norm": 2.1792802810668945, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7131056189537048, + "num_tokens": 231311488.0, + "step": 9271 + }, + { + "epoch": 1.0182297386338677, + "grad_norm": 1.9463552236557007, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7170484066009521, + "num_tokens": 231342244.0, + "step": 9272 + }, + { + "epoch": 1.0183395563364814, + "grad_norm": 1.9988682270050049, + "learning_rate": 1e-06, + "loss": 0.7608, + "mean_token_accuracy": 0.762304425239563, + "num_tokens": 231368632.0, + "step": 9273 + }, + { + "epoch": 1.0184493740390952, + "grad_norm": 2.467663526535034, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7253991961479187, + "num_tokens": 231389499.0, + "step": 9274 + }, + { + "epoch": 1.0185591917417087, + "grad_norm": 2.1526875495910645, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6924557089805603, + "num_tokens": 231418034.0, + "step": 9275 + }, + { + "epoch": 1.0186690094443225, + "grad_norm": 2.3516876697540283, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7099076509475708, + "num_tokens": 231442744.0, + "step": 9276 + }, + { + "epoch": 1.018778827146936, + "grad_norm": 2.746896982192993, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7340805530548096, + "num_tokens": 231459860.0, + "step": 9277 + }, + { + "epoch": 1.0188886448495498, + "grad_norm": 2.201079845428467, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7068127989768982, + "num_tokens": 231485972.0, + "step": 9278 + }, + { + "epoch": 1.0189984625521633, + "grad_norm": 2.127073287963867, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7117667198181152, + "num_tokens": 231513578.0, + "step": 9279 + }, + { + "epoch": 1.019108280254777, + "grad_norm": 2.390413522720337, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7247097492218018, + "num_tokens": 231536914.0, + "step": 9280 + }, + { + "epoch": 1.0192180979573908, + "grad_norm": 2.504430055618286, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7329596281051636, + "num_tokens": 231556536.0, + "step": 9281 + }, + { + "epoch": 1.0193279156600044, + "grad_norm": 2.512826919555664, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.724313497543335, + "num_tokens": 231577989.0, + "step": 9282 + }, + { + "epoch": 1.0194377333626181, + "grad_norm": 2.5851151943206787, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7224685549736023, + "num_tokens": 231598693.0, + "step": 9283 + }, + { + "epoch": 1.0195475510652316, + "grad_norm": 2.202371120452881, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.735649585723877, + "num_tokens": 231624086.0, + "step": 9284 + }, + { + "epoch": 1.0196573687678454, + "grad_norm": 2.344912052154541, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7116587162017822, + "num_tokens": 231648819.0, + "step": 9285 + }, + { + "epoch": 1.019767186470459, + "grad_norm": 2.0223796367645264, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7168006896972656, + "num_tokens": 231678144.0, + "step": 9286 + }, + { + "epoch": 1.0198770041730727, + "grad_norm": 2.2540204524993896, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7201492786407471, + "num_tokens": 231702740.0, + "step": 9287 + }, + { + "epoch": 1.0199868218756865, + "grad_norm": 1.8309576511383057, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.6997586488723755, + "num_tokens": 231739722.0, + "step": 9288 + }, + { + "epoch": 1.0200966395783, + "grad_norm": 2.4436466693878174, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7276400327682495, + "num_tokens": 231761236.0, + "step": 9289 + }, + { + "epoch": 1.0202064572809137, + "grad_norm": 2.2392754554748535, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7330163717269897, + "num_tokens": 231785852.0, + "step": 9290 + }, + { + "epoch": 1.0203162749835273, + "grad_norm": 2.4727911949157715, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7144282460212708, + "num_tokens": 231807097.0, + "step": 9291 + }, + { + "epoch": 1.020426092686141, + "grad_norm": 2.5159623622894287, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7390989661216736, + "num_tokens": 231826904.0, + "step": 9292 + }, + { + "epoch": 1.0205359103887546, + "grad_norm": 1.9601272344589233, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7583688497543335, + "num_tokens": 231856414.0, + "step": 9293 + }, + { + "epoch": 1.0206457280913683, + "grad_norm": 2.30373215675354, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7127097249031067, + "num_tokens": 231883131.0, + "step": 9294 + }, + { + "epoch": 1.020755545793982, + "grad_norm": 2.471104145050049, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7223681807518005, + "num_tokens": 231904521.0, + "step": 9295 + }, + { + "epoch": 1.0208653634965956, + "grad_norm": 2.4731762409210205, + "learning_rate": 1e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.7471398115158081, + "num_tokens": 231924421.0, + "step": 9296 + }, + { + "epoch": 1.0209751811992094, + "grad_norm": 2.3051087856292725, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7106202244758606, + "num_tokens": 231949945.0, + "step": 9297 + }, + { + "epoch": 1.021084998901823, + "grad_norm": 2.1527132987976074, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.6989982724189758, + "num_tokens": 231978858.0, + "step": 9298 + }, + { + "epoch": 1.0211948166044367, + "grad_norm": 2.169795513153076, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7212681770324707, + "num_tokens": 232005146.0, + "step": 9299 + }, + { + "epoch": 1.0213046343070502, + "grad_norm": 2.489656925201416, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7326416969299316, + "num_tokens": 232027584.0, + "step": 9300 + }, + { + "epoch": 1.021414452009664, + "grad_norm": 2.5627973079681396, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7066934108734131, + "num_tokens": 232048669.0, + "step": 9301 + }, + { + "epoch": 1.0215242697122777, + "grad_norm": 2.0966522693634033, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7263436317443848, + "num_tokens": 232075036.0, + "step": 9302 + }, + { + "epoch": 1.0216340874148913, + "grad_norm": 2.2963244915008545, + "learning_rate": 1e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7469194531440735, + "num_tokens": 232098024.0, + "step": 9303 + }, + { + "epoch": 1.021743905117505, + "grad_norm": 2.495098829269409, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7131643295288086, + "num_tokens": 232119599.0, + "step": 9304 + }, + { + "epoch": 1.0218537228201185, + "grad_norm": 2.028566837310791, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7081257104873657, + "num_tokens": 232150969.0, + "step": 9305 + }, + { + "epoch": 1.0219635405227323, + "grad_norm": 2.2722697257995605, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7278985977172852, + "num_tokens": 232177948.0, + "step": 9306 + }, + { + "epoch": 1.0220733582253458, + "grad_norm": 2.204955816268921, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7045665979385376, + "num_tokens": 232204125.0, + "step": 9307 + }, + { + "epoch": 1.0221831759279596, + "grad_norm": 2.162001609802246, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7191175222396851, + "num_tokens": 232231521.0, + "step": 9308 + }, + { + "epoch": 1.0222929936305734, + "grad_norm": 1.9998090267181396, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7271685600280762, + "num_tokens": 232260219.0, + "step": 9309 + }, + { + "epoch": 1.022402811333187, + "grad_norm": 2.533567428588867, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.70599764585495, + "num_tokens": 232283233.0, + "step": 9310 + }, + { + "epoch": 1.0225126290358006, + "grad_norm": 2.322972297668457, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7200188636779785, + "num_tokens": 232309250.0, + "step": 9311 + }, + { + "epoch": 1.0226224467384142, + "grad_norm": 2.0980496406555176, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7069663405418396, + "num_tokens": 232338448.0, + "step": 9312 + }, + { + "epoch": 1.022732264441028, + "grad_norm": 2.1071507930755615, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7170909643173218, + "num_tokens": 232364227.0, + "step": 9313 + }, + { + "epoch": 1.0228420821436415, + "grad_norm": 2.1576991081237793, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7295553684234619, + "num_tokens": 232390067.0, + "step": 9314 + }, + { + "epoch": 1.0229518998462552, + "grad_norm": 2.3604042530059814, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7251979112625122, + "num_tokens": 232413924.0, + "step": 9315 + }, + { + "epoch": 1.023061717548869, + "grad_norm": 2.4847474098205566, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7356998324394226, + "num_tokens": 232434252.0, + "step": 9316 + }, + { + "epoch": 1.0231715352514825, + "grad_norm": 2.258185863494873, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.714900553226471, + "num_tokens": 232461013.0, + "step": 9317 + }, + { + "epoch": 1.0232813529540963, + "grad_norm": 2.1733410358428955, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7203556299209595, + "num_tokens": 232486855.0, + "step": 9318 + }, + { + "epoch": 1.0233911706567098, + "grad_norm": 2.2128961086273193, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7191401720046997, + "num_tokens": 232513250.0, + "step": 9319 + }, + { + "epoch": 1.0235009883593236, + "grad_norm": 2.153498649597168, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7085174322128296, + "num_tokens": 232545399.0, + "step": 9320 + }, + { + "epoch": 1.023610806061937, + "grad_norm": 2.012455940246582, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7122403383255005, + "num_tokens": 232575426.0, + "step": 9321 + }, + { + "epoch": 1.0237206237645509, + "grad_norm": 2.6087417602539062, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7330690026283264, + "num_tokens": 232595029.0, + "step": 9322 + }, + { + "epoch": 1.0238304414671644, + "grad_norm": 2.2684662342071533, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7226600646972656, + "num_tokens": 232620485.0, + "step": 9323 + }, + { + "epoch": 1.0239402591697782, + "grad_norm": 2.0401883125305176, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7011343836784363, + "num_tokens": 232652410.0, + "step": 9324 + }, + { + "epoch": 1.024050076872392, + "grad_norm": 2.0170211791992188, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7199276685714722, + "num_tokens": 232680876.0, + "step": 9325 + }, + { + "epoch": 1.0241598945750054, + "grad_norm": 2.2344167232513428, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7251769304275513, + "num_tokens": 232705411.0, + "step": 9326 + }, + { + "epoch": 1.0242697122776192, + "grad_norm": 2.2668421268463135, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7061454057693481, + "num_tokens": 232729018.0, + "step": 9327 + }, + { + "epoch": 1.0243795299802327, + "grad_norm": 2.3911068439483643, + "learning_rate": 1e-06, + "loss": 0.7974, + "mean_token_accuracy": 0.7487756609916687, + "num_tokens": 232749553.0, + "step": 9328 + }, + { + "epoch": 1.0244893476828465, + "grad_norm": 2.2755908966064453, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7161084413528442, + "num_tokens": 232772989.0, + "step": 9329 + }, + { + "epoch": 1.02459916538546, + "grad_norm": 2.41007924079895, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7262275218963623, + "num_tokens": 232793979.0, + "step": 9330 + }, + { + "epoch": 1.0247089830880738, + "grad_norm": 2.62003755569458, + "learning_rate": 1e-06, + "loss": 0.7286, + "mean_token_accuracy": 0.766476571559906, + "num_tokens": 232812030.0, + "step": 9331 + }, + { + "epoch": 1.0248188007906875, + "grad_norm": 2.1010982990264893, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7140092849731445, + "num_tokens": 232839310.0, + "step": 9332 + }, + { + "epoch": 1.024928618493301, + "grad_norm": 2.2363955974578857, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7466905117034912, + "num_tokens": 232862495.0, + "step": 9333 + }, + { + "epoch": 1.0250384361959148, + "grad_norm": 2.3483011722564697, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7092522382736206, + "num_tokens": 232885625.0, + "step": 9334 + }, + { + "epoch": 1.0251482538985284, + "grad_norm": 2.2359657287597656, + "learning_rate": 1e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7342144250869751, + "num_tokens": 232910572.0, + "step": 9335 + }, + { + "epoch": 1.0252580716011421, + "grad_norm": 2.472790479660034, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7210859656333923, + "num_tokens": 232931001.0, + "step": 9336 + }, + { + "epoch": 1.0253678893037557, + "grad_norm": 2.0251996517181396, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7225548028945923, + "num_tokens": 232962098.0, + "step": 9337 + }, + { + "epoch": 1.0254777070063694, + "grad_norm": 2.349309206008911, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7362315058708191, + "num_tokens": 232985185.0, + "step": 9338 + }, + { + "epoch": 1.0255875247089832, + "grad_norm": 2.432553768157959, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7422415018081665, + "num_tokens": 233006585.0, + "step": 9339 + }, + { + "epoch": 1.0256973424115967, + "grad_norm": 2.0302839279174805, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6979044675827026, + "num_tokens": 233039845.0, + "step": 9340 + }, + { + "epoch": 1.0258071601142105, + "grad_norm": 2.218635320663452, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7246693968772888, + "num_tokens": 233065852.0, + "step": 9341 + }, + { + "epoch": 1.025916977816824, + "grad_norm": 2.117716073989868, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.710300087928772, + "num_tokens": 233095481.0, + "step": 9342 + }, + { + "epoch": 1.0260267955194378, + "grad_norm": 2.225487232208252, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7423180937767029, + "num_tokens": 233118939.0, + "step": 9343 + }, + { + "epoch": 1.0261366132220513, + "grad_norm": 2.1097359657287598, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.723981499671936, + "num_tokens": 233145750.0, + "step": 9344 + }, + { + "epoch": 1.026246430924665, + "grad_norm": 1.9775607585906982, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7011146545410156, + "num_tokens": 233178392.0, + "step": 9345 + }, + { + "epoch": 1.0263562486272788, + "grad_norm": 2.3335282802581787, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7245494723320007, + "num_tokens": 233203146.0, + "step": 9346 + }, + { + "epoch": 1.0264660663298923, + "grad_norm": 2.0281407833099365, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7220896482467651, + "num_tokens": 233231553.0, + "step": 9347 + }, + { + "epoch": 1.026575884032506, + "grad_norm": 2.4934465885162354, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7265622615814209, + "num_tokens": 233252121.0, + "step": 9348 + }, + { + "epoch": 1.0266857017351196, + "grad_norm": 2.12300443649292, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.702069878578186, + "num_tokens": 233280573.0, + "step": 9349 + }, + { + "epoch": 1.0267955194377334, + "grad_norm": 2.357691764831543, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7041594982147217, + "num_tokens": 233305874.0, + "step": 9350 + }, + { + "epoch": 1.026905337140347, + "grad_norm": 2.2994372844696045, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7397092580795288, + "num_tokens": 233329656.0, + "step": 9351 + }, + { + "epoch": 1.0270151548429607, + "grad_norm": 2.1951637268066406, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7114850878715515, + "num_tokens": 233357369.0, + "step": 9352 + }, + { + "epoch": 1.0271249725455744, + "grad_norm": 2.230483293533325, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7410353422164917, + "num_tokens": 233381095.0, + "step": 9353 + }, + { + "epoch": 1.027234790248188, + "grad_norm": 2.028817653656006, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7230936288833618, + "num_tokens": 233408711.0, + "step": 9354 + }, + { + "epoch": 1.0273446079508017, + "grad_norm": 2.246924877166748, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7212207913398743, + "num_tokens": 233434670.0, + "step": 9355 + }, + { + "epoch": 1.0274544256534153, + "grad_norm": 2.46085786819458, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7119868397712708, + "num_tokens": 233456928.0, + "step": 9356 + }, + { + "epoch": 1.027564243356029, + "grad_norm": 2.0322141647338867, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7081570625305176, + "num_tokens": 233486220.0, + "step": 9357 + }, + { + "epoch": 1.0276740610586426, + "grad_norm": 2.649749755859375, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7045812606811523, + "num_tokens": 233506741.0, + "step": 9358 + }, + { + "epoch": 1.0277838787612563, + "grad_norm": 2.2017641067504883, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7078973054885864, + "num_tokens": 233531828.0, + "step": 9359 + }, + { + "epoch": 1.02789369646387, + "grad_norm": 2.1431992053985596, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7160590291023254, + "num_tokens": 233559509.0, + "step": 9360 + }, + { + "epoch": 1.0280035141664836, + "grad_norm": 2.053506374359131, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7017046213150024, + "num_tokens": 233593803.0, + "step": 9361 + }, + { + "epoch": 1.0281133318690974, + "grad_norm": 2.1205480098724365, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7286226749420166, + "num_tokens": 233621454.0, + "step": 9362 + }, + { + "epoch": 1.028223149571711, + "grad_norm": 2.1509761810302734, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6874698400497437, + "num_tokens": 233649752.0, + "step": 9363 + }, + { + "epoch": 1.0283329672743247, + "grad_norm": 2.4034616947174072, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7430829405784607, + "num_tokens": 233671461.0, + "step": 9364 + }, + { + "epoch": 1.0284427849769382, + "grad_norm": 2.1686830520629883, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7040888071060181, + "num_tokens": 233700965.0, + "step": 9365 + }, + { + "epoch": 1.028552602679552, + "grad_norm": 2.8887009620666504, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.728043794631958, + "num_tokens": 233717718.0, + "step": 9366 + }, + { + "epoch": 1.0286624203821657, + "grad_norm": 2.539651393890381, + "learning_rate": 1e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7370842695236206, + "num_tokens": 233738025.0, + "step": 9367 + }, + { + "epoch": 1.0287722380847792, + "grad_norm": 2.3993608951568604, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.737152636051178, + "num_tokens": 233762932.0, + "step": 9368 + }, + { + "epoch": 1.028882055787393, + "grad_norm": 2.2969329357147217, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7481339573860168, + "num_tokens": 233785403.0, + "step": 9369 + }, + { + "epoch": 1.0289918734900065, + "grad_norm": 2.3208751678466797, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7295098900794983, + "num_tokens": 233812317.0, + "step": 9370 + }, + { + "epoch": 1.0291016911926203, + "grad_norm": 2.20829701423645, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.684184193611145, + "num_tokens": 233842178.0, + "step": 9371 + }, + { + "epoch": 1.0292115088952338, + "grad_norm": 2.1669952869415283, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7191585302352905, + "num_tokens": 233871651.0, + "step": 9372 + }, + { + "epoch": 1.0293213265978476, + "grad_norm": 2.6027629375457764, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7334542274475098, + "num_tokens": 233892928.0, + "step": 9373 + }, + { + "epoch": 1.0294311443004613, + "grad_norm": 2.2512223720550537, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.717936635017395, + "num_tokens": 233918936.0, + "step": 9374 + }, + { + "epoch": 1.0295409620030749, + "grad_norm": 2.1038825511932373, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.704258918762207, + "num_tokens": 233948535.0, + "step": 9375 + }, + { + "epoch": 1.0296507797056886, + "grad_norm": 2.060002326965332, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7069101333618164, + "num_tokens": 233976826.0, + "step": 9376 + }, + { + "epoch": 1.0297605974083022, + "grad_norm": 2.4510462284088135, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7148557901382446, + "num_tokens": 234000310.0, + "step": 9377 + }, + { + "epoch": 1.029870415110916, + "grad_norm": 2.2995400428771973, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.712394654750824, + "num_tokens": 234027004.0, + "step": 9378 + }, + { + "epoch": 1.0299802328135295, + "grad_norm": 2.5308117866516113, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7323699593544006, + "num_tokens": 234047355.0, + "step": 9379 + }, + { + "epoch": 1.0300900505161432, + "grad_norm": 2.094575881958008, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6967284679412842, + "num_tokens": 234078597.0, + "step": 9380 + }, + { + "epoch": 1.030199868218757, + "grad_norm": 2.0285391807556152, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7034664750099182, + "num_tokens": 234105980.0, + "step": 9381 + }, + { + "epoch": 1.0303096859213705, + "grad_norm": 2.4244635105133057, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7185758352279663, + "num_tokens": 234128021.0, + "step": 9382 + }, + { + "epoch": 1.0304195036239843, + "grad_norm": 2.0709028244018555, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6935793161392212, + "num_tokens": 234157959.0, + "step": 9383 + }, + { + "epoch": 1.0305293213265978, + "grad_norm": 2.2825753688812256, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7158469557762146, + "num_tokens": 234185070.0, + "step": 9384 + }, + { + "epoch": 1.0306391390292116, + "grad_norm": 2.2708497047424316, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7160842418670654, + "num_tokens": 234210339.0, + "step": 9385 + }, + { + "epoch": 1.030748956731825, + "grad_norm": 2.3220250606536865, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7096059322357178, + "num_tokens": 234234947.0, + "step": 9386 + }, + { + "epoch": 1.0308587744344389, + "grad_norm": 2.326313018798828, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6860220432281494, + "num_tokens": 234260506.0, + "step": 9387 + }, + { + "epoch": 1.0309685921370524, + "grad_norm": 2.3275206089019775, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7110717296600342, + "num_tokens": 234284159.0, + "step": 9388 + }, + { + "epoch": 1.0310784098396661, + "grad_norm": 2.5384528636932373, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7319651246070862, + "num_tokens": 234306084.0, + "step": 9389 + }, + { + "epoch": 1.03118822754228, + "grad_norm": 2.161038637161255, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7102856040000916, + "num_tokens": 234335161.0, + "step": 9390 + }, + { + "epoch": 1.0312980452448934, + "grad_norm": 2.3489251136779785, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7157642841339111, + "num_tokens": 234358300.0, + "step": 9391 + }, + { + "epoch": 1.0314078629475072, + "grad_norm": 1.9051789045333862, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7200746536254883, + "num_tokens": 234391963.0, + "step": 9392 + }, + { + "epoch": 1.0315176806501207, + "grad_norm": 2.337697982788086, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7342565059661865, + "num_tokens": 234413819.0, + "step": 9393 + }, + { + "epoch": 1.0316274983527345, + "grad_norm": 2.409647226333618, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7109026908874512, + "num_tokens": 234436042.0, + "step": 9394 + }, + { + "epoch": 1.031737316055348, + "grad_norm": 2.4917702674865723, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.725777804851532, + "num_tokens": 234458240.0, + "step": 9395 + }, + { + "epoch": 1.0318471337579618, + "grad_norm": 2.6686766147613525, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7447208166122437, + "num_tokens": 234477632.0, + "step": 9396 + }, + { + "epoch": 1.0319569514605755, + "grad_norm": 2.221663475036621, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7053846716880798, + "num_tokens": 234505760.0, + "step": 9397 + }, + { + "epoch": 1.032066769163189, + "grad_norm": 2.3972110748291016, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7192239761352539, + "num_tokens": 234528511.0, + "step": 9398 + }, + { + "epoch": 1.0321765868658028, + "grad_norm": 2.420029878616333, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6948394179344177, + "num_tokens": 234553669.0, + "step": 9399 + }, + { + "epoch": 1.0322864045684164, + "grad_norm": 2.3739845752716064, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7352616786956787, + "num_tokens": 234577374.0, + "step": 9400 + }, + { + "epoch": 1.0323962222710301, + "grad_norm": 2.5732500553131104, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7131252288818359, + "num_tokens": 234600213.0, + "step": 9401 + }, + { + "epoch": 1.0325060399736437, + "grad_norm": 2.26316237449646, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7200735807418823, + "num_tokens": 234624471.0, + "step": 9402 + }, + { + "epoch": 1.0326158576762574, + "grad_norm": 2.3584015369415283, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7251017689704895, + "num_tokens": 234649513.0, + "step": 9403 + }, + { + "epoch": 1.0327256753788712, + "grad_norm": 2.0394179821014404, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7067887187004089, + "num_tokens": 234679104.0, + "step": 9404 + }, + { + "epoch": 1.0328354930814847, + "grad_norm": 2.2333004474639893, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7312134504318237, + "num_tokens": 234704660.0, + "step": 9405 + }, + { + "epoch": 1.0329453107840985, + "grad_norm": 2.312579870223999, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7126870155334473, + "num_tokens": 234728442.0, + "step": 9406 + }, + { + "epoch": 1.033055128486712, + "grad_norm": 2.204155921936035, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7098768353462219, + "num_tokens": 234753494.0, + "step": 9407 + }, + { + "epoch": 1.0331649461893258, + "grad_norm": 2.0531527996063232, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7266526222229004, + "num_tokens": 234785376.0, + "step": 9408 + }, + { + "epoch": 1.0332747638919393, + "grad_norm": 2.331786870956421, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7120822668075562, + "num_tokens": 234810084.0, + "step": 9409 + }, + { + "epoch": 1.033384581594553, + "grad_norm": 2.2110044956207275, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7065755724906921, + "num_tokens": 234836466.0, + "step": 9410 + }, + { + "epoch": 1.0334943992971668, + "grad_norm": 2.2992236614227295, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7220000624656677, + "num_tokens": 234860326.0, + "step": 9411 + }, + { + "epoch": 1.0336042169997803, + "grad_norm": 2.015183687210083, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7147801518440247, + "num_tokens": 234888782.0, + "step": 9412 + }, + { + "epoch": 1.033714034702394, + "grad_norm": 2.320990562438965, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7034755945205688, + "num_tokens": 234913889.0, + "step": 9413 + }, + { + "epoch": 1.0338238524050076, + "grad_norm": 2.3011844158172607, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7162430286407471, + "num_tokens": 234937845.0, + "step": 9414 + }, + { + "epoch": 1.0339336701076214, + "grad_norm": 2.583160638809204, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7257570624351501, + "num_tokens": 234956351.0, + "step": 9415 + }, + { + "epoch": 1.034043487810235, + "grad_norm": 2.3342089653015137, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7049601078033447, + "num_tokens": 234979992.0, + "step": 9416 + }, + { + "epoch": 1.0341533055128487, + "grad_norm": 2.928091526031494, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7388730049133301, + "num_tokens": 234998925.0, + "step": 9417 + }, + { + "epoch": 1.0342631232154624, + "grad_norm": 2.3717141151428223, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7163179516792297, + "num_tokens": 235021445.0, + "step": 9418 + }, + { + "epoch": 1.034372940918076, + "grad_norm": 2.5056326389312744, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7257786989212036, + "num_tokens": 235044511.0, + "step": 9419 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 2.212005853652954, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7316673994064331, + "num_tokens": 235068966.0, + "step": 9420 + }, + { + "epoch": 1.0345925763233033, + "grad_norm": 2.026486873626709, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.6997708082199097, + "num_tokens": 235098566.0, + "step": 9421 + }, + { + "epoch": 1.034702394025917, + "grad_norm": 2.510667085647583, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7289273738861084, + "num_tokens": 235119023.0, + "step": 9422 + }, + { + "epoch": 1.0348122117285306, + "grad_norm": 2.4631240367889404, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7119356393814087, + "num_tokens": 235141336.0, + "step": 9423 + }, + { + "epoch": 1.0349220294311443, + "grad_norm": 2.636751651763916, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.74261075258255, + "num_tokens": 235160674.0, + "step": 9424 + }, + { + "epoch": 1.035031847133758, + "grad_norm": 2.145800828933716, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7048015594482422, + "num_tokens": 235189175.0, + "step": 9425 + }, + { + "epoch": 1.0351416648363716, + "grad_norm": 2.061253786087036, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7331027984619141, + "num_tokens": 235216213.0, + "step": 9426 + }, + { + "epoch": 1.0352514825389854, + "grad_norm": 2.0558242797851562, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6960333585739136, + "num_tokens": 235244847.0, + "step": 9427 + }, + { + "epoch": 1.035361300241599, + "grad_norm": 2.4578850269317627, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7181527614593506, + "num_tokens": 235266941.0, + "step": 9428 + }, + { + "epoch": 1.0354711179442126, + "grad_norm": 1.9525657892227173, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.702825665473938, + "num_tokens": 235297349.0, + "step": 9429 + }, + { + "epoch": 1.0355809356468262, + "grad_norm": 2.301403522491455, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.725636899471283, + "num_tokens": 235321338.0, + "step": 9430 + }, + { + "epoch": 1.03569075334944, + "grad_norm": 2.4145140647888184, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7104257941246033, + "num_tokens": 235344064.0, + "step": 9431 + }, + { + "epoch": 1.0358005710520537, + "grad_norm": 2.085435152053833, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.699172854423523, + "num_tokens": 235371614.0, + "step": 9432 + }, + { + "epoch": 1.0359103887546672, + "grad_norm": 2.134608745574951, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.6955387592315674, + "num_tokens": 235399623.0, + "step": 9433 + }, + { + "epoch": 1.036020206457281, + "grad_norm": 2.227762222290039, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.706331729888916, + "num_tokens": 235426174.0, + "step": 9434 + }, + { + "epoch": 1.0361300241598945, + "grad_norm": 2.059812307357788, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7082257866859436, + "num_tokens": 235459773.0, + "step": 9435 + }, + { + "epoch": 1.0362398418625083, + "grad_norm": 2.4856061935424805, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7223722338676453, + "num_tokens": 235481880.0, + "step": 9436 + }, + { + "epoch": 1.0363496595651218, + "grad_norm": 2.20043683052063, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7251161336898804, + "num_tokens": 235508870.0, + "step": 9437 + }, + { + "epoch": 1.0364594772677356, + "grad_norm": 2.344162702560425, + "learning_rate": 1e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7566617727279663, + "num_tokens": 235530522.0, + "step": 9438 + }, + { + "epoch": 1.036569294970349, + "grad_norm": 2.1683568954467773, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7184498310089111, + "num_tokens": 235557617.0, + "step": 9439 + }, + { + "epoch": 1.0366791126729629, + "grad_norm": 2.4207987785339355, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7196130752563477, + "num_tokens": 235578688.0, + "step": 9440 + }, + { + "epoch": 1.0367889303755766, + "grad_norm": 2.1085875034332275, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7162787914276123, + "num_tokens": 235607170.0, + "step": 9441 + }, + { + "epoch": 1.0368987480781902, + "grad_norm": 2.171502113342285, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7389492988586426, + "num_tokens": 235631360.0, + "step": 9442 + }, + { + "epoch": 1.037008565780804, + "grad_norm": 2.137972116470337, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.702296257019043, + "num_tokens": 235662776.0, + "step": 9443 + }, + { + "epoch": 1.0371183834834174, + "grad_norm": 1.9057583808898926, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7032293677330017, + "num_tokens": 235697924.0, + "step": 9444 + }, + { + "epoch": 1.0372282011860312, + "grad_norm": 2.176966905593872, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.722725510597229, + "num_tokens": 235725652.0, + "step": 9445 + }, + { + "epoch": 1.037338018888645, + "grad_norm": 2.464948892593384, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7144003510475159, + "num_tokens": 235748210.0, + "step": 9446 + }, + { + "epoch": 1.0374478365912585, + "grad_norm": 2.4972245693206787, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7316558361053467, + "num_tokens": 235769430.0, + "step": 9447 + }, + { + "epoch": 1.0375576542938723, + "grad_norm": 2.538577079772949, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7214850187301636, + "num_tokens": 235789999.0, + "step": 9448 + }, + { + "epoch": 1.0376674719964858, + "grad_norm": 2.323540449142456, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7027954459190369, + "num_tokens": 235814253.0, + "step": 9449 + }, + { + "epoch": 1.0377772896990995, + "grad_norm": 2.294015884399414, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7312816381454468, + "num_tokens": 235837710.0, + "step": 9450 + }, + { + "epoch": 1.037887107401713, + "grad_norm": 2.4168989658355713, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.71895432472229, + "num_tokens": 235859489.0, + "step": 9451 + }, + { + "epoch": 1.0379969251043268, + "grad_norm": 2.3284730911254883, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7165082097053528, + "num_tokens": 235883199.0, + "step": 9452 + }, + { + "epoch": 1.0381067428069404, + "grad_norm": 2.1939125061035156, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7340961694717407, + "num_tokens": 235908798.0, + "step": 9453 + }, + { + "epoch": 1.0382165605095541, + "grad_norm": 2.2749693393707275, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7200545072555542, + "num_tokens": 235934978.0, + "step": 9454 + }, + { + "epoch": 1.0383263782121679, + "grad_norm": 2.5747647285461426, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7269277572631836, + "num_tokens": 235954152.0, + "step": 9455 + }, + { + "epoch": 1.0384361959147814, + "grad_norm": 2.3841652870178223, + "learning_rate": 1e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7380679845809937, + "num_tokens": 235975189.0, + "step": 9456 + }, + { + "epoch": 1.0385460136173952, + "grad_norm": 2.65175724029541, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7156814336776733, + "num_tokens": 235993866.0, + "step": 9457 + }, + { + "epoch": 1.0386558313200087, + "grad_norm": 2.4398889541625977, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7351955771446228, + "num_tokens": 236015265.0, + "step": 9458 + }, + { + "epoch": 1.0387656490226225, + "grad_norm": 2.079085111618042, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7239019870758057, + "num_tokens": 236044218.0, + "step": 9459 + }, + { + "epoch": 1.038875466725236, + "grad_norm": 2.5725085735321045, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7095119953155518, + "num_tokens": 236064320.0, + "step": 9460 + }, + { + "epoch": 1.0389852844278498, + "grad_norm": 2.6083996295928955, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7439708709716797, + "num_tokens": 236083658.0, + "step": 9461 + }, + { + "epoch": 1.0390951021304635, + "grad_norm": 2.2129504680633545, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7163244485855103, + "num_tokens": 236110609.0, + "step": 9462 + }, + { + "epoch": 1.039204919833077, + "grad_norm": 2.2891345024108887, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7181624174118042, + "num_tokens": 236133844.0, + "step": 9463 + }, + { + "epoch": 1.0393147375356908, + "grad_norm": 2.697333812713623, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7290895581245422, + "num_tokens": 236154508.0, + "step": 9464 + }, + { + "epoch": 1.0394245552383043, + "grad_norm": 2.4592342376708984, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7206932306289673, + "num_tokens": 236178884.0, + "step": 9465 + }, + { + "epoch": 1.039534372940918, + "grad_norm": 2.1213159561157227, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7080358862876892, + "num_tokens": 236207976.0, + "step": 9466 + }, + { + "epoch": 1.0396441906435316, + "grad_norm": 2.321267604827881, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7255040407180786, + "num_tokens": 236232605.0, + "step": 9467 + }, + { + "epoch": 1.0397540083461454, + "grad_norm": 2.230588436126709, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7301010489463806, + "num_tokens": 236259233.0, + "step": 9468 + }, + { + "epoch": 1.0398638260487592, + "grad_norm": 2.116179943084717, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6896802186965942, + "num_tokens": 236286745.0, + "step": 9469 + }, + { + "epoch": 1.0399736437513727, + "grad_norm": 2.0344858169555664, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7104800939559937, + "num_tokens": 236317515.0, + "step": 9470 + }, + { + "epoch": 1.0400834614539864, + "grad_norm": 2.7657251358032227, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7302365899085999, + "num_tokens": 236336472.0, + "step": 9471 + }, + { + "epoch": 1.0401932791566, + "grad_norm": 2.1214051246643066, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7066399455070496, + "num_tokens": 236365113.0, + "step": 9472 + }, + { + "epoch": 1.0403030968592137, + "grad_norm": 2.257143020629883, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7014103531837463, + "num_tokens": 236390747.0, + "step": 9473 + }, + { + "epoch": 1.0404129145618273, + "grad_norm": 2.3677048683166504, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.734358012676239, + "num_tokens": 236413444.0, + "step": 9474 + }, + { + "epoch": 1.040522732264441, + "grad_norm": 2.4038279056549072, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7277977466583252, + "num_tokens": 236435712.0, + "step": 9475 + }, + { + "epoch": 1.0406325499670548, + "grad_norm": 2.7732887268066406, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7222796082496643, + "num_tokens": 236453746.0, + "step": 9476 + }, + { + "epoch": 1.0407423676696683, + "grad_norm": 2.1046741008758545, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7134026885032654, + "num_tokens": 236481312.0, + "step": 9477 + }, + { + "epoch": 1.040852185372282, + "grad_norm": 1.929415225982666, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.718842089176178, + "num_tokens": 236514854.0, + "step": 9478 + }, + { + "epoch": 1.0409620030748956, + "grad_norm": 2.3311586380004883, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.713173508644104, + "num_tokens": 236538604.0, + "step": 9479 + }, + { + "epoch": 1.0410718207775094, + "grad_norm": 2.2467846870422363, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7039504051208496, + "num_tokens": 236564832.0, + "step": 9480 + }, + { + "epoch": 1.041181638480123, + "grad_norm": 2.139735460281372, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7303174734115601, + "num_tokens": 236592257.0, + "step": 9481 + }, + { + "epoch": 1.0412914561827367, + "grad_norm": 2.1075503826141357, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7202301025390625, + "num_tokens": 236620900.0, + "step": 9482 + }, + { + "epoch": 1.0414012738853504, + "grad_norm": 2.244626045227051, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7142837047576904, + "num_tokens": 236646393.0, + "step": 9483 + }, + { + "epoch": 1.041511091587964, + "grad_norm": 2.2953128814697266, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.6988253593444824, + "num_tokens": 236670734.0, + "step": 9484 + }, + { + "epoch": 1.0416209092905777, + "grad_norm": 1.9929437637329102, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7149476408958435, + "num_tokens": 236702266.0, + "step": 9485 + }, + { + "epoch": 1.0417307269931912, + "grad_norm": 2.4585909843444824, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7145846486091614, + "num_tokens": 236725224.0, + "step": 9486 + }, + { + "epoch": 1.041840544695805, + "grad_norm": 2.307586431503296, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7426897287368774, + "num_tokens": 236747232.0, + "step": 9487 + }, + { + "epoch": 1.0419503623984185, + "grad_norm": 1.9875730276107788, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.703502357006073, + "num_tokens": 236777527.0, + "step": 9488 + }, + { + "epoch": 1.0420601801010323, + "grad_norm": 2.2468409538269043, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7287000417709351, + "num_tokens": 236802531.0, + "step": 9489 + }, + { + "epoch": 1.042169997803646, + "grad_norm": 2.3946471214294434, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7177026271820068, + "num_tokens": 236825439.0, + "step": 9490 + }, + { + "epoch": 1.0422798155062596, + "grad_norm": 2.4150187969207764, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7197496294975281, + "num_tokens": 236849613.0, + "step": 9491 + }, + { + "epoch": 1.0423896332088733, + "grad_norm": 2.3179750442504883, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7201460599899292, + "num_tokens": 236872403.0, + "step": 9492 + }, + { + "epoch": 1.0424994509114869, + "grad_norm": 2.6185083389282227, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7109877467155457, + "num_tokens": 236893572.0, + "step": 9493 + }, + { + "epoch": 1.0426092686141006, + "grad_norm": 2.421978235244751, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7063091397285461, + "num_tokens": 236914943.0, + "step": 9494 + }, + { + "epoch": 1.0427190863167142, + "grad_norm": 1.9220225811004639, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7195796370506287, + "num_tokens": 236946728.0, + "step": 9495 + }, + { + "epoch": 1.042828904019328, + "grad_norm": 2.126573324203491, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7317847013473511, + "num_tokens": 236973143.0, + "step": 9496 + }, + { + "epoch": 1.0429387217219417, + "grad_norm": 2.4205374717712402, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7343037128448486, + "num_tokens": 236995897.0, + "step": 9497 + }, + { + "epoch": 1.0430485394245552, + "grad_norm": 2.4609596729278564, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7391109466552734, + "num_tokens": 237017076.0, + "step": 9498 + }, + { + "epoch": 1.043158357127169, + "grad_norm": 2.0999484062194824, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7135848999023438, + "num_tokens": 237046733.0, + "step": 9499 + }, + { + "epoch": 1.0432681748297825, + "grad_norm": 2.360366106033325, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7372215390205383, + "num_tokens": 237070424.0, + "step": 9500 + }, + { + "epoch": 1.0433779925323963, + "grad_norm": 2.2561728954315186, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7052686214447021, + "num_tokens": 237094633.0, + "step": 9501 + }, + { + "epoch": 1.0434878102350098, + "grad_norm": 2.307706832885742, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7336832284927368, + "num_tokens": 237116841.0, + "step": 9502 + }, + { + "epoch": 1.0435976279376236, + "grad_norm": 2.4461171627044678, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7157474756240845, + "num_tokens": 237139130.0, + "step": 9503 + }, + { + "epoch": 1.043707445640237, + "grad_norm": 2.175466299057007, + "learning_rate": 1e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7416456341743469, + "num_tokens": 237163864.0, + "step": 9504 + }, + { + "epoch": 1.0438172633428509, + "grad_norm": 2.419111967086792, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7219898700714111, + "num_tokens": 237185779.0, + "step": 9505 + }, + { + "epoch": 1.0439270810454646, + "grad_norm": 2.476452589035034, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7230778932571411, + "num_tokens": 237208572.0, + "step": 9506 + }, + { + "epoch": 1.0440368987480781, + "grad_norm": 2.2451517581939697, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7191837430000305, + "num_tokens": 237233543.0, + "step": 9507 + }, + { + "epoch": 1.044146716450692, + "grad_norm": 2.474308967590332, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7215986847877502, + "num_tokens": 237256194.0, + "step": 9508 + }, + { + "epoch": 1.0442565341533054, + "grad_norm": 1.9914032220840454, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7175054550170898, + "num_tokens": 237286314.0, + "step": 9509 + }, + { + "epoch": 1.0443663518559192, + "grad_norm": 2.3728866577148438, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7372026443481445, + "num_tokens": 237309102.0, + "step": 9510 + }, + { + "epoch": 1.0444761695585327, + "grad_norm": 2.2599635124206543, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7326050996780396, + "num_tokens": 237332220.0, + "step": 9511 + }, + { + "epoch": 1.0445859872611465, + "grad_norm": 2.6617136001586914, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7383578419685364, + "num_tokens": 237350568.0, + "step": 9512 + }, + { + "epoch": 1.0446958049637602, + "grad_norm": 2.2244746685028076, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.727489709854126, + "num_tokens": 237375751.0, + "step": 9513 + }, + { + "epoch": 1.0448056226663738, + "grad_norm": 2.244894027709961, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7277593612670898, + "num_tokens": 237399261.0, + "step": 9514 + }, + { + "epoch": 1.0449154403689875, + "grad_norm": 2.148606300354004, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6923803091049194, + "num_tokens": 237426130.0, + "step": 9515 + }, + { + "epoch": 1.045025258071601, + "grad_norm": 2.213515281677246, + "learning_rate": 1e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.737258791923523, + "num_tokens": 237450700.0, + "step": 9516 + }, + { + "epoch": 1.0451350757742148, + "grad_norm": 2.4182863235473633, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7254686951637268, + "num_tokens": 237473949.0, + "step": 9517 + }, + { + "epoch": 1.0452448934768284, + "grad_norm": 2.3138320446014404, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6877566576004028, + "num_tokens": 237501219.0, + "step": 9518 + }, + { + "epoch": 1.0453547111794421, + "grad_norm": 2.4357635974884033, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7113640308380127, + "num_tokens": 237525097.0, + "step": 9519 + }, + { + "epoch": 1.0454645288820559, + "grad_norm": 2.0572867393493652, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7049868106842041, + "num_tokens": 237554968.0, + "step": 9520 + }, + { + "epoch": 1.0455743465846694, + "grad_norm": 2.2908263206481934, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7112810611724854, + "num_tokens": 237582703.0, + "step": 9521 + }, + { + "epoch": 1.0456841642872832, + "grad_norm": 2.160627603530884, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7125653028488159, + "num_tokens": 237609264.0, + "step": 9522 + }, + { + "epoch": 1.0457939819898967, + "grad_norm": 2.163360595703125, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6957273483276367, + "num_tokens": 237637606.0, + "step": 9523 + }, + { + "epoch": 1.0459037996925105, + "grad_norm": 1.9393045902252197, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7375377416610718, + "num_tokens": 237668279.0, + "step": 9524 + }, + { + "epoch": 1.046013617395124, + "grad_norm": 2.4149398803710938, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7321382761001587, + "num_tokens": 237689930.0, + "step": 9525 + }, + { + "epoch": 1.0461234350977378, + "grad_norm": 2.13800311088562, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7280548810958862, + "num_tokens": 237716811.0, + "step": 9526 + }, + { + "epoch": 1.0462332528003515, + "grad_norm": 2.363703966140747, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7180366516113281, + "num_tokens": 237741104.0, + "step": 9527 + }, + { + "epoch": 1.046343070502965, + "grad_norm": 2.211665391921997, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.714794397354126, + "num_tokens": 237765960.0, + "step": 9528 + }, + { + "epoch": 1.0464528882055788, + "grad_norm": 2.060647487640381, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7286009788513184, + "num_tokens": 237794486.0, + "step": 9529 + }, + { + "epoch": 1.0465627059081923, + "grad_norm": 2.2121708393096924, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7227656841278076, + "num_tokens": 237820292.0, + "step": 9530 + }, + { + "epoch": 1.046672523610806, + "grad_norm": 2.024747133255005, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7190681099891663, + "num_tokens": 237850080.0, + "step": 9531 + }, + { + "epoch": 1.0467823413134196, + "grad_norm": 2.584533929824829, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7368391156196594, + "num_tokens": 237869018.0, + "step": 9532 + }, + { + "epoch": 1.0468921590160334, + "grad_norm": 2.461986541748047, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7359204888343811, + "num_tokens": 237891579.0, + "step": 9533 + }, + { + "epoch": 1.0470019767186471, + "grad_norm": 2.1970860958099365, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7439427971839905, + "num_tokens": 237916360.0, + "step": 9534 + }, + { + "epoch": 1.0471117944212607, + "grad_norm": 2.5935778617858887, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7414087057113647, + "num_tokens": 237936971.0, + "step": 9535 + }, + { + "epoch": 1.0472216121238744, + "grad_norm": 2.3433785438537598, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7162835001945496, + "num_tokens": 237960576.0, + "step": 9536 + }, + { + "epoch": 1.047331429826488, + "grad_norm": 2.1726112365722656, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7197421789169312, + "num_tokens": 237986188.0, + "step": 9537 + }, + { + "epoch": 1.0474412475291017, + "grad_norm": 2.2400963306427, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7207003831863403, + "num_tokens": 238013933.0, + "step": 9538 + }, + { + "epoch": 1.0475510652317153, + "grad_norm": 2.195664644241333, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7106286287307739, + "num_tokens": 238041411.0, + "step": 9539 + }, + { + "epoch": 1.047660882934329, + "grad_norm": 2.2288923263549805, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.727542519569397, + "num_tokens": 238068115.0, + "step": 9540 + }, + { + "epoch": 1.0477707006369428, + "grad_norm": 2.4742677211761475, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7256873250007629, + "num_tokens": 238088164.0, + "step": 9541 + }, + { + "epoch": 1.0478805183395563, + "grad_norm": 2.852084159851074, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7395212650299072, + "num_tokens": 238105055.0, + "step": 9542 + }, + { + "epoch": 1.04799033604217, + "grad_norm": 2.1148509979248047, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6817508935928345, + "num_tokens": 238136545.0, + "step": 9543 + }, + { + "epoch": 1.0481001537447836, + "grad_norm": 2.1998229026794434, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6928421854972839, + "num_tokens": 238164619.0, + "step": 9544 + }, + { + "epoch": 1.0482099714473974, + "grad_norm": 2.403571367263794, + "learning_rate": 1e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7449663877487183, + "num_tokens": 238185736.0, + "step": 9545 + }, + { + "epoch": 1.048319789150011, + "grad_norm": 2.238816976547241, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.700896143913269, + "num_tokens": 238211733.0, + "step": 9546 + }, + { + "epoch": 1.0484296068526247, + "grad_norm": 2.140404224395752, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7039806842803955, + "num_tokens": 238240411.0, + "step": 9547 + }, + { + "epoch": 1.0485394245552384, + "grad_norm": 2.174140453338623, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.703334391117096, + "num_tokens": 238266622.0, + "step": 9548 + }, + { + "epoch": 1.048649242257852, + "grad_norm": 2.1189584732055664, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7383129596710205, + "num_tokens": 238293263.0, + "step": 9549 + }, + { + "epoch": 1.0487590599604657, + "grad_norm": 2.4789669513702393, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7133866548538208, + "num_tokens": 238313361.0, + "step": 9550 + }, + { + "epoch": 1.0488688776630792, + "grad_norm": 2.350090742111206, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7146372199058533, + "num_tokens": 238335872.0, + "step": 9551 + }, + { + "epoch": 1.048978695365693, + "grad_norm": 2.3753697872161865, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7135605812072754, + "num_tokens": 238358032.0, + "step": 9552 + }, + { + "epoch": 1.0490885130683065, + "grad_norm": 2.538337469100952, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7376450300216675, + "num_tokens": 238378381.0, + "step": 9553 + }, + { + "epoch": 1.0491983307709203, + "grad_norm": 2.412416934967041, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7454590797424316, + "num_tokens": 238400178.0, + "step": 9554 + }, + { + "epoch": 1.0493081484735338, + "grad_norm": 2.19154953956604, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7016961574554443, + "num_tokens": 238427941.0, + "step": 9555 + }, + { + "epoch": 1.0494179661761476, + "grad_norm": 2.3280880451202393, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6982958316802979, + "num_tokens": 238451474.0, + "step": 9556 + }, + { + "epoch": 1.0495277838787613, + "grad_norm": 1.9476466178894043, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7153733968734741, + "num_tokens": 238483619.0, + "step": 9557 + }, + { + "epoch": 1.0496376015813749, + "grad_norm": 2.2250800132751465, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.708250880241394, + "num_tokens": 238509199.0, + "step": 9558 + }, + { + "epoch": 1.0497474192839886, + "grad_norm": 2.2357959747314453, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7068685293197632, + "num_tokens": 238535510.0, + "step": 9559 + }, + { + "epoch": 1.0498572369866022, + "grad_norm": 2.492495536804199, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7182047367095947, + "num_tokens": 238556490.0, + "step": 9560 + }, + { + "epoch": 1.049967054689216, + "grad_norm": 2.4238102436065674, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.720079779624939, + "num_tokens": 238579658.0, + "step": 9561 + }, + { + "epoch": 1.0500768723918297, + "grad_norm": 1.847486138343811, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7105231285095215, + "num_tokens": 238614045.0, + "step": 9562 + }, + { + "epoch": 1.0501866900944432, + "grad_norm": 2.3987677097320557, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7285279631614685, + "num_tokens": 238635473.0, + "step": 9563 + }, + { + "epoch": 1.050296507797057, + "grad_norm": 2.537074327468872, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7284454107284546, + "num_tokens": 238656479.0, + "step": 9564 + }, + { + "epoch": 1.0504063254996705, + "grad_norm": 2.023068428039551, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7180907726287842, + "num_tokens": 238688904.0, + "step": 9565 + }, + { + "epoch": 1.0505161432022843, + "grad_norm": 2.007357597351074, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7018964886665344, + "num_tokens": 238721439.0, + "step": 9566 + }, + { + "epoch": 1.0506259609048978, + "grad_norm": 2.417067527770996, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.716704249382019, + "num_tokens": 238743344.0, + "step": 9567 + }, + { + "epoch": 1.0507357786075116, + "grad_norm": 2.571239471435547, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7347218990325928, + "num_tokens": 238764073.0, + "step": 9568 + }, + { + "epoch": 1.050845596310125, + "grad_norm": 2.3823025226593018, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7086359262466431, + "num_tokens": 238787202.0, + "step": 9569 + }, + { + "epoch": 1.0509554140127388, + "grad_norm": 2.6000425815582275, + "learning_rate": 1e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7451763153076172, + "num_tokens": 238806108.0, + "step": 9570 + }, + { + "epoch": 1.0510652317153526, + "grad_norm": 2.400242567062378, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.69417804479599, + "num_tokens": 238831570.0, + "step": 9571 + }, + { + "epoch": 1.0511750494179661, + "grad_norm": 2.5228259563446045, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.726524293422699, + "num_tokens": 238852332.0, + "step": 9572 + }, + { + "epoch": 1.05128486712058, + "grad_norm": 2.1789097785949707, + "learning_rate": 1e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7577441930770874, + "num_tokens": 238878640.0, + "step": 9573 + }, + { + "epoch": 1.0513946848231934, + "grad_norm": 2.1203455924987793, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7215153574943542, + "num_tokens": 238904924.0, + "step": 9574 + }, + { + "epoch": 1.0515045025258072, + "grad_norm": 2.4425530433654785, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7262061238288879, + "num_tokens": 238927669.0, + "step": 9575 + }, + { + "epoch": 1.0516143202284207, + "grad_norm": 2.150763511657715, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7357836961746216, + "num_tokens": 238953560.0, + "step": 9576 + }, + { + "epoch": 1.0517241379310345, + "grad_norm": 2.219229221343994, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7130962610244751, + "num_tokens": 238979398.0, + "step": 9577 + }, + { + "epoch": 1.0518339556336482, + "grad_norm": 2.242649793624878, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.721347451210022, + "num_tokens": 239005034.0, + "step": 9578 + }, + { + "epoch": 1.0519437733362618, + "grad_norm": 2.5286357402801514, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7383535504341125, + "num_tokens": 239023896.0, + "step": 9579 + }, + { + "epoch": 1.0520535910388755, + "grad_norm": 2.272156000137329, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7189172506332397, + "num_tokens": 239048313.0, + "step": 9580 + }, + { + "epoch": 1.052163408741489, + "grad_norm": 2.395794630050659, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7299801111221313, + "num_tokens": 239070260.0, + "step": 9581 + }, + { + "epoch": 1.0522732264441028, + "grad_norm": 2.1882543563842773, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7209784388542175, + "num_tokens": 239096960.0, + "step": 9582 + }, + { + "epoch": 1.0523830441467164, + "grad_norm": 2.1814188957214355, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.692288875579834, + "num_tokens": 239124279.0, + "step": 9583 + }, + { + "epoch": 1.05249286184933, + "grad_norm": 2.082115411758423, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.6969121098518372, + "num_tokens": 239154378.0, + "step": 9584 + }, + { + "epoch": 1.0526026795519439, + "grad_norm": 2.142035722732544, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7235655784606934, + "num_tokens": 239180682.0, + "step": 9585 + }, + { + "epoch": 1.0527124972545574, + "grad_norm": 2.5473153591156006, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7105165719985962, + "num_tokens": 239202166.0, + "step": 9586 + }, + { + "epoch": 1.0528223149571712, + "grad_norm": 2.4700305461883545, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7234203815460205, + "num_tokens": 239224917.0, + "step": 9587 + }, + { + "epoch": 1.0529321326597847, + "grad_norm": 2.055929660797119, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7206987142562866, + "num_tokens": 239253584.0, + "step": 9588 + }, + { + "epoch": 1.0530419503623984, + "grad_norm": 2.763484477996826, + "learning_rate": 1e-06, + "loss": 0.7618, + "mean_token_accuracy": 0.7521213293075562, + "num_tokens": 239272015.0, + "step": 9589 + }, + { + "epoch": 1.053151768065012, + "grad_norm": 1.9777382612228394, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6933079361915588, + "num_tokens": 239305123.0, + "step": 9590 + }, + { + "epoch": 1.0532615857676257, + "grad_norm": 2.456965446472168, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7241250276565552, + "num_tokens": 239328373.0, + "step": 9591 + }, + { + "epoch": 1.0533714034702395, + "grad_norm": 2.3550305366516113, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7073487043380737, + "num_tokens": 239352332.0, + "step": 9592 + }, + { + "epoch": 1.053481221172853, + "grad_norm": 2.254526138305664, + "learning_rate": 1e-06, + "loss": 0.7418, + "mean_token_accuracy": 0.7616108655929565, + "num_tokens": 239376416.0, + "step": 9593 + }, + { + "epoch": 1.0535910388754668, + "grad_norm": 2.1508898735046387, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7012865543365479, + "num_tokens": 239404827.0, + "step": 9594 + }, + { + "epoch": 1.0537008565780803, + "grad_norm": 1.996887445449829, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6842300891876221, + "num_tokens": 239438723.0, + "step": 9595 + }, + { + "epoch": 1.053810674280694, + "grad_norm": 2.0714640617370605, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7314563393592834, + "num_tokens": 239466875.0, + "step": 9596 + }, + { + "epoch": 1.0539204919833076, + "grad_norm": 2.213106155395508, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7289676666259766, + "num_tokens": 239491562.0, + "step": 9597 + }, + { + "epoch": 1.0540303096859214, + "grad_norm": 2.242046594619751, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6954522132873535, + "num_tokens": 239517316.0, + "step": 9598 + }, + { + "epoch": 1.0541401273885351, + "grad_norm": 2.065373659133911, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7144733667373657, + "num_tokens": 239546003.0, + "step": 9599 + }, + { + "epoch": 1.0542499450911487, + "grad_norm": 1.7768070697784424, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.686481773853302, + "num_tokens": 239583566.0, + "step": 9600 + }, + { + "epoch": 1.0543597627937624, + "grad_norm": 2.4999783039093018, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7177828550338745, + "num_tokens": 239605107.0, + "step": 9601 + }, + { + "epoch": 1.054469580496376, + "grad_norm": 2.3864967823028564, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7228065133094788, + "num_tokens": 239628614.0, + "step": 9602 + }, + { + "epoch": 1.0545793981989897, + "grad_norm": 2.5417239665985107, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7174790501594543, + "num_tokens": 239651540.0, + "step": 9603 + }, + { + "epoch": 1.0546892159016032, + "grad_norm": 2.2972543239593506, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.724442720413208, + "num_tokens": 239675401.0, + "step": 9604 + }, + { + "epoch": 1.054799033604217, + "grad_norm": 2.5759401321411133, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7177033424377441, + "num_tokens": 239697082.0, + "step": 9605 + }, + { + "epoch": 1.0549088513068308, + "grad_norm": 2.4453933238983154, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7289125919342041, + "num_tokens": 239718859.0, + "step": 9606 + }, + { + "epoch": 1.0550186690094443, + "grad_norm": 2.415280818939209, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7443498969078064, + "num_tokens": 239741910.0, + "step": 9607 + }, + { + "epoch": 1.055128486712058, + "grad_norm": 2.273667335510254, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.706619143486023, + "num_tokens": 239768749.0, + "step": 9608 + }, + { + "epoch": 1.0552383044146716, + "grad_norm": 2.091813087463379, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7389962077140808, + "num_tokens": 239797706.0, + "step": 9609 + }, + { + "epoch": 1.0553481221172853, + "grad_norm": 2.404330015182495, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7335677742958069, + "num_tokens": 239818709.0, + "step": 9610 + }, + { + "epoch": 1.0554579398198989, + "grad_norm": 2.2710154056549072, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7175155282020569, + "num_tokens": 239844692.0, + "step": 9611 + }, + { + "epoch": 1.0555677575225126, + "grad_norm": 2.368948459625244, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7136482000350952, + "num_tokens": 239867672.0, + "step": 9612 + }, + { + "epoch": 1.0556775752251264, + "grad_norm": 2.216546058654785, + "learning_rate": 1e-06, + "loss": 0.814, + "mean_token_accuracy": 0.743553876876831, + "num_tokens": 239892905.0, + "step": 9613 + }, + { + "epoch": 1.05578739292774, + "grad_norm": 2.1634907722473145, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6948432326316833, + "num_tokens": 239920697.0, + "step": 9614 + }, + { + "epoch": 1.0558972106303537, + "grad_norm": 2.637990951538086, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7131918668746948, + "num_tokens": 239939173.0, + "step": 9615 + }, + { + "epoch": 1.0560070283329672, + "grad_norm": 2.130251169204712, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7318271398544312, + "num_tokens": 239968348.0, + "step": 9616 + }, + { + "epoch": 1.056116846035581, + "grad_norm": 2.17669677734375, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7127734422683716, + "num_tokens": 239996639.0, + "step": 9617 + }, + { + "epoch": 1.0562266637381945, + "grad_norm": 2.1160178184509277, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7074394822120667, + "num_tokens": 240026077.0, + "step": 9618 + }, + { + "epoch": 1.0563364814408083, + "grad_norm": 2.291019916534424, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7205449342727661, + "num_tokens": 240049048.0, + "step": 9619 + }, + { + "epoch": 1.0564462991434218, + "grad_norm": 2.0498507022857666, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6948051452636719, + "num_tokens": 240078417.0, + "step": 9620 + }, + { + "epoch": 1.0565561168460356, + "grad_norm": 2.2598373889923096, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7115898132324219, + "num_tokens": 240103617.0, + "step": 9621 + }, + { + "epoch": 1.0566659345486493, + "grad_norm": 2.1821129322052, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7099215984344482, + "num_tokens": 240131143.0, + "step": 9622 + }, + { + "epoch": 1.0567757522512629, + "grad_norm": 2.4932596683502197, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7400414943695068, + "num_tokens": 240150935.0, + "step": 9623 + }, + { + "epoch": 1.0568855699538766, + "grad_norm": 2.4987399578094482, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7261923551559448, + "num_tokens": 240173492.0, + "step": 9624 + }, + { + "epoch": 1.0569953876564901, + "grad_norm": 2.0529987812042236, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7117851972579956, + "num_tokens": 240203626.0, + "step": 9625 + }, + { + "epoch": 1.057105205359104, + "grad_norm": 2.3084726333618164, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7232348322868347, + "num_tokens": 240227323.0, + "step": 9626 + }, + { + "epoch": 1.0572150230617177, + "grad_norm": 2.2340340614318848, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7129523754119873, + "num_tokens": 240253587.0, + "step": 9627 + }, + { + "epoch": 1.0573248407643312, + "grad_norm": 2.411397695541382, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7387781143188477, + "num_tokens": 240276152.0, + "step": 9628 + }, + { + "epoch": 1.057434658466945, + "grad_norm": 2.479193687438965, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7485625147819519, + "num_tokens": 240297883.0, + "step": 9629 + }, + { + "epoch": 1.0575444761695585, + "grad_norm": 2.092341899871826, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.742880642414093, + "num_tokens": 240324810.0, + "step": 9630 + }, + { + "epoch": 1.0576542938721722, + "grad_norm": 2.6161179542541504, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7353605031967163, + "num_tokens": 240343361.0, + "step": 9631 + }, + { + "epoch": 1.0577641115747858, + "grad_norm": 2.2858777046203613, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7101358771324158, + "num_tokens": 240367919.0, + "step": 9632 + }, + { + "epoch": 1.0578739292773995, + "grad_norm": 2.5631322860717773, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7231337428092957, + "num_tokens": 240388857.0, + "step": 9633 + }, + { + "epoch": 1.057983746980013, + "grad_norm": 2.2390995025634766, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.6998738050460815, + "num_tokens": 240416009.0, + "step": 9634 + }, + { + "epoch": 1.0580935646826268, + "grad_norm": 2.3305389881134033, + "learning_rate": 1e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7494100332260132, + "num_tokens": 240437907.0, + "step": 9635 + }, + { + "epoch": 1.0582033823852406, + "grad_norm": 2.3317058086395264, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.6987079977989197, + "num_tokens": 240462348.0, + "step": 9636 + }, + { + "epoch": 1.0583132000878541, + "grad_norm": 2.1636431217193604, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7115874290466309, + "num_tokens": 240488922.0, + "step": 9637 + }, + { + "epoch": 1.0584230177904679, + "grad_norm": 2.4837327003479004, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7259274125099182, + "num_tokens": 240509006.0, + "step": 9638 + }, + { + "epoch": 1.0585328354930814, + "grad_norm": 2.305772066116333, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7125743627548218, + "num_tokens": 240534494.0, + "step": 9639 + }, + { + "epoch": 1.0586426531956952, + "grad_norm": 2.022834062576294, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7067031860351562, + "num_tokens": 240565714.0, + "step": 9640 + }, + { + "epoch": 1.0587524708983087, + "grad_norm": 2.554788589477539, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7392417192459106, + "num_tokens": 240585889.0, + "step": 9641 + }, + { + "epoch": 1.0588622886009225, + "grad_norm": 2.338642120361328, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7255333662033081, + "num_tokens": 240609925.0, + "step": 9642 + }, + { + "epoch": 1.0589721063035362, + "grad_norm": 2.510676383972168, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7397972345352173, + "num_tokens": 240632195.0, + "step": 9643 + }, + { + "epoch": 1.0590819240061498, + "grad_norm": 2.6089835166931152, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7370890378952026, + "num_tokens": 240652610.0, + "step": 9644 + }, + { + "epoch": 1.0591917417087635, + "grad_norm": 2.1835713386535645, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7314364910125732, + "num_tokens": 240679122.0, + "step": 9645 + }, + { + "epoch": 1.059301559411377, + "grad_norm": 2.2935845851898193, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7026941776275635, + "num_tokens": 240705825.0, + "step": 9646 + }, + { + "epoch": 1.0594113771139908, + "grad_norm": 2.341850996017456, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7218621373176575, + "num_tokens": 240730304.0, + "step": 9647 + }, + { + "epoch": 1.0595211948166043, + "grad_norm": 2.19234561920166, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7221554517745972, + "num_tokens": 240755486.0, + "step": 9648 + }, + { + "epoch": 1.059631012519218, + "grad_norm": 2.1737074851989746, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6960658431053162, + "num_tokens": 240784074.0, + "step": 9649 + }, + { + "epoch": 1.0597408302218319, + "grad_norm": 2.5311055183410645, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7122565507888794, + "num_tokens": 240804694.0, + "step": 9650 + }, + { + "epoch": 1.0598506479244454, + "grad_norm": 2.2412309646606445, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7009862661361694, + "num_tokens": 240831963.0, + "step": 9651 + }, + { + "epoch": 1.0599604656270591, + "grad_norm": 2.274808406829834, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7206723093986511, + "num_tokens": 240857874.0, + "step": 9652 + }, + { + "epoch": 1.0600702833296727, + "grad_norm": 1.917597770690918, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.6985965371131897, + "num_tokens": 240890608.0, + "step": 9653 + }, + { + "epoch": 1.0601801010322864, + "grad_norm": 2.407538652420044, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7148107290267944, + "num_tokens": 240914023.0, + "step": 9654 + }, + { + "epoch": 1.0602899187349, + "grad_norm": 2.457897663116455, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7164411544799805, + "num_tokens": 240936067.0, + "step": 9655 + }, + { + "epoch": 1.0603997364375137, + "grad_norm": 2.4304518699645996, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7124087810516357, + "num_tokens": 240959893.0, + "step": 9656 + }, + { + "epoch": 1.0605095541401275, + "grad_norm": 2.1307315826416016, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7194241285324097, + "num_tokens": 240987345.0, + "step": 9657 + }, + { + "epoch": 1.060619371842741, + "grad_norm": 2.1091272830963135, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7101720571517944, + "num_tokens": 241014348.0, + "step": 9658 + }, + { + "epoch": 1.0607291895453548, + "grad_norm": 2.737110137939453, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7246578931808472, + "num_tokens": 241033591.0, + "step": 9659 + }, + { + "epoch": 1.0608390072479683, + "grad_norm": 2.194366693496704, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7037063837051392, + "num_tokens": 241062163.0, + "step": 9660 + }, + { + "epoch": 1.060948824950582, + "grad_norm": 2.4558238983154297, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7296257615089417, + "num_tokens": 241083034.0, + "step": 9661 + }, + { + "epoch": 1.0610586426531956, + "grad_norm": 2.034507989883423, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.6995795965194702, + "num_tokens": 241112941.0, + "step": 9662 + }, + { + "epoch": 1.0611684603558094, + "grad_norm": 2.1888976097106934, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7356128692626953, + "num_tokens": 241136366.0, + "step": 9663 + }, + { + "epoch": 1.0612782780584231, + "grad_norm": 2.259307861328125, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7270699143409729, + "num_tokens": 241160759.0, + "step": 9664 + }, + { + "epoch": 1.0613880957610367, + "grad_norm": 1.9747483730316162, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7102230191230774, + "num_tokens": 241192631.0, + "step": 9665 + }, + { + "epoch": 1.0614979134636504, + "grad_norm": 2.4820752143859863, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7140365242958069, + "num_tokens": 241215983.0, + "step": 9666 + }, + { + "epoch": 1.061607731166264, + "grad_norm": 2.117931365966797, + "learning_rate": 1e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7330272197723389, + "num_tokens": 241243261.0, + "step": 9667 + }, + { + "epoch": 1.0617175488688777, + "grad_norm": 2.22127628326416, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7105892896652222, + "num_tokens": 241270145.0, + "step": 9668 + }, + { + "epoch": 1.0618273665714912, + "grad_norm": 2.3217809200286865, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7251664400100708, + "num_tokens": 241292807.0, + "step": 9669 + }, + { + "epoch": 1.061937184274105, + "grad_norm": 2.3571736812591553, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7212890386581421, + "num_tokens": 241315790.0, + "step": 9670 + }, + { + "epoch": 1.0620470019767188, + "grad_norm": 2.272228479385376, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7399177551269531, + "num_tokens": 241338697.0, + "step": 9671 + }, + { + "epoch": 1.0621568196793323, + "grad_norm": 2.206594228744507, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7298336625099182, + "num_tokens": 241365097.0, + "step": 9672 + }, + { + "epoch": 1.062266637381946, + "grad_norm": 2.067669630050659, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7169902324676514, + "num_tokens": 241393273.0, + "step": 9673 + }, + { + "epoch": 1.0623764550845596, + "grad_norm": 2.187002182006836, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6933951377868652, + "num_tokens": 241424072.0, + "step": 9674 + }, + { + "epoch": 1.0624862727871733, + "grad_norm": 2.654477596282959, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7191312313079834, + "num_tokens": 241445370.0, + "step": 9675 + }, + { + "epoch": 1.0625960904897869, + "grad_norm": 2.5200345516204834, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7326964735984802, + "num_tokens": 241466869.0, + "step": 9676 + }, + { + "epoch": 1.0627059081924006, + "grad_norm": 2.3277153968811035, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7131808996200562, + "num_tokens": 241490735.0, + "step": 9677 + }, + { + "epoch": 1.0628157258950144, + "grad_norm": 2.230226755142212, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7095568180084229, + "num_tokens": 241515930.0, + "step": 9678 + }, + { + "epoch": 1.062925543597628, + "grad_norm": 1.9135719537734985, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7148404717445374, + "num_tokens": 241549747.0, + "step": 9679 + }, + { + "epoch": 1.0630353613002417, + "grad_norm": 2.607006072998047, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7392455339431763, + "num_tokens": 241570443.0, + "step": 9680 + }, + { + "epoch": 1.0631451790028552, + "grad_norm": 2.507606029510498, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7176043391227722, + "num_tokens": 241592678.0, + "step": 9681 + }, + { + "epoch": 1.063254996705469, + "grad_norm": 2.3290212154388428, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6874865889549255, + "num_tokens": 241620091.0, + "step": 9682 + }, + { + "epoch": 1.0633648144080825, + "grad_norm": 2.128386974334717, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.709182620048523, + "num_tokens": 241649600.0, + "step": 9683 + }, + { + "epoch": 1.0634746321106963, + "grad_norm": 2.131809949874878, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.6995252966880798, + "num_tokens": 241677581.0, + "step": 9684 + }, + { + "epoch": 1.0635844498133098, + "grad_norm": 2.3395004272460938, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7184910178184509, + "num_tokens": 241701769.0, + "step": 9685 + }, + { + "epoch": 1.0636942675159236, + "grad_norm": 2.7001261711120605, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.725758969783783, + "num_tokens": 241721861.0, + "step": 9686 + }, + { + "epoch": 1.0638040852185373, + "grad_norm": 2.161766290664673, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6837791204452515, + "num_tokens": 241750118.0, + "step": 9687 + }, + { + "epoch": 1.0639139029211508, + "grad_norm": 2.361420154571533, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7214765548706055, + "num_tokens": 241774754.0, + "step": 9688 + }, + { + "epoch": 1.0640237206237646, + "grad_norm": 2.0645782947540283, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7358533143997192, + "num_tokens": 241803639.0, + "step": 9689 + }, + { + "epoch": 1.0641335383263781, + "grad_norm": 2.6584932804107666, + "learning_rate": 1e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7482385039329529, + "num_tokens": 241822291.0, + "step": 9690 + }, + { + "epoch": 1.064243356028992, + "grad_norm": 2.3426764011383057, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7319666743278503, + "num_tokens": 241843888.0, + "step": 9691 + }, + { + "epoch": 1.0643531737316057, + "grad_norm": 2.6555051803588867, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7352524995803833, + "num_tokens": 241862026.0, + "step": 9692 + }, + { + "epoch": 1.0644629914342192, + "grad_norm": 2.4746532440185547, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7045197486877441, + "num_tokens": 241885088.0, + "step": 9693 + }, + { + "epoch": 1.064572809136833, + "grad_norm": 2.1576223373413086, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7207827568054199, + "num_tokens": 241914243.0, + "step": 9694 + }, + { + "epoch": 1.0646826268394465, + "grad_norm": 2.675774574279785, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7442256808280945, + "num_tokens": 241932358.0, + "step": 9695 + }, + { + "epoch": 1.0647924445420602, + "grad_norm": 2.511133909225464, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7337101697921753, + "num_tokens": 241952175.0, + "step": 9696 + }, + { + "epoch": 1.0649022622446738, + "grad_norm": 2.208296298980713, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7247633934020996, + "num_tokens": 241977012.0, + "step": 9697 + }, + { + "epoch": 1.0650120799472875, + "grad_norm": 2.6319215297698975, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7177332639694214, + "num_tokens": 241996489.0, + "step": 9698 + }, + { + "epoch": 1.065121897649901, + "grad_norm": 2.623067855834961, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7192710638046265, + "num_tokens": 242016243.0, + "step": 9699 + }, + { + "epoch": 1.0652317153525148, + "grad_norm": 2.2544009685516357, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.744014322757721, + "num_tokens": 242040514.0, + "step": 9700 + }, + { + "epoch": 1.0653415330551286, + "grad_norm": 2.0822553634643555, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.701346755027771, + "num_tokens": 242069531.0, + "step": 9701 + }, + { + "epoch": 1.065451350757742, + "grad_norm": 1.993868112564087, + "learning_rate": 1e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.73484206199646, + "num_tokens": 242099462.0, + "step": 9702 + }, + { + "epoch": 1.0655611684603559, + "grad_norm": 2.2353131771087646, + "learning_rate": 1e-06, + "loss": 0.7401, + "mean_token_accuracy": 0.7592288255691528, + "num_tokens": 242121473.0, + "step": 9703 + }, + { + "epoch": 1.0656709861629694, + "grad_norm": 1.9564846754074097, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7346671223640442, + "num_tokens": 242152007.0, + "step": 9704 + }, + { + "epoch": 1.0657808038655832, + "grad_norm": 2.5211963653564453, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7334052324295044, + "num_tokens": 242171976.0, + "step": 9705 + }, + { + "epoch": 1.0658906215681967, + "grad_norm": 2.1732277870178223, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.740594744682312, + "num_tokens": 242198926.0, + "step": 9706 + }, + { + "epoch": 1.0660004392708105, + "grad_norm": 2.739997386932373, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7357322573661804, + "num_tokens": 242216931.0, + "step": 9707 + }, + { + "epoch": 1.0661102569734242, + "grad_norm": 1.913151741027832, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.689238429069519, + "num_tokens": 242254619.0, + "step": 9708 + }, + { + "epoch": 1.0662200746760377, + "grad_norm": 2.0072665214538574, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7197944521903992, + "num_tokens": 242285778.0, + "step": 9709 + }, + { + "epoch": 1.0663298923786515, + "grad_norm": 2.4195032119750977, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.717188835144043, + "num_tokens": 242308383.0, + "step": 9710 + }, + { + "epoch": 1.066439710081265, + "grad_norm": 2.5877580642700195, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7149529457092285, + "num_tokens": 242330219.0, + "step": 9711 + }, + { + "epoch": 1.0665495277838788, + "grad_norm": 2.0580623149871826, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7299137711524963, + "num_tokens": 242356945.0, + "step": 9712 + }, + { + "epoch": 1.0666593454864923, + "grad_norm": 2.2594618797302246, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7272192239761353, + "num_tokens": 242381527.0, + "step": 9713 + }, + { + "epoch": 1.066769163189106, + "grad_norm": 2.5087482929229736, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7189599275588989, + "num_tokens": 242402916.0, + "step": 9714 + }, + { + "epoch": 1.0668789808917198, + "grad_norm": 2.4863791465759277, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7014684677124023, + "num_tokens": 242426575.0, + "step": 9715 + }, + { + "epoch": 1.0669887985943334, + "grad_norm": 2.0429389476776123, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7132631540298462, + "num_tokens": 242456457.0, + "step": 9716 + }, + { + "epoch": 1.0670986162969471, + "grad_norm": 2.3371453285217285, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7140398621559143, + "num_tokens": 242481342.0, + "step": 9717 + }, + { + "epoch": 1.0672084339995607, + "grad_norm": 1.9847368001937866, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7075130939483643, + "num_tokens": 242511371.0, + "step": 9718 + }, + { + "epoch": 1.0673182517021744, + "grad_norm": 2.36926531791687, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7260857820510864, + "num_tokens": 242533201.0, + "step": 9719 + }, + { + "epoch": 1.067428069404788, + "grad_norm": 2.359747886657715, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7050917148590088, + "num_tokens": 242558385.0, + "step": 9720 + }, + { + "epoch": 1.0675378871074017, + "grad_norm": 2.3392977714538574, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7184432744979858, + "num_tokens": 242582186.0, + "step": 9721 + }, + { + "epoch": 1.0676477048100155, + "grad_norm": 2.537691593170166, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7140839695930481, + "num_tokens": 242604817.0, + "step": 9722 + }, + { + "epoch": 1.067757522512629, + "grad_norm": 2.5187690258026123, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7065867185592651, + "num_tokens": 242627661.0, + "step": 9723 + }, + { + "epoch": 1.0678673402152428, + "grad_norm": 2.3494787216186523, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7126721739768982, + "num_tokens": 242651633.0, + "step": 9724 + }, + { + "epoch": 1.0679771579178563, + "grad_norm": 1.992936611175537, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7032079696655273, + "num_tokens": 242680553.0, + "step": 9725 + }, + { + "epoch": 1.06808697562047, + "grad_norm": 2.291266918182373, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7290205955505371, + "num_tokens": 242703910.0, + "step": 9726 + }, + { + "epoch": 1.0681967933230836, + "grad_norm": 2.2688164710998535, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7174928188323975, + "num_tokens": 242730710.0, + "step": 9727 + }, + { + "epoch": 1.0683066110256974, + "grad_norm": 2.1860177516937256, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.722913384437561, + "num_tokens": 242757302.0, + "step": 9728 + }, + { + "epoch": 1.068416428728311, + "grad_norm": 2.1284842491149902, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7045117020606995, + "num_tokens": 242785107.0, + "step": 9729 + }, + { + "epoch": 1.0685262464309246, + "grad_norm": 2.377087116241455, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7302530407905579, + "num_tokens": 242807249.0, + "step": 9730 + }, + { + "epoch": 1.0686360641335384, + "grad_norm": 2.3103559017181396, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7272968292236328, + "num_tokens": 242830091.0, + "step": 9731 + }, + { + "epoch": 1.068745881836152, + "grad_norm": 2.474565267562866, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7195801734924316, + "num_tokens": 242852795.0, + "step": 9732 + }, + { + "epoch": 1.0688556995387657, + "grad_norm": 2.3534810543060303, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7019124627113342, + "num_tokens": 242878696.0, + "step": 9733 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 2.3150832653045654, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7280073165893555, + "num_tokens": 242902337.0, + "step": 9734 + }, + { + "epoch": 1.069075334943993, + "grad_norm": 2.3635094165802, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7240405082702637, + "num_tokens": 242926820.0, + "step": 9735 + }, + { + "epoch": 1.0691851526466065, + "grad_norm": 2.3576629161834717, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7023671865463257, + "num_tokens": 242950681.0, + "step": 9736 + }, + { + "epoch": 1.0692949703492203, + "grad_norm": 2.274104356765747, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7122319936752319, + "num_tokens": 242976067.0, + "step": 9737 + }, + { + "epoch": 1.069404788051834, + "grad_norm": 2.6639440059661865, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7245602607727051, + "num_tokens": 242994924.0, + "step": 9738 + }, + { + "epoch": 1.0695146057544476, + "grad_norm": 2.5239715576171875, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7397138476371765, + "num_tokens": 243015820.0, + "step": 9739 + }, + { + "epoch": 1.0696244234570613, + "grad_norm": 2.4700913429260254, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7292373180389404, + "num_tokens": 243040288.0, + "step": 9740 + }, + { + "epoch": 1.0697342411596749, + "grad_norm": 2.3275063037872314, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7194650173187256, + "num_tokens": 243066580.0, + "step": 9741 + }, + { + "epoch": 1.0698440588622886, + "grad_norm": 2.1269192695617676, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7178968787193298, + "num_tokens": 243094861.0, + "step": 9742 + }, + { + "epoch": 1.0699538765649024, + "grad_norm": 2.075920343399048, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7222510576248169, + "num_tokens": 243123169.0, + "step": 9743 + }, + { + "epoch": 1.070063694267516, + "grad_norm": 2.0655274391174316, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7115887999534607, + "num_tokens": 243150841.0, + "step": 9744 + }, + { + "epoch": 1.0701735119701297, + "grad_norm": 2.372040271759033, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7127698063850403, + "num_tokens": 243172486.0, + "step": 9745 + }, + { + "epoch": 1.0702833296727432, + "grad_norm": 2.2888922691345215, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.6973049640655518, + "num_tokens": 243197844.0, + "step": 9746 + }, + { + "epoch": 1.070393147375357, + "grad_norm": 2.2905361652374268, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7198312878608704, + "num_tokens": 243221404.0, + "step": 9747 + }, + { + "epoch": 1.0705029650779705, + "grad_norm": 2.1768417358398438, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7332761287689209, + "num_tokens": 243246352.0, + "step": 9748 + }, + { + "epoch": 1.0706127827805842, + "grad_norm": 2.375703811645508, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7356326580047607, + "num_tokens": 243269201.0, + "step": 9749 + }, + { + "epoch": 1.0707226004831978, + "grad_norm": 2.532822847366333, + "learning_rate": 1e-06, + "loss": 0.7049, + "mean_token_accuracy": 0.7716794013977051, + "num_tokens": 243286663.0, + "step": 9750 + }, + { + "epoch": 1.0708324181858115, + "grad_norm": 2.3390090465545654, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7244030237197876, + "num_tokens": 243314526.0, + "step": 9751 + }, + { + "epoch": 1.0709422358884253, + "grad_norm": 2.7332427501678467, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7159596681594849, + "num_tokens": 243332182.0, + "step": 9752 + }, + { + "epoch": 1.0710520535910388, + "grad_norm": 1.9081038236618042, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7075819969177246, + "num_tokens": 243368647.0, + "step": 9753 + }, + { + "epoch": 1.0711618712936526, + "grad_norm": 2.352141857147217, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7105540037155151, + "num_tokens": 243391548.0, + "step": 9754 + }, + { + "epoch": 1.0712716889962661, + "grad_norm": 2.290405035018921, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7047378420829773, + "num_tokens": 243417168.0, + "step": 9755 + }, + { + "epoch": 1.0713815066988799, + "grad_norm": 2.4688122272491455, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7392855882644653, + "num_tokens": 243437020.0, + "step": 9756 + }, + { + "epoch": 1.0714913244014934, + "grad_norm": 2.1189842224121094, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7061525583267212, + "num_tokens": 243467083.0, + "step": 9757 + }, + { + "epoch": 1.0716011421041072, + "grad_norm": 2.5133399963378906, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7298385500907898, + "num_tokens": 243487236.0, + "step": 9758 + }, + { + "epoch": 1.071710959806721, + "grad_norm": 2.3003735542297363, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7247521281242371, + "num_tokens": 243510075.0, + "step": 9759 + }, + { + "epoch": 1.0718207775093345, + "grad_norm": 1.9378057718276978, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7155519127845764, + "num_tokens": 243539853.0, + "step": 9760 + }, + { + "epoch": 1.0719305952119482, + "grad_norm": 2.302273988723755, + "learning_rate": 1e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7455828189849854, + "num_tokens": 243561898.0, + "step": 9761 + }, + { + "epoch": 1.0720404129145618, + "grad_norm": 2.4523730278015137, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7222539186477661, + "num_tokens": 243585432.0, + "step": 9762 + }, + { + "epoch": 1.0721502306171755, + "grad_norm": 2.2249722480773926, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.702449381351471, + "num_tokens": 243610827.0, + "step": 9763 + }, + { + "epoch": 1.072260048319789, + "grad_norm": 2.451361656188965, + "learning_rate": 1e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7383425235748291, + "num_tokens": 243630910.0, + "step": 9764 + }, + { + "epoch": 1.0723698660224028, + "grad_norm": 2.621267318725586, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7232877016067505, + "num_tokens": 243650568.0, + "step": 9765 + }, + { + "epoch": 1.0724796837250166, + "grad_norm": 2.2335057258605957, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7158706188201904, + "num_tokens": 243676311.0, + "step": 9766 + }, + { + "epoch": 1.07258950142763, + "grad_norm": 2.195157527923584, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7063472270965576, + "num_tokens": 243702766.0, + "step": 9767 + }, + { + "epoch": 1.0726993191302439, + "grad_norm": 2.13777232170105, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7028886079788208, + "num_tokens": 243730809.0, + "step": 9768 + }, + { + "epoch": 1.0728091368328574, + "grad_norm": 2.375732183456421, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6973735094070435, + "num_tokens": 243755042.0, + "step": 9769 + }, + { + "epoch": 1.0729189545354711, + "grad_norm": 2.4336464405059814, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7432038187980652, + "num_tokens": 243775137.0, + "step": 9770 + }, + { + "epoch": 1.0730287722380847, + "grad_norm": 2.0723419189453125, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7134792804718018, + "num_tokens": 243804018.0, + "step": 9771 + }, + { + "epoch": 1.0731385899406984, + "grad_norm": 2.5153355598449707, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7016780376434326, + "num_tokens": 243826174.0, + "step": 9772 + }, + { + "epoch": 1.0732484076433122, + "grad_norm": 2.337554693222046, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7303412556648254, + "num_tokens": 243849500.0, + "step": 9773 + }, + { + "epoch": 1.0733582253459257, + "grad_norm": 2.300206184387207, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7199386358261108, + "num_tokens": 243873804.0, + "step": 9774 + }, + { + "epoch": 1.0734680430485395, + "grad_norm": 2.221970796585083, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7302150726318359, + "num_tokens": 243898019.0, + "step": 9775 + }, + { + "epoch": 1.073577860751153, + "grad_norm": 2.2971270084381104, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7184956669807434, + "num_tokens": 243922658.0, + "step": 9776 + }, + { + "epoch": 1.0736876784537668, + "grad_norm": 2.298556089401245, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7378579378128052, + "num_tokens": 243946027.0, + "step": 9777 + }, + { + "epoch": 1.0737974961563803, + "grad_norm": 2.2207722663879395, + "learning_rate": 1e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.7472929954528809, + "num_tokens": 243971043.0, + "step": 9778 + }, + { + "epoch": 1.073907313858994, + "grad_norm": 2.7190959453582764, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7516412734985352, + "num_tokens": 243989741.0, + "step": 9779 + }, + { + "epoch": 1.0740171315616078, + "grad_norm": 1.9705396890640259, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7177460789680481, + "num_tokens": 244021448.0, + "step": 9780 + }, + { + "epoch": 1.0741269492642214, + "grad_norm": 2.049947738647461, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.719552218914032, + "num_tokens": 244050449.0, + "step": 9781 + }, + { + "epoch": 1.0742367669668351, + "grad_norm": 2.111774444580078, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7277681231498718, + "num_tokens": 244077882.0, + "step": 9782 + }, + { + "epoch": 1.0743465846694487, + "grad_norm": 2.614499092102051, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7230684757232666, + "num_tokens": 244097472.0, + "step": 9783 + }, + { + "epoch": 1.0744564023720624, + "grad_norm": 2.2226665019989014, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6870949268341064, + "num_tokens": 244126960.0, + "step": 9784 + }, + { + "epoch": 1.074566220074676, + "grad_norm": 2.1247761249542236, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7197921276092529, + "num_tokens": 244153744.0, + "step": 9785 + }, + { + "epoch": 1.0746760377772897, + "grad_norm": 2.6308212280273438, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7054879069328308, + "num_tokens": 244174098.0, + "step": 9786 + }, + { + "epoch": 1.0747858554799035, + "grad_norm": 2.2560691833496094, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7160866856575012, + "num_tokens": 244200310.0, + "step": 9787 + }, + { + "epoch": 1.074895673182517, + "grad_norm": 2.3439230918884277, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7069771885871887, + "num_tokens": 244226840.0, + "step": 9788 + }, + { + "epoch": 1.0750054908851308, + "grad_norm": 2.180974006652832, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7083454132080078, + "num_tokens": 244254700.0, + "step": 9789 + }, + { + "epoch": 1.0751153085877443, + "grad_norm": 2.398167371749878, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7214897871017456, + "num_tokens": 244277249.0, + "step": 9790 + }, + { + "epoch": 1.075225126290358, + "grad_norm": 2.024559497833252, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7146786451339722, + "num_tokens": 244309194.0, + "step": 9791 + }, + { + "epoch": 1.0753349439929716, + "grad_norm": 2.1386168003082275, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7090020179748535, + "num_tokens": 244336472.0, + "step": 9792 + }, + { + "epoch": 1.0754447616955853, + "grad_norm": 2.0380361080169678, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7132744789123535, + "num_tokens": 244365979.0, + "step": 9793 + }, + { + "epoch": 1.075554579398199, + "grad_norm": 2.1227710247039795, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7106039524078369, + "num_tokens": 244392021.0, + "step": 9794 + }, + { + "epoch": 1.0756643971008126, + "grad_norm": 2.3471696376800537, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7097064256668091, + "num_tokens": 244415758.0, + "step": 9795 + }, + { + "epoch": 1.0757742148034264, + "grad_norm": 2.3521838188171387, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7071213722229004, + "num_tokens": 244441727.0, + "step": 9796 + }, + { + "epoch": 1.07588403250604, + "grad_norm": 2.481362819671631, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7177741527557373, + "num_tokens": 244463410.0, + "step": 9797 + }, + { + "epoch": 1.0759938502086537, + "grad_norm": 2.1632871627807617, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7310105562210083, + "num_tokens": 244491501.0, + "step": 9798 + }, + { + "epoch": 1.0761036679112672, + "grad_norm": 2.4370577335357666, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.6956576108932495, + "num_tokens": 244516676.0, + "step": 9799 + }, + { + "epoch": 1.076213485613881, + "grad_norm": 2.614292860031128, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7368038892745972, + "num_tokens": 244535170.0, + "step": 9800 + }, + { + "epoch": 1.0763233033164945, + "grad_norm": 2.445967435836792, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7095160484313965, + "num_tokens": 244557855.0, + "step": 9801 + }, + { + "epoch": 1.0764331210191083, + "grad_norm": 2.2522084712982178, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7215052247047424, + "num_tokens": 244583282.0, + "step": 9802 + }, + { + "epoch": 1.076542938721722, + "grad_norm": 2.525479316711426, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7385072708129883, + "num_tokens": 244603857.0, + "step": 9803 + }, + { + "epoch": 1.0766527564243356, + "grad_norm": 2.3258204460144043, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7176799774169922, + "num_tokens": 244628985.0, + "step": 9804 + }, + { + "epoch": 1.0767625741269493, + "grad_norm": 2.568690776824951, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7384376525878906, + "num_tokens": 244648158.0, + "step": 9805 + }, + { + "epoch": 1.0768723918295628, + "grad_norm": 2.68550705909729, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7209320664405823, + "num_tokens": 244668099.0, + "step": 9806 + }, + { + "epoch": 1.0769822095321766, + "grad_norm": 2.6416265964508057, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7231286764144897, + "num_tokens": 244688266.0, + "step": 9807 + }, + { + "epoch": 1.0770920272347904, + "grad_norm": 2.051997423171997, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7447198629379272, + "num_tokens": 244715385.0, + "step": 9808 + }, + { + "epoch": 1.077201844937404, + "grad_norm": 2.008166551589966, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.724877655506134, + "num_tokens": 244746253.0, + "step": 9809 + }, + { + "epoch": 1.0773116626400177, + "grad_norm": 1.9636955261230469, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.739523708820343, + "num_tokens": 244774799.0, + "step": 9810 + }, + { + "epoch": 1.0774214803426312, + "grad_norm": 2.0105929374694824, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7213780879974365, + "num_tokens": 244808869.0, + "step": 9811 + }, + { + "epoch": 1.077531298045245, + "grad_norm": 2.1225173473358154, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7305172085762024, + "num_tokens": 244834948.0, + "step": 9812 + }, + { + "epoch": 1.0776411157478585, + "grad_norm": 2.2722437381744385, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7337515354156494, + "num_tokens": 244860790.0, + "step": 9813 + }, + { + "epoch": 1.0777509334504722, + "grad_norm": 2.183763027191162, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.713108241558075, + "num_tokens": 244886779.0, + "step": 9814 + }, + { + "epoch": 1.0778607511530858, + "grad_norm": 2.312255859375, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7465462684631348, + "num_tokens": 244908033.0, + "step": 9815 + }, + { + "epoch": 1.0779705688556995, + "grad_norm": 2.2683098316192627, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7160955667495728, + "num_tokens": 244934434.0, + "step": 9816 + }, + { + "epoch": 1.0780803865583133, + "grad_norm": 2.4494972229003906, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7184029817581177, + "num_tokens": 244956968.0, + "step": 9817 + }, + { + "epoch": 1.0781902042609268, + "grad_norm": 2.1952943801879883, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7127970457077026, + "num_tokens": 244983208.0, + "step": 9818 + }, + { + "epoch": 1.0783000219635406, + "grad_norm": 2.3473637104034424, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7341434359550476, + "num_tokens": 245005913.0, + "step": 9819 + }, + { + "epoch": 1.0784098396661541, + "grad_norm": 2.127028703689575, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7144291400909424, + "num_tokens": 245032304.0, + "step": 9820 + }, + { + "epoch": 1.0785196573687679, + "grad_norm": 2.3795511722564697, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7233039736747742, + "num_tokens": 245057041.0, + "step": 9821 + }, + { + "epoch": 1.0786294750713814, + "grad_norm": 2.1860084533691406, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.6988626718521118, + "num_tokens": 245084701.0, + "step": 9822 + }, + { + "epoch": 1.0787392927739952, + "grad_norm": 2.2193918228149414, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6980547308921814, + "num_tokens": 245113686.0, + "step": 9823 + }, + { + "epoch": 1.078849110476609, + "grad_norm": 1.8667774200439453, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6900016069412231, + "num_tokens": 245148199.0, + "step": 9824 + }, + { + "epoch": 1.0789589281792225, + "grad_norm": 2.275991678237915, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.71565842628479, + "num_tokens": 245171410.0, + "step": 9825 + }, + { + "epoch": 1.0790687458818362, + "grad_norm": 2.4090075492858887, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7294960021972656, + "num_tokens": 245193488.0, + "step": 9826 + }, + { + "epoch": 1.0791785635844497, + "grad_norm": 2.6529598236083984, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7366770505905151, + "num_tokens": 245213341.0, + "step": 9827 + }, + { + "epoch": 1.0792883812870635, + "grad_norm": 1.9867113828659058, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7109202742576599, + "num_tokens": 245245501.0, + "step": 9828 + }, + { + "epoch": 1.079398198989677, + "grad_norm": 2.324955940246582, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7375032901763916, + "num_tokens": 245271247.0, + "step": 9829 + }, + { + "epoch": 1.0795080166922908, + "grad_norm": 2.0760338306427, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.701011061668396, + "num_tokens": 245300895.0, + "step": 9830 + }, + { + "epoch": 1.0796178343949046, + "grad_norm": 2.688154935836792, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7288000583648682, + "num_tokens": 245321465.0, + "step": 9831 + }, + { + "epoch": 1.079727652097518, + "grad_norm": 2.4806666374206543, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7323757410049438, + "num_tokens": 245342601.0, + "step": 9832 + }, + { + "epoch": 1.0798374698001318, + "grad_norm": 2.5152251720428467, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7242999076843262, + "num_tokens": 245367005.0, + "step": 9833 + }, + { + "epoch": 1.0799472875027454, + "grad_norm": 2.1283438205718994, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7179872989654541, + "num_tokens": 245395119.0, + "step": 9834 + }, + { + "epoch": 1.0800571052053591, + "grad_norm": 2.2214975357055664, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7246235609054565, + "num_tokens": 245421612.0, + "step": 9835 + }, + { + "epoch": 1.0801669229079727, + "grad_norm": 2.148991346359253, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7102599740028381, + "num_tokens": 245450484.0, + "step": 9836 + }, + { + "epoch": 1.0802767406105864, + "grad_norm": 2.396596670150757, + "learning_rate": 1e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.735956072807312, + "num_tokens": 245470711.0, + "step": 9837 + }, + { + "epoch": 1.0803865583132002, + "grad_norm": 2.485682249069214, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7341034412384033, + "num_tokens": 245491528.0, + "step": 9838 + }, + { + "epoch": 1.0804963760158137, + "grad_norm": 2.1975905895233154, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.6984086036682129, + "num_tokens": 245521713.0, + "step": 9839 + }, + { + "epoch": 1.0806061937184275, + "grad_norm": 2.1474130153656006, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7118284702301025, + "num_tokens": 245550695.0, + "step": 9840 + }, + { + "epoch": 1.080716011421041, + "grad_norm": 2.166834592819214, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7430821061134338, + "num_tokens": 245575162.0, + "step": 9841 + }, + { + "epoch": 1.0808258291236548, + "grad_norm": 2.3146369457244873, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7289661765098572, + "num_tokens": 245600590.0, + "step": 9842 + }, + { + "epoch": 1.0809356468262683, + "grad_norm": 2.1279125213623047, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7287121415138245, + "num_tokens": 245629213.0, + "step": 9843 + }, + { + "epoch": 1.081045464528882, + "grad_norm": 2.3703384399414062, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7132827639579773, + "num_tokens": 245652659.0, + "step": 9844 + }, + { + "epoch": 1.0811552822314958, + "grad_norm": 2.292396068572998, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7242532968521118, + "num_tokens": 245678672.0, + "step": 9845 + }, + { + "epoch": 1.0812650999341094, + "grad_norm": 2.5339112281799316, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7417944669723511, + "num_tokens": 245699367.0, + "step": 9846 + }, + { + "epoch": 1.081374917636723, + "grad_norm": 2.373838424682617, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7347561120986938, + "num_tokens": 245722063.0, + "step": 9847 + }, + { + "epoch": 1.0814847353393366, + "grad_norm": 2.293114423751831, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7173516750335693, + "num_tokens": 245747668.0, + "step": 9848 + }, + { + "epoch": 1.0815945530419504, + "grad_norm": 2.368896961212158, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7127444744110107, + "num_tokens": 245771166.0, + "step": 9849 + }, + { + "epoch": 1.081704370744564, + "grad_norm": 2.1012392044067383, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7043170928955078, + "num_tokens": 245799546.0, + "step": 9850 + }, + { + "epoch": 1.0818141884471777, + "grad_norm": 2.5116398334503174, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7363262176513672, + "num_tokens": 245820106.0, + "step": 9851 + }, + { + "epoch": 1.0819240061497912, + "grad_norm": 2.0493197441101074, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7047272324562073, + "num_tokens": 245850286.0, + "step": 9852 + }, + { + "epoch": 1.082033823852405, + "grad_norm": 2.2840576171875, + "learning_rate": 1e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.7462719678878784, + "num_tokens": 245877165.0, + "step": 9853 + }, + { + "epoch": 1.0821436415550187, + "grad_norm": 2.8249423503875732, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7236691117286682, + "num_tokens": 245895154.0, + "step": 9854 + }, + { + "epoch": 1.0822534592576323, + "grad_norm": 2.192014217376709, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.716681182384491, + "num_tokens": 245922333.0, + "step": 9855 + }, + { + "epoch": 1.082363276960246, + "grad_norm": 1.9191827774047852, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7119390368461609, + "num_tokens": 245954531.0, + "step": 9856 + }, + { + "epoch": 1.0824730946628596, + "grad_norm": 2.2081360816955566, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7197140455245972, + "num_tokens": 245980904.0, + "step": 9857 + }, + { + "epoch": 1.0825829123654733, + "grad_norm": 2.059809684753418, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6891560554504395, + "num_tokens": 246013626.0, + "step": 9858 + }, + { + "epoch": 1.082692730068087, + "grad_norm": 2.491328001022339, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7138956785202026, + "num_tokens": 246035099.0, + "step": 9859 + }, + { + "epoch": 1.0828025477707006, + "grad_norm": 1.9951348304748535, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7386595606803894, + "num_tokens": 246065484.0, + "step": 9860 + }, + { + "epoch": 1.0829123654733144, + "grad_norm": 2.314277172088623, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7046191692352295, + "num_tokens": 246090920.0, + "step": 9861 + }, + { + "epoch": 1.083022183175928, + "grad_norm": 2.107820749282837, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7196027040481567, + "num_tokens": 246118038.0, + "step": 9862 + }, + { + "epoch": 1.0831320008785417, + "grad_norm": 2.4547154903411865, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7347121238708496, + "num_tokens": 246141702.0, + "step": 9863 + }, + { + "epoch": 1.0832418185811552, + "grad_norm": 2.317131519317627, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.736780047416687, + "num_tokens": 246165829.0, + "step": 9864 + }, + { + "epoch": 1.083351636283769, + "grad_norm": 2.511981248855591, + "learning_rate": 1e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7513947486877441, + "num_tokens": 246185051.0, + "step": 9865 + }, + { + "epoch": 1.0834614539863825, + "grad_norm": 2.7453057765960693, + "learning_rate": 1e-06, + "loss": 0.6882, + "mean_token_accuracy": 0.7759256958961487, + "num_tokens": 246201757.0, + "step": 9866 + }, + { + "epoch": 1.0835712716889963, + "grad_norm": 2.237396001815796, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7343279123306274, + "num_tokens": 246225631.0, + "step": 9867 + }, + { + "epoch": 1.08368108939161, + "grad_norm": 2.534217596054077, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7264574766159058, + "num_tokens": 246245869.0, + "step": 9868 + }, + { + "epoch": 1.0837909070942235, + "grad_norm": 2.1438522338867188, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7289300560951233, + "num_tokens": 246274138.0, + "step": 9869 + }, + { + "epoch": 1.0839007247968373, + "grad_norm": 2.22524356842041, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7055364847183228, + "num_tokens": 246299431.0, + "step": 9870 + }, + { + "epoch": 1.0840105424994508, + "grad_norm": 2.0522007942199707, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7388889789581299, + "num_tokens": 246324926.0, + "step": 9871 + }, + { + "epoch": 1.0841203602020646, + "grad_norm": 2.8775715827941895, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7197243571281433, + "num_tokens": 246341624.0, + "step": 9872 + }, + { + "epoch": 1.0842301779046783, + "grad_norm": 2.4856295585632324, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.6974080204963684, + "num_tokens": 246364841.0, + "step": 9873 + }, + { + "epoch": 1.0843399956072919, + "grad_norm": 2.284289598464966, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7151176333427429, + "num_tokens": 246391709.0, + "step": 9874 + }, + { + "epoch": 1.0844498133099056, + "grad_norm": 2.3089261054992676, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.7021411657333374, + "num_tokens": 246416146.0, + "step": 9875 + }, + { + "epoch": 1.0845596310125192, + "grad_norm": 2.284121036529541, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7045587301254272, + "num_tokens": 246442562.0, + "step": 9876 + }, + { + "epoch": 1.084669448715133, + "grad_norm": 2.030517578125, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7002968788146973, + "num_tokens": 246474373.0, + "step": 9877 + }, + { + "epoch": 1.0847792664177465, + "grad_norm": 2.2118027210235596, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7003841400146484, + "num_tokens": 246502099.0, + "step": 9878 + }, + { + "epoch": 1.0848890841203602, + "grad_norm": 2.3157756328582764, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7444818019866943, + "num_tokens": 246526214.0, + "step": 9879 + }, + { + "epoch": 1.0849989018229738, + "grad_norm": 2.1109702587127686, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7431086301803589, + "num_tokens": 246553499.0, + "step": 9880 + }, + { + "epoch": 1.0851087195255875, + "grad_norm": 2.4680309295654297, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7165552377700806, + "num_tokens": 246575560.0, + "step": 9881 + }, + { + "epoch": 1.0852185372282013, + "grad_norm": 1.9422008991241455, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7089031934738159, + "num_tokens": 246610422.0, + "step": 9882 + }, + { + "epoch": 1.0853283549308148, + "grad_norm": 2.5292305946350098, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7198389768600464, + "num_tokens": 246632473.0, + "step": 9883 + }, + { + "epoch": 1.0854381726334286, + "grad_norm": 2.413590908050537, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7186551094055176, + "num_tokens": 246656489.0, + "step": 9884 + }, + { + "epoch": 1.085547990336042, + "grad_norm": 2.151585340499878, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7018967270851135, + "num_tokens": 246684991.0, + "step": 9885 + }, + { + "epoch": 1.0856578080386559, + "grad_norm": 2.304643154144287, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7210580110549927, + "num_tokens": 246709859.0, + "step": 9886 + }, + { + "epoch": 1.0857676257412694, + "grad_norm": 2.343290090560913, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7423404455184937, + "num_tokens": 246732654.0, + "step": 9887 + }, + { + "epoch": 1.0858774434438832, + "grad_norm": 2.2724666595458984, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7100226283073425, + "num_tokens": 246757494.0, + "step": 9888 + }, + { + "epoch": 1.085987261146497, + "grad_norm": 2.494692802429199, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7510266900062561, + "num_tokens": 246778596.0, + "step": 9889 + }, + { + "epoch": 1.0860970788491104, + "grad_norm": 2.2231943607330322, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7280348539352417, + "num_tokens": 246803695.0, + "step": 9890 + }, + { + "epoch": 1.0862068965517242, + "grad_norm": 2.339031934738159, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7021274566650391, + "num_tokens": 246827129.0, + "step": 9891 + }, + { + "epoch": 1.0863167142543377, + "grad_norm": 2.076975107192993, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6988903880119324, + "num_tokens": 246857560.0, + "step": 9892 + }, + { + "epoch": 1.0864265319569515, + "grad_norm": 2.17683482170105, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7067649364471436, + "num_tokens": 246884928.0, + "step": 9893 + }, + { + "epoch": 1.086536349659565, + "grad_norm": 2.3825032711029053, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7287328243255615, + "num_tokens": 246907226.0, + "step": 9894 + }, + { + "epoch": 1.0866461673621788, + "grad_norm": 3.062483549118042, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.740484893321991, + "num_tokens": 246923522.0, + "step": 9895 + }, + { + "epoch": 1.0867559850647925, + "grad_norm": 2.183363676071167, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7268720865249634, + "num_tokens": 246950152.0, + "step": 9896 + }, + { + "epoch": 1.086865802767406, + "grad_norm": 2.3856048583984375, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7089223861694336, + "num_tokens": 246971900.0, + "step": 9897 + }, + { + "epoch": 1.0869756204700198, + "grad_norm": 2.1760218143463135, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7272691130638123, + "num_tokens": 246997737.0, + "step": 9898 + }, + { + "epoch": 1.0870854381726334, + "grad_norm": 2.0454800128936768, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7062402367591858, + "num_tokens": 247029325.0, + "step": 9899 + }, + { + "epoch": 1.0871952558752471, + "grad_norm": 2.4631102085113525, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7351109981536865, + "num_tokens": 247051172.0, + "step": 9900 + }, + { + "epoch": 1.0873050735778607, + "grad_norm": 2.162329912185669, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7115477323532104, + "num_tokens": 247080337.0, + "step": 9901 + }, + { + "epoch": 1.0874148912804744, + "grad_norm": 2.233524799346924, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7253050804138184, + "num_tokens": 247106518.0, + "step": 9902 + }, + { + "epoch": 1.0875247089830882, + "grad_norm": 2.237856388092041, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7280070781707764, + "num_tokens": 247132348.0, + "step": 9903 + }, + { + "epoch": 1.0876345266857017, + "grad_norm": 2.2660515308380127, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7251465916633606, + "num_tokens": 247158806.0, + "step": 9904 + }, + { + "epoch": 1.0877443443883155, + "grad_norm": 2.155259609222412, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7161675095558167, + "num_tokens": 247186032.0, + "step": 9905 + }, + { + "epoch": 1.087854162090929, + "grad_norm": 2.2241132259368896, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.726898193359375, + "num_tokens": 247211873.0, + "step": 9906 + }, + { + "epoch": 1.0879639797935428, + "grad_norm": 2.3775339126586914, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7283337116241455, + "num_tokens": 247234488.0, + "step": 9907 + }, + { + "epoch": 1.0880737974961563, + "grad_norm": 1.9910526275634766, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7315211296081543, + "num_tokens": 247265405.0, + "step": 9908 + }, + { + "epoch": 1.08818361519877, + "grad_norm": 2.0959556102752686, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7434251308441162, + "num_tokens": 247293571.0, + "step": 9909 + }, + { + "epoch": 1.0882934329013838, + "grad_norm": 2.2351157665252686, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7053405046463013, + "num_tokens": 247320152.0, + "step": 9910 + }, + { + "epoch": 1.0884032506039973, + "grad_norm": 2.153794288635254, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.728856086730957, + "num_tokens": 247346952.0, + "step": 9911 + }, + { + "epoch": 1.088513068306611, + "grad_norm": 2.6684677600860596, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7409566640853882, + "num_tokens": 247365371.0, + "step": 9912 + }, + { + "epoch": 1.0886228860092246, + "grad_norm": 2.403176784515381, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7360203266143799, + "num_tokens": 247387173.0, + "step": 9913 + }, + { + "epoch": 1.0887327037118384, + "grad_norm": 2.092517137527466, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7166087627410889, + "num_tokens": 247416440.0, + "step": 9914 + }, + { + "epoch": 1.088842521414452, + "grad_norm": 2.2085206508636475, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7152531147003174, + "num_tokens": 247443048.0, + "step": 9915 + }, + { + "epoch": 1.0889523391170657, + "grad_norm": 2.12960147857666, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7120212912559509, + "num_tokens": 247474410.0, + "step": 9916 + }, + { + "epoch": 1.0890621568196792, + "grad_norm": 2.205651044845581, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7227266430854797, + "num_tokens": 247500179.0, + "step": 9917 + }, + { + "epoch": 1.089171974522293, + "grad_norm": 2.624058961868286, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7221393585205078, + "num_tokens": 247519634.0, + "step": 9918 + }, + { + "epoch": 1.0892817922249067, + "grad_norm": 2.3268685340881348, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7211235165596008, + "num_tokens": 247544611.0, + "step": 9919 + }, + { + "epoch": 1.0893916099275203, + "grad_norm": 2.3572804927825928, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7054166793823242, + "num_tokens": 247569385.0, + "step": 9920 + }, + { + "epoch": 1.089501427630134, + "grad_norm": 2.2513766288757324, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7095171213150024, + "num_tokens": 247594771.0, + "step": 9921 + }, + { + "epoch": 1.0896112453327476, + "grad_norm": 2.0547561645507812, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.714827835559845, + "num_tokens": 247624860.0, + "step": 9922 + }, + { + "epoch": 1.0897210630353613, + "grad_norm": 2.2937464714050293, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7421931028366089, + "num_tokens": 247648342.0, + "step": 9923 + }, + { + "epoch": 1.089830880737975, + "grad_norm": 2.240896463394165, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7315424680709839, + "num_tokens": 247673063.0, + "step": 9924 + }, + { + "epoch": 1.0899406984405886, + "grad_norm": 2.461869478225708, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7117038369178772, + "num_tokens": 247696828.0, + "step": 9925 + }, + { + "epoch": 1.0900505161432024, + "grad_norm": 2.0832509994506836, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7126015424728394, + "num_tokens": 247725572.0, + "step": 9926 + }, + { + "epoch": 1.090160333845816, + "grad_norm": 2.1176960468292236, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6791852116584778, + "num_tokens": 247757031.0, + "step": 9927 + }, + { + "epoch": 1.0902701515484297, + "grad_norm": 2.1812241077423096, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.6975927352905273, + "num_tokens": 247784102.0, + "step": 9928 + }, + { + "epoch": 1.0903799692510432, + "grad_norm": 2.5451900959014893, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7401713132858276, + "num_tokens": 247804365.0, + "step": 9929 + }, + { + "epoch": 1.090489786953657, + "grad_norm": 2.247105121612549, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7151686549186707, + "num_tokens": 247832523.0, + "step": 9930 + }, + { + "epoch": 1.0905996046562705, + "grad_norm": 2.200424909591675, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7247530221939087, + "num_tokens": 247856237.0, + "step": 9931 + }, + { + "epoch": 1.0907094223588842, + "grad_norm": 2.311123847961426, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.704644501209259, + "num_tokens": 247883201.0, + "step": 9932 + }, + { + "epoch": 1.090819240061498, + "grad_norm": 2.1806750297546387, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7094192504882812, + "num_tokens": 247911405.0, + "step": 9933 + }, + { + "epoch": 1.0909290577641115, + "grad_norm": 2.1132290363311768, + "learning_rate": 1e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.741194486618042, + "num_tokens": 247936570.0, + "step": 9934 + }, + { + "epoch": 1.0910388754667253, + "grad_norm": 2.0763120651245117, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.6965482234954834, + "num_tokens": 247965751.0, + "step": 9935 + }, + { + "epoch": 1.0911486931693388, + "grad_norm": 2.3556153774261475, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7053529024124146, + "num_tokens": 247989284.0, + "step": 9936 + }, + { + "epoch": 1.0912585108719526, + "grad_norm": 2.346287250518799, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7299913167953491, + "num_tokens": 248014768.0, + "step": 9937 + }, + { + "epoch": 1.0913683285745663, + "grad_norm": 2.3021652698516846, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7211868762969971, + "num_tokens": 248039218.0, + "step": 9938 + }, + { + "epoch": 1.0914781462771799, + "grad_norm": 2.466691017150879, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7174041271209717, + "num_tokens": 248061943.0, + "step": 9939 + }, + { + "epoch": 1.0915879639797936, + "grad_norm": 2.340240240097046, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6934810280799866, + "num_tokens": 248087089.0, + "step": 9940 + }, + { + "epoch": 1.0916977816824072, + "grad_norm": 2.5422675609588623, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7135146260261536, + "num_tokens": 248108828.0, + "step": 9941 + }, + { + "epoch": 1.091807599385021, + "grad_norm": 2.0849244594573975, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7086858153343201, + "num_tokens": 248136331.0, + "step": 9942 + }, + { + "epoch": 1.0919174170876345, + "grad_norm": 2.2880587577819824, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7210295796394348, + "num_tokens": 248161340.0, + "step": 9943 + }, + { + "epoch": 1.0920272347902482, + "grad_norm": 2.2381820678710938, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6955950856208801, + "num_tokens": 248189136.0, + "step": 9944 + }, + { + "epoch": 1.0921370524928617, + "grad_norm": 2.3469889163970947, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7275043725967407, + "num_tokens": 248211828.0, + "step": 9945 + }, + { + "epoch": 1.0922468701954755, + "grad_norm": 2.2076830863952637, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7231559753417969, + "num_tokens": 248238377.0, + "step": 9946 + }, + { + "epoch": 1.0923566878980893, + "grad_norm": 2.0602474212646484, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7191208004951477, + "num_tokens": 248268335.0, + "step": 9947 + }, + { + "epoch": 1.0924665056007028, + "grad_norm": 2.1975715160369873, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7147647142410278, + "num_tokens": 248295720.0, + "step": 9948 + }, + { + "epoch": 1.0925763233033166, + "grad_norm": 2.2002053260803223, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7326956987380981, + "num_tokens": 248321278.0, + "step": 9949 + }, + { + "epoch": 1.09268614100593, + "grad_norm": 2.519070625305176, + "learning_rate": 1e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7287237048149109, + "num_tokens": 248342040.0, + "step": 9950 + }, + { + "epoch": 1.0927959587085438, + "grad_norm": 2.4734113216400146, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7251500487327576, + "num_tokens": 248363119.0, + "step": 9951 + }, + { + "epoch": 1.0929057764111574, + "grad_norm": 2.324671983718872, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7267394065856934, + "num_tokens": 248385917.0, + "step": 9952 + }, + { + "epoch": 1.0930155941137711, + "grad_norm": 2.476487398147583, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7202157974243164, + "num_tokens": 248407776.0, + "step": 9953 + }, + { + "epoch": 1.093125411816385, + "grad_norm": 2.427041530609131, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7275264263153076, + "num_tokens": 248429404.0, + "step": 9954 + }, + { + "epoch": 1.0932352295189984, + "grad_norm": 2.7378711700439453, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7439980506896973, + "num_tokens": 248446847.0, + "step": 9955 + }, + { + "epoch": 1.0933450472216122, + "grad_norm": 2.474815845489502, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7147127389907837, + "num_tokens": 248469004.0, + "step": 9956 + }, + { + "epoch": 1.0934548649242257, + "grad_norm": 2.16345477104187, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7076807022094727, + "num_tokens": 248497639.0, + "step": 9957 + }, + { + "epoch": 1.0935646826268395, + "grad_norm": 2.1896722316741943, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.721871554851532, + "num_tokens": 248524753.0, + "step": 9958 + }, + { + "epoch": 1.093674500329453, + "grad_norm": 2.039567708969116, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7210359573364258, + "num_tokens": 248556249.0, + "step": 9959 + }, + { + "epoch": 1.0937843180320668, + "grad_norm": 2.438105344772339, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.728115439414978, + "num_tokens": 248578693.0, + "step": 9960 + }, + { + "epoch": 1.0938941357346805, + "grad_norm": 2.1591575145721436, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7141238451004028, + "num_tokens": 248607151.0, + "step": 9961 + }, + { + "epoch": 1.094003953437294, + "grad_norm": 2.3530821800231934, + "learning_rate": 1e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.73317551612854, + "num_tokens": 248630629.0, + "step": 9962 + }, + { + "epoch": 1.0941137711399078, + "grad_norm": 2.3538248538970947, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.730777382850647, + "num_tokens": 248653797.0, + "step": 9963 + }, + { + "epoch": 1.0942235888425214, + "grad_norm": 2.500196933746338, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7238361835479736, + "num_tokens": 248676863.0, + "step": 9964 + }, + { + "epoch": 1.0943334065451351, + "grad_norm": 2.208902597427368, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7182332277297974, + "num_tokens": 248703392.0, + "step": 9965 + }, + { + "epoch": 1.0944432242477486, + "grad_norm": 2.5293707847595215, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7172998785972595, + "num_tokens": 248725053.0, + "step": 9966 + }, + { + "epoch": 1.0945530419503624, + "grad_norm": 2.2954065799713135, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7093579769134521, + "num_tokens": 248749549.0, + "step": 9967 + }, + { + "epoch": 1.0946628596529762, + "grad_norm": 2.1623852252960205, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7209299802780151, + "num_tokens": 248776870.0, + "step": 9968 + }, + { + "epoch": 1.0947726773555897, + "grad_norm": 2.3285300731658936, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7254249453544617, + "num_tokens": 248801332.0, + "step": 9969 + }, + { + "epoch": 1.0948824950582035, + "grad_norm": 2.467575788497925, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7341797351837158, + "num_tokens": 248821120.0, + "step": 9970 + }, + { + "epoch": 1.094992312760817, + "grad_norm": 2.3275578022003174, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7241260409355164, + "num_tokens": 248847014.0, + "step": 9971 + }, + { + "epoch": 1.0951021304634307, + "grad_norm": 2.5978832244873047, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7302955985069275, + "num_tokens": 248867034.0, + "step": 9972 + }, + { + "epoch": 1.0952119481660443, + "grad_norm": 2.3542160987854004, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7223501205444336, + "num_tokens": 248890992.0, + "step": 9973 + }, + { + "epoch": 1.095321765868658, + "grad_norm": 2.2636921405792236, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7150808572769165, + "num_tokens": 248916210.0, + "step": 9974 + }, + { + "epoch": 1.0954315835712718, + "grad_norm": 2.3736841678619385, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7042714357376099, + "num_tokens": 248942426.0, + "step": 9975 + }, + { + "epoch": 1.0955414012738853, + "grad_norm": 2.2739369869232178, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7159271240234375, + "num_tokens": 248968813.0, + "step": 9976 + }, + { + "epoch": 1.095651218976499, + "grad_norm": 2.3006815910339355, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7167501449584961, + "num_tokens": 248995479.0, + "step": 9977 + }, + { + "epoch": 1.0957610366791126, + "grad_norm": 2.5326411724090576, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7199684381484985, + "num_tokens": 249017140.0, + "step": 9978 + }, + { + "epoch": 1.0958708543817264, + "grad_norm": 2.3309195041656494, + "learning_rate": 1e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.748253345489502, + "num_tokens": 249041104.0, + "step": 9979 + }, + { + "epoch": 1.09598067208434, + "grad_norm": 2.141244888305664, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7027518153190613, + "num_tokens": 249070128.0, + "step": 9980 + }, + { + "epoch": 1.0960904897869537, + "grad_norm": 2.0814783573150635, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7047267556190491, + "num_tokens": 249098770.0, + "step": 9981 + }, + { + "epoch": 1.0962003074895672, + "grad_norm": 2.323254346847534, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7311292886734009, + "num_tokens": 249122395.0, + "step": 9982 + }, + { + "epoch": 1.096310125192181, + "grad_norm": 2.324918031692505, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7279747128486633, + "num_tokens": 249145424.0, + "step": 9983 + }, + { + "epoch": 1.0964199428947947, + "grad_norm": 2.0166807174682617, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.712053656578064, + "num_tokens": 249177451.0, + "step": 9984 + }, + { + "epoch": 1.0965297605974083, + "grad_norm": 2.0812222957611084, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7066315412521362, + "num_tokens": 249208991.0, + "step": 9985 + }, + { + "epoch": 1.096639578300022, + "grad_norm": 2.0948121547698975, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7224003076553345, + "num_tokens": 249238203.0, + "step": 9986 + }, + { + "epoch": 1.0967493960026355, + "grad_norm": 2.4161980152130127, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7084228992462158, + "num_tokens": 249261994.0, + "step": 9987 + }, + { + "epoch": 1.0968592137052493, + "grad_norm": 1.9184606075286865, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7201443314552307, + "num_tokens": 249292714.0, + "step": 9988 + }, + { + "epoch": 1.096969031407863, + "grad_norm": 2.4998273849487305, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7174016237258911, + "num_tokens": 249315019.0, + "step": 9989 + }, + { + "epoch": 1.0970788491104766, + "grad_norm": 2.4356584548950195, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.707361102104187, + "num_tokens": 249339375.0, + "step": 9990 + }, + { + "epoch": 1.0971886668130904, + "grad_norm": 2.409165143966675, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7081670761108398, + "num_tokens": 249364888.0, + "step": 9991 + }, + { + "epoch": 1.0972984845157039, + "grad_norm": 2.354883909225464, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7563813924789429, + "num_tokens": 249388934.0, + "step": 9992 + }, + { + "epoch": 1.0974083022183176, + "grad_norm": 2.660270929336548, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7222734689712524, + "num_tokens": 249409663.0, + "step": 9993 + }, + { + "epoch": 1.0975181199209312, + "grad_norm": 2.0753889083862305, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7195636034011841, + "num_tokens": 249436861.0, + "step": 9994 + }, + { + "epoch": 1.097627937623545, + "grad_norm": 2.033522129058838, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7382766604423523, + "num_tokens": 249465941.0, + "step": 9995 + }, + { + "epoch": 1.0977377553261585, + "grad_norm": 2.661043643951416, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7451378107070923, + "num_tokens": 249483896.0, + "step": 9996 + }, + { + "epoch": 1.0978475730287722, + "grad_norm": 2.695969581604004, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7222422361373901, + "num_tokens": 249504968.0, + "step": 9997 + }, + { + "epoch": 1.097957390731386, + "grad_norm": 2.279639720916748, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7110474109649658, + "num_tokens": 249530034.0, + "step": 9998 + }, + { + "epoch": 1.0980672084339995, + "grad_norm": 2.082832098007202, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7170571088790894, + "num_tokens": 249559665.0, + "step": 9999 + }, + { + "epoch": 1.0981770261366133, + "grad_norm": 2.194692373275757, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7520853877067566, + "num_tokens": 249583629.0, + "step": 10000 + }, + { + "epoch": 1.0982868438392268, + "grad_norm": 2.439915657043457, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7220436334609985, + "num_tokens": 249605812.0, + "step": 10001 + }, + { + "epoch": 1.0983966615418406, + "grad_norm": 2.419938564300537, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6973045468330383, + "num_tokens": 249632153.0, + "step": 10002 + }, + { + "epoch": 1.098506479244454, + "grad_norm": 1.8248405456542969, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7227560877799988, + "num_tokens": 249671446.0, + "step": 10003 + }, + { + "epoch": 1.0986162969470679, + "grad_norm": 2.589205026626587, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7219109535217285, + "num_tokens": 249692129.0, + "step": 10004 + }, + { + "epoch": 1.0987261146496816, + "grad_norm": 2.1690163612365723, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7395209074020386, + "num_tokens": 249717187.0, + "step": 10005 + }, + { + "epoch": 1.0988359323522952, + "grad_norm": 2.547934055328369, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7405208349227905, + "num_tokens": 249736092.0, + "step": 10006 + }, + { + "epoch": 1.098945750054909, + "grad_norm": 2.265075206756592, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.716620683670044, + "num_tokens": 249761779.0, + "step": 10007 + }, + { + "epoch": 1.0990555677575224, + "grad_norm": 2.3368568420410156, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7356547117233276, + "num_tokens": 249784710.0, + "step": 10008 + }, + { + "epoch": 1.0991653854601362, + "grad_norm": 2.5612473487854004, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.718023955821991, + "num_tokens": 249805422.0, + "step": 10009 + }, + { + "epoch": 1.0992752031627497, + "grad_norm": 2.413423538208008, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7111339569091797, + "num_tokens": 249831286.0, + "step": 10010 + }, + { + "epoch": 1.0993850208653635, + "grad_norm": 2.139331579208374, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6904850602149963, + "num_tokens": 249863753.0, + "step": 10011 + }, + { + "epoch": 1.0994948385679773, + "grad_norm": 2.4570014476776123, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7093225717544556, + "num_tokens": 249886248.0, + "step": 10012 + }, + { + "epoch": 1.0996046562705908, + "grad_norm": 2.1076483726501465, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7243665456771851, + "num_tokens": 249912618.0, + "step": 10013 + }, + { + "epoch": 1.0997144739732045, + "grad_norm": 2.2445733547210693, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7134711146354675, + "num_tokens": 249938300.0, + "step": 10014 + }, + { + "epoch": 1.099824291675818, + "grad_norm": 2.6431281566619873, + "learning_rate": 1e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7586017847061157, + "num_tokens": 249955902.0, + "step": 10015 + }, + { + "epoch": 1.0999341093784318, + "grad_norm": 2.253894805908203, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7013908624649048, + "num_tokens": 249983332.0, + "step": 10016 + }, + { + "epoch": 1.1000439270810454, + "grad_norm": 2.409996271133423, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7046190500259399, + "num_tokens": 250008483.0, + "step": 10017 + }, + { + "epoch": 1.1001537447836591, + "grad_norm": 2.1908202171325684, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7213062644004822, + "num_tokens": 250034892.0, + "step": 10018 + }, + { + "epoch": 1.1002635624862729, + "grad_norm": 2.532719373703003, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7307518720626831, + "num_tokens": 250055640.0, + "step": 10019 + }, + { + "epoch": 1.1003733801888864, + "grad_norm": 2.2533111572265625, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7216545939445496, + "num_tokens": 250081609.0, + "step": 10020 + }, + { + "epoch": 1.1004831978915002, + "grad_norm": 2.4566190242767334, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7172737121582031, + "num_tokens": 250106699.0, + "step": 10021 + }, + { + "epoch": 1.1005930155941137, + "grad_norm": 2.0642125606536865, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7264049053192139, + "num_tokens": 250134011.0, + "step": 10022 + }, + { + "epoch": 1.1007028332967275, + "grad_norm": 2.1784560680389404, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7028968930244446, + "num_tokens": 250160049.0, + "step": 10023 + }, + { + "epoch": 1.100812650999341, + "grad_norm": 2.363809823989868, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7230653166770935, + "num_tokens": 250182545.0, + "step": 10024 + }, + { + "epoch": 1.1009224687019548, + "grad_norm": 2.7799320220947266, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7281012535095215, + "num_tokens": 250199832.0, + "step": 10025 + }, + { + "epoch": 1.1010322864045685, + "grad_norm": 2.567335844039917, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7162186503410339, + "num_tokens": 250220944.0, + "step": 10026 + }, + { + "epoch": 1.101142104107182, + "grad_norm": 2.3458118438720703, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7221239805221558, + "num_tokens": 250244724.0, + "step": 10027 + }, + { + "epoch": 1.1012519218097958, + "grad_norm": 2.705454111099243, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7172068357467651, + "num_tokens": 250263993.0, + "step": 10028 + }, + { + "epoch": 1.1013617395124093, + "grad_norm": 2.240873336791992, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7230615615844727, + "num_tokens": 250288603.0, + "step": 10029 + }, + { + "epoch": 1.101471557215023, + "grad_norm": 1.9440691471099854, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7019760012626648, + "num_tokens": 250321248.0, + "step": 10030 + }, + { + "epoch": 1.1015813749176366, + "grad_norm": 1.868639349937439, + "learning_rate": 1e-06, + "loss": 1.1149, + "mean_token_accuracy": 0.6634594202041626, + "num_tokens": 250356628.0, + "step": 10031 + }, + { + "epoch": 1.1016911926202504, + "grad_norm": 2.1400492191314697, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.717994213104248, + "num_tokens": 250382784.0, + "step": 10032 + }, + { + "epoch": 1.1018010103228641, + "grad_norm": 2.4301154613494873, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7167977690696716, + "num_tokens": 250404760.0, + "step": 10033 + }, + { + "epoch": 1.1019108280254777, + "grad_norm": 2.2538862228393555, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7064275145530701, + "num_tokens": 250430922.0, + "step": 10034 + }, + { + "epoch": 1.1020206457280914, + "grad_norm": 1.987205147743225, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7079246640205383, + "num_tokens": 250462547.0, + "step": 10035 + }, + { + "epoch": 1.102130463430705, + "grad_norm": 2.75557279586792, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7202399969100952, + "num_tokens": 250481374.0, + "step": 10036 + }, + { + "epoch": 1.1022402811333187, + "grad_norm": 2.5722084045410156, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7332847118377686, + "num_tokens": 250501281.0, + "step": 10037 + }, + { + "epoch": 1.1023500988359323, + "grad_norm": 2.4984660148620605, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7406023740768433, + "num_tokens": 250522927.0, + "step": 10038 + }, + { + "epoch": 1.102459916538546, + "grad_norm": 2.1008975505828857, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7316496968269348, + "num_tokens": 250550066.0, + "step": 10039 + }, + { + "epoch": 1.1025697342411598, + "grad_norm": 2.258801221847534, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7153632044792175, + "num_tokens": 250576132.0, + "step": 10040 + }, + { + "epoch": 1.1026795519437733, + "grad_norm": 2.1947715282440186, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7157081961631775, + "num_tokens": 250602863.0, + "step": 10041 + }, + { + "epoch": 1.102789369646387, + "grad_norm": 2.2317793369293213, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7213679552078247, + "num_tokens": 250626087.0, + "step": 10042 + }, + { + "epoch": 1.1028991873490006, + "grad_norm": 2.599226713180542, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7306435108184814, + "num_tokens": 250645878.0, + "step": 10043 + }, + { + "epoch": 1.1030090050516144, + "grad_norm": 2.263329267501831, + "learning_rate": 1e-06, + "loss": 0.7287, + "mean_token_accuracy": 0.7663112878799438, + "num_tokens": 250668041.0, + "step": 10044 + }, + { + "epoch": 1.103118822754228, + "grad_norm": 2.3589484691619873, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7302883863449097, + "num_tokens": 250690806.0, + "step": 10045 + }, + { + "epoch": 1.1032286404568417, + "grad_norm": 2.385329008102417, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7344084978103638, + "num_tokens": 250714829.0, + "step": 10046 + }, + { + "epoch": 1.1033384581594552, + "grad_norm": 2.3725807666778564, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7162925004959106, + "num_tokens": 250739425.0, + "step": 10047 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 2.273900032043457, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7035455107688904, + "num_tokens": 250766564.0, + "step": 10048 + }, + { + "epoch": 1.1035580935646827, + "grad_norm": 2.1897666454315186, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7070352435112, + "num_tokens": 250793989.0, + "step": 10049 + }, + { + "epoch": 1.1036679112672962, + "grad_norm": 2.539726495742798, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7515893578529358, + "num_tokens": 250814621.0, + "step": 10050 + }, + { + "epoch": 1.10377772896991, + "grad_norm": 2.302330732345581, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7319022417068481, + "num_tokens": 250837957.0, + "step": 10051 + }, + { + "epoch": 1.1038875466725235, + "grad_norm": 2.3652522563934326, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7175334692001343, + "num_tokens": 250860893.0, + "step": 10052 + }, + { + "epoch": 1.1039973643751373, + "grad_norm": 2.5754621028900146, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7123026251792908, + "num_tokens": 250880867.0, + "step": 10053 + }, + { + "epoch": 1.104107182077751, + "grad_norm": 2.0166399478912354, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7227520942687988, + "num_tokens": 250911110.0, + "step": 10054 + }, + { + "epoch": 1.1042169997803646, + "grad_norm": 2.474358081817627, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7326184511184692, + "num_tokens": 250933529.0, + "step": 10055 + }, + { + "epoch": 1.1043268174829783, + "grad_norm": 2.1463592052459717, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7183453440666199, + "num_tokens": 250960145.0, + "step": 10056 + }, + { + "epoch": 1.1044366351855919, + "grad_norm": 2.162973403930664, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.701679527759552, + "num_tokens": 250988219.0, + "step": 10057 + }, + { + "epoch": 1.1045464528882056, + "grad_norm": 2.246615409851074, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7104721665382385, + "num_tokens": 251014585.0, + "step": 10058 + }, + { + "epoch": 1.1046562705908192, + "grad_norm": 2.023193120956421, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7083464860916138, + "num_tokens": 251045675.0, + "step": 10059 + }, + { + "epoch": 1.104766088293433, + "grad_norm": 2.329831123352051, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7285565137863159, + "num_tokens": 251067225.0, + "step": 10060 + }, + { + "epoch": 1.1048759059960465, + "grad_norm": 2.4419608116149902, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7263822555541992, + "num_tokens": 251088540.0, + "step": 10061 + }, + { + "epoch": 1.1049857236986602, + "grad_norm": 2.2602267265319824, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.720005452632904, + "num_tokens": 251111978.0, + "step": 10062 + }, + { + "epoch": 1.105095541401274, + "grad_norm": 2.0018930435180664, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7068901062011719, + "num_tokens": 251142605.0, + "step": 10063 + }, + { + "epoch": 1.1052053591038875, + "grad_norm": 2.270078182220459, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.725467324256897, + "num_tokens": 251168300.0, + "step": 10064 + }, + { + "epoch": 1.1053151768065013, + "grad_norm": 2.428049325942993, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7469807863235474, + "num_tokens": 251189216.0, + "step": 10065 + }, + { + "epoch": 1.1054249945091148, + "grad_norm": 2.4032251834869385, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7149790525436401, + "num_tokens": 251211360.0, + "step": 10066 + }, + { + "epoch": 1.1055348122117286, + "grad_norm": 2.343388557434082, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7360783815383911, + "num_tokens": 251232267.0, + "step": 10067 + }, + { + "epoch": 1.105644629914342, + "grad_norm": 1.9855008125305176, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7252674102783203, + "num_tokens": 251264688.0, + "step": 10068 + }, + { + "epoch": 1.1057544476169558, + "grad_norm": 2.1312437057495117, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7057320475578308, + "num_tokens": 251292224.0, + "step": 10069 + }, + { + "epoch": 1.1058642653195696, + "grad_norm": 2.172396183013916, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.724285364151001, + "num_tokens": 251319665.0, + "step": 10070 + }, + { + "epoch": 1.1059740830221831, + "grad_norm": 2.1042134761810303, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7171021103858948, + "num_tokens": 251347961.0, + "step": 10071 + }, + { + "epoch": 1.106083900724797, + "grad_norm": 2.217020034790039, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7174868583679199, + "num_tokens": 251372762.0, + "step": 10072 + }, + { + "epoch": 1.1061937184274104, + "grad_norm": 1.98564875125885, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7299662828445435, + "num_tokens": 251404103.0, + "step": 10073 + }, + { + "epoch": 1.1063035361300242, + "grad_norm": 2.062406063079834, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7268341779708862, + "num_tokens": 251433318.0, + "step": 10074 + }, + { + "epoch": 1.1064133538326377, + "grad_norm": 2.0921571254730225, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7110815048217773, + "num_tokens": 251462180.0, + "step": 10075 + }, + { + "epoch": 1.1065231715352515, + "grad_norm": 2.4770734310150146, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7221879959106445, + "num_tokens": 251482777.0, + "step": 10076 + }, + { + "epoch": 1.1066329892378652, + "grad_norm": 2.2735486030578613, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7150952816009521, + "num_tokens": 251509231.0, + "step": 10077 + }, + { + "epoch": 1.1067428069404788, + "grad_norm": 2.5179386138916016, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7359534502029419, + "num_tokens": 251528874.0, + "step": 10078 + }, + { + "epoch": 1.1068526246430925, + "grad_norm": 2.287912130355835, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7250784635543823, + "num_tokens": 251553820.0, + "step": 10079 + }, + { + "epoch": 1.106962442345706, + "grad_norm": 2.6034462451934814, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7115424275398254, + "num_tokens": 251573679.0, + "step": 10080 + }, + { + "epoch": 1.1070722600483198, + "grad_norm": 2.5022974014282227, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7302088737487793, + "num_tokens": 251594445.0, + "step": 10081 + }, + { + "epoch": 1.1071820777509334, + "grad_norm": 2.567378520965576, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7384189367294312, + "num_tokens": 251615281.0, + "step": 10082 + }, + { + "epoch": 1.1072918954535471, + "grad_norm": 2.2516701221466064, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7222973108291626, + "num_tokens": 251639090.0, + "step": 10083 + }, + { + "epoch": 1.1074017131561609, + "grad_norm": 2.279167890548706, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7384907007217407, + "num_tokens": 251663179.0, + "step": 10084 + }, + { + "epoch": 1.1075115308587744, + "grad_norm": 2.381669044494629, + "learning_rate": 1e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.744745135307312, + "num_tokens": 251685319.0, + "step": 10085 + }, + { + "epoch": 1.1076213485613882, + "grad_norm": 2.2356605529785156, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7022470235824585, + "num_tokens": 251712397.0, + "step": 10086 + }, + { + "epoch": 1.1077311662640017, + "grad_norm": 2.286876916885376, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7216498255729675, + "num_tokens": 251737115.0, + "step": 10087 + }, + { + "epoch": 1.1078409839666155, + "grad_norm": 2.1512603759765625, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7394957542419434, + "num_tokens": 251762664.0, + "step": 10088 + }, + { + "epoch": 1.107950801669229, + "grad_norm": 2.1788198947906494, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.718776524066925, + "num_tokens": 251787249.0, + "step": 10089 + }, + { + "epoch": 1.1080606193718427, + "grad_norm": 2.5413079261779785, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7236394286155701, + "num_tokens": 251808539.0, + "step": 10090 + }, + { + "epoch": 1.1081704370744565, + "grad_norm": 2.649014949798584, + "learning_rate": 1e-06, + "loss": 0.774, + "mean_token_accuracy": 0.7520302534103394, + "num_tokens": 251826842.0, + "step": 10091 + }, + { + "epoch": 1.10828025477707, + "grad_norm": 2.342714309692383, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7383304238319397, + "num_tokens": 251848618.0, + "step": 10092 + }, + { + "epoch": 1.1083900724796838, + "grad_norm": 2.041825532913208, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.720565140247345, + "num_tokens": 251875766.0, + "step": 10093 + }, + { + "epoch": 1.1084998901822973, + "grad_norm": 2.224947452545166, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.6998710036277771, + "num_tokens": 251902177.0, + "step": 10094 + }, + { + "epoch": 1.108609707884911, + "grad_norm": 2.447760581970215, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7080634832382202, + "num_tokens": 251926579.0, + "step": 10095 + }, + { + "epoch": 1.1087195255875246, + "grad_norm": 2.2881433963775635, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7180871963500977, + "num_tokens": 251951018.0, + "step": 10096 + }, + { + "epoch": 1.1088293432901384, + "grad_norm": 2.4042258262634277, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7350964546203613, + "num_tokens": 251973453.0, + "step": 10097 + }, + { + "epoch": 1.108939160992752, + "grad_norm": 2.2260842323303223, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.7047374248504639, + "num_tokens": 252001128.0, + "step": 10098 + }, + { + "epoch": 1.1090489786953657, + "grad_norm": 2.303565263748169, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7297107577323914, + "num_tokens": 252026029.0, + "step": 10099 + }, + { + "epoch": 1.1091587963979794, + "grad_norm": 2.2634456157684326, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7204747796058655, + "num_tokens": 252050494.0, + "step": 10100 + }, + { + "epoch": 1.109268614100593, + "grad_norm": 2.2278847694396973, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7078350782394409, + "num_tokens": 252075246.0, + "step": 10101 + }, + { + "epoch": 1.1093784318032067, + "grad_norm": 2.0333199501037598, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6869205236434937, + "num_tokens": 252105910.0, + "step": 10102 + }, + { + "epoch": 1.1094882495058203, + "grad_norm": 2.321197032928467, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.714199960231781, + "num_tokens": 252130217.0, + "step": 10103 + }, + { + "epoch": 1.109598067208434, + "grad_norm": 2.3495960235595703, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7465190887451172, + "num_tokens": 252153180.0, + "step": 10104 + }, + { + "epoch": 1.1097078849110478, + "grad_norm": 2.6738076210021973, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7070950865745544, + "num_tokens": 252174481.0, + "step": 10105 + }, + { + "epoch": 1.1098177026136613, + "grad_norm": 2.1890108585357666, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6967233419418335, + "num_tokens": 252200946.0, + "step": 10106 + }, + { + "epoch": 1.109927520316275, + "grad_norm": 2.2518322467803955, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7274330854415894, + "num_tokens": 252224976.0, + "step": 10107 + }, + { + "epoch": 1.1100373380188886, + "grad_norm": 2.3137340545654297, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7226580381393433, + "num_tokens": 252249284.0, + "step": 10108 + }, + { + "epoch": 1.1101471557215024, + "grad_norm": 2.487022638320923, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7083197832107544, + "num_tokens": 252276895.0, + "step": 10109 + }, + { + "epoch": 1.110256973424116, + "grad_norm": 2.206296682357788, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7104049921035767, + "num_tokens": 252305931.0, + "step": 10110 + }, + { + "epoch": 1.1103667911267296, + "grad_norm": 2.690976619720459, + "learning_rate": 1e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.740936279296875, + "num_tokens": 252323688.0, + "step": 10111 + }, + { + "epoch": 1.1104766088293432, + "grad_norm": 2.345829486846924, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7246742248535156, + "num_tokens": 252348787.0, + "step": 10112 + }, + { + "epoch": 1.110586426531957, + "grad_norm": 2.440812110900879, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7245290279388428, + "num_tokens": 252371336.0, + "step": 10113 + }, + { + "epoch": 1.1106962442345707, + "grad_norm": 2.2615480422973633, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7158429026603699, + "num_tokens": 252397696.0, + "step": 10114 + }, + { + "epoch": 1.1108060619371842, + "grad_norm": 2.448906183242798, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7274165153503418, + "num_tokens": 252420995.0, + "step": 10115 + }, + { + "epoch": 1.110915879639798, + "grad_norm": 2.432159423828125, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7193357944488525, + "num_tokens": 252442899.0, + "step": 10116 + }, + { + "epoch": 1.1110256973424115, + "grad_norm": 2.0807669162750244, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7260539531707764, + "num_tokens": 252472753.0, + "step": 10117 + }, + { + "epoch": 1.1111355150450253, + "grad_norm": 2.2865374088287354, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7383087873458862, + "num_tokens": 252498194.0, + "step": 10118 + }, + { + "epoch": 1.111245332747639, + "grad_norm": 2.1459500789642334, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7290294766426086, + "num_tokens": 252524657.0, + "step": 10119 + }, + { + "epoch": 1.1113551504502526, + "grad_norm": 2.116063356399536, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7398555278778076, + "num_tokens": 252553337.0, + "step": 10120 + }, + { + "epoch": 1.1114649681528663, + "grad_norm": 2.0694735050201416, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7269927263259888, + "num_tokens": 252580876.0, + "step": 10121 + }, + { + "epoch": 1.1115747858554799, + "grad_norm": 2.394982099533081, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7019915580749512, + "num_tokens": 252604960.0, + "step": 10122 + }, + { + "epoch": 1.1116846035580936, + "grad_norm": 2.489091634750366, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7427679896354675, + "num_tokens": 252625200.0, + "step": 10123 + }, + { + "epoch": 1.1117944212607072, + "grad_norm": 2.2041773796081543, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7003073692321777, + "num_tokens": 252652837.0, + "step": 10124 + }, + { + "epoch": 1.111904238963321, + "grad_norm": 1.8934777975082397, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7180517315864563, + "num_tokens": 252686117.0, + "step": 10125 + }, + { + "epoch": 1.1120140566659344, + "grad_norm": 2.022951602935791, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7228648066520691, + "num_tokens": 252714592.0, + "step": 10126 + }, + { + "epoch": 1.1121238743685482, + "grad_norm": 2.22475266456604, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7322880029678345, + "num_tokens": 252739487.0, + "step": 10127 + }, + { + "epoch": 1.112233692071162, + "grad_norm": 2.3719284534454346, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7253268957138062, + "num_tokens": 252763760.0, + "step": 10128 + }, + { + "epoch": 1.1123435097737755, + "grad_norm": 2.264096736907959, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6988356113433838, + "num_tokens": 252793350.0, + "step": 10129 + }, + { + "epoch": 1.1124533274763893, + "grad_norm": 2.22884464263916, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.73759925365448, + "num_tokens": 252817318.0, + "step": 10130 + }, + { + "epoch": 1.1125631451790028, + "grad_norm": 2.3336122035980225, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.717801570892334, + "num_tokens": 252841383.0, + "step": 10131 + }, + { + "epoch": 1.1126729628816165, + "grad_norm": 2.489105224609375, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7275301218032837, + "num_tokens": 252862491.0, + "step": 10132 + }, + { + "epoch": 1.11278278058423, + "grad_norm": 2.638700246810913, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.723264217376709, + "num_tokens": 252880810.0, + "step": 10133 + }, + { + "epoch": 1.1128925982868438, + "grad_norm": 2.421553611755371, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7131714820861816, + "num_tokens": 252903689.0, + "step": 10134 + }, + { + "epoch": 1.1130024159894576, + "grad_norm": 2.364697217941284, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7345056533813477, + "num_tokens": 252927167.0, + "step": 10135 + }, + { + "epoch": 1.1131122336920711, + "grad_norm": 2.5126702785491943, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7169439792633057, + "num_tokens": 252948991.0, + "step": 10136 + }, + { + "epoch": 1.1132220513946849, + "grad_norm": 2.514662504196167, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7244142889976501, + "num_tokens": 252970397.0, + "step": 10137 + }, + { + "epoch": 1.1133318690972984, + "grad_norm": 2.2364115715026855, + "learning_rate": 1e-06, + "loss": 1.0431, + "mean_token_accuracy": 0.6949831247329712, + "num_tokens": 252995026.0, + "step": 10138 + }, + { + "epoch": 1.1134416867999122, + "grad_norm": 2.5354769229888916, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7254062294960022, + "num_tokens": 253017720.0, + "step": 10139 + }, + { + "epoch": 1.1135515045025257, + "grad_norm": 2.178682804107666, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.738753080368042, + "num_tokens": 253044541.0, + "step": 10140 + }, + { + "epoch": 1.1136613222051395, + "grad_norm": 2.7158517837524414, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7306059002876282, + "num_tokens": 253065355.0, + "step": 10141 + }, + { + "epoch": 1.1137711399077532, + "grad_norm": 2.5374929904937744, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.734549880027771, + "num_tokens": 253086682.0, + "step": 10142 + }, + { + "epoch": 1.1138809576103668, + "grad_norm": 2.6112146377563477, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7194270491600037, + "num_tokens": 253108345.0, + "step": 10143 + }, + { + "epoch": 1.1139907753129805, + "grad_norm": 2.3884990215301514, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7322866320610046, + "num_tokens": 253131178.0, + "step": 10144 + }, + { + "epoch": 1.114100593015594, + "grad_norm": 2.7268292903900146, + "learning_rate": 1e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7480316162109375, + "num_tokens": 253148263.0, + "step": 10145 + }, + { + "epoch": 1.1142104107182078, + "grad_norm": 2.167170763015747, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7105490565299988, + "num_tokens": 253176916.0, + "step": 10146 + }, + { + "epoch": 1.1143202284208213, + "grad_norm": 2.1807901859283447, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7154284119606018, + "num_tokens": 253200944.0, + "step": 10147 + }, + { + "epoch": 1.114430046123435, + "grad_norm": 2.2273471355438232, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7264808416366577, + "num_tokens": 253226315.0, + "step": 10148 + }, + { + "epoch": 1.1145398638260489, + "grad_norm": 2.2440693378448486, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7189604640007019, + "num_tokens": 253251476.0, + "step": 10149 + }, + { + "epoch": 1.1146496815286624, + "grad_norm": 2.2856669425964355, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7375246286392212, + "num_tokens": 253274650.0, + "step": 10150 + }, + { + "epoch": 1.1147594992312762, + "grad_norm": 2.3853678703308105, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7053865194320679, + "num_tokens": 253298576.0, + "step": 10151 + }, + { + "epoch": 1.1148693169338897, + "grad_norm": 2.264035701751709, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7180421948432922, + "num_tokens": 253326094.0, + "step": 10152 + }, + { + "epoch": 1.1149791346365034, + "grad_norm": 2.4738717079162598, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7223732471466064, + "num_tokens": 253349100.0, + "step": 10153 + }, + { + "epoch": 1.115088952339117, + "grad_norm": 2.179145097732544, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.689838171005249, + "num_tokens": 253378038.0, + "step": 10154 + }, + { + "epoch": 1.1151987700417307, + "grad_norm": 2.3196022510528564, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7190297842025757, + "num_tokens": 253402579.0, + "step": 10155 + }, + { + "epoch": 1.1153085877443445, + "grad_norm": 2.293473243713379, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7076524496078491, + "num_tokens": 253428316.0, + "step": 10156 + }, + { + "epoch": 1.115418405446958, + "grad_norm": 2.471766710281372, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7267564535140991, + "num_tokens": 253449675.0, + "step": 10157 + }, + { + "epoch": 1.1155282231495718, + "grad_norm": 2.3773128986358643, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7347893118858337, + "num_tokens": 253472002.0, + "step": 10158 + }, + { + "epoch": 1.1156380408521853, + "grad_norm": 2.350191593170166, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7186381816864014, + "num_tokens": 253494875.0, + "step": 10159 + }, + { + "epoch": 1.115747858554799, + "grad_norm": 2.0485243797302246, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7110214829444885, + "num_tokens": 253522953.0, + "step": 10160 + }, + { + "epoch": 1.1158576762574126, + "grad_norm": 2.5905020236968994, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7150276899337769, + "num_tokens": 253543994.0, + "step": 10161 + }, + { + "epoch": 1.1159674939600264, + "grad_norm": 2.5105555057525635, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7223772406578064, + "num_tokens": 253565809.0, + "step": 10162 + }, + { + "epoch": 1.11607731166264, + "grad_norm": 2.2070107460021973, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7019537091255188, + "num_tokens": 253593151.0, + "step": 10163 + }, + { + "epoch": 1.1161871293652537, + "grad_norm": 2.0036346912384033, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7091764807701111, + "num_tokens": 253623492.0, + "step": 10164 + }, + { + "epoch": 1.1162969470678674, + "grad_norm": 2.3151047229766846, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7190198302268982, + "num_tokens": 253647637.0, + "step": 10165 + }, + { + "epoch": 1.116406764770481, + "grad_norm": 2.18705153465271, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.713442862033844, + "num_tokens": 253675122.0, + "step": 10166 + }, + { + "epoch": 1.1165165824730947, + "grad_norm": 2.378448009490967, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.70793616771698, + "num_tokens": 253696265.0, + "step": 10167 + }, + { + "epoch": 1.1166264001757082, + "grad_norm": 2.4084312915802, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7178634405136108, + "num_tokens": 253719962.0, + "step": 10168 + }, + { + "epoch": 1.116736217878322, + "grad_norm": 2.303374767303467, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6865040063858032, + "num_tokens": 253747492.0, + "step": 10169 + }, + { + "epoch": 1.1168460355809358, + "grad_norm": 2.2529680728912354, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7295942902565002, + "num_tokens": 253774752.0, + "step": 10170 + }, + { + "epoch": 1.1169558532835493, + "grad_norm": 2.20481014251709, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.708433985710144, + "num_tokens": 253802278.0, + "step": 10171 + }, + { + "epoch": 1.117065670986163, + "grad_norm": 2.6441664695739746, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7402970194816589, + "num_tokens": 253822181.0, + "step": 10172 + }, + { + "epoch": 1.1171754886887766, + "grad_norm": 2.557063341140747, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7178351283073425, + "num_tokens": 253844653.0, + "step": 10173 + }, + { + "epoch": 1.1172853063913903, + "grad_norm": 2.4962971210479736, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6961839199066162, + "num_tokens": 253868150.0, + "step": 10174 + }, + { + "epoch": 1.1173951240940039, + "grad_norm": 2.2921667098999023, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7129035592079163, + "num_tokens": 253894936.0, + "step": 10175 + }, + { + "epoch": 1.1175049417966176, + "grad_norm": 2.1284143924713135, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7315003275871277, + "num_tokens": 253921979.0, + "step": 10176 + }, + { + "epoch": 1.1176147594992312, + "grad_norm": 2.540574789047241, + "learning_rate": 1e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7460083961486816, + "num_tokens": 253943404.0, + "step": 10177 + }, + { + "epoch": 1.117724577201845, + "grad_norm": 2.4776370525360107, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7317413091659546, + "num_tokens": 253965800.0, + "step": 10178 + }, + { + "epoch": 1.1178343949044587, + "grad_norm": 2.3781495094299316, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7397351264953613, + "num_tokens": 253990177.0, + "step": 10179 + }, + { + "epoch": 1.1179442126070722, + "grad_norm": 2.182049036026001, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7083556056022644, + "num_tokens": 254018199.0, + "step": 10180 + }, + { + "epoch": 1.118054030309686, + "grad_norm": 2.3909285068511963, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7102886438369751, + "num_tokens": 254044205.0, + "step": 10181 + }, + { + "epoch": 1.1181638480122995, + "grad_norm": 2.447622537612915, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7131352424621582, + "num_tokens": 254066625.0, + "step": 10182 + }, + { + "epoch": 1.1182736657149133, + "grad_norm": 2.084465265274048, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7298108339309692, + "num_tokens": 254095504.0, + "step": 10183 + }, + { + "epoch": 1.1183834834175268, + "grad_norm": 2.6543853282928467, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7204492092132568, + "num_tokens": 254115072.0, + "step": 10184 + }, + { + "epoch": 1.1184933011201406, + "grad_norm": 2.3080618381500244, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.719738245010376, + "num_tokens": 254138294.0, + "step": 10185 + }, + { + "epoch": 1.1186031188227543, + "grad_norm": 2.391378402709961, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7162370681762695, + "num_tokens": 254162449.0, + "step": 10186 + }, + { + "epoch": 1.1187129365253679, + "grad_norm": 2.3965988159179688, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6900729537010193, + "num_tokens": 254187824.0, + "step": 10187 + }, + { + "epoch": 1.1188227542279816, + "grad_norm": 2.587773323059082, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7325685620307922, + "num_tokens": 254208223.0, + "step": 10188 + }, + { + "epoch": 1.1189325719305951, + "grad_norm": 2.1137757301330566, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7025419473648071, + "num_tokens": 254235441.0, + "step": 10189 + }, + { + "epoch": 1.119042389633209, + "grad_norm": 2.3378515243530273, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7131702899932861, + "num_tokens": 254258883.0, + "step": 10190 + }, + { + "epoch": 1.1191522073358224, + "grad_norm": 2.204298734664917, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7233595252037048, + "num_tokens": 254283395.0, + "step": 10191 + }, + { + "epoch": 1.1192620250384362, + "grad_norm": 2.5249226093292236, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.724994421005249, + "num_tokens": 254303127.0, + "step": 10192 + }, + { + "epoch": 1.11937184274105, + "grad_norm": 2.443834066390991, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7395523190498352, + "num_tokens": 254325331.0, + "step": 10193 + }, + { + "epoch": 1.1194816604436635, + "grad_norm": 2.4036548137664795, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7200942039489746, + "num_tokens": 254347421.0, + "step": 10194 + }, + { + "epoch": 1.1195914781462772, + "grad_norm": 2.1653378009796143, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.6985562443733215, + "num_tokens": 254376116.0, + "step": 10195 + }, + { + "epoch": 1.1197012958488908, + "grad_norm": 2.409497022628784, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7170897126197815, + "num_tokens": 254398411.0, + "step": 10196 + }, + { + "epoch": 1.1198111135515045, + "grad_norm": 2.463390827178955, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7320913672447205, + "num_tokens": 254422155.0, + "step": 10197 + }, + { + "epoch": 1.119920931254118, + "grad_norm": 2.216818332672119, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.726858377456665, + "num_tokens": 254448531.0, + "step": 10198 + }, + { + "epoch": 1.1200307489567318, + "grad_norm": 2.173426628112793, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.703982949256897, + "num_tokens": 254477153.0, + "step": 10199 + }, + { + "epoch": 1.1201405666593456, + "grad_norm": 2.4876413345336914, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7412316799163818, + "num_tokens": 254497749.0, + "step": 10200 + }, + { + "epoch": 1.1202503843619591, + "grad_norm": 2.007129669189453, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6828334331512451, + "num_tokens": 254530954.0, + "step": 10201 + }, + { + "epoch": 1.1203602020645729, + "grad_norm": 2.232396125793457, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7079781889915466, + "num_tokens": 254558281.0, + "step": 10202 + }, + { + "epoch": 1.1204700197671864, + "grad_norm": 2.371084690093994, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7097064256668091, + "num_tokens": 254585786.0, + "step": 10203 + }, + { + "epoch": 1.1205798374698002, + "grad_norm": 2.3375415802001953, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7247378826141357, + "num_tokens": 254609635.0, + "step": 10204 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 2.139702558517456, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6994094252586365, + "num_tokens": 254638887.0, + "step": 10205 + }, + { + "epoch": 1.1207994728750275, + "grad_norm": 2.1210501194000244, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7086232304573059, + "num_tokens": 254667982.0, + "step": 10206 + }, + { + "epoch": 1.1209092905776412, + "grad_norm": 2.6426944732666016, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7467353343963623, + "num_tokens": 254687603.0, + "step": 10207 + }, + { + "epoch": 1.1210191082802548, + "grad_norm": 2.379262924194336, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7026044130325317, + "num_tokens": 254711545.0, + "step": 10208 + }, + { + "epoch": 1.1211289259828685, + "grad_norm": 2.2147750854492188, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7102499008178711, + "num_tokens": 254739407.0, + "step": 10209 + }, + { + "epoch": 1.121238743685482, + "grad_norm": 2.4785714149475098, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.718205451965332, + "num_tokens": 254761192.0, + "step": 10210 + }, + { + "epoch": 1.1213485613880958, + "grad_norm": 2.228402614593506, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7221411466598511, + "num_tokens": 254786684.0, + "step": 10211 + }, + { + "epoch": 1.1214583790907093, + "grad_norm": 2.1724019050598145, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7145904302597046, + "num_tokens": 254813746.0, + "step": 10212 + }, + { + "epoch": 1.121568196793323, + "grad_norm": 2.3145289421081543, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.730507493019104, + "num_tokens": 254836021.0, + "step": 10213 + }, + { + "epoch": 1.1216780144959368, + "grad_norm": 2.044528007507324, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7103169560432434, + "num_tokens": 254865688.0, + "step": 10214 + }, + { + "epoch": 1.1217878321985504, + "grad_norm": 2.4590883255004883, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7200151085853577, + "num_tokens": 254887483.0, + "step": 10215 + }, + { + "epoch": 1.1218976499011641, + "grad_norm": 2.2216925621032715, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7011682987213135, + "num_tokens": 254912643.0, + "step": 10216 + }, + { + "epoch": 1.1220074676037777, + "grad_norm": 2.5787506103515625, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7466735243797302, + "num_tokens": 254931120.0, + "step": 10217 + }, + { + "epoch": 1.1221172853063914, + "grad_norm": 2.1926424503326416, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6929070353507996, + "num_tokens": 254961860.0, + "step": 10218 + }, + { + "epoch": 1.122227103009005, + "grad_norm": 2.362041711807251, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7397005558013916, + "num_tokens": 254984434.0, + "step": 10219 + }, + { + "epoch": 1.1223369207116187, + "grad_norm": 2.14255952835083, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.70536869764328, + "num_tokens": 255010409.0, + "step": 10220 + }, + { + "epoch": 1.1224467384142325, + "grad_norm": 2.137535572052002, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7142016887664795, + "num_tokens": 255036632.0, + "step": 10221 + }, + { + "epoch": 1.122556556116846, + "grad_norm": 2.1053073406219482, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6799334287643433, + "num_tokens": 255067868.0, + "step": 10222 + }, + { + "epoch": 1.1226663738194598, + "grad_norm": 1.80448317527771, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7171622514724731, + "num_tokens": 255101825.0, + "step": 10223 + }, + { + "epoch": 1.1227761915220733, + "grad_norm": 2.2224204540252686, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7014489769935608, + "num_tokens": 255126573.0, + "step": 10224 + }, + { + "epoch": 1.122886009224687, + "grad_norm": 2.283858060836792, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.706257700920105, + "num_tokens": 255153233.0, + "step": 10225 + }, + { + "epoch": 1.1229958269273006, + "grad_norm": 2.1792237758636475, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.6984602808952332, + "num_tokens": 255184098.0, + "step": 10226 + }, + { + "epoch": 1.1231056446299144, + "grad_norm": 2.662705659866333, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7341800928115845, + "num_tokens": 255202248.0, + "step": 10227 + }, + { + "epoch": 1.123215462332528, + "grad_norm": 2.192171812057495, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.726831316947937, + "num_tokens": 255229266.0, + "step": 10228 + }, + { + "epoch": 1.1233252800351416, + "grad_norm": 2.152787446975708, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7182055115699768, + "num_tokens": 255256600.0, + "step": 10229 + }, + { + "epoch": 1.1234350977377554, + "grad_norm": 2.266355514526367, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7239445447921753, + "num_tokens": 255281878.0, + "step": 10230 + }, + { + "epoch": 1.123544915440369, + "grad_norm": 2.170729875564575, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6971737146377563, + "num_tokens": 255311275.0, + "step": 10231 + }, + { + "epoch": 1.1236547331429827, + "grad_norm": 2.6947455406188965, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7219451665878296, + "num_tokens": 255331564.0, + "step": 10232 + }, + { + "epoch": 1.1237645508455962, + "grad_norm": 2.1931066513061523, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7156580090522766, + "num_tokens": 255357091.0, + "step": 10233 + }, + { + "epoch": 1.12387436854821, + "grad_norm": 2.267589807510376, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7198113799095154, + "num_tokens": 255381469.0, + "step": 10234 + }, + { + "epoch": 1.1239841862508237, + "grad_norm": 2.5334150791168213, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7308133840560913, + "num_tokens": 255402219.0, + "step": 10235 + }, + { + "epoch": 1.1240940039534373, + "grad_norm": 2.4622890949249268, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7376478910446167, + "num_tokens": 255422851.0, + "step": 10236 + }, + { + "epoch": 1.124203821656051, + "grad_norm": 2.332383632659912, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7335025668144226, + "num_tokens": 255445996.0, + "step": 10237 + }, + { + "epoch": 1.1243136393586646, + "grad_norm": 2.370198965072632, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7156480550765991, + "num_tokens": 255469118.0, + "step": 10238 + }, + { + "epoch": 1.1244234570612783, + "grad_norm": 2.325317144393921, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6938230395317078, + "num_tokens": 255493768.0, + "step": 10239 + }, + { + "epoch": 1.1245332747638919, + "grad_norm": 2.1245028972625732, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7038059830665588, + "num_tokens": 255523613.0, + "step": 10240 + }, + { + "epoch": 1.1246430924665056, + "grad_norm": 2.0933477878570557, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7153868675231934, + "num_tokens": 255552137.0, + "step": 10241 + }, + { + "epoch": 1.1247529101691192, + "grad_norm": 2.223701000213623, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7231296300888062, + "num_tokens": 255580228.0, + "step": 10242 + }, + { + "epoch": 1.124862727871733, + "grad_norm": 2.263460636138916, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7290130853652954, + "num_tokens": 255605967.0, + "step": 10243 + }, + { + "epoch": 1.1249725455743467, + "grad_norm": 2.231855869293213, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7240386009216309, + "num_tokens": 255630848.0, + "step": 10244 + }, + { + "epoch": 1.1250823632769602, + "grad_norm": 2.4247703552246094, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7108262777328491, + "num_tokens": 255654174.0, + "step": 10245 + }, + { + "epoch": 1.125192180979574, + "grad_norm": 2.2140822410583496, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6860313415527344, + "num_tokens": 255684213.0, + "step": 10246 + }, + { + "epoch": 1.1253019986821875, + "grad_norm": 2.4740655422210693, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7263993620872498, + "num_tokens": 255706076.0, + "step": 10247 + }, + { + "epoch": 1.1254118163848013, + "grad_norm": 2.184021234512329, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7172864079475403, + "num_tokens": 255732259.0, + "step": 10248 + }, + { + "epoch": 1.125521634087415, + "grad_norm": 2.345433473587036, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7188414335250854, + "num_tokens": 255758107.0, + "step": 10249 + }, + { + "epoch": 1.1256314517900285, + "grad_norm": 2.685075044631958, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7440795302391052, + "num_tokens": 255776260.0, + "step": 10250 + }, + { + "epoch": 1.1257412694926423, + "grad_norm": 2.2633931636810303, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7008641958236694, + "num_tokens": 255802609.0, + "step": 10251 + }, + { + "epoch": 1.1258510871952558, + "grad_norm": 2.2200281620025635, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7152715921401978, + "num_tokens": 255829620.0, + "step": 10252 + }, + { + "epoch": 1.1259609048978696, + "grad_norm": 2.1378467082977295, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7300838232040405, + "num_tokens": 255856466.0, + "step": 10253 + }, + { + "epoch": 1.1260707226004831, + "grad_norm": 2.4160499572753906, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7305806875228882, + "num_tokens": 255878740.0, + "step": 10254 + }, + { + "epoch": 1.1261805403030969, + "grad_norm": 2.1496667861938477, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7098103165626526, + "num_tokens": 255908063.0, + "step": 10255 + }, + { + "epoch": 1.1262903580057104, + "grad_norm": 2.521256685256958, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7397174835205078, + "num_tokens": 255929233.0, + "step": 10256 + }, + { + "epoch": 1.1264001757083242, + "grad_norm": 2.20902156829834, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7127367258071899, + "num_tokens": 255955762.0, + "step": 10257 + }, + { + "epoch": 1.126509993410938, + "grad_norm": 2.220790147781372, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7138423323631287, + "num_tokens": 255982077.0, + "step": 10258 + }, + { + "epoch": 1.1266198111135515, + "grad_norm": 2.2466421127319336, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6928737163543701, + "num_tokens": 256010309.0, + "step": 10259 + }, + { + "epoch": 1.1267296288161652, + "grad_norm": 2.2593648433685303, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7105517983436584, + "num_tokens": 256035832.0, + "step": 10260 + }, + { + "epoch": 1.1268394465187788, + "grad_norm": 2.183732271194458, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.714266300201416, + "num_tokens": 256063228.0, + "step": 10261 + }, + { + "epoch": 1.1269492642213925, + "grad_norm": 2.4110000133514404, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7422311305999756, + "num_tokens": 256085244.0, + "step": 10262 + }, + { + "epoch": 1.127059081924006, + "grad_norm": 2.60935640335083, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7274344563484192, + "num_tokens": 256104933.0, + "step": 10263 + }, + { + "epoch": 1.1271688996266198, + "grad_norm": 2.481437921524048, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7234724760055542, + "num_tokens": 256127096.0, + "step": 10264 + }, + { + "epoch": 1.1272787173292333, + "grad_norm": 2.26172137260437, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7085457444190979, + "num_tokens": 256152977.0, + "step": 10265 + }, + { + "epoch": 1.127388535031847, + "grad_norm": 2.1074094772338867, + "learning_rate": 1e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7507023215293884, + "num_tokens": 256177750.0, + "step": 10266 + }, + { + "epoch": 1.1274983527344609, + "grad_norm": 2.2684309482574463, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.711475133895874, + "num_tokens": 256204293.0, + "step": 10267 + }, + { + "epoch": 1.1276081704370744, + "grad_norm": 2.537644386291504, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7338513135910034, + "num_tokens": 256224658.0, + "step": 10268 + }, + { + "epoch": 1.1277179881396882, + "grad_norm": 2.638387680053711, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.710369884967804, + "num_tokens": 256243571.0, + "step": 10269 + }, + { + "epoch": 1.1278278058423017, + "grad_norm": 2.428821086883545, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.722780704498291, + "num_tokens": 256266750.0, + "step": 10270 + }, + { + "epoch": 1.1279376235449154, + "grad_norm": 2.215895175933838, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7058225870132446, + "num_tokens": 256292243.0, + "step": 10271 + }, + { + "epoch": 1.1280474412475292, + "grad_norm": 2.1506505012512207, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7158704400062561, + "num_tokens": 256322513.0, + "step": 10272 + }, + { + "epoch": 1.1281572589501427, + "grad_norm": 2.2689969539642334, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.716465950012207, + "num_tokens": 256346859.0, + "step": 10273 + }, + { + "epoch": 1.1282670766527565, + "grad_norm": 2.5239410400390625, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7127258777618408, + "num_tokens": 256369233.0, + "step": 10274 + }, + { + "epoch": 1.12837689435537, + "grad_norm": 2.247588872909546, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7237640023231506, + "num_tokens": 256394162.0, + "step": 10275 + }, + { + "epoch": 1.1284867120579838, + "grad_norm": 2.1522631645202637, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7255092263221741, + "num_tokens": 256419856.0, + "step": 10276 + }, + { + "epoch": 1.1285965297605973, + "grad_norm": 2.6892261505126953, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.730457067489624, + "num_tokens": 256438053.0, + "step": 10277 + }, + { + "epoch": 1.128706347463211, + "grad_norm": 2.565247058868408, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7064008712768555, + "num_tokens": 256459555.0, + "step": 10278 + }, + { + "epoch": 1.1288161651658246, + "grad_norm": 2.1354196071624756, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.6973189115524292, + "num_tokens": 256488361.0, + "step": 10279 + }, + { + "epoch": 1.1289259828684384, + "grad_norm": 2.4433443546295166, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7200635671615601, + "num_tokens": 256512161.0, + "step": 10280 + }, + { + "epoch": 1.1290358005710521, + "grad_norm": 2.07908034324646, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7161138653755188, + "num_tokens": 256540297.0, + "step": 10281 + }, + { + "epoch": 1.1291456182736657, + "grad_norm": 2.5196948051452637, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7207332849502563, + "num_tokens": 256562132.0, + "step": 10282 + }, + { + "epoch": 1.1292554359762794, + "grad_norm": 2.069577693939209, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7161626219749451, + "num_tokens": 256592463.0, + "step": 10283 + }, + { + "epoch": 1.129365253678893, + "grad_norm": 2.2011022567749023, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7000600099563599, + "num_tokens": 256621174.0, + "step": 10284 + }, + { + "epoch": 1.1294750713815067, + "grad_norm": 2.024045467376709, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7262090444564819, + "num_tokens": 256652818.0, + "step": 10285 + }, + { + "epoch": 1.1295848890841205, + "grad_norm": 2.374715805053711, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7183109521865845, + "num_tokens": 256677224.0, + "step": 10286 + }, + { + "epoch": 1.129694706786734, + "grad_norm": 2.4628665447235107, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7381901741027832, + "num_tokens": 256698126.0, + "step": 10287 + }, + { + "epoch": 1.1298045244893478, + "grad_norm": 2.575308084487915, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7523974180221558, + "num_tokens": 256717250.0, + "step": 10288 + }, + { + "epoch": 1.1299143421919613, + "grad_norm": 2.3168468475341797, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7301839590072632, + "num_tokens": 256741403.0, + "step": 10289 + }, + { + "epoch": 1.130024159894575, + "grad_norm": 2.0538527965545654, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.6853796243667603, + "num_tokens": 256770927.0, + "step": 10290 + }, + { + "epoch": 1.1301339775971886, + "grad_norm": 2.3747284412384033, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7187818288803101, + "num_tokens": 256793354.0, + "step": 10291 + }, + { + "epoch": 1.1302437952998023, + "grad_norm": 2.0884130001068115, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7204413414001465, + "num_tokens": 256822245.0, + "step": 10292 + }, + { + "epoch": 1.1303536130024159, + "grad_norm": 2.117371082305908, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7159667015075684, + "num_tokens": 256849630.0, + "step": 10293 + }, + { + "epoch": 1.1304634307050296, + "grad_norm": 2.6751818656921387, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7076711654663086, + "num_tokens": 256870078.0, + "step": 10294 + }, + { + "epoch": 1.1305732484076434, + "grad_norm": 2.5021440982818604, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7276752591133118, + "num_tokens": 256891718.0, + "step": 10295 + }, + { + "epoch": 1.130683066110257, + "grad_norm": 2.0549590587615967, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7105519771575928, + "num_tokens": 256922031.0, + "step": 10296 + }, + { + "epoch": 1.1307928838128707, + "grad_norm": 2.3930108547210693, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7399132251739502, + "num_tokens": 256943780.0, + "step": 10297 + }, + { + "epoch": 1.1309027015154842, + "grad_norm": 2.4258408546447754, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7082733511924744, + "num_tokens": 256965243.0, + "step": 10298 + }, + { + "epoch": 1.131012519218098, + "grad_norm": 2.3005430698394775, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7138264179229736, + "num_tokens": 256990385.0, + "step": 10299 + }, + { + "epoch": 1.1311223369207117, + "grad_norm": 2.359182834625244, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7023597955703735, + "num_tokens": 257013985.0, + "step": 10300 + }, + { + "epoch": 1.1312321546233253, + "grad_norm": 2.3218936920166016, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7044737935066223, + "num_tokens": 257039302.0, + "step": 10301 + }, + { + "epoch": 1.131341972325939, + "grad_norm": 2.250087261199951, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7187991142272949, + "num_tokens": 257063924.0, + "step": 10302 + }, + { + "epoch": 1.1314517900285526, + "grad_norm": 2.318619966506958, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7042433023452759, + "num_tokens": 257090632.0, + "step": 10303 + }, + { + "epoch": 1.1315616077311663, + "grad_norm": 2.6324660778045654, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7237464785575867, + "num_tokens": 257111668.0, + "step": 10304 + }, + { + "epoch": 1.1316714254337799, + "grad_norm": 2.3784165382385254, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.72953861951828, + "num_tokens": 257134573.0, + "step": 10305 + }, + { + "epoch": 1.1317812431363936, + "grad_norm": 1.9995776414871216, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7243543267250061, + "num_tokens": 257166093.0, + "step": 10306 + }, + { + "epoch": 1.1318910608390071, + "grad_norm": 2.151278018951416, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7093065977096558, + "num_tokens": 257194232.0, + "step": 10307 + }, + { + "epoch": 1.132000878541621, + "grad_norm": 2.621065378189087, + "learning_rate": 1e-06, + "loss": 0.7877, + "mean_token_accuracy": 0.7439234852790833, + "num_tokens": 257213123.0, + "step": 10308 + }, + { + "epoch": 1.1321106962442347, + "grad_norm": 2.15364933013916, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7067067623138428, + "num_tokens": 257240448.0, + "step": 10309 + }, + { + "epoch": 1.1322205139468482, + "grad_norm": 2.1253771781921387, + "learning_rate": 1e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7389461994171143, + "num_tokens": 257265808.0, + "step": 10310 + }, + { + "epoch": 1.132330331649462, + "grad_norm": 2.1679790019989014, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7117562890052795, + "num_tokens": 257294479.0, + "step": 10311 + }, + { + "epoch": 1.1324401493520755, + "grad_norm": 2.5046305656433105, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7257512807846069, + "num_tokens": 257316912.0, + "step": 10312 + }, + { + "epoch": 1.1325499670546892, + "grad_norm": 2.114880084991455, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.704862117767334, + "num_tokens": 257346107.0, + "step": 10313 + }, + { + "epoch": 1.132659784757303, + "grad_norm": 2.772814989089966, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7199296951293945, + "num_tokens": 257364766.0, + "step": 10314 + }, + { + "epoch": 1.1327696024599165, + "grad_norm": 2.762908458709717, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7097740173339844, + "num_tokens": 257384385.0, + "step": 10315 + }, + { + "epoch": 1.1328794201625303, + "grad_norm": 2.2137460708618164, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7202185392379761, + "num_tokens": 257411026.0, + "step": 10316 + }, + { + "epoch": 1.1329892378651438, + "grad_norm": 2.293691635131836, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7128148078918457, + "num_tokens": 257435432.0, + "step": 10317 + }, + { + "epoch": 1.1330990555677576, + "grad_norm": 2.028527021408081, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7074160575866699, + "num_tokens": 257464821.0, + "step": 10318 + }, + { + "epoch": 1.1332088732703711, + "grad_norm": 2.2420578002929688, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7126609086990356, + "num_tokens": 257491297.0, + "step": 10319 + }, + { + "epoch": 1.1333186909729849, + "grad_norm": 2.182706594467163, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7367759943008423, + "num_tokens": 257516804.0, + "step": 10320 + }, + { + "epoch": 1.1334285086755984, + "grad_norm": 2.358241319656372, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7088828086853027, + "num_tokens": 257543794.0, + "step": 10321 + }, + { + "epoch": 1.1335383263782122, + "grad_norm": 2.7227213382720947, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.728357195854187, + "num_tokens": 257561386.0, + "step": 10322 + }, + { + "epoch": 1.133648144080826, + "grad_norm": 2.006474256515503, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7124581336975098, + "num_tokens": 257591892.0, + "step": 10323 + }, + { + "epoch": 1.1337579617834395, + "grad_norm": 2.545072555541992, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.710787832736969, + "num_tokens": 257614359.0, + "step": 10324 + }, + { + "epoch": 1.1338677794860532, + "grad_norm": 2.185091495513916, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6991473436355591, + "num_tokens": 257646276.0, + "step": 10325 + }, + { + "epoch": 1.1339775971886668, + "grad_norm": 2.357776165008545, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7007271647453308, + "num_tokens": 257672446.0, + "step": 10326 + }, + { + "epoch": 1.1340874148912805, + "grad_norm": 2.429208755493164, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7105039954185486, + "num_tokens": 257696796.0, + "step": 10327 + }, + { + "epoch": 1.134197232593894, + "grad_norm": 2.4135892391204834, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7228013277053833, + "num_tokens": 257719253.0, + "step": 10328 + }, + { + "epoch": 1.1343070502965078, + "grad_norm": 2.1848666667938232, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.700763463973999, + "num_tokens": 257748194.0, + "step": 10329 + }, + { + "epoch": 1.1344168679991213, + "grad_norm": 2.550440788269043, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7263702154159546, + "num_tokens": 257768901.0, + "step": 10330 + }, + { + "epoch": 1.134526685701735, + "grad_norm": 2.2731571197509766, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7244259119033813, + "num_tokens": 257793766.0, + "step": 10331 + }, + { + "epoch": 1.1346365034043489, + "grad_norm": 2.3795549869537354, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7169649600982666, + "num_tokens": 257817074.0, + "step": 10332 + }, + { + "epoch": 1.1347463211069624, + "grad_norm": 2.3787710666656494, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7355760335922241, + "num_tokens": 257838734.0, + "step": 10333 + }, + { + "epoch": 1.1348561388095761, + "grad_norm": 2.323286533355713, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7029053568840027, + "num_tokens": 257865432.0, + "step": 10334 + }, + { + "epoch": 1.1349659565121897, + "grad_norm": 2.1813602447509766, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.6988643407821655, + "num_tokens": 257896408.0, + "step": 10335 + }, + { + "epoch": 1.1350757742148034, + "grad_norm": 2.244202136993408, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7117896676063538, + "num_tokens": 257922557.0, + "step": 10336 + }, + { + "epoch": 1.1351855919174172, + "grad_norm": 2.101623296737671, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7352582216262817, + "num_tokens": 257950047.0, + "step": 10337 + }, + { + "epoch": 1.1352954096200307, + "grad_norm": 2.1563806533813477, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7180185317993164, + "num_tokens": 257977546.0, + "step": 10338 + }, + { + "epoch": 1.1354052273226445, + "grad_norm": 2.6106650829315186, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7359788417816162, + "num_tokens": 257996913.0, + "step": 10339 + }, + { + "epoch": 1.135515045025258, + "grad_norm": 2.2664337158203125, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7028633952140808, + "num_tokens": 258022575.0, + "step": 10340 + }, + { + "epoch": 1.1356248627278718, + "grad_norm": 2.0961813926696777, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6947855949401855, + "num_tokens": 258052096.0, + "step": 10341 + }, + { + "epoch": 1.1357346804304853, + "grad_norm": 2.713122844696045, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7315152883529663, + "num_tokens": 258072240.0, + "step": 10342 + }, + { + "epoch": 1.135844498133099, + "grad_norm": 2.0356764793395996, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7162995338439941, + "num_tokens": 258102752.0, + "step": 10343 + }, + { + "epoch": 1.1359543158357126, + "grad_norm": 2.6152586936950684, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7044550180435181, + "num_tokens": 258127422.0, + "step": 10344 + }, + { + "epoch": 1.1360641335383264, + "grad_norm": 2.222682237625122, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7190554738044739, + "num_tokens": 258153222.0, + "step": 10345 + }, + { + "epoch": 1.1361739512409401, + "grad_norm": 2.177699089050293, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7207440137863159, + "num_tokens": 258178600.0, + "step": 10346 + }, + { + "epoch": 1.1362837689435537, + "grad_norm": 2.4623820781707764, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.740429162979126, + "num_tokens": 258198799.0, + "step": 10347 + }, + { + "epoch": 1.1363935866461674, + "grad_norm": 2.1579084396362305, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6904615163803101, + "num_tokens": 258228669.0, + "step": 10348 + }, + { + "epoch": 1.136503404348781, + "grad_norm": 2.3926031589508057, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7530463933944702, + "num_tokens": 258250297.0, + "step": 10349 + }, + { + "epoch": 1.1366132220513947, + "grad_norm": 2.4319348335266113, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7151426076889038, + "num_tokens": 258273370.0, + "step": 10350 + }, + { + "epoch": 1.1367230397540085, + "grad_norm": 2.5034873485565186, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7260758876800537, + "num_tokens": 258293790.0, + "step": 10351 + }, + { + "epoch": 1.136832857456622, + "grad_norm": 2.399502992630005, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7186094522476196, + "num_tokens": 258317650.0, + "step": 10352 + }, + { + "epoch": 1.1369426751592357, + "grad_norm": 2.2921249866485596, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7162157297134399, + "num_tokens": 258341578.0, + "step": 10353 + }, + { + "epoch": 1.1370524928618493, + "grad_norm": 2.567251682281494, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.6984631419181824, + "num_tokens": 258363744.0, + "step": 10354 + }, + { + "epoch": 1.137162310564463, + "grad_norm": 2.169088125228882, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7379962205886841, + "num_tokens": 258387299.0, + "step": 10355 + }, + { + "epoch": 1.1372721282670766, + "grad_norm": 2.3813374042510986, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7365803122520447, + "num_tokens": 258409755.0, + "step": 10356 + }, + { + "epoch": 1.1373819459696903, + "grad_norm": 1.9615252017974854, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7311194539070129, + "num_tokens": 258439977.0, + "step": 10357 + }, + { + "epoch": 1.1374917636723039, + "grad_norm": 2.472342014312744, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7270386219024658, + "num_tokens": 258460593.0, + "step": 10358 + }, + { + "epoch": 1.1376015813749176, + "grad_norm": 2.1624433994293213, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7146180868148804, + "num_tokens": 258487364.0, + "step": 10359 + }, + { + "epoch": 1.1377113990775314, + "grad_norm": 2.151648998260498, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.716990053653717, + "num_tokens": 258515365.0, + "step": 10360 + }, + { + "epoch": 1.137821216780145, + "grad_norm": 2.2366907596588135, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7004380226135254, + "num_tokens": 258542125.0, + "step": 10361 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 2.3904941082000732, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7375955581665039, + "num_tokens": 258565402.0, + "step": 10362 + }, + { + "epoch": 1.1380408521853722, + "grad_norm": 2.683398485183716, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7284265756607056, + "num_tokens": 258583083.0, + "step": 10363 + }, + { + "epoch": 1.138150669887986, + "grad_norm": 2.2100448608398438, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7199674844741821, + "num_tokens": 258607974.0, + "step": 10364 + }, + { + "epoch": 1.1382604875905997, + "grad_norm": 2.112300157546997, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.709624171257019, + "num_tokens": 258637721.0, + "step": 10365 + }, + { + "epoch": 1.1383703052932133, + "grad_norm": 2.263498306274414, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7259644269943237, + "num_tokens": 258662181.0, + "step": 10366 + }, + { + "epoch": 1.138480122995827, + "grad_norm": 1.9563758373260498, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7172716856002808, + "num_tokens": 258695045.0, + "step": 10367 + }, + { + "epoch": 1.1385899406984406, + "grad_norm": 2.188415765762329, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7118319272994995, + "num_tokens": 258721886.0, + "step": 10368 + }, + { + "epoch": 1.1386997584010543, + "grad_norm": 2.297567844390869, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7089850902557373, + "num_tokens": 258746156.0, + "step": 10369 + }, + { + "epoch": 1.1388095761036678, + "grad_norm": 2.2769482135772705, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.731741726398468, + "num_tokens": 258770247.0, + "step": 10370 + }, + { + "epoch": 1.1389193938062816, + "grad_norm": 2.3102588653564453, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7120627760887146, + "num_tokens": 258794773.0, + "step": 10371 + }, + { + "epoch": 1.1390292115088951, + "grad_norm": 2.2045419216156006, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7339465022087097, + "num_tokens": 258820466.0, + "step": 10372 + }, + { + "epoch": 1.139139029211509, + "grad_norm": 2.091736316680908, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6974013447761536, + "num_tokens": 258849130.0, + "step": 10373 + }, + { + "epoch": 1.1392488469141226, + "grad_norm": 2.126694679260254, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6922827959060669, + "num_tokens": 258878135.0, + "step": 10374 + }, + { + "epoch": 1.1393586646167362, + "grad_norm": 2.2718403339385986, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7173492312431335, + "num_tokens": 258902715.0, + "step": 10375 + }, + { + "epoch": 1.13946848231935, + "grad_norm": 2.336947202682495, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7217400670051575, + "num_tokens": 258926653.0, + "step": 10376 + }, + { + "epoch": 1.1395783000219635, + "grad_norm": 2.325343608856201, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7238207459449768, + "num_tokens": 258949123.0, + "step": 10377 + }, + { + "epoch": 1.1396881177245772, + "grad_norm": 2.1754069328308105, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7064934372901917, + "num_tokens": 258976868.0, + "step": 10378 + }, + { + "epoch": 1.1397979354271908, + "grad_norm": 2.2069242000579834, + "learning_rate": 1e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7620930671691895, + "num_tokens": 259001341.0, + "step": 10379 + }, + { + "epoch": 1.1399077531298045, + "grad_norm": 2.7062432765960693, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7364186644554138, + "num_tokens": 259020654.0, + "step": 10380 + }, + { + "epoch": 1.1400175708324183, + "grad_norm": 2.140495777130127, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7134107947349548, + "num_tokens": 259048083.0, + "step": 10381 + }, + { + "epoch": 1.1401273885350318, + "grad_norm": 2.255535125732422, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7191807627677917, + "num_tokens": 259072534.0, + "step": 10382 + }, + { + "epoch": 1.1402372062376456, + "grad_norm": 2.390974283218384, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7136760950088501, + "num_tokens": 259097751.0, + "step": 10383 + }, + { + "epoch": 1.140347023940259, + "grad_norm": 2.0135438442230225, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7048762440681458, + "num_tokens": 259129587.0, + "step": 10384 + }, + { + "epoch": 1.1404568416428729, + "grad_norm": 2.5219779014587402, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7249411940574646, + "num_tokens": 259150239.0, + "step": 10385 + }, + { + "epoch": 1.1405666593454864, + "grad_norm": 2.622375965118408, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.742889404296875, + "num_tokens": 259169197.0, + "step": 10386 + }, + { + "epoch": 1.1406764770481002, + "grad_norm": 2.189483880996704, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7227011919021606, + "num_tokens": 259196008.0, + "step": 10387 + }, + { + "epoch": 1.140786294750714, + "grad_norm": 2.190476655960083, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7134072780609131, + "num_tokens": 259222103.0, + "step": 10388 + }, + { + "epoch": 1.1408961124533274, + "grad_norm": 2.395447015762329, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7376497983932495, + "num_tokens": 259242199.0, + "step": 10389 + }, + { + "epoch": 1.1410059301559412, + "grad_norm": 2.4708778858184814, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7468029260635376, + "num_tokens": 259263274.0, + "step": 10390 + }, + { + "epoch": 1.1411157478585547, + "grad_norm": 2.0861411094665527, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7122430801391602, + "num_tokens": 259295339.0, + "step": 10391 + }, + { + "epoch": 1.1412255655611685, + "grad_norm": 2.158299446105957, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6943544745445251, + "num_tokens": 259323173.0, + "step": 10392 + }, + { + "epoch": 1.141335383263782, + "grad_norm": 2.4014902114868164, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7199763059616089, + "num_tokens": 259347549.0, + "step": 10393 + }, + { + "epoch": 1.1414452009663958, + "grad_norm": 2.3909473419189453, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7355138063430786, + "num_tokens": 259370094.0, + "step": 10394 + }, + { + "epoch": 1.1415550186690093, + "grad_norm": 2.440308094024658, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7178014516830444, + "num_tokens": 259392615.0, + "step": 10395 + }, + { + "epoch": 1.141664836371623, + "grad_norm": 2.1082816123962402, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7123275995254517, + "num_tokens": 259421903.0, + "step": 10396 + }, + { + "epoch": 1.1417746540742368, + "grad_norm": 2.4221653938293457, + "learning_rate": 1e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7623343467712402, + "num_tokens": 259442996.0, + "step": 10397 + }, + { + "epoch": 1.1418844717768504, + "grad_norm": 2.1916024684906006, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7273639440536499, + "num_tokens": 259469784.0, + "step": 10398 + }, + { + "epoch": 1.1419942894794641, + "grad_norm": 2.2324838638305664, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7186877727508545, + "num_tokens": 259495079.0, + "step": 10399 + }, + { + "epoch": 1.1421041071820777, + "grad_norm": 2.3885653018951416, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7168477773666382, + "num_tokens": 259521141.0, + "step": 10400 + }, + { + "epoch": 1.1422139248846914, + "grad_norm": 2.541152238845825, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7156389951705933, + "num_tokens": 259546679.0, + "step": 10401 + }, + { + "epoch": 1.1423237425873052, + "grad_norm": 2.3537371158599854, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6930676698684692, + "num_tokens": 259574243.0, + "step": 10402 + }, + { + "epoch": 1.1424335602899187, + "grad_norm": 2.3895633220672607, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7019029855728149, + "num_tokens": 259597637.0, + "step": 10403 + }, + { + "epoch": 1.1425433779925325, + "grad_norm": 2.2894623279571533, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7067358493804932, + "num_tokens": 259623420.0, + "step": 10404 + }, + { + "epoch": 1.142653195695146, + "grad_norm": 2.4149703979492188, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7181500196456909, + "num_tokens": 259645491.0, + "step": 10405 + }, + { + "epoch": 1.1427630133977598, + "grad_norm": 2.1664609909057617, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7258514165878296, + "num_tokens": 259672283.0, + "step": 10406 + }, + { + "epoch": 1.1428728311003733, + "grad_norm": 2.228224039077759, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7118971347808838, + "num_tokens": 259700176.0, + "step": 10407 + }, + { + "epoch": 1.142982648802987, + "grad_norm": 2.586625099182129, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7167104482650757, + "num_tokens": 259722883.0, + "step": 10408 + }, + { + "epoch": 1.1430924665056006, + "grad_norm": 2.191412925720215, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7118661403656006, + "num_tokens": 259749827.0, + "step": 10409 + }, + { + "epoch": 1.1432022842082143, + "grad_norm": 2.295175075531006, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7227333188056946, + "num_tokens": 259773588.0, + "step": 10410 + }, + { + "epoch": 1.143312101910828, + "grad_norm": 2.5282998085021973, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7185337543487549, + "num_tokens": 259795593.0, + "step": 10411 + }, + { + "epoch": 1.1434219196134416, + "grad_norm": 2.694915294647217, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7223182320594788, + "num_tokens": 259816215.0, + "step": 10412 + }, + { + "epoch": 1.1435317373160554, + "grad_norm": 2.59533953666687, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7395515441894531, + "num_tokens": 259836104.0, + "step": 10413 + }, + { + "epoch": 1.143641555018669, + "grad_norm": 2.2872233390808105, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7336848974227905, + "num_tokens": 259862702.0, + "step": 10414 + }, + { + "epoch": 1.1437513727212827, + "grad_norm": 2.317915916442871, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7246012091636658, + "num_tokens": 259886747.0, + "step": 10415 + }, + { + "epoch": 1.1438611904238964, + "grad_norm": 2.2852022647857666, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7013378739356995, + "num_tokens": 259914831.0, + "step": 10416 + }, + { + "epoch": 1.14397100812651, + "grad_norm": 2.1441361904144287, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7319188117980957, + "num_tokens": 259941912.0, + "step": 10417 + }, + { + "epoch": 1.1440808258291237, + "grad_norm": 2.2959935665130615, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7025488018989563, + "num_tokens": 259965184.0, + "step": 10418 + }, + { + "epoch": 1.1441906435317373, + "grad_norm": 2.3782777786254883, + "learning_rate": 1e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7363730669021606, + "num_tokens": 259987733.0, + "step": 10419 + }, + { + "epoch": 1.144300461234351, + "grad_norm": 2.040736436843872, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7008979320526123, + "num_tokens": 260019053.0, + "step": 10420 + }, + { + "epoch": 1.1444102789369646, + "grad_norm": 2.3743176460266113, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7235196232795715, + "num_tokens": 260042289.0, + "step": 10421 + }, + { + "epoch": 1.1445200966395783, + "grad_norm": 2.0871031284332275, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.717282235622406, + "num_tokens": 260072073.0, + "step": 10422 + }, + { + "epoch": 1.1446299143421919, + "grad_norm": 2.351583242416382, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7234681844711304, + "num_tokens": 260095658.0, + "step": 10423 + }, + { + "epoch": 1.1447397320448056, + "grad_norm": 2.2534801959991455, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7259005308151245, + "num_tokens": 260122169.0, + "step": 10424 + }, + { + "epoch": 1.1448495497474194, + "grad_norm": 2.511470079421997, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7210944890975952, + "num_tokens": 260144076.0, + "step": 10425 + }, + { + "epoch": 1.144959367450033, + "grad_norm": 2.585418224334717, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7309204339981079, + "num_tokens": 260165853.0, + "step": 10426 + }, + { + "epoch": 1.1450691851526467, + "grad_norm": 2.3344321250915527, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7146304845809937, + "num_tokens": 260190445.0, + "step": 10427 + }, + { + "epoch": 1.1451790028552602, + "grad_norm": 2.629972219467163, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7053772211074829, + "num_tokens": 260209845.0, + "step": 10428 + }, + { + "epoch": 1.145288820557874, + "grad_norm": 2.4151694774627686, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7484010457992554, + "num_tokens": 260232877.0, + "step": 10429 + }, + { + "epoch": 1.1453986382604877, + "grad_norm": 2.2966744899749756, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7119427919387817, + "num_tokens": 260258951.0, + "step": 10430 + }, + { + "epoch": 1.1455084559631012, + "grad_norm": 2.214339017868042, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.719511866569519, + "num_tokens": 260284897.0, + "step": 10431 + }, + { + "epoch": 1.145618273665715, + "grad_norm": 2.26540207862854, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.71257483959198, + "num_tokens": 260310768.0, + "step": 10432 + }, + { + "epoch": 1.1457280913683285, + "grad_norm": 2.4639945030212402, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7269710302352905, + "num_tokens": 260331565.0, + "step": 10433 + }, + { + "epoch": 1.1458379090709423, + "grad_norm": 2.5524377822875977, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.732567310333252, + "num_tokens": 260351876.0, + "step": 10434 + }, + { + "epoch": 1.1459477267735558, + "grad_norm": 2.269216299057007, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7430593371391296, + "num_tokens": 260376793.0, + "step": 10435 + }, + { + "epoch": 1.1460575444761696, + "grad_norm": 2.1566359996795654, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.728174090385437, + "num_tokens": 260404279.0, + "step": 10436 + }, + { + "epoch": 1.1461673621787831, + "grad_norm": 2.374556541442871, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7089952826499939, + "num_tokens": 260428417.0, + "step": 10437 + }, + { + "epoch": 1.1462771798813969, + "grad_norm": 2.659424066543579, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7271147966384888, + "num_tokens": 260448348.0, + "step": 10438 + }, + { + "epoch": 1.1463869975840106, + "grad_norm": 2.5390281677246094, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7199350595474243, + "num_tokens": 260469457.0, + "step": 10439 + }, + { + "epoch": 1.1464968152866242, + "grad_norm": 2.4308278560638428, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.728070080280304, + "num_tokens": 260491948.0, + "step": 10440 + }, + { + "epoch": 1.146606632989238, + "grad_norm": 2.354750156402588, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7126477360725403, + "num_tokens": 260517141.0, + "step": 10441 + }, + { + "epoch": 1.1467164506918515, + "grad_norm": 2.524182081222534, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.726686954498291, + "num_tokens": 260539490.0, + "step": 10442 + }, + { + "epoch": 1.1468262683944652, + "grad_norm": 2.37886643409729, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7052547335624695, + "num_tokens": 260562877.0, + "step": 10443 + }, + { + "epoch": 1.1469360860970788, + "grad_norm": 2.260371446609497, + "learning_rate": 1e-06, + "loss": 0.7829, + "mean_token_accuracy": 0.7523279190063477, + "num_tokens": 260587537.0, + "step": 10444 + }, + { + "epoch": 1.1470459037996925, + "grad_norm": 2.1415064334869385, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7437534332275391, + "num_tokens": 260611994.0, + "step": 10445 + }, + { + "epoch": 1.147155721502306, + "grad_norm": 2.237053155899048, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7157370448112488, + "num_tokens": 260638664.0, + "step": 10446 + }, + { + "epoch": 1.1472655392049198, + "grad_norm": 2.0925791263580322, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7321817874908447, + "num_tokens": 260666444.0, + "step": 10447 + }, + { + "epoch": 1.1473753569075336, + "grad_norm": 2.725778579711914, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.696940541267395, + "num_tokens": 260685772.0, + "step": 10448 + }, + { + "epoch": 1.147485174610147, + "grad_norm": 2.3428657054901123, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7281770706176758, + "num_tokens": 260708320.0, + "step": 10449 + }, + { + "epoch": 1.1475949923127609, + "grad_norm": 2.291839838027954, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7107572555541992, + "num_tokens": 260732959.0, + "step": 10450 + }, + { + "epoch": 1.1477048100153744, + "grad_norm": 2.2266674041748047, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7203798890113831, + "num_tokens": 260759074.0, + "step": 10451 + }, + { + "epoch": 1.1478146277179881, + "grad_norm": 2.1019959449768066, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.704357385635376, + "num_tokens": 260787776.0, + "step": 10452 + }, + { + "epoch": 1.147924445420602, + "grad_norm": 2.6546785831451416, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7189961075782776, + "num_tokens": 260808075.0, + "step": 10453 + }, + { + "epoch": 1.1480342631232154, + "grad_norm": 2.5324974060058594, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7370961904525757, + "num_tokens": 260829933.0, + "step": 10454 + }, + { + "epoch": 1.1481440808258292, + "grad_norm": 2.380462646484375, + "learning_rate": 1e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.73930823802948, + "num_tokens": 260852982.0, + "step": 10455 + }, + { + "epoch": 1.1482538985284427, + "grad_norm": 2.202854633331299, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7322201728820801, + "num_tokens": 260878452.0, + "step": 10456 + }, + { + "epoch": 1.1483637162310565, + "grad_norm": 2.1953821182250977, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7066279649734497, + "num_tokens": 260908803.0, + "step": 10457 + }, + { + "epoch": 1.14847353393367, + "grad_norm": 2.3703691959381104, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6959095001220703, + "num_tokens": 260933484.0, + "step": 10458 + }, + { + "epoch": 1.1485833516362838, + "grad_norm": 2.171945095062256, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.6958334445953369, + "num_tokens": 260962437.0, + "step": 10459 + }, + { + "epoch": 1.1486931693388973, + "grad_norm": 2.5785725116729736, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7279493808746338, + "num_tokens": 260984071.0, + "step": 10460 + }, + { + "epoch": 1.148802987041511, + "grad_norm": 2.83482027053833, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7281439304351807, + "num_tokens": 261002473.0, + "step": 10461 + }, + { + "epoch": 1.1489128047441248, + "grad_norm": 2.5206711292266846, + "learning_rate": 1e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7494227886199951, + "num_tokens": 261021292.0, + "step": 10462 + }, + { + "epoch": 1.1490226224467384, + "grad_norm": 2.2526400089263916, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7415496110916138, + "num_tokens": 261045315.0, + "step": 10463 + }, + { + "epoch": 1.1491324401493521, + "grad_norm": 2.391221523284912, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7150958776473999, + "num_tokens": 261068220.0, + "step": 10464 + }, + { + "epoch": 1.1492422578519657, + "grad_norm": 2.4388833045959473, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7420573234558105, + "num_tokens": 261090205.0, + "step": 10465 + }, + { + "epoch": 1.1493520755545794, + "grad_norm": 2.105275869369507, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7157752513885498, + "num_tokens": 261118200.0, + "step": 10466 + }, + { + "epoch": 1.1494618932571932, + "grad_norm": 2.397908926010132, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7324751615524292, + "num_tokens": 261142977.0, + "step": 10467 + }, + { + "epoch": 1.1495717109598067, + "grad_norm": 2.297795295715332, + "learning_rate": 1e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7365687489509583, + "num_tokens": 261165977.0, + "step": 10468 + }, + { + "epoch": 1.1496815286624205, + "grad_norm": 2.360555648803711, + "learning_rate": 1e-06, + "loss": 0.7701, + "mean_token_accuracy": 0.7566148042678833, + "num_tokens": 261188624.0, + "step": 10469 + }, + { + "epoch": 1.149791346365034, + "grad_norm": 2.263868808746338, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7353113293647766, + "num_tokens": 261212349.0, + "step": 10470 + }, + { + "epoch": 1.1499011640676478, + "grad_norm": 2.369407892227173, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7159175872802734, + "num_tokens": 261236026.0, + "step": 10471 + }, + { + "epoch": 1.1500109817702613, + "grad_norm": 2.309102773666382, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.726392924785614, + "num_tokens": 261259291.0, + "step": 10472 + }, + { + "epoch": 1.150120799472875, + "grad_norm": 2.4312937259674072, + "learning_rate": 1e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7379008531570435, + "num_tokens": 261281808.0, + "step": 10473 + }, + { + "epoch": 1.1502306171754886, + "grad_norm": 2.047971487045288, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7174546122550964, + "num_tokens": 261312559.0, + "step": 10474 + }, + { + "epoch": 1.1503404348781023, + "grad_norm": 2.2467660903930664, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7325088381767273, + "num_tokens": 261338947.0, + "step": 10475 + }, + { + "epoch": 1.150450252580716, + "grad_norm": 2.1665990352630615, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7171432971954346, + "num_tokens": 261365441.0, + "step": 10476 + }, + { + "epoch": 1.1505600702833296, + "grad_norm": 2.1886184215545654, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.686914324760437, + "num_tokens": 261392958.0, + "step": 10477 + }, + { + "epoch": 1.1506698879859434, + "grad_norm": 2.2444963455200195, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7325992584228516, + "num_tokens": 261417400.0, + "step": 10478 + }, + { + "epoch": 1.150779705688557, + "grad_norm": 2.379607915878296, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7187689542770386, + "num_tokens": 261441104.0, + "step": 10479 + }, + { + "epoch": 1.1508895233911707, + "grad_norm": 2.31284236907959, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7083629965782166, + "num_tokens": 261465416.0, + "step": 10480 + }, + { + "epoch": 1.1509993410937844, + "grad_norm": 2.2548024654388428, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7101380825042725, + "num_tokens": 261495990.0, + "step": 10481 + }, + { + "epoch": 1.151109158796398, + "grad_norm": 2.2674903869628906, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7189654111862183, + "num_tokens": 261519922.0, + "step": 10482 + }, + { + "epoch": 1.1512189764990117, + "grad_norm": 2.432410955429077, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7351562976837158, + "num_tokens": 261542436.0, + "step": 10483 + }, + { + "epoch": 1.1513287942016253, + "grad_norm": 2.425685405731201, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7264978885650635, + "num_tokens": 261565264.0, + "step": 10484 + }, + { + "epoch": 1.151438611904239, + "grad_norm": 2.1212916374206543, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7434448003768921, + "num_tokens": 261591240.0, + "step": 10485 + }, + { + "epoch": 1.1515484296068526, + "grad_norm": 2.014186143875122, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7051256895065308, + "num_tokens": 261623560.0, + "step": 10486 + }, + { + "epoch": 1.1516582473094663, + "grad_norm": 2.6958060264587402, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7267104387283325, + "num_tokens": 261642880.0, + "step": 10487 + }, + { + "epoch": 1.1517680650120798, + "grad_norm": 2.3662807941436768, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.7003378868103027, + "num_tokens": 261667363.0, + "step": 10488 + }, + { + "epoch": 1.1518778827146936, + "grad_norm": 2.1043782234191895, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7241551876068115, + "num_tokens": 261694350.0, + "step": 10489 + }, + { + "epoch": 1.1519877004173074, + "grad_norm": 2.305811643600464, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.692223072052002, + "num_tokens": 261720256.0, + "step": 10490 + }, + { + "epoch": 1.152097518119921, + "grad_norm": 2.7945854663848877, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7500690221786499, + "num_tokens": 261737555.0, + "step": 10491 + }, + { + "epoch": 1.1522073358225347, + "grad_norm": 2.166224956512451, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.712091326713562, + "num_tokens": 261765182.0, + "step": 10492 + }, + { + "epoch": 1.1523171535251482, + "grad_norm": 2.3873517513275146, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7152632474899292, + "num_tokens": 261788428.0, + "step": 10493 + }, + { + "epoch": 1.152426971227762, + "grad_norm": 2.271732807159424, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.700150191783905, + "num_tokens": 261813556.0, + "step": 10494 + }, + { + "epoch": 1.1525367889303757, + "grad_norm": 2.498567581176758, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7149173617362976, + "num_tokens": 261834698.0, + "step": 10495 + }, + { + "epoch": 1.1526466066329892, + "grad_norm": 2.1163482666015625, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7093505859375, + "num_tokens": 261865647.0, + "step": 10496 + }, + { + "epoch": 1.152756424335603, + "grad_norm": 2.2966690063476562, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7111451625823975, + "num_tokens": 261888625.0, + "step": 10497 + }, + { + "epoch": 1.1528662420382165, + "grad_norm": 2.2655954360961914, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7121355533599854, + "num_tokens": 261913371.0, + "step": 10498 + }, + { + "epoch": 1.1529760597408303, + "grad_norm": 2.4720137119293213, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7218813300132751, + "num_tokens": 261935578.0, + "step": 10499 + }, + { + "epoch": 1.1530858774434438, + "grad_norm": 2.318862199783325, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7318601608276367, + "num_tokens": 261958816.0, + "step": 10500 + }, + { + "epoch": 1.1531956951460576, + "grad_norm": 2.4156229496002197, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7053154706954956, + "num_tokens": 261982386.0, + "step": 10501 + }, + { + "epoch": 1.153305512848671, + "grad_norm": 2.1783502101898193, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7131751775741577, + "num_tokens": 262009108.0, + "step": 10502 + }, + { + "epoch": 1.1534153305512849, + "grad_norm": 2.3463823795318604, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.712067723274231, + "num_tokens": 262032685.0, + "step": 10503 + }, + { + "epoch": 1.1535251482538986, + "grad_norm": 2.4493227005004883, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7176636457443237, + "num_tokens": 262057333.0, + "step": 10504 + }, + { + "epoch": 1.1536349659565122, + "grad_norm": 2.232248067855835, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7199226021766663, + "num_tokens": 262082247.0, + "step": 10505 + }, + { + "epoch": 1.153744783659126, + "grad_norm": 2.359229564666748, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7403883337974548, + "num_tokens": 262104152.0, + "step": 10506 + }, + { + "epoch": 1.1538546013617395, + "grad_norm": 2.4114274978637695, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7170185446739197, + "num_tokens": 262125976.0, + "step": 10507 + }, + { + "epoch": 1.1539644190643532, + "grad_norm": 2.4684853553771973, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.733588695526123, + "num_tokens": 262146865.0, + "step": 10508 + }, + { + "epoch": 1.1540742367669667, + "grad_norm": 2.1470296382904053, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7198548913002014, + "num_tokens": 262173383.0, + "step": 10509 + }, + { + "epoch": 1.1541840544695805, + "grad_norm": 2.5475528240203857, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7211759090423584, + "num_tokens": 262193698.0, + "step": 10510 + }, + { + "epoch": 1.154293872172194, + "grad_norm": 2.1164331436157227, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.732140302658081, + "num_tokens": 262221639.0, + "step": 10511 + }, + { + "epoch": 1.1544036898748078, + "grad_norm": 2.304006576538086, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7154200673103333, + "num_tokens": 262246298.0, + "step": 10512 + }, + { + "epoch": 1.1545135075774215, + "grad_norm": 2.019237518310547, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6912003755569458, + "num_tokens": 262278299.0, + "step": 10513 + }, + { + "epoch": 1.154623325280035, + "grad_norm": 2.607178211212158, + "learning_rate": 1e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7480413913726807, + "num_tokens": 262298795.0, + "step": 10514 + }, + { + "epoch": 1.1547331429826488, + "grad_norm": 2.3427505493164062, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7059130668640137, + "num_tokens": 262323074.0, + "step": 10515 + }, + { + "epoch": 1.1548429606852624, + "grad_norm": 2.1780123710632324, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7109760642051697, + "num_tokens": 262350565.0, + "step": 10516 + }, + { + "epoch": 1.1549527783878761, + "grad_norm": 2.202319860458374, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7263829708099365, + "num_tokens": 262377338.0, + "step": 10517 + }, + { + "epoch": 1.15506259609049, + "grad_norm": 2.334095001220703, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7367868423461914, + "num_tokens": 262399051.0, + "step": 10518 + }, + { + "epoch": 1.1551724137931034, + "grad_norm": 2.1435093879699707, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.7013640403747559, + "num_tokens": 262430778.0, + "step": 10519 + }, + { + "epoch": 1.1552822314957172, + "grad_norm": 2.3286092281341553, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7502247095108032, + "num_tokens": 262454325.0, + "step": 10520 + }, + { + "epoch": 1.1553920491983307, + "grad_norm": 2.7509870529174805, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7346498966217041, + "num_tokens": 262472346.0, + "step": 10521 + }, + { + "epoch": 1.1555018669009445, + "grad_norm": 2.195404291152954, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7377389073371887, + "num_tokens": 262497146.0, + "step": 10522 + }, + { + "epoch": 1.155611684603558, + "grad_norm": 2.1420705318450928, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7230278253555298, + "num_tokens": 262525378.0, + "step": 10523 + }, + { + "epoch": 1.1557215023061718, + "grad_norm": 2.2581889629364014, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7118557095527649, + "num_tokens": 262550640.0, + "step": 10524 + }, + { + "epoch": 1.1558313200087853, + "grad_norm": 2.3158209323883057, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7177030444145203, + "num_tokens": 262574826.0, + "step": 10525 + }, + { + "epoch": 1.155941137711399, + "grad_norm": 2.1067869663238525, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7142627239227295, + "num_tokens": 262604444.0, + "step": 10526 + }, + { + "epoch": 1.1560509554140128, + "grad_norm": 2.3846707344055176, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.6995338797569275, + "num_tokens": 262627268.0, + "step": 10527 + }, + { + "epoch": 1.1561607731166264, + "grad_norm": 2.7507519721984863, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7360670566558838, + "num_tokens": 262646481.0, + "step": 10528 + }, + { + "epoch": 1.15627059081924, + "grad_norm": 2.3258090019226074, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7031680345535278, + "num_tokens": 262670297.0, + "step": 10529 + }, + { + "epoch": 1.1563804085218536, + "grad_norm": 2.3707687854766846, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7261644005775452, + "num_tokens": 262693049.0, + "step": 10530 + }, + { + "epoch": 1.1564902262244674, + "grad_norm": 2.3999829292297363, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7190566062927246, + "num_tokens": 262716155.0, + "step": 10531 + }, + { + "epoch": 1.1566000439270812, + "grad_norm": 2.317932367324829, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7030788660049438, + "num_tokens": 262740747.0, + "step": 10532 + }, + { + "epoch": 1.1567098616296947, + "grad_norm": 2.414854049682617, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7399930357933044, + "num_tokens": 262762968.0, + "step": 10533 + }, + { + "epoch": 1.1568196793323084, + "grad_norm": 2.1130056381225586, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7048752307891846, + "num_tokens": 262791466.0, + "step": 10534 + }, + { + "epoch": 1.156929497034922, + "grad_norm": 2.3727893829345703, + "learning_rate": 1e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7457210421562195, + "num_tokens": 262813278.0, + "step": 10535 + }, + { + "epoch": 1.1570393147375357, + "grad_norm": 2.1370911598205566, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7139302492141724, + "num_tokens": 262842420.0, + "step": 10536 + }, + { + "epoch": 1.1571491324401493, + "grad_norm": 2.3901889324188232, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.718203067779541, + "num_tokens": 262866700.0, + "step": 10537 + }, + { + "epoch": 1.157258950142763, + "grad_norm": 2.382849931716919, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7559652924537659, + "num_tokens": 262888477.0, + "step": 10538 + }, + { + "epoch": 1.1573687678453766, + "grad_norm": 2.402984619140625, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7281903028488159, + "num_tokens": 262911974.0, + "step": 10539 + }, + { + "epoch": 1.1574785855479903, + "grad_norm": 2.4827721118927, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7393706440925598, + "num_tokens": 262933516.0, + "step": 10540 + }, + { + "epoch": 1.157588403250604, + "grad_norm": 2.563117265701294, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7426232695579529, + "num_tokens": 262953157.0, + "step": 10541 + }, + { + "epoch": 1.1576982209532176, + "grad_norm": 2.613412857055664, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7179100513458252, + "num_tokens": 262973037.0, + "step": 10542 + }, + { + "epoch": 1.1578080386558314, + "grad_norm": 2.5749194622039795, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7286605834960938, + "num_tokens": 262994972.0, + "step": 10543 + }, + { + "epoch": 1.157917856358445, + "grad_norm": 2.2106680870056152, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7087860703468323, + "num_tokens": 263022381.0, + "step": 10544 + }, + { + "epoch": 1.1580276740610587, + "grad_norm": 2.1881942749023438, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7014909982681274, + "num_tokens": 263048510.0, + "step": 10545 + }, + { + "epoch": 1.1581374917636724, + "grad_norm": 2.1019222736358643, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7470760345458984, + "num_tokens": 263075130.0, + "step": 10546 + }, + { + "epoch": 1.158247309466286, + "grad_norm": 2.227267265319824, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7218953371047974, + "num_tokens": 263102608.0, + "step": 10547 + }, + { + "epoch": 1.1583571271688997, + "grad_norm": 2.512399196624756, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7180129289627075, + "num_tokens": 263123037.0, + "step": 10548 + }, + { + "epoch": 1.1584669448715132, + "grad_norm": 2.230093002319336, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7211169600486755, + "num_tokens": 263149696.0, + "step": 10549 + }, + { + "epoch": 1.158576762574127, + "grad_norm": 2.451519727706909, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7133240699768066, + "num_tokens": 263172657.0, + "step": 10550 + }, + { + "epoch": 1.1586865802767405, + "grad_norm": 2.393778085708618, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7301551103591919, + "num_tokens": 263196922.0, + "step": 10551 + }, + { + "epoch": 1.1587963979793543, + "grad_norm": 2.3223893642425537, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7454133033752441, + "num_tokens": 263220329.0, + "step": 10552 + }, + { + "epoch": 1.1589062156819678, + "grad_norm": 2.259683847427368, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7144508361816406, + "num_tokens": 263247644.0, + "step": 10553 + }, + { + "epoch": 1.1590160333845816, + "grad_norm": 2.5690360069274902, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7378869652748108, + "num_tokens": 263266670.0, + "step": 10554 + }, + { + "epoch": 1.1591258510871953, + "grad_norm": 2.516465425491333, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7422424554824829, + "num_tokens": 263287084.0, + "step": 10555 + }, + { + "epoch": 1.1592356687898089, + "grad_norm": 2.1420016288757324, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7303372025489807, + "num_tokens": 263315094.0, + "step": 10556 + }, + { + "epoch": 1.1593454864924226, + "grad_norm": 2.3409950733184814, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7173761129379272, + "num_tokens": 263338980.0, + "step": 10557 + }, + { + "epoch": 1.1594553041950362, + "grad_norm": 2.439807891845703, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7081723213195801, + "num_tokens": 263362569.0, + "step": 10558 + }, + { + "epoch": 1.15956512189765, + "grad_norm": 2.4095044136047363, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7451679706573486, + "num_tokens": 263385499.0, + "step": 10559 + }, + { + "epoch": 1.1596749396002635, + "grad_norm": 2.4982683658599854, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7309045791625977, + "num_tokens": 263408405.0, + "step": 10560 + }, + { + "epoch": 1.1597847573028772, + "grad_norm": 2.442383289337158, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.744183361530304, + "num_tokens": 263430913.0, + "step": 10561 + }, + { + "epoch": 1.159894575005491, + "grad_norm": 2.401658773422241, + "learning_rate": 1e-06, + "loss": 0.8038, + "mean_token_accuracy": 0.7428861856460571, + "num_tokens": 263454106.0, + "step": 10562 + }, + { + "epoch": 1.1600043927081045, + "grad_norm": 2.424750804901123, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7224903106689453, + "num_tokens": 263476944.0, + "step": 10563 + }, + { + "epoch": 1.1601142104107183, + "grad_norm": 2.238725185394287, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7073110342025757, + "num_tokens": 263502863.0, + "step": 10564 + }, + { + "epoch": 1.1602240281133318, + "grad_norm": 2.2182157039642334, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7204489707946777, + "num_tokens": 263530092.0, + "step": 10565 + }, + { + "epoch": 1.1603338458159456, + "grad_norm": 2.1794848442077637, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6920427680015564, + "num_tokens": 263558007.0, + "step": 10566 + }, + { + "epoch": 1.160443663518559, + "grad_norm": 2.1953017711639404, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7206825017929077, + "num_tokens": 263586879.0, + "step": 10567 + }, + { + "epoch": 1.1605534812211729, + "grad_norm": 2.6439201831817627, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7210875749588013, + "num_tokens": 263607311.0, + "step": 10568 + }, + { + "epoch": 1.1606632989237866, + "grad_norm": 2.167076349258423, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.719725489616394, + "num_tokens": 263634321.0, + "step": 10569 + }, + { + "epoch": 1.1607731166264001, + "grad_norm": 2.6124579906463623, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.71656733751297, + "num_tokens": 263653366.0, + "step": 10570 + }, + { + "epoch": 1.160882934329014, + "grad_norm": 2.112393617630005, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7468445301055908, + "num_tokens": 263680286.0, + "step": 10571 + }, + { + "epoch": 1.1609927520316274, + "grad_norm": 2.2822487354278564, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.6985853910446167, + "num_tokens": 263705202.0, + "step": 10572 + }, + { + "epoch": 1.1611025697342412, + "grad_norm": 2.4767448902130127, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7219980359077454, + "num_tokens": 263726887.0, + "step": 10573 + }, + { + "epoch": 1.1612123874368547, + "grad_norm": 2.3578760623931885, + "learning_rate": 1e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7428274750709534, + "num_tokens": 263749076.0, + "step": 10574 + }, + { + "epoch": 1.1613222051394685, + "grad_norm": 2.2125535011291504, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.730574369430542, + "num_tokens": 263774811.0, + "step": 10575 + }, + { + "epoch": 1.161432022842082, + "grad_norm": 1.9601359367370605, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7119877338409424, + "num_tokens": 263808506.0, + "step": 10576 + }, + { + "epoch": 1.1615418405446958, + "grad_norm": 2.2884762287139893, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7410246729850769, + "num_tokens": 263831496.0, + "step": 10577 + }, + { + "epoch": 1.1616516582473095, + "grad_norm": 2.465125322341919, + "learning_rate": 1e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7490448951721191, + "num_tokens": 263853315.0, + "step": 10578 + }, + { + "epoch": 1.161761475949923, + "grad_norm": 2.0702803134918213, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7047199010848999, + "num_tokens": 263883730.0, + "step": 10579 + }, + { + "epoch": 1.1618712936525368, + "grad_norm": 2.2446751594543457, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7137540578842163, + "num_tokens": 263911028.0, + "step": 10580 + }, + { + "epoch": 1.1619811113551504, + "grad_norm": 2.077721118927002, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7085718512535095, + "num_tokens": 263940958.0, + "step": 10581 + }, + { + "epoch": 1.1620909290577641, + "grad_norm": 2.297149896621704, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7205259799957275, + "num_tokens": 263963786.0, + "step": 10582 + }, + { + "epoch": 1.1622007467603779, + "grad_norm": 2.7951300144195557, + "learning_rate": 1e-06, + "loss": 0.734, + "mean_token_accuracy": 0.7577804327011108, + "num_tokens": 263980294.0, + "step": 10583 + }, + { + "epoch": 1.1623105644629914, + "grad_norm": 2.500115394592285, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7271699905395508, + "num_tokens": 264002328.0, + "step": 10584 + }, + { + "epoch": 1.1624203821656052, + "grad_norm": 2.487494468688965, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7399466037750244, + "num_tokens": 264023688.0, + "step": 10585 + }, + { + "epoch": 1.1625301998682187, + "grad_norm": 2.2940192222595215, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.689586341381073, + "num_tokens": 264051226.0, + "step": 10586 + }, + { + "epoch": 1.1626400175708325, + "grad_norm": 2.2462728023529053, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7468602657318115, + "num_tokens": 264074197.0, + "step": 10587 + }, + { + "epoch": 1.162749835273446, + "grad_norm": 2.2350146770477295, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7105100154876709, + "num_tokens": 264101279.0, + "step": 10588 + }, + { + "epoch": 1.1628596529760598, + "grad_norm": 2.166166067123413, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7324615120887756, + "num_tokens": 264128422.0, + "step": 10589 + }, + { + "epoch": 1.1629694706786733, + "grad_norm": 2.3501136302948, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.70014888048172, + "num_tokens": 264152646.0, + "step": 10590 + }, + { + "epoch": 1.163079288381287, + "grad_norm": 2.285097360610962, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7075026035308838, + "num_tokens": 264177136.0, + "step": 10591 + }, + { + "epoch": 1.1631891060839008, + "grad_norm": 2.3600213527679443, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7220669388771057, + "num_tokens": 264200290.0, + "step": 10592 + }, + { + "epoch": 1.1632989237865143, + "grad_norm": 2.159991979598999, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7200382351875305, + "num_tokens": 264227331.0, + "step": 10593 + }, + { + "epoch": 1.163408741489128, + "grad_norm": 2.2756121158599854, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7385331392288208, + "num_tokens": 264250299.0, + "step": 10594 + }, + { + "epoch": 1.1635185591917416, + "grad_norm": 2.4859395027160645, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7457343339920044, + "num_tokens": 264269737.0, + "step": 10595 + }, + { + "epoch": 1.1636283768943554, + "grad_norm": 2.259758472442627, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7050567865371704, + "num_tokens": 264294279.0, + "step": 10596 + }, + { + "epoch": 1.1637381945969691, + "grad_norm": 2.6150894165039062, + "learning_rate": 1e-06, + "loss": 0.7925, + "mean_token_accuracy": 0.7462433576583862, + "num_tokens": 264313376.0, + "step": 10597 + }, + { + "epoch": 1.1638480122995827, + "grad_norm": 2.4341864585876465, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.712867796421051, + "num_tokens": 264334816.0, + "step": 10598 + }, + { + "epoch": 1.1639578300021964, + "grad_norm": 2.4123878479003906, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7164007425308228, + "num_tokens": 264358214.0, + "step": 10599 + }, + { + "epoch": 1.16406764770481, + "grad_norm": 2.4897372722625732, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.734142541885376, + "num_tokens": 264380178.0, + "step": 10600 + }, + { + "epoch": 1.1641774654074237, + "grad_norm": 2.5815017223358154, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7056798934936523, + "num_tokens": 264401350.0, + "step": 10601 + }, + { + "epoch": 1.1642872831100373, + "grad_norm": 2.2008252143859863, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7440316677093506, + "num_tokens": 264427024.0, + "step": 10602 + }, + { + "epoch": 1.164397100812651, + "grad_norm": 2.1218459606170654, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7071831822395325, + "num_tokens": 264456303.0, + "step": 10603 + }, + { + "epoch": 1.1645069185152646, + "grad_norm": 2.1556055545806885, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7210204601287842, + "num_tokens": 264484489.0, + "step": 10604 + }, + { + "epoch": 1.1646167362178783, + "grad_norm": 2.2623775005340576, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7287125587463379, + "num_tokens": 264508271.0, + "step": 10605 + }, + { + "epoch": 1.164726553920492, + "grad_norm": 1.9760743379592896, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7509716749191284, + "num_tokens": 264535046.0, + "step": 10606 + }, + { + "epoch": 1.1648363716231056, + "grad_norm": 2.638519525527954, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7390535473823547, + "num_tokens": 264555577.0, + "step": 10607 + }, + { + "epoch": 1.1649461893257194, + "grad_norm": 2.290767192840576, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.7053346633911133, + "num_tokens": 264579980.0, + "step": 10608 + }, + { + "epoch": 1.165056007028333, + "grad_norm": 2.2532715797424316, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7045440673828125, + "num_tokens": 264604987.0, + "step": 10609 + }, + { + "epoch": 1.1651658247309467, + "grad_norm": 2.4393064975738525, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7298662662506104, + "num_tokens": 264625419.0, + "step": 10610 + }, + { + "epoch": 1.1652756424335604, + "grad_norm": 2.758121967315674, + "learning_rate": 1e-06, + "loss": 0.7844, + "mean_token_accuracy": 0.7482500076293945, + "num_tokens": 264643529.0, + "step": 10611 + }, + { + "epoch": 1.165385460136174, + "grad_norm": 2.3445119857788086, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7194857597351074, + "num_tokens": 264668354.0, + "step": 10612 + }, + { + "epoch": 1.1654952778387877, + "grad_norm": 2.6135811805725098, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7406190633773804, + "num_tokens": 264687449.0, + "step": 10613 + }, + { + "epoch": 1.1656050955414012, + "grad_norm": 2.1511800289154053, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7346888780593872, + "num_tokens": 264713800.0, + "step": 10614 + }, + { + "epoch": 1.165714913244015, + "grad_norm": 2.4261012077331543, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7298483848571777, + "num_tokens": 264736084.0, + "step": 10615 + }, + { + "epoch": 1.1658247309466285, + "grad_norm": 2.3320817947387695, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7388434410095215, + "num_tokens": 264759553.0, + "step": 10616 + }, + { + "epoch": 1.1659345486492423, + "grad_norm": 2.4893651008605957, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7342174053192139, + "num_tokens": 264780493.0, + "step": 10617 + }, + { + "epoch": 1.1660443663518558, + "grad_norm": 2.4900760650634766, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7349177002906799, + "num_tokens": 264801544.0, + "step": 10618 + }, + { + "epoch": 1.1661541840544696, + "grad_norm": 2.2585394382476807, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7409176826477051, + "num_tokens": 264828536.0, + "step": 10619 + }, + { + "epoch": 1.1662640017570833, + "grad_norm": 2.594658136367798, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7106391787528992, + "num_tokens": 264851458.0, + "step": 10620 + }, + { + "epoch": 1.1663738194596969, + "grad_norm": 2.2445545196533203, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7198821306228638, + "num_tokens": 264877095.0, + "step": 10621 + }, + { + "epoch": 1.1664836371623106, + "grad_norm": 2.070223808288574, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7290431261062622, + "num_tokens": 264905451.0, + "step": 10622 + }, + { + "epoch": 1.1665934548649242, + "grad_norm": 2.307365655899048, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.736760139465332, + "num_tokens": 264928899.0, + "step": 10623 + }, + { + "epoch": 1.166703272567538, + "grad_norm": 2.1610050201416016, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7191274762153625, + "num_tokens": 264954245.0, + "step": 10624 + }, + { + "epoch": 1.1668130902701515, + "grad_norm": 2.3250746726989746, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7298941612243652, + "num_tokens": 264977800.0, + "step": 10625 + }, + { + "epoch": 1.1669229079727652, + "grad_norm": 2.295612335205078, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6981664299964905, + "num_tokens": 265002875.0, + "step": 10626 + }, + { + "epoch": 1.167032725675379, + "grad_norm": 2.1878974437713623, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7084383964538574, + "num_tokens": 265031162.0, + "step": 10627 + }, + { + "epoch": 1.1671425433779925, + "grad_norm": 2.122035026550293, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7321286797523499, + "num_tokens": 265058225.0, + "step": 10628 + }, + { + "epoch": 1.1672523610806063, + "grad_norm": 2.084191083908081, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7158046960830688, + "num_tokens": 265087885.0, + "step": 10629 + }, + { + "epoch": 1.1673621787832198, + "grad_norm": 2.6787309646606445, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7177179455757141, + "num_tokens": 265107892.0, + "step": 10630 + }, + { + "epoch": 1.1674719964858336, + "grad_norm": 2.20530366897583, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7070516347885132, + "num_tokens": 265134609.0, + "step": 10631 + }, + { + "epoch": 1.167581814188447, + "grad_norm": 2.1703944206237793, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7262548208236694, + "num_tokens": 265160889.0, + "step": 10632 + }, + { + "epoch": 1.1676916318910608, + "grad_norm": 2.2226402759552, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7287850379943848, + "num_tokens": 265185437.0, + "step": 10633 + }, + { + "epoch": 1.1678014495936746, + "grad_norm": 2.497426986694336, + "learning_rate": 1e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7463096380233765, + "num_tokens": 265206021.0, + "step": 10634 + }, + { + "epoch": 1.1679112672962881, + "grad_norm": 2.4277379512786865, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7224090695381165, + "num_tokens": 265227753.0, + "step": 10635 + }, + { + "epoch": 1.168021084998902, + "grad_norm": 2.263375997543335, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7032700181007385, + "num_tokens": 265254985.0, + "step": 10636 + }, + { + "epoch": 1.1681309027015154, + "grad_norm": 2.1187949180603027, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7257775068283081, + "num_tokens": 265282500.0, + "step": 10637 + }, + { + "epoch": 1.1682407204041292, + "grad_norm": 2.064347743988037, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7144343852996826, + "num_tokens": 265310383.0, + "step": 10638 + }, + { + "epoch": 1.1683505381067427, + "grad_norm": 2.6241564750671387, + "learning_rate": 1e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7525154948234558, + "num_tokens": 265329115.0, + "step": 10639 + }, + { + "epoch": 1.1684603558093565, + "grad_norm": 2.3450992107391357, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7024527192115784, + "num_tokens": 265353969.0, + "step": 10640 + }, + { + "epoch": 1.16857017351197, + "grad_norm": 2.4118244647979736, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7248814105987549, + "num_tokens": 265377159.0, + "step": 10641 + }, + { + "epoch": 1.1686799912145838, + "grad_norm": 2.6643004417419434, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.716259241104126, + "num_tokens": 265396135.0, + "step": 10642 + }, + { + "epoch": 1.1687898089171975, + "grad_norm": 2.0751144886016846, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7090649604797363, + "num_tokens": 265426994.0, + "step": 10643 + }, + { + "epoch": 1.168899626619811, + "grad_norm": 2.453167200088501, + "learning_rate": 1e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.746292233467102, + "num_tokens": 265447616.0, + "step": 10644 + }, + { + "epoch": 1.1690094443224248, + "grad_norm": 2.2165615558624268, + "learning_rate": 1e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6825696229934692, + "num_tokens": 265477552.0, + "step": 10645 + }, + { + "epoch": 1.1691192620250384, + "grad_norm": 2.5027661323547363, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7221702337265015, + "num_tokens": 265498425.0, + "step": 10646 + }, + { + "epoch": 1.169229079727652, + "grad_norm": 2.0664215087890625, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7072672843933105, + "num_tokens": 265528620.0, + "step": 10647 + }, + { + "epoch": 1.1693388974302659, + "grad_norm": 2.255175828933716, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7226783037185669, + "num_tokens": 265554122.0, + "step": 10648 + }, + { + "epoch": 1.1694487151328794, + "grad_norm": 2.4582407474517822, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.713294506072998, + "num_tokens": 265576496.0, + "step": 10649 + }, + { + "epoch": 1.1695585328354932, + "grad_norm": 2.6549227237701416, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7560469508171082, + "num_tokens": 265595637.0, + "step": 10650 + }, + { + "epoch": 1.1696683505381067, + "grad_norm": 2.0809381008148193, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7120785713195801, + "num_tokens": 265625881.0, + "step": 10651 + }, + { + "epoch": 1.1697781682407205, + "grad_norm": 2.025299310684204, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7291520833969116, + "num_tokens": 265656295.0, + "step": 10652 + }, + { + "epoch": 1.169887985943334, + "grad_norm": 1.8677741289138794, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7313197255134583, + "num_tokens": 265690193.0, + "step": 10653 + }, + { + "epoch": 1.1699978036459477, + "grad_norm": 2.2557945251464844, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7177019715309143, + "num_tokens": 265717017.0, + "step": 10654 + }, + { + "epoch": 1.1701076213485613, + "grad_norm": 2.4247193336486816, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7403688430786133, + "num_tokens": 265739014.0, + "step": 10655 + }, + { + "epoch": 1.170217439051175, + "grad_norm": 2.190779685974121, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7183558940887451, + "num_tokens": 265766279.0, + "step": 10656 + }, + { + "epoch": 1.1703272567537888, + "grad_norm": 2.2668957710266113, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7257238626480103, + "num_tokens": 265789561.0, + "step": 10657 + }, + { + "epoch": 1.1704370744564023, + "grad_norm": 2.3487935066223145, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7366815209388733, + "num_tokens": 265811893.0, + "step": 10658 + }, + { + "epoch": 1.170546892159016, + "grad_norm": 2.8076581954956055, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7067728042602539, + "num_tokens": 265832249.0, + "step": 10659 + }, + { + "epoch": 1.1706567098616296, + "grad_norm": 2.5557515621185303, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7302993535995483, + "num_tokens": 265851981.0, + "step": 10660 + }, + { + "epoch": 1.1707665275642434, + "grad_norm": 2.2850987911224365, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.730449914932251, + "num_tokens": 265875610.0, + "step": 10661 + }, + { + "epoch": 1.1708763452668571, + "grad_norm": 2.1812267303466797, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7291141748428345, + "num_tokens": 265901320.0, + "step": 10662 + }, + { + "epoch": 1.1709861629694707, + "grad_norm": 2.2178738117218018, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.712669849395752, + "num_tokens": 265926550.0, + "step": 10663 + }, + { + "epoch": 1.1710959806720844, + "grad_norm": 2.8535349369049072, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7362397909164429, + "num_tokens": 265945091.0, + "step": 10664 + }, + { + "epoch": 1.171205798374698, + "grad_norm": 2.4275875091552734, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7285636067390442, + "num_tokens": 265966265.0, + "step": 10665 + }, + { + "epoch": 1.1713156160773117, + "grad_norm": 2.181581974029541, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6888905763626099, + "num_tokens": 265994183.0, + "step": 10666 + }, + { + "epoch": 1.1714254337799253, + "grad_norm": 2.2135608196258545, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7124136686325073, + "num_tokens": 266023839.0, + "step": 10667 + }, + { + "epoch": 1.171535251482539, + "grad_norm": 2.342116117477417, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7122902870178223, + "num_tokens": 266048869.0, + "step": 10668 + }, + { + "epoch": 1.1716450691851525, + "grad_norm": 2.200777053833008, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7372171878814697, + "num_tokens": 266074938.0, + "step": 10669 + }, + { + "epoch": 1.1717548868877663, + "grad_norm": 2.3354954719543457, + "learning_rate": 1e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7339046001434326, + "num_tokens": 266096639.0, + "step": 10670 + }, + { + "epoch": 1.17186470459038, + "grad_norm": 2.509599447250366, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.742021918296814, + "num_tokens": 266117210.0, + "step": 10671 + }, + { + "epoch": 1.1719745222929936, + "grad_norm": 2.7782771587371826, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7228655815124512, + "num_tokens": 266136361.0, + "step": 10672 + }, + { + "epoch": 1.1720843399956073, + "grad_norm": 2.2505078315734863, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7095205783843994, + "num_tokens": 266161607.0, + "step": 10673 + }, + { + "epoch": 1.1721941576982209, + "grad_norm": 1.9637809991836548, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.728394627571106, + "num_tokens": 266191829.0, + "step": 10674 + }, + { + "epoch": 1.1723039754008346, + "grad_norm": 2.286461591720581, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7082409262657166, + "num_tokens": 266215741.0, + "step": 10675 + }, + { + "epoch": 1.1724137931034484, + "grad_norm": 2.5372753143310547, + "learning_rate": 1e-06, + "loss": 0.7546, + "mean_token_accuracy": 0.7693424224853516, + "num_tokens": 266235014.0, + "step": 10676 + }, + { + "epoch": 1.172523610806062, + "grad_norm": 2.1458301544189453, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7202857732772827, + "num_tokens": 266264515.0, + "step": 10677 + }, + { + "epoch": 1.1726334285086757, + "grad_norm": 2.4115898609161377, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7155610918998718, + "num_tokens": 266288366.0, + "step": 10678 + }, + { + "epoch": 1.1727432462112892, + "grad_norm": 2.3884365558624268, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7196247577667236, + "num_tokens": 266312230.0, + "step": 10679 + }, + { + "epoch": 1.172853063913903, + "grad_norm": 2.2825546264648438, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7107173204421997, + "num_tokens": 266339082.0, + "step": 10680 + }, + { + "epoch": 1.1729628816165165, + "grad_norm": 2.4220356941223145, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7330691814422607, + "num_tokens": 266363931.0, + "step": 10681 + }, + { + "epoch": 1.1730726993191303, + "grad_norm": 2.595839500427246, + "learning_rate": 1e-06, + "loss": 0.791, + "mean_token_accuracy": 0.7441990375518799, + "num_tokens": 266382026.0, + "step": 10682 + }, + { + "epoch": 1.1731825170217438, + "grad_norm": 2.1455202102661133, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7190902233123779, + "num_tokens": 266408571.0, + "step": 10683 + }, + { + "epoch": 1.1732923347243576, + "grad_norm": 2.326685905456543, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7108162045478821, + "num_tokens": 266433970.0, + "step": 10684 + }, + { + "epoch": 1.1734021524269713, + "grad_norm": 2.3238685131073, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7244499921798706, + "num_tokens": 266459415.0, + "step": 10685 + }, + { + "epoch": 1.1735119701295849, + "grad_norm": 2.3828284740448, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7056377530097961, + "num_tokens": 266483599.0, + "step": 10686 + }, + { + "epoch": 1.1736217878321986, + "grad_norm": 2.309129238128662, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7299029231071472, + "num_tokens": 266505731.0, + "step": 10687 + }, + { + "epoch": 1.1737316055348122, + "grad_norm": 2.1639273166656494, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6959275603294373, + "num_tokens": 266532533.0, + "step": 10688 + }, + { + "epoch": 1.173841423237426, + "grad_norm": 2.1221776008605957, + "learning_rate": 1e-06, + "loss": 0.7782, + "mean_token_accuracy": 0.7507719993591309, + "num_tokens": 266557116.0, + "step": 10689 + }, + { + "epoch": 1.1739512409400394, + "grad_norm": 2.345446825027466, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7310107946395874, + "num_tokens": 266580265.0, + "step": 10690 + }, + { + "epoch": 1.1740610586426532, + "grad_norm": 2.0695011615753174, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7063344717025757, + "num_tokens": 266608107.0, + "step": 10691 + }, + { + "epoch": 1.1741708763452667, + "grad_norm": 2.351428985595703, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7357428073883057, + "num_tokens": 266632169.0, + "step": 10692 + }, + { + "epoch": 1.1742806940478805, + "grad_norm": 2.5270779132843018, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7446514964103699, + "num_tokens": 266653680.0, + "step": 10693 + }, + { + "epoch": 1.1743905117504942, + "grad_norm": 2.2785727977752686, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7192131280899048, + "num_tokens": 266678596.0, + "step": 10694 + }, + { + "epoch": 1.1745003294531078, + "grad_norm": 2.260524272918701, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7033693790435791, + "num_tokens": 266706149.0, + "step": 10695 + }, + { + "epoch": 1.1746101471557215, + "grad_norm": 2.4277851581573486, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7064443826675415, + "num_tokens": 266730198.0, + "step": 10696 + }, + { + "epoch": 1.174719964858335, + "grad_norm": 1.9691603183746338, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6926605701446533, + "num_tokens": 266761789.0, + "step": 10697 + }, + { + "epoch": 1.1748297825609488, + "grad_norm": 1.8952422142028809, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7174782752990723, + "num_tokens": 266796976.0, + "step": 10698 + }, + { + "epoch": 1.1749396002635626, + "grad_norm": 2.0935583114624023, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7159006595611572, + "num_tokens": 266827627.0, + "step": 10699 + }, + { + "epoch": 1.1750494179661761, + "grad_norm": 2.093599796295166, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7077780365943909, + "num_tokens": 266854714.0, + "step": 10700 + }, + { + "epoch": 1.1751592356687899, + "grad_norm": 2.36277174949646, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7300857305526733, + "num_tokens": 266877532.0, + "step": 10701 + }, + { + "epoch": 1.1752690533714034, + "grad_norm": 2.3655788898468018, + "learning_rate": 1e-06, + "loss": 0.8031, + "mean_token_accuracy": 0.7457834482192993, + "num_tokens": 266898925.0, + "step": 10702 + }, + { + "epoch": 1.1753788710740172, + "grad_norm": 2.1154282093048096, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7281369566917419, + "num_tokens": 266925581.0, + "step": 10703 + }, + { + "epoch": 1.1754886887766307, + "grad_norm": 2.198488473892212, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.6959536075592041, + "num_tokens": 266952662.0, + "step": 10704 + }, + { + "epoch": 1.1755985064792445, + "grad_norm": 2.4508745670318604, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7255831360816956, + "num_tokens": 266975814.0, + "step": 10705 + }, + { + "epoch": 1.175708324181858, + "grad_norm": 2.189450979232788, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7424236536026001, + "num_tokens": 267002390.0, + "step": 10706 + }, + { + "epoch": 1.1758181418844718, + "grad_norm": 2.136528968811035, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7214831113815308, + "num_tokens": 267030033.0, + "step": 10707 + }, + { + "epoch": 1.1759279595870855, + "grad_norm": 2.4144678115844727, + "learning_rate": 1e-06, + "loss": 0.7772, + "mean_token_accuracy": 0.744962751865387, + "num_tokens": 267050512.0, + "step": 10708 + }, + { + "epoch": 1.176037777289699, + "grad_norm": 2.6372194290161133, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7230391502380371, + "num_tokens": 267070486.0, + "step": 10709 + }, + { + "epoch": 1.1761475949923128, + "grad_norm": 2.311814546585083, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7199419736862183, + "num_tokens": 267094107.0, + "step": 10710 + }, + { + "epoch": 1.1762574126949263, + "grad_norm": 1.9509464502334595, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7237588167190552, + "num_tokens": 267124680.0, + "step": 10711 + }, + { + "epoch": 1.17636723039754, + "grad_norm": 2.752761125564575, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7196535468101501, + "num_tokens": 267142762.0, + "step": 10712 + }, + { + "epoch": 1.1764770481001539, + "grad_norm": 2.2344181537628174, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7312233448028564, + "num_tokens": 267168009.0, + "step": 10713 + }, + { + "epoch": 1.1765868658027674, + "grad_norm": 2.8433878421783447, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7253468632698059, + "num_tokens": 267185972.0, + "step": 10714 + }, + { + "epoch": 1.1766966835053811, + "grad_norm": 2.0432729721069336, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7085216641426086, + "num_tokens": 267215163.0, + "step": 10715 + }, + { + "epoch": 1.1768065012079947, + "grad_norm": 2.340003728866577, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7189059257507324, + "num_tokens": 267240052.0, + "step": 10716 + }, + { + "epoch": 1.1769163189106084, + "grad_norm": 2.294264793395996, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7230556011199951, + "num_tokens": 267265004.0, + "step": 10717 + }, + { + "epoch": 1.177026136613222, + "grad_norm": 2.7888851165771484, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.725262463092804, + "num_tokens": 267284170.0, + "step": 10718 + }, + { + "epoch": 1.1771359543158357, + "grad_norm": 2.4795851707458496, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7206671237945557, + "num_tokens": 267306510.0, + "step": 10719 + }, + { + "epoch": 1.1772457720184493, + "grad_norm": 2.2612290382385254, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7060675621032715, + "num_tokens": 267332140.0, + "step": 10720 + }, + { + "epoch": 1.177355589721063, + "grad_norm": 2.3567001819610596, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.722548246383667, + "num_tokens": 267354137.0, + "step": 10721 + }, + { + "epoch": 1.1774654074236768, + "grad_norm": 2.325585126876831, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.6977847218513489, + "num_tokens": 267380213.0, + "step": 10722 + }, + { + "epoch": 1.1775752251262903, + "grad_norm": 2.278684377670288, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7414838075637817, + "num_tokens": 267406991.0, + "step": 10723 + }, + { + "epoch": 1.177685042828904, + "grad_norm": 2.246828556060791, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7299984693527222, + "num_tokens": 267432377.0, + "step": 10724 + }, + { + "epoch": 1.1777948605315176, + "grad_norm": 2.407883882522583, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.727584958076477, + "num_tokens": 267455256.0, + "step": 10725 + }, + { + "epoch": 1.1779046782341314, + "grad_norm": 2.658752202987671, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.736441969871521, + "num_tokens": 267474674.0, + "step": 10726 + }, + { + "epoch": 1.1780144959367451, + "grad_norm": 2.3587558269500732, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.703781247138977, + "num_tokens": 267501489.0, + "step": 10727 + }, + { + "epoch": 1.1781243136393587, + "grad_norm": 2.1120057106018066, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7116954326629639, + "num_tokens": 267529815.0, + "step": 10728 + }, + { + "epoch": 1.1782341313419724, + "grad_norm": 2.2620582580566406, + "learning_rate": 1e-06, + "loss": 0.7555, + "mean_token_accuracy": 0.7655138969421387, + "num_tokens": 267554033.0, + "step": 10729 + }, + { + "epoch": 1.178343949044586, + "grad_norm": 2.1875243186950684, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7516714334487915, + "num_tokens": 267577837.0, + "step": 10730 + }, + { + "epoch": 1.1784537667471997, + "grad_norm": 2.1914737224578857, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7270433902740479, + "num_tokens": 267604432.0, + "step": 10731 + }, + { + "epoch": 1.1785635844498132, + "grad_norm": 2.301602840423584, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7125105261802673, + "num_tokens": 267629619.0, + "step": 10732 + }, + { + "epoch": 1.178673402152427, + "grad_norm": 2.2114408016204834, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7052173614501953, + "num_tokens": 267657991.0, + "step": 10733 + }, + { + "epoch": 1.1787832198550405, + "grad_norm": 2.2825071811676025, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7162953615188599, + "num_tokens": 267683270.0, + "step": 10734 + }, + { + "epoch": 1.1788930375576543, + "grad_norm": 2.318969249725342, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7090444564819336, + "num_tokens": 267708298.0, + "step": 10735 + }, + { + "epoch": 1.179002855260268, + "grad_norm": 2.4959425926208496, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7335615158081055, + "num_tokens": 267730901.0, + "step": 10736 + }, + { + "epoch": 1.1791126729628816, + "grad_norm": 2.294983386993408, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.724856972694397, + "num_tokens": 267755253.0, + "step": 10737 + }, + { + "epoch": 1.1792224906654953, + "grad_norm": 2.756737470626831, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7199955582618713, + "num_tokens": 267774206.0, + "step": 10738 + }, + { + "epoch": 1.1793323083681089, + "grad_norm": 2.1260597705841064, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7156658172607422, + "num_tokens": 267800480.0, + "step": 10739 + }, + { + "epoch": 1.1794421260707226, + "grad_norm": 1.9519522190093994, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7181781530380249, + "num_tokens": 267830621.0, + "step": 10740 + }, + { + "epoch": 1.1795519437733364, + "grad_norm": 2.105858325958252, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7051297426223755, + "num_tokens": 267860188.0, + "step": 10741 + }, + { + "epoch": 1.17966176147595, + "grad_norm": 2.3648829460144043, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7084569931030273, + "num_tokens": 267886056.0, + "step": 10742 + }, + { + "epoch": 1.1797715791785637, + "grad_norm": 2.375577211380005, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7442158460617065, + "num_tokens": 267907852.0, + "step": 10743 + }, + { + "epoch": 1.1798813968811772, + "grad_norm": 2.4307186603546143, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7117501497268677, + "num_tokens": 267931489.0, + "step": 10744 + }, + { + "epoch": 1.179991214583791, + "grad_norm": 2.580784559249878, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7273150682449341, + "num_tokens": 267951034.0, + "step": 10745 + }, + { + "epoch": 1.1801010322864045, + "grad_norm": 2.425015687942505, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7344200611114502, + "num_tokens": 267972928.0, + "step": 10746 + }, + { + "epoch": 1.1802108499890183, + "grad_norm": 2.037142038345337, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7058408856391907, + "num_tokens": 268004358.0, + "step": 10747 + }, + { + "epoch": 1.1803206676916318, + "grad_norm": 2.143582344055176, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7296692728996277, + "num_tokens": 268030618.0, + "step": 10748 + }, + { + "epoch": 1.1804304853942456, + "grad_norm": 2.6231353282928467, + "learning_rate": 1e-06, + "loss": 0.7596, + "mean_token_accuracy": 0.7646214365959167, + "num_tokens": 268048043.0, + "step": 10749 + }, + { + "epoch": 1.1805403030968593, + "grad_norm": 2.331000804901123, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7153133153915405, + "num_tokens": 268071801.0, + "step": 10750 + }, + { + "epoch": 1.1806501207994728, + "grad_norm": 1.9509625434875488, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.6983833312988281, + "num_tokens": 268104377.0, + "step": 10751 + }, + { + "epoch": 1.1807599385020866, + "grad_norm": 2.458214521408081, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7378005385398865, + "num_tokens": 268126447.0, + "step": 10752 + }, + { + "epoch": 1.1808697562047001, + "grad_norm": 2.2944798469543457, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7092453241348267, + "num_tokens": 268150495.0, + "step": 10753 + }, + { + "epoch": 1.180979573907314, + "grad_norm": 2.4287500381469727, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7155926823616028, + "num_tokens": 268174166.0, + "step": 10754 + }, + { + "epoch": 1.1810893916099274, + "grad_norm": 2.1180930137634277, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7069401144981384, + "num_tokens": 268203118.0, + "step": 10755 + }, + { + "epoch": 1.1811992093125412, + "grad_norm": 2.3378102779388428, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7309433817863464, + "num_tokens": 268228457.0, + "step": 10756 + }, + { + "epoch": 1.1813090270151547, + "grad_norm": 2.137998342514038, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7238513827323914, + "num_tokens": 268256349.0, + "step": 10757 + }, + { + "epoch": 1.1814188447177685, + "grad_norm": 2.31392240524292, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7382821440696716, + "num_tokens": 268281244.0, + "step": 10758 + }, + { + "epoch": 1.1815286624203822, + "grad_norm": 2.0899994373321533, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7360391616821289, + "num_tokens": 268307612.0, + "step": 10759 + }, + { + "epoch": 1.1816384801229958, + "grad_norm": 2.241093397140503, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6933855414390564, + "num_tokens": 268334155.0, + "step": 10760 + }, + { + "epoch": 1.1817482978256095, + "grad_norm": 2.25577974319458, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7045468091964722, + "num_tokens": 268361509.0, + "step": 10761 + }, + { + "epoch": 1.181858115528223, + "grad_norm": 2.0231475830078125, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7240767478942871, + "num_tokens": 268391956.0, + "step": 10762 + }, + { + "epoch": 1.1819679332308368, + "grad_norm": 2.5186927318573, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7212824821472168, + "num_tokens": 268414875.0, + "step": 10763 + }, + { + "epoch": 1.1820777509334506, + "grad_norm": 2.2816858291625977, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.725607693195343, + "num_tokens": 268438704.0, + "step": 10764 + }, + { + "epoch": 1.1821875686360641, + "grad_norm": 2.3576090335845947, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7113931775093079, + "num_tokens": 268462826.0, + "step": 10765 + }, + { + "epoch": 1.1822973863386779, + "grad_norm": 2.388157844543457, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7300710082054138, + "num_tokens": 268485009.0, + "step": 10766 + }, + { + "epoch": 1.1824072040412914, + "grad_norm": 2.551447629928589, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7328675985336304, + "num_tokens": 268505448.0, + "step": 10767 + }, + { + "epoch": 1.1825170217439052, + "grad_norm": 2.4307212829589844, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7187749147415161, + "num_tokens": 268527208.0, + "step": 10768 + }, + { + "epoch": 1.1826268394465187, + "grad_norm": 2.275690793991089, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7284539341926575, + "num_tokens": 268549990.0, + "step": 10769 + }, + { + "epoch": 1.1827366571491325, + "grad_norm": 2.185445547103882, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7225675582885742, + "num_tokens": 268576354.0, + "step": 10770 + }, + { + "epoch": 1.182846474851746, + "grad_norm": 2.261420965194702, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7333035469055176, + "num_tokens": 268601553.0, + "step": 10771 + }, + { + "epoch": 1.1829562925543597, + "grad_norm": 2.0820937156677246, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7278295755386353, + "num_tokens": 268629633.0, + "step": 10772 + }, + { + "epoch": 1.1830661102569735, + "grad_norm": 2.537335157394409, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7082253694534302, + "num_tokens": 268649353.0, + "step": 10773 + }, + { + "epoch": 1.183175927959587, + "grad_norm": 2.317159414291382, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7147271037101746, + "num_tokens": 268678199.0, + "step": 10774 + }, + { + "epoch": 1.1832857456622008, + "grad_norm": 2.412264585494995, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7051498889923096, + "num_tokens": 268700879.0, + "step": 10775 + }, + { + "epoch": 1.1833955633648143, + "grad_norm": 1.9100162982940674, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6942172050476074, + "num_tokens": 268736085.0, + "step": 10776 + }, + { + "epoch": 1.183505381067428, + "grad_norm": 2.3527817726135254, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.711186945438385, + "num_tokens": 268759856.0, + "step": 10777 + }, + { + "epoch": 1.1836151987700418, + "grad_norm": 2.414407730102539, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7024991512298584, + "num_tokens": 268783317.0, + "step": 10778 + }, + { + "epoch": 1.1837250164726554, + "grad_norm": 2.2494354248046875, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7350499629974365, + "num_tokens": 268808737.0, + "step": 10779 + }, + { + "epoch": 1.1838348341752691, + "grad_norm": 2.045311212539673, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7256677746772766, + "num_tokens": 268839344.0, + "step": 10780 + }, + { + "epoch": 1.1839446518778827, + "grad_norm": 2.3544857501983643, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7053909301757812, + "num_tokens": 268863837.0, + "step": 10781 + }, + { + "epoch": 1.1840544695804964, + "grad_norm": 2.006122350692749, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6898548603057861, + "num_tokens": 268897500.0, + "step": 10782 + }, + { + "epoch": 1.18416428728311, + "grad_norm": 2.2773377895355225, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.6960785388946533, + "num_tokens": 268924120.0, + "step": 10783 + }, + { + "epoch": 1.1842741049857237, + "grad_norm": 2.573122262954712, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7080124616622925, + "num_tokens": 268944767.0, + "step": 10784 + }, + { + "epoch": 1.1843839226883373, + "grad_norm": 2.1442248821258545, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7248296737670898, + "num_tokens": 268971170.0, + "step": 10785 + }, + { + "epoch": 1.184493740390951, + "grad_norm": 2.4567627906799316, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.720860481262207, + "num_tokens": 268992410.0, + "step": 10786 + }, + { + "epoch": 1.1846035580935648, + "grad_norm": 1.9594011306762695, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7280222177505493, + "num_tokens": 269025645.0, + "step": 10787 + }, + { + "epoch": 1.1847133757961783, + "grad_norm": 2.3140697479248047, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7120177745819092, + "num_tokens": 269050334.0, + "step": 10788 + }, + { + "epoch": 1.184823193498792, + "grad_norm": 2.3380684852600098, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7044044137001038, + "num_tokens": 269075209.0, + "step": 10789 + }, + { + "epoch": 1.1849330112014056, + "grad_norm": 2.4234259128570557, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7330577969551086, + "num_tokens": 269097716.0, + "step": 10790 + }, + { + "epoch": 1.1850428289040194, + "grad_norm": 2.1283113956451416, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7221618294715881, + "num_tokens": 269124805.0, + "step": 10791 + }, + { + "epoch": 1.185152646606633, + "grad_norm": 2.3812897205352783, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7316378355026245, + "num_tokens": 269147025.0, + "step": 10792 + }, + { + "epoch": 1.1852624643092466, + "grad_norm": 2.2393901348114014, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7075602412223816, + "num_tokens": 269172227.0, + "step": 10793 + }, + { + "epoch": 1.1853722820118604, + "grad_norm": 2.398256778717041, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7090594172477722, + "num_tokens": 269194568.0, + "step": 10794 + }, + { + "epoch": 1.185482099714474, + "grad_norm": 2.348177909851074, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7266278266906738, + "num_tokens": 269220011.0, + "step": 10795 + }, + { + "epoch": 1.1855919174170877, + "grad_norm": 2.320819616317749, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7217093706130981, + "num_tokens": 269243048.0, + "step": 10796 + }, + { + "epoch": 1.1857017351197012, + "grad_norm": 2.3783955574035645, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7291092872619629, + "num_tokens": 269267542.0, + "step": 10797 + }, + { + "epoch": 1.185811552822315, + "grad_norm": 2.1636383533477783, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.727840781211853, + "num_tokens": 269293951.0, + "step": 10798 + }, + { + "epoch": 1.1859213705249285, + "grad_norm": 2.3276617527008057, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7075703740119934, + "num_tokens": 269319461.0, + "step": 10799 + }, + { + "epoch": 1.1860311882275423, + "grad_norm": 2.374915599822998, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7222000360488892, + "num_tokens": 269344674.0, + "step": 10800 + }, + { + "epoch": 1.186141005930156, + "grad_norm": 2.1844091415405273, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7319929599761963, + "num_tokens": 269371562.0, + "step": 10801 + }, + { + "epoch": 1.1862508236327696, + "grad_norm": 2.183776378631592, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6974622011184692, + "num_tokens": 269401607.0, + "step": 10802 + }, + { + "epoch": 1.1863606413353833, + "grad_norm": 2.420621871948242, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7219177484512329, + "num_tokens": 269424516.0, + "step": 10803 + }, + { + "epoch": 1.1864704590379969, + "grad_norm": 2.076744318008423, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6969286203384399, + "num_tokens": 269454940.0, + "step": 10804 + }, + { + "epoch": 1.1865802767406106, + "grad_norm": 2.3097474575042725, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.736911952495575, + "num_tokens": 269479966.0, + "step": 10805 + }, + { + "epoch": 1.1866900944432242, + "grad_norm": 2.4138898849487305, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7352373600006104, + "num_tokens": 269501597.0, + "step": 10806 + }, + { + "epoch": 1.186799912145838, + "grad_norm": 2.069232702255249, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7370559573173523, + "num_tokens": 269531254.0, + "step": 10807 + }, + { + "epoch": 1.1869097298484517, + "grad_norm": 2.260237216949463, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7135642766952515, + "num_tokens": 269556682.0, + "step": 10808 + }, + { + "epoch": 1.1870195475510652, + "grad_norm": 2.4916417598724365, + "learning_rate": 1e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7525244355201721, + "num_tokens": 269578174.0, + "step": 10809 + }, + { + "epoch": 1.187129365253679, + "grad_norm": 2.3267836570739746, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7284359931945801, + "num_tokens": 269603405.0, + "step": 10810 + }, + { + "epoch": 1.1872391829562925, + "grad_norm": 2.097560405731201, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7217503190040588, + "num_tokens": 269631107.0, + "step": 10811 + }, + { + "epoch": 1.1873490006589063, + "grad_norm": 2.2063870429992676, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7050182819366455, + "num_tokens": 269658785.0, + "step": 10812 + }, + { + "epoch": 1.1874588183615198, + "grad_norm": 2.3573176860809326, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7345770597457886, + "num_tokens": 269682539.0, + "step": 10813 + }, + { + "epoch": 1.1875686360641335, + "grad_norm": 2.2807247638702393, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.735876202583313, + "num_tokens": 269706453.0, + "step": 10814 + }, + { + "epoch": 1.1876784537667473, + "grad_norm": 2.5494132041931152, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7242854833602905, + "num_tokens": 269727239.0, + "step": 10815 + }, + { + "epoch": 1.1877882714693608, + "grad_norm": 2.506627082824707, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7475522756576538, + "num_tokens": 269747955.0, + "step": 10816 + }, + { + "epoch": 1.1878980891719746, + "grad_norm": 2.5110981464385986, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7128403186798096, + "num_tokens": 269768978.0, + "step": 10817 + }, + { + "epoch": 1.1880079068745881, + "grad_norm": 2.329892635345459, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7242117524147034, + "num_tokens": 269793913.0, + "step": 10818 + }, + { + "epoch": 1.1881177245772019, + "grad_norm": 2.5346145629882812, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.723006546497345, + "num_tokens": 269817831.0, + "step": 10819 + }, + { + "epoch": 1.1882275422798154, + "grad_norm": 2.298482656478882, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7420439124107361, + "num_tokens": 269840519.0, + "step": 10820 + }, + { + "epoch": 1.1883373599824292, + "grad_norm": 1.884477138519287, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7089219093322754, + "num_tokens": 269875212.0, + "step": 10821 + }, + { + "epoch": 1.1884471776850427, + "grad_norm": 2.130497694015503, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7276396155357361, + "num_tokens": 269900613.0, + "step": 10822 + }, + { + "epoch": 1.1885569953876565, + "grad_norm": 2.4925198554992676, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7412885427474976, + "num_tokens": 269922115.0, + "step": 10823 + }, + { + "epoch": 1.1886668130902702, + "grad_norm": 2.0786325931549072, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7106027603149414, + "num_tokens": 269950815.0, + "step": 10824 + }, + { + "epoch": 1.1887766307928838, + "grad_norm": 2.2320780754089355, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7433233261108398, + "num_tokens": 269974551.0, + "step": 10825 + }, + { + "epoch": 1.1888864484954975, + "grad_norm": 2.5371217727661133, + "learning_rate": 1e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.745262861251831, + "num_tokens": 269993764.0, + "step": 10826 + }, + { + "epoch": 1.188996266198111, + "grad_norm": 2.4331374168395996, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7007535696029663, + "num_tokens": 270015855.0, + "step": 10827 + }, + { + "epoch": 1.1891060839007248, + "grad_norm": 2.704653739929199, + "learning_rate": 1e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7406333684921265, + "num_tokens": 270034648.0, + "step": 10828 + }, + { + "epoch": 1.1892159016033386, + "grad_norm": 2.3545353412628174, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.742210328578949, + "num_tokens": 270056939.0, + "step": 10829 + }, + { + "epoch": 1.189325719305952, + "grad_norm": 1.9500274658203125, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.70442795753479, + "num_tokens": 270089330.0, + "step": 10830 + }, + { + "epoch": 1.1894355370085659, + "grad_norm": 2.045745849609375, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7145392298698425, + "num_tokens": 270120022.0, + "step": 10831 + }, + { + "epoch": 1.1895453547111794, + "grad_norm": 2.3625223636627197, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6918289065361023, + "num_tokens": 270144266.0, + "step": 10832 + }, + { + "epoch": 1.1896551724137931, + "grad_norm": 2.0020835399627686, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.716206431388855, + "num_tokens": 270177850.0, + "step": 10833 + }, + { + "epoch": 1.1897649901164067, + "grad_norm": 2.317491292953491, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7255455255508423, + "num_tokens": 270200996.0, + "step": 10834 + }, + { + "epoch": 1.1898748078190204, + "grad_norm": 2.3256123065948486, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7188107967376709, + "num_tokens": 270224855.0, + "step": 10835 + }, + { + "epoch": 1.189984625521634, + "grad_norm": 2.084486484527588, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6890699863433838, + "num_tokens": 270257587.0, + "step": 10836 + }, + { + "epoch": 1.1900944432242477, + "grad_norm": 2.306169271469116, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7228187322616577, + "num_tokens": 270284521.0, + "step": 10837 + }, + { + "epoch": 1.1902042609268615, + "grad_norm": 2.5270767211914062, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7265812158584595, + "num_tokens": 270305501.0, + "step": 10838 + }, + { + "epoch": 1.190314078629475, + "grad_norm": 2.1723926067352295, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7121933102607727, + "num_tokens": 270331957.0, + "step": 10839 + }, + { + "epoch": 1.1904238963320888, + "grad_norm": 2.172114849090576, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7065021991729736, + "num_tokens": 270360686.0, + "step": 10840 + }, + { + "epoch": 1.1905337140347023, + "grad_norm": 2.1468019485473633, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7147873640060425, + "num_tokens": 270387934.0, + "step": 10841 + }, + { + "epoch": 1.190643531737316, + "grad_norm": 2.288219690322876, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7231084108352661, + "num_tokens": 270412228.0, + "step": 10842 + }, + { + "epoch": 1.1907533494399298, + "grad_norm": 2.287993907928467, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7085713148117065, + "num_tokens": 270436599.0, + "step": 10843 + }, + { + "epoch": 1.1908631671425434, + "grad_norm": 2.140873670578003, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7005247473716736, + "num_tokens": 270465532.0, + "step": 10844 + }, + { + "epoch": 1.1909729848451571, + "grad_norm": 2.128666639328003, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7136701941490173, + "num_tokens": 270491965.0, + "step": 10845 + }, + { + "epoch": 1.1910828025477707, + "grad_norm": 2.1168298721313477, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.745205819606781, + "num_tokens": 270517731.0, + "step": 10846 + }, + { + "epoch": 1.1911926202503844, + "grad_norm": 2.087078094482422, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7257122993469238, + "num_tokens": 270547324.0, + "step": 10847 + }, + { + "epoch": 1.191302437952998, + "grad_norm": 2.19557785987854, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7348587512969971, + "num_tokens": 270573936.0, + "step": 10848 + }, + { + "epoch": 1.1914122556556117, + "grad_norm": 2.334341526031494, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.714317798614502, + "num_tokens": 270597698.0, + "step": 10849 + }, + { + "epoch": 1.1915220733582252, + "grad_norm": 2.1916627883911133, + "learning_rate": 1e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7460924386978149, + "num_tokens": 270622393.0, + "step": 10850 + }, + { + "epoch": 1.191631891060839, + "grad_norm": 2.1822543144226074, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7065384387969971, + "num_tokens": 270651378.0, + "step": 10851 + }, + { + "epoch": 1.1917417087634528, + "grad_norm": 2.330037832260132, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7076734304428101, + "num_tokens": 270676125.0, + "step": 10852 + }, + { + "epoch": 1.1918515264660663, + "grad_norm": 2.46510910987854, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7373403310775757, + "num_tokens": 270697828.0, + "step": 10853 + }, + { + "epoch": 1.19196134416868, + "grad_norm": 2.338995933532715, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7239769101142883, + "num_tokens": 270722304.0, + "step": 10854 + }, + { + "epoch": 1.1920711618712936, + "grad_norm": 1.9582375288009644, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7274767756462097, + "num_tokens": 270750821.0, + "step": 10855 + }, + { + "epoch": 1.1921809795739073, + "grad_norm": 2.323302745819092, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7258087396621704, + "num_tokens": 270775261.0, + "step": 10856 + }, + { + "epoch": 1.192290797276521, + "grad_norm": 2.0447022914886475, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6894617080688477, + "num_tokens": 270805130.0, + "step": 10857 + }, + { + "epoch": 1.1924006149791346, + "grad_norm": 2.3678996562957764, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7144217491149902, + "num_tokens": 270828235.0, + "step": 10858 + }, + { + "epoch": 1.1925104326817484, + "grad_norm": 2.441988229751587, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7525839805603027, + "num_tokens": 270847909.0, + "step": 10859 + }, + { + "epoch": 1.192620250384362, + "grad_norm": 2.3155784606933594, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6865981221199036, + "num_tokens": 270875160.0, + "step": 10860 + }, + { + "epoch": 1.1927300680869757, + "grad_norm": 2.2864484786987305, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7166768312454224, + "num_tokens": 270899611.0, + "step": 10861 + }, + { + "epoch": 1.1928398857895892, + "grad_norm": 2.1076107025146484, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.6990712881088257, + "num_tokens": 270926802.0, + "step": 10862 + }, + { + "epoch": 1.192949703492203, + "grad_norm": 2.398622989654541, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7213091254234314, + "num_tokens": 270949573.0, + "step": 10863 + }, + { + "epoch": 1.1930595211948165, + "grad_norm": 2.3438963890075684, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7381067872047424, + "num_tokens": 270971770.0, + "step": 10864 + }, + { + "epoch": 1.1931693388974303, + "grad_norm": 2.1408958435058594, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7330683469772339, + "num_tokens": 270997258.0, + "step": 10865 + }, + { + "epoch": 1.193279156600044, + "grad_norm": 2.054300546646118, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7109589576721191, + "num_tokens": 271027922.0, + "step": 10866 + }, + { + "epoch": 1.1933889743026576, + "grad_norm": 2.0340449810028076, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7009007930755615, + "num_tokens": 271059055.0, + "step": 10867 + }, + { + "epoch": 1.1934987920052713, + "grad_norm": 2.288241386413574, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7147396206855774, + "num_tokens": 271086697.0, + "step": 10868 + }, + { + "epoch": 1.1936086097078848, + "grad_norm": 2.426748037338257, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7217848300933838, + "num_tokens": 271107448.0, + "step": 10869 + }, + { + "epoch": 1.1937184274104986, + "grad_norm": 2.2459471225738525, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7223193645477295, + "num_tokens": 271133972.0, + "step": 10870 + }, + { + "epoch": 1.1938282451131121, + "grad_norm": 2.23587703704834, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6901630759239197, + "num_tokens": 271159339.0, + "step": 10871 + }, + { + "epoch": 1.193938062815726, + "grad_norm": 2.296621561050415, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7187169790267944, + "num_tokens": 271181754.0, + "step": 10872 + }, + { + "epoch": 1.1940478805183394, + "grad_norm": 2.160614490509033, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.718016505241394, + "num_tokens": 271208835.0, + "step": 10873 + }, + { + "epoch": 1.1941576982209532, + "grad_norm": 2.008040428161621, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7077980637550354, + "num_tokens": 271239845.0, + "step": 10874 + }, + { + "epoch": 1.194267515923567, + "grad_norm": 2.3821349143981934, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7041274309158325, + "num_tokens": 271264499.0, + "step": 10875 + }, + { + "epoch": 1.1943773336261805, + "grad_norm": 2.586306095123291, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7080529928207397, + "num_tokens": 271284926.0, + "step": 10876 + }, + { + "epoch": 1.1944871513287942, + "grad_norm": 2.246957778930664, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7313960790634155, + "num_tokens": 271308829.0, + "step": 10877 + }, + { + "epoch": 1.1945969690314078, + "grad_norm": 2.3701236248016357, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7224165201187134, + "num_tokens": 271332135.0, + "step": 10878 + }, + { + "epoch": 1.1947067867340215, + "grad_norm": 2.141230821609497, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7103660106658936, + "num_tokens": 271360807.0, + "step": 10879 + }, + { + "epoch": 1.1948166044366353, + "grad_norm": 2.4353854656219482, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7174963355064392, + "num_tokens": 271384929.0, + "step": 10880 + }, + { + "epoch": 1.1949264221392488, + "grad_norm": 2.198704481124878, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6990756988525391, + "num_tokens": 271413555.0, + "step": 10881 + }, + { + "epoch": 1.1950362398418626, + "grad_norm": 2.240149736404419, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.733474850654602, + "num_tokens": 271436601.0, + "step": 10882 + }, + { + "epoch": 1.1951460575444761, + "grad_norm": 2.542027711868286, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7232034206390381, + "num_tokens": 271457535.0, + "step": 10883 + }, + { + "epoch": 1.1952558752470899, + "grad_norm": 2.3685190677642822, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7216537594795227, + "num_tokens": 271479366.0, + "step": 10884 + }, + { + "epoch": 1.1953656929497034, + "grad_norm": 2.5046072006225586, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7128710746765137, + "num_tokens": 271501409.0, + "step": 10885 + }, + { + "epoch": 1.1954755106523172, + "grad_norm": 2.1567142009735107, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7260010242462158, + "num_tokens": 271527510.0, + "step": 10886 + }, + { + "epoch": 1.1955853283549307, + "grad_norm": 2.844775915145874, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7344627976417542, + "num_tokens": 271544707.0, + "step": 10887 + }, + { + "epoch": 1.1956951460575445, + "grad_norm": 2.274554491043091, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.719754695892334, + "num_tokens": 271570351.0, + "step": 10888 + }, + { + "epoch": 1.1958049637601582, + "grad_norm": 2.22479510307312, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7326415777206421, + "num_tokens": 271593382.0, + "step": 10889 + }, + { + "epoch": 1.1959147814627717, + "grad_norm": 2.089789628982544, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7218663096427917, + "num_tokens": 271620462.0, + "step": 10890 + }, + { + "epoch": 1.1960245991653855, + "grad_norm": 2.182030439376831, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7517526745796204, + "num_tokens": 271648164.0, + "step": 10891 + }, + { + "epoch": 1.196134416867999, + "grad_norm": 2.2816100120544434, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7490803003311157, + "num_tokens": 271670601.0, + "step": 10892 + }, + { + "epoch": 1.1962442345706128, + "grad_norm": 2.403139352798462, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7302059531211853, + "num_tokens": 271692181.0, + "step": 10893 + }, + { + "epoch": 1.1963540522732266, + "grad_norm": 2.088207483291626, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7090682983398438, + "num_tokens": 271720499.0, + "step": 10894 + }, + { + "epoch": 1.19646386997584, + "grad_norm": 2.370389223098755, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7140940427780151, + "num_tokens": 271744718.0, + "step": 10895 + }, + { + "epoch": 1.1965736876784538, + "grad_norm": 2.2581419944763184, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7170495986938477, + "num_tokens": 271768747.0, + "step": 10896 + }, + { + "epoch": 1.1966835053810674, + "grad_norm": 2.213689088821411, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7177389860153198, + "num_tokens": 271793778.0, + "step": 10897 + }, + { + "epoch": 1.1967933230836811, + "grad_norm": 2.2077572345733643, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.727015495300293, + "num_tokens": 271817091.0, + "step": 10898 + }, + { + "epoch": 1.1969031407862947, + "grad_norm": 1.9719465970993042, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7033104300498962, + "num_tokens": 271849397.0, + "step": 10899 + }, + { + "epoch": 1.1970129584889084, + "grad_norm": 2.164149045944214, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7274186611175537, + "num_tokens": 271875962.0, + "step": 10900 + }, + { + "epoch": 1.197122776191522, + "grad_norm": 2.1394004821777344, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7018095254898071, + "num_tokens": 271901470.0, + "step": 10901 + }, + { + "epoch": 1.1972325938941357, + "grad_norm": 2.3097147941589355, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7554682493209839, + "num_tokens": 271923134.0, + "step": 10902 + }, + { + "epoch": 1.1973424115967495, + "grad_norm": 2.3584682941436768, + "learning_rate": 1e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7411981821060181, + "num_tokens": 271946160.0, + "step": 10903 + }, + { + "epoch": 1.197452229299363, + "grad_norm": 2.494354248046875, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7115303874015808, + "num_tokens": 271968455.0, + "step": 10904 + }, + { + "epoch": 1.1975620470019768, + "grad_norm": 2.416625499725342, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.723691463470459, + "num_tokens": 271990366.0, + "step": 10905 + }, + { + "epoch": 1.1976718647045903, + "grad_norm": 2.8495726585388184, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7205644845962524, + "num_tokens": 272008660.0, + "step": 10906 + }, + { + "epoch": 1.197781682407204, + "grad_norm": 2.0486814975738525, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7315324544906616, + "num_tokens": 272036633.0, + "step": 10907 + }, + { + "epoch": 1.1978915001098178, + "grad_norm": 2.544019937515259, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7372934222221375, + "num_tokens": 272057304.0, + "step": 10908 + }, + { + "epoch": 1.1980013178124314, + "grad_norm": 2.310084581375122, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7219511270523071, + "num_tokens": 272083905.0, + "step": 10909 + }, + { + "epoch": 1.1981111355150451, + "grad_norm": 2.5031497478485107, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7093082070350647, + "num_tokens": 272105817.0, + "step": 10910 + }, + { + "epoch": 1.1982209532176586, + "grad_norm": 2.297358989715576, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7362111210823059, + "num_tokens": 272130374.0, + "step": 10911 + }, + { + "epoch": 1.1983307709202724, + "grad_norm": 2.0318856239318848, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7303234934806824, + "num_tokens": 272158719.0, + "step": 10912 + }, + { + "epoch": 1.198440588622886, + "grad_norm": 2.4148683547973633, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7266789078712463, + "num_tokens": 272181759.0, + "step": 10913 + }, + { + "epoch": 1.1985504063254997, + "grad_norm": 1.964023232460022, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6996657848358154, + "num_tokens": 272213243.0, + "step": 10914 + }, + { + "epoch": 1.1986602240281132, + "grad_norm": 1.9434552192687988, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.704360842704773, + "num_tokens": 272247635.0, + "step": 10915 + }, + { + "epoch": 1.198770041730727, + "grad_norm": 2.247828483581543, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7310685515403748, + "num_tokens": 272272281.0, + "step": 10916 + }, + { + "epoch": 1.1988798594333407, + "grad_norm": 2.2301313877105713, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7245051264762878, + "num_tokens": 272297396.0, + "step": 10917 + }, + { + "epoch": 1.1989896771359543, + "grad_norm": 2.1573679447174072, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7130576968193054, + "num_tokens": 272325691.0, + "step": 10918 + }, + { + "epoch": 1.199099494838568, + "grad_norm": 2.8790998458862305, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7322278022766113, + "num_tokens": 272342796.0, + "step": 10919 + }, + { + "epoch": 1.1992093125411816, + "grad_norm": 2.1855216026306152, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.703335702419281, + "num_tokens": 272370459.0, + "step": 10920 + }, + { + "epoch": 1.1993191302437953, + "grad_norm": 2.048870086669922, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7150235176086426, + "num_tokens": 272398712.0, + "step": 10921 + }, + { + "epoch": 1.199428947946409, + "grad_norm": 2.30196213722229, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7277405858039856, + "num_tokens": 272422811.0, + "step": 10922 + }, + { + "epoch": 1.1995387656490226, + "grad_norm": 2.1871449947357178, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7186826467514038, + "num_tokens": 272449313.0, + "step": 10923 + }, + { + "epoch": 1.1996485833516364, + "grad_norm": 2.2448484897613525, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7360150814056396, + "num_tokens": 272472669.0, + "step": 10924 + }, + { + "epoch": 1.19975840105425, + "grad_norm": 2.406740427017212, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7317456007003784, + "num_tokens": 272495922.0, + "step": 10925 + }, + { + "epoch": 1.1998682187568637, + "grad_norm": 2.0624115467071533, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7356033325195312, + "num_tokens": 272524692.0, + "step": 10926 + }, + { + "epoch": 1.1999780364594772, + "grad_norm": 2.1214993000030518, + "learning_rate": 1e-06, + "loss": 0.8082, + "mean_token_accuracy": 0.7474664449691772, + "num_tokens": 272552324.0, + "step": 10927 + }, + { + "epoch": 1.200087854162091, + "grad_norm": 2.5819101333618164, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7161365151405334, + "num_tokens": 272573584.0, + "step": 10928 + }, + { + "epoch": 1.2001976718647045, + "grad_norm": 2.0611777305603027, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7086026668548584, + "num_tokens": 272603713.0, + "step": 10929 + }, + { + "epoch": 1.2003074895673183, + "grad_norm": 2.211272954940796, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.6994487047195435, + "num_tokens": 272629898.0, + "step": 10930 + }, + { + "epoch": 1.200417307269932, + "grad_norm": 2.6620640754699707, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7381766438484192, + "num_tokens": 272648991.0, + "step": 10931 + }, + { + "epoch": 1.2005271249725455, + "grad_norm": 2.448431968688965, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7283586263656616, + "num_tokens": 272670983.0, + "step": 10932 + }, + { + "epoch": 1.2006369426751593, + "grad_norm": 2.3913772106170654, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.738987922668457, + "num_tokens": 272695341.0, + "step": 10933 + }, + { + "epoch": 1.2007467603777728, + "grad_norm": 2.188938856124878, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7130412459373474, + "num_tokens": 272722608.0, + "step": 10934 + }, + { + "epoch": 1.2008565780803866, + "grad_norm": 2.8828377723693848, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6951265335083008, + "num_tokens": 272743901.0, + "step": 10935 + }, + { + "epoch": 1.2009663957830001, + "grad_norm": 2.1343252658843994, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7038226127624512, + "num_tokens": 272773154.0, + "step": 10936 + }, + { + "epoch": 1.2010762134856139, + "grad_norm": 2.536531686782837, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7361955046653748, + "num_tokens": 272792963.0, + "step": 10937 + }, + { + "epoch": 1.2011860311882274, + "grad_norm": 2.3765387535095215, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7168900370597839, + "num_tokens": 272816466.0, + "step": 10938 + }, + { + "epoch": 1.2012958488908412, + "grad_norm": 2.105994701385498, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7318564653396606, + "num_tokens": 272843078.0, + "step": 10939 + }, + { + "epoch": 1.201405666593455, + "grad_norm": 2.0659902095794678, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.6988709568977356, + "num_tokens": 272874685.0, + "step": 10940 + }, + { + "epoch": 1.2015154842960685, + "grad_norm": 2.532513380050659, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7196265459060669, + "num_tokens": 272897079.0, + "step": 10941 + }, + { + "epoch": 1.2016253019986822, + "grad_norm": 2.0385828018188477, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7103589773178101, + "num_tokens": 272929418.0, + "step": 10942 + }, + { + "epoch": 1.2017351197012958, + "grad_norm": 2.511115789413452, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7365884780883789, + "num_tokens": 272949458.0, + "step": 10943 + }, + { + "epoch": 1.2018449374039095, + "grad_norm": 2.2592225074768066, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7308666110038757, + "num_tokens": 272973948.0, + "step": 10944 + }, + { + "epoch": 1.2019547551065233, + "grad_norm": 2.754631757736206, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7150918245315552, + "num_tokens": 272993688.0, + "step": 10945 + }, + { + "epoch": 1.2020645728091368, + "grad_norm": 2.3056185245513916, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7285141348838806, + "num_tokens": 273016876.0, + "step": 10946 + }, + { + "epoch": 1.2021743905117506, + "grad_norm": 2.276406764984131, + "learning_rate": 1e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7406413555145264, + "num_tokens": 273040753.0, + "step": 10947 + }, + { + "epoch": 1.202284208214364, + "grad_norm": 2.5619900226593018, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7237658500671387, + "num_tokens": 273063139.0, + "step": 10948 + }, + { + "epoch": 1.2023940259169779, + "grad_norm": 2.267408609390259, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7096143364906311, + "num_tokens": 273088737.0, + "step": 10949 + }, + { + "epoch": 1.2025038436195914, + "grad_norm": 2.2038257122039795, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7273770570755005, + "num_tokens": 273114393.0, + "step": 10950 + }, + { + "epoch": 1.2026136613222052, + "grad_norm": 2.146430492401123, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7345479726791382, + "num_tokens": 273141921.0, + "step": 10951 + }, + { + "epoch": 1.2027234790248187, + "grad_norm": 2.0321452617645264, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6820260286331177, + "num_tokens": 273173574.0, + "step": 10952 + }, + { + "epoch": 1.2028332967274324, + "grad_norm": 2.348095417022705, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7198176383972168, + "num_tokens": 273198609.0, + "step": 10953 + }, + { + "epoch": 1.2029431144300462, + "grad_norm": 2.302588939666748, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7439724802970886, + "num_tokens": 273222575.0, + "step": 10954 + }, + { + "epoch": 1.2030529321326597, + "grad_norm": 2.2415287494659424, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7339431643486023, + "num_tokens": 273248682.0, + "step": 10955 + }, + { + "epoch": 1.2031627498352735, + "grad_norm": 2.263354778289795, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7136696577072144, + "num_tokens": 273274790.0, + "step": 10956 + }, + { + "epoch": 1.203272567537887, + "grad_norm": 2.1583378314971924, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7101608514785767, + "num_tokens": 273302770.0, + "step": 10957 + }, + { + "epoch": 1.2033823852405008, + "grad_norm": 2.1329615116119385, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7133545279502869, + "num_tokens": 273331117.0, + "step": 10958 + }, + { + "epoch": 1.2034922029431145, + "grad_norm": 2.3505451679229736, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7320441007614136, + "num_tokens": 273354672.0, + "step": 10959 + }, + { + "epoch": 1.203602020645728, + "grad_norm": 2.2562344074249268, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7104467153549194, + "num_tokens": 273379274.0, + "step": 10960 + }, + { + "epoch": 1.2037118383483418, + "grad_norm": 2.2974236011505127, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7328336238861084, + "num_tokens": 273404133.0, + "step": 10961 + }, + { + "epoch": 1.2038216560509554, + "grad_norm": 2.2173256874084473, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7114131450653076, + "num_tokens": 273431431.0, + "step": 10962 + }, + { + "epoch": 1.2039314737535691, + "grad_norm": 2.413787364959717, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7185574769973755, + "num_tokens": 273453247.0, + "step": 10963 + }, + { + "epoch": 1.2040412914561827, + "grad_norm": 2.409844398498535, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7173766493797302, + "num_tokens": 273475319.0, + "step": 10964 + }, + { + "epoch": 1.2041511091587964, + "grad_norm": 2.387878894805908, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7148024439811707, + "num_tokens": 273498189.0, + "step": 10965 + }, + { + "epoch": 1.20426092686141, + "grad_norm": 2.7652366161346436, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7306441068649292, + "num_tokens": 273517348.0, + "step": 10966 + }, + { + "epoch": 1.2043707445640237, + "grad_norm": 2.506800889968872, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7218418717384338, + "num_tokens": 273538941.0, + "step": 10967 + }, + { + "epoch": 1.2044805622666375, + "grad_norm": 2.46687650680542, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7290942668914795, + "num_tokens": 273560987.0, + "step": 10968 + }, + { + "epoch": 1.204590379969251, + "grad_norm": 2.1867148876190186, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7192064523696899, + "num_tokens": 273587926.0, + "step": 10969 + }, + { + "epoch": 1.2047001976718648, + "grad_norm": 2.283750057220459, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7088214159011841, + "num_tokens": 273613385.0, + "step": 10970 + }, + { + "epoch": 1.2048100153744783, + "grad_norm": 2.3603789806365967, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7332620024681091, + "num_tokens": 273637162.0, + "step": 10971 + }, + { + "epoch": 1.204919833077092, + "grad_norm": 2.3471293449401855, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.724115252494812, + "num_tokens": 273661502.0, + "step": 10972 + }, + { + "epoch": 1.2050296507797058, + "grad_norm": 2.1511590480804443, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7324848175048828, + "num_tokens": 273689018.0, + "step": 10973 + }, + { + "epoch": 1.2051394684823193, + "grad_norm": 2.670822858810425, + "learning_rate": 1e-06, + "loss": 0.7107, + "mean_token_accuracy": 0.7698543071746826, + "num_tokens": 273707200.0, + "step": 10974 + }, + { + "epoch": 1.205249286184933, + "grad_norm": 2.5895893573760986, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7369152307510376, + "num_tokens": 273724991.0, + "step": 10975 + }, + { + "epoch": 1.2053591038875466, + "grad_norm": 2.175295352935791, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7258703708648682, + "num_tokens": 273750424.0, + "step": 10976 + }, + { + "epoch": 1.2054689215901604, + "grad_norm": 2.287015914916992, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.708824872970581, + "num_tokens": 273777719.0, + "step": 10977 + }, + { + "epoch": 1.205578739292774, + "grad_norm": 2.147942066192627, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7001973986625671, + "num_tokens": 273804693.0, + "step": 10978 + }, + { + "epoch": 1.2056885569953877, + "grad_norm": 2.070708751678467, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7064995765686035, + "num_tokens": 273835558.0, + "step": 10979 + }, + { + "epoch": 1.2057983746980012, + "grad_norm": 2.239804267883301, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7230803966522217, + "num_tokens": 273861404.0, + "step": 10980 + }, + { + "epoch": 1.205908192400615, + "grad_norm": 1.9788800477981567, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7309883236885071, + "num_tokens": 273890561.0, + "step": 10981 + }, + { + "epoch": 1.2060180101032287, + "grad_norm": 2.2022781372070312, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.712274968624115, + "num_tokens": 273917248.0, + "step": 10982 + }, + { + "epoch": 1.2061278278058423, + "grad_norm": 2.2378089427948, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7172418832778931, + "num_tokens": 273943198.0, + "step": 10983 + }, + { + "epoch": 1.206237645508456, + "grad_norm": 2.26625657081604, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7145308256149292, + "num_tokens": 273967456.0, + "step": 10984 + }, + { + "epoch": 1.2063474632110696, + "grad_norm": 2.365471363067627, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7229660153388977, + "num_tokens": 273990718.0, + "step": 10985 + }, + { + "epoch": 1.2064572809136833, + "grad_norm": 2.3135502338409424, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.712888240814209, + "num_tokens": 274014623.0, + "step": 10986 + }, + { + "epoch": 1.2065670986162969, + "grad_norm": 2.3259642124176025, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.72493577003479, + "num_tokens": 274038922.0, + "step": 10987 + }, + { + "epoch": 1.2066769163189106, + "grad_norm": 2.111030340194702, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7340142726898193, + "num_tokens": 274065770.0, + "step": 10988 + }, + { + "epoch": 1.2067867340215244, + "grad_norm": 2.4146294593811035, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7334638237953186, + "num_tokens": 274086521.0, + "step": 10989 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 2.492053270339966, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7187209129333496, + "num_tokens": 274108779.0, + "step": 10990 + }, + { + "epoch": 1.2070063694267517, + "grad_norm": 2.4126410484313965, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7253023386001587, + "num_tokens": 274131335.0, + "step": 10991 + }, + { + "epoch": 1.2071161871293652, + "grad_norm": 2.307750940322876, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7065474390983582, + "num_tokens": 274155210.0, + "step": 10992 + }, + { + "epoch": 1.207226004831979, + "grad_norm": 2.408212184906006, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7293472290039062, + "num_tokens": 274177377.0, + "step": 10993 + }, + { + "epoch": 1.2073358225345925, + "grad_norm": 2.274238109588623, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7182769179344177, + "num_tokens": 274202009.0, + "step": 10994 + }, + { + "epoch": 1.2074456402372062, + "grad_norm": 2.474644422531128, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7303929328918457, + "num_tokens": 274225855.0, + "step": 10995 + }, + { + "epoch": 1.20755545793982, + "grad_norm": 2.592304229736328, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7420672178268433, + "num_tokens": 274247196.0, + "step": 10996 + }, + { + "epoch": 1.2076652756424335, + "grad_norm": 2.3047711849212646, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.7084531188011169, + "num_tokens": 274272434.0, + "step": 10997 + }, + { + "epoch": 1.2077750933450473, + "grad_norm": 2.3327648639678955, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7281925678253174, + "num_tokens": 274296568.0, + "step": 10998 + }, + { + "epoch": 1.2078849110476608, + "grad_norm": 2.6379165649414062, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7346347570419312, + "num_tokens": 274316863.0, + "step": 10999 + }, + { + "epoch": 1.2079947287502746, + "grad_norm": 2.0742008686065674, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7506682872772217, + "num_tokens": 274342253.0, + "step": 11000 + }, + { + "epoch": 1.2081045464528881, + "grad_norm": 2.2762978076934814, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7229505777359009, + "num_tokens": 274366887.0, + "step": 11001 + }, + { + "epoch": 1.2082143641555019, + "grad_norm": 2.3648464679718018, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7240965366363525, + "num_tokens": 274389716.0, + "step": 11002 + }, + { + "epoch": 1.2083241818581154, + "grad_norm": 2.196826934814453, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7199435234069824, + "num_tokens": 274417110.0, + "step": 11003 + }, + { + "epoch": 1.2084339995607292, + "grad_norm": 2.3084428310394287, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.69412761926651, + "num_tokens": 274442444.0, + "step": 11004 + }, + { + "epoch": 1.208543817263343, + "grad_norm": 2.3959319591522217, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7184821367263794, + "num_tokens": 274467710.0, + "step": 11005 + }, + { + "epoch": 1.2086536349659565, + "grad_norm": 2.215369701385498, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7297676801681519, + "num_tokens": 274492777.0, + "step": 11006 + }, + { + "epoch": 1.2087634526685702, + "grad_norm": 1.978194236755371, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7083830237388611, + "num_tokens": 274522535.0, + "step": 11007 + }, + { + "epoch": 1.2088732703711838, + "grad_norm": 2.2484166622161865, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7406977415084839, + "num_tokens": 274546423.0, + "step": 11008 + }, + { + "epoch": 1.2089830880737975, + "grad_norm": 2.1929261684417725, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7149154543876648, + "num_tokens": 274574420.0, + "step": 11009 + }, + { + "epoch": 1.2090929057764113, + "grad_norm": 2.4214770793914795, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7297495603561401, + "num_tokens": 274598883.0, + "step": 11010 + }, + { + "epoch": 1.2092027234790248, + "grad_norm": 2.1037940979003906, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7370386719703674, + "num_tokens": 274625155.0, + "step": 11011 + }, + { + "epoch": 1.2093125411816386, + "grad_norm": 2.2187342643737793, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.724425196647644, + "num_tokens": 274649843.0, + "step": 11012 + }, + { + "epoch": 1.209422358884252, + "grad_norm": 2.2084896564483643, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7119969129562378, + "num_tokens": 274675674.0, + "step": 11013 + }, + { + "epoch": 1.2095321765868658, + "grad_norm": 2.175405740737915, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7118321061134338, + "num_tokens": 274703060.0, + "step": 11014 + }, + { + "epoch": 1.2096419942894794, + "grad_norm": 2.2235474586486816, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7061364650726318, + "num_tokens": 274728652.0, + "step": 11015 + }, + { + "epoch": 1.2097518119920931, + "grad_norm": 2.195117950439453, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7304602265357971, + "num_tokens": 274753721.0, + "step": 11016 + }, + { + "epoch": 1.2098616296947067, + "grad_norm": 2.4334146976470947, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.758019208908081, + "num_tokens": 274775705.0, + "step": 11017 + }, + { + "epoch": 1.2099714473973204, + "grad_norm": 2.4065794944763184, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.713125467300415, + "num_tokens": 274797642.0, + "step": 11018 + }, + { + "epoch": 1.2100812650999342, + "grad_norm": 1.9625911712646484, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7131854295730591, + "num_tokens": 274829466.0, + "step": 11019 + }, + { + "epoch": 1.2101910828025477, + "grad_norm": 2.333385944366455, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7205860614776611, + "num_tokens": 274855011.0, + "step": 11020 + }, + { + "epoch": 1.2103009005051615, + "grad_norm": 2.274345636367798, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6904785633087158, + "num_tokens": 274882187.0, + "step": 11021 + }, + { + "epoch": 1.210410718207775, + "grad_norm": 2.1477434635162354, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7433618903160095, + "num_tokens": 274908071.0, + "step": 11022 + }, + { + "epoch": 1.2105205359103888, + "grad_norm": 2.084308624267578, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6962075233459473, + "num_tokens": 274936458.0, + "step": 11023 + }, + { + "epoch": 1.2106303536130025, + "grad_norm": 2.4235386848449707, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.721490740776062, + "num_tokens": 274957940.0, + "step": 11024 + }, + { + "epoch": 1.210740171315616, + "grad_norm": 2.304811477661133, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7277049422264099, + "num_tokens": 274981612.0, + "step": 11025 + }, + { + "epoch": 1.2108499890182298, + "grad_norm": 2.2841107845306396, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.719163715839386, + "num_tokens": 275006720.0, + "step": 11026 + }, + { + "epoch": 1.2109598067208434, + "grad_norm": 2.1407713890075684, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7086368799209595, + "num_tokens": 275035057.0, + "step": 11027 + }, + { + "epoch": 1.2110696244234571, + "grad_norm": 2.5312671661376953, + "learning_rate": 1e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7421016693115234, + "num_tokens": 275057016.0, + "step": 11028 + }, + { + "epoch": 1.2111794421260706, + "grad_norm": 2.4799208641052246, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7327519059181213, + "num_tokens": 275076865.0, + "step": 11029 + }, + { + "epoch": 1.2112892598286844, + "grad_norm": 2.4484236240386963, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7224560379981995, + "num_tokens": 275100403.0, + "step": 11030 + }, + { + "epoch": 1.211399077531298, + "grad_norm": 1.9955581426620483, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6969186067581177, + "num_tokens": 275132240.0, + "step": 11031 + }, + { + "epoch": 1.2115088952339117, + "grad_norm": 2.4501194953918457, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7133592963218689, + "num_tokens": 275155833.0, + "step": 11032 + }, + { + "epoch": 1.2116187129365255, + "grad_norm": 2.3177874088287354, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7437076568603516, + "num_tokens": 275179309.0, + "step": 11033 + }, + { + "epoch": 1.211728530639139, + "grad_norm": 2.097649097442627, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7044247388839722, + "num_tokens": 275210290.0, + "step": 11034 + }, + { + "epoch": 1.2118383483417527, + "grad_norm": 2.2245097160339355, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7096211314201355, + "num_tokens": 275236161.0, + "step": 11035 + }, + { + "epoch": 1.2119481660443663, + "grad_norm": 2.5898542404174805, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7133902311325073, + "num_tokens": 275257346.0, + "step": 11036 + }, + { + "epoch": 1.21205798374698, + "grad_norm": 2.169463872909546, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7149118781089783, + "num_tokens": 275284762.0, + "step": 11037 + }, + { + "epoch": 1.2121678014495938, + "grad_norm": 2.4870262145996094, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7171300649642944, + "num_tokens": 275307105.0, + "step": 11038 + }, + { + "epoch": 1.2122776191522073, + "grad_norm": 2.681729793548584, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7300820350646973, + "num_tokens": 275326525.0, + "step": 11039 + }, + { + "epoch": 1.212387436854821, + "grad_norm": 2.2269253730773926, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.728635311126709, + "num_tokens": 275352322.0, + "step": 11040 + }, + { + "epoch": 1.2124972545574346, + "grad_norm": 2.4446349143981934, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.728125274181366, + "num_tokens": 275374343.0, + "step": 11041 + }, + { + "epoch": 1.2126070722600484, + "grad_norm": 2.2135961055755615, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7070724368095398, + "num_tokens": 275401050.0, + "step": 11042 + }, + { + "epoch": 1.212716889962662, + "grad_norm": 1.949053168296814, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7300571203231812, + "num_tokens": 275432569.0, + "step": 11043 + }, + { + "epoch": 1.2128267076652757, + "grad_norm": 2.610097885131836, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7183635234832764, + "num_tokens": 275453815.0, + "step": 11044 + }, + { + "epoch": 1.2129365253678892, + "grad_norm": 2.4218125343322754, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7382304668426514, + "num_tokens": 275474208.0, + "step": 11045 + }, + { + "epoch": 1.213046343070503, + "grad_norm": 2.36419939994812, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7262789011001587, + "num_tokens": 275495604.0, + "step": 11046 + }, + { + "epoch": 1.2131561607731167, + "grad_norm": 2.0154998302459717, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7092635035514832, + "num_tokens": 275525049.0, + "step": 11047 + }, + { + "epoch": 1.2132659784757303, + "grad_norm": 2.294796943664551, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7058359980583191, + "num_tokens": 275549508.0, + "step": 11048 + }, + { + "epoch": 1.213375796178344, + "grad_norm": 2.294856309890747, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7289498448371887, + "num_tokens": 275573952.0, + "step": 11049 + }, + { + "epoch": 1.2134856138809575, + "grad_norm": 3.058168649673462, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.730475664138794, + "num_tokens": 275589168.0, + "step": 11050 + }, + { + "epoch": 1.2135954315835713, + "grad_norm": 2.4217379093170166, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.6962012052536011, + "num_tokens": 275612939.0, + "step": 11051 + }, + { + "epoch": 1.2137052492861848, + "grad_norm": 2.292734146118164, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7162076830863953, + "num_tokens": 275638178.0, + "step": 11052 + }, + { + "epoch": 1.2138150669887986, + "grad_norm": 2.1880178451538086, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6874617338180542, + "num_tokens": 275666606.0, + "step": 11053 + }, + { + "epoch": 1.2139248846914121, + "grad_norm": 1.9993430376052856, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7264761924743652, + "num_tokens": 275696647.0, + "step": 11054 + }, + { + "epoch": 1.2140347023940259, + "grad_norm": 2.070183753967285, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7115642428398132, + "num_tokens": 275726114.0, + "step": 11055 + }, + { + "epoch": 1.2141445200966396, + "grad_norm": 2.3703393936157227, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7228927612304688, + "num_tokens": 275748881.0, + "step": 11056 + }, + { + "epoch": 1.2142543377992532, + "grad_norm": 2.328249931335449, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.728030800819397, + "num_tokens": 275773399.0, + "step": 11057 + }, + { + "epoch": 1.214364155501867, + "grad_norm": 2.5794010162353516, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7150232791900635, + "num_tokens": 275794127.0, + "step": 11058 + }, + { + "epoch": 1.2144739732044805, + "grad_norm": 2.3873114585876465, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7091033458709717, + "num_tokens": 275817766.0, + "step": 11059 + }, + { + "epoch": 1.2145837909070942, + "grad_norm": 2.1726207733154297, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6908512115478516, + "num_tokens": 275845679.0, + "step": 11060 + }, + { + "epoch": 1.214693608609708, + "grad_norm": 2.4228408336639404, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7280383110046387, + "num_tokens": 275868604.0, + "step": 11061 + }, + { + "epoch": 1.2148034263123215, + "grad_norm": 2.3402395248413086, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7159799337387085, + "num_tokens": 275892176.0, + "step": 11062 + }, + { + "epoch": 1.2149132440149353, + "grad_norm": 2.2979815006256104, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7280515432357788, + "num_tokens": 275915840.0, + "step": 11063 + }, + { + "epoch": 1.2150230617175488, + "grad_norm": 2.5924243927001953, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7405168414115906, + "num_tokens": 275936940.0, + "step": 11064 + }, + { + "epoch": 1.2151328794201626, + "grad_norm": 2.226346254348755, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7275408506393433, + "num_tokens": 275963292.0, + "step": 11065 + }, + { + "epoch": 1.215242697122776, + "grad_norm": 2.3762760162353516, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7116193175315857, + "num_tokens": 275987038.0, + "step": 11066 + }, + { + "epoch": 1.2153525148253899, + "grad_norm": 2.5061757564544678, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7203976511955261, + "num_tokens": 276008986.0, + "step": 11067 + }, + { + "epoch": 1.2154623325280034, + "grad_norm": 2.461625099182129, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7372250556945801, + "num_tokens": 276031809.0, + "step": 11068 + }, + { + "epoch": 1.2155721502306172, + "grad_norm": 2.237877607345581, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7125482559204102, + "num_tokens": 276059252.0, + "step": 11069 + }, + { + "epoch": 1.215681967933231, + "grad_norm": 2.7157907485961914, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7250608205795288, + "num_tokens": 276077056.0, + "step": 11070 + }, + { + "epoch": 1.2157917856358444, + "grad_norm": 2.170078754425049, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7078254818916321, + "num_tokens": 276105054.0, + "step": 11071 + }, + { + "epoch": 1.2159016033384582, + "grad_norm": 2.087205171585083, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7159469723701477, + "num_tokens": 276133672.0, + "step": 11072 + }, + { + "epoch": 1.2160114210410717, + "grad_norm": 2.444837808609009, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7369632124900818, + "num_tokens": 276154890.0, + "step": 11073 + }, + { + "epoch": 1.2161212387436855, + "grad_norm": 2.2168362140655518, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7164261341094971, + "num_tokens": 276181889.0, + "step": 11074 + }, + { + "epoch": 1.2162310564462993, + "grad_norm": 2.569851875305176, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7391433119773865, + "num_tokens": 276203612.0, + "step": 11075 + }, + { + "epoch": 1.2163408741489128, + "grad_norm": 2.3851237297058105, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7184996008872986, + "num_tokens": 276227465.0, + "step": 11076 + }, + { + "epoch": 1.2164506918515265, + "grad_norm": 2.6311309337615967, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7404108643531799, + "num_tokens": 276246670.0, + "step": 11077 + }, + { + "epoch": 1.21656050955414, + "grad_norm": 2.153524160385132, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7303277254104614, + "num_tokens": 276272408.0, + "step": 11078 + }, + { + "epoch": 1.2166703272567538, + "grad_norm": 2.2137389183044434, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7347241640090942, + "num_tokens": 276296983.0, + "step": 11079 + }, + { + "epoch": 1.2167801449593674, + "grad_norm": 2.7961182594299316, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.721345067024231, + "num_tokens": 276316922.0, + "step": 11080 + }, + { + "epoch": 1.2168899626619811, + "grad_norm": 2.4115450382232666, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7256660461425781, + "num_tokens": 276342785.0, + "step": 11081 + }, + { + "epoch": 1.2169997803645947, + "grad_norm": 2.6225662231445312, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7424870729446411, + "num_tokens": 276363017.0, + "step": 11082 + }, + { + "epoch": 1.2171095980672084, + "grad_norm": 2.39888072013855, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7098674774169922, + "num_tokens": 276387430.0, + "step": 11083 + }, + { + "epoch": 1.2172194157698222, + "grad_norm": 2.2114927768707275, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7100974321365356, + "num_tokens": 276414036.0, + "step": 11084 + }, + { + "epoch": 1.2173292334724357, + "grad_norm": 2.322179079055786, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7090516686439514, + "num_tokens": 276439247.0, + "step": 11085 + }, + { + "epoch": 1.2174390511750495, + "grad_norm": 2.604710340499878, + "learning_rate": 1e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7380894422531128, + "num_tokens": 276459610.0, + "step": 11086 + }, + { + "epoch": 1.217548868877663, + "grad_norm": 2.432896137237549, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7195666432380676, + "num_tokens": 276482097.0, + "step": 11087 + }, + { + "epoch": 1.2176586865802768, + "grad_norm": 2.4280948638916016, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7429848909378052, + "num_tokens": 276505468.0, + "step": 11088 + }, + { + "epoch": 1.2177685042828905, + "grad_norm": 2.491734266281128, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.733591616153717, + "num_tokens": 276525790.0, + "step": 11089 + }, + { + "epoch": 1.217878321985504, + "grad_norm": 2.397920608520508, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.732694149017334, + "num_tokens": 276549998.0, + "step": 11090 + }, + { + "epoch": 1.2179881396881178, + "grad_norm": 2.5007472038269043, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7224433422088623, + "num_tokens": 276570599.0, + "step": 11091 + }, + { + "epoch": 1.2180979573907313, + "grad_norm": 2.371493339538574, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7337184548377991, + "num_tokens": 276592180.0, + "step": 11092 + }, + { + "epoch": 1.218207775093345, + "grad_norm": 2.2015433311462402, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7097083330154419, + "num_tokens": 276618607.0, + "step": 11093 + }, + { + "epoch": 1.2183175927959586, + "grad_norm": 1.8522143363952637, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7040489315986633, + "num_tokens": 276652997.0, + "step": 11094 + }, + { + "epoch": 1.2184274104985724, + "grad_norm": 2.560615062713623, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7279911041259766, + "num_tokens": 276673221.0, + "step": 11095 + }, + { + "epoch": 1.218537228201186, + "grad_norm": 2.437556505203247, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7098090648651123, + "num_tokens": 276696590.0, + "step": 11096 + }, + { + "epoch": 1.2186470459037997, + "grad_norm": 2.417926788330078, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7086955904960632, + "num_tokens": 276720624.0, + "step": 11097 + }, + { + "epoch": 1.2187568636064134, + "grad_norm": 2.39562726020813, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7190591096878052, + "num_tokens": 276746673.0, + "step": 11098 + }, + { + "epoch": 1.218866681309027, + "grad_norm": 2.5354883670806885, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7354303598403931, + "num_tokens": 276766434.0, + "step": 11099 + }, + { + "epoch": 1.2189764990116407, + "grad_norm": 2.5160586833953857, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7318426966667175, + "num_tokens": 276788616.0, + "step": 11100 + }, + { + "epoch": 1.2190863167142543, + "grad_norm": 2.417393684387207, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7110676765441895, + "num_tokens": 276811918.0, + "step": 11101 + }, + { + "epoch": 1.219196134416868, + "grad_norm": 2.3509910106658936, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7241547107696533, + "num_tokens": 276835909.0, + "step": 11102 + }, + { + "epoch": 1.2193059521194818, + "grad_norm": 2.3308608531951904, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7021093368530273, + "num_tokens": 276862360.0, + "step": 11103 + }, + { + "epoch": 1.2194157698220953, + "grad_norm": 2.3558242321014404, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7154032588005066, + "num_tokens": 276885374.0, + "step": 11104 + }, + { + "epoch": 1.219525587524709, + "grad_norm": 2.2402687072753906, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6926106810569763, + "num_tokens": 276909602.0, + "step": 11105 + }, + { + "epoch": 1.2196354052273226, + "grad_norm": 2.44262957572937, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7026549577713013, + "num_tokens": 276932119.0, + "step": 11106 + }, + { + "epoch": 1.2197452229299364, + "grad_norm": 2.079587459564209, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.72996985912323, + "num_tokens": 276959380.0, + "step": 11107 + }, + { + "epoch": 1.21985504063255, + "grad_norm": 2.478975296020508, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7103064060211182, + "num_tokens": 276983103.0, + "step": 11108 + }, + { + "epoch": 1.2199648583351637, + "grad_norm": 2.125004291534424, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7110356092453003, + "num_tokens": 277013143.0, + "step": 11109 + }, + { + "epoch": 1.2200746760377772, + "grad_norm": 2.6137731075286865, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7241889238357544, + "num_tokens": 277033147.0, + "step": 11110 + }, + { + "epoch": 1.220184493740391, + "grad_norm": 2.409972667694092, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7257359027862549, + "num_tokens": 277055375.0, + "step": 11111 + }, + { + "epoch": 1.2202943114430047, + "grad_norm": 2.6208925247192383, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7019294500350952, + "num_tokens": 277077079.0, + "step": 11112 + }, + { + "epoch": 1.2204041291456182, + "grad_norm": 2.2426154613494873, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7352913618087769, + "num_tokens": 277102031.0, + "step": 11113 + }, + { + "epoch": 1.220513946848232, + "grad_norm": 2.208648681640625, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7054773569107056, + "num_tokens": 277130528.0, + "step": 11114 + }, + { + "epoch": 1.2206237645508455, + "grad_norm": 2.1766035556793213, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7156834006309509, + "num_tokens": 277157546.0, + "step": 11115 + }, + { + "epoch": 1.2207335822534593, + "grad_norm": 2.595923662185669, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7183904647827148, + "num_tokens": 277177209.0, + "step": 11116 + }, + { + "epoch": 1.2208433999560728, + "grad_norm": 2.2165472507476807, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.727337121963501, + "num_tokens": 277203297.0, + "step": 11117 + }, + { + "epoch": 1.2209532176586866, + "grad_norm": 2.603140115737915, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.724259614944458, + "num_tokens": 277224829.0, + "step": 11118 + }, + { + "epoch": 1.2210630353613001, + "grad_norm": 2.362407684326172, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7350027561187744, + "num_tokens": 277248034.0, + "step": 11119 + }, + { + "epoch": 1.2211728530639139, + "grad_norm": 2.510523796081543, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7085962295532227, + "num_tokens": 277270061.0, + "step": 11120 + }, + { + "epoch": 1.2212826707665276, + "grad_norm": 2.045001268386841, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7218061089515686, + "num_tokens": 277300694.0, + "step": 11121 + }, + { + "epoch": 1.2213924884691412, + "grad_norm": 2.1773793697357178, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7020806074142456, + "num_tokens": 277327873.0, + "step": 11122 + }, + { + "epoch": 1.221502306171755, + "grad_norm": 2.2745635509490967, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7272559404373169, + "num_tokens": 277353000.0, + "step": 11123 + }, + { + "epoch": 1.2216121238743685, + "grad_norm": 2.2639312744140625, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7104470133781433, + "num_tokens": 277379185.0, + "step": 11124 + }, + { + "epoch": 1.2217219415769822, + "grad_norm": 2.429011106491089, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6976262331008911, + "num_tokens": 277402659.0, + "step": 11125 + }, + { + "epoch": 1.221831759279596, + "grad_norm": 2.1310675144195557, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.748384952545166, + "num_tokens": 277430192.0, + "step": 11126 + }, + { + "epoch": 1.2219415769822095, + "grad_norm": 2.193765163421631, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6974557638168335, + "num_tokens": 277458680.0, + "step": 11127 + }, + { + "epoch": 1.2220513946848233, + "grad_norm": 2.3332297801971436, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7139201164245605, + "num_tokens": 277483320.0, + "step": 11128 + }, + { + "epoch": 1.2221612123874368, + "grad_norm": 2.1184985637664795, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7109748125076294, + "num_tokens": 277512499.0, + "step": 11129 + }, + { + "epoch": 1.2222710300900506, + "grad_norm": 2.584916591644287, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7138955593109131, + "num_tokens": 277534024.0, + "step": 11130 + }, + { + "epoch": 1.222380847792664, + "grad_norm": 2.499924659729004, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7265461683273315, + "num_tokens": 277555875.0, + "step": 11131 + }, + { + "epoch": 1.2224906654952779, + "grad_norm": 2.249030113220215, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.6973956227302551, + "num_tokens": 277581804.0, + "step": 11132 + }, + { + "epoch": 1.2226004831978914, + "grad_norm": 2.1997768878936768, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.735011637210846, + "num_tokens": 277606890.0, + "step": 11133 + }, + { + "epoch": 1.2227103009005051, + "grad_norm": 2.4110779762268066, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7321898341178894, + "num_tokens": 277629357.0, + "step": 11134 + }, + { + "epoch": 1.222820118603119, + "grad_norm": 2.3758292198181152, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7310589551925659, + "num_tokens": 277652848.0, + "step": 11135 + }, + { + "epoch": 1.2229299363057324, + "grad_norm": 2.177311420440674, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7028762102127075, + "num_tokens": 277681714.0, + "step": 11136 + }, + { + "epoch": 1.2230397540083462, + "grad_norm": 1.8625842332839966, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7232075333595276, + "num_tokens": 277714073.0, + "step": 11137 + }, + { + "epoch": 1.2231495717109597, + "grad_norm": 1.9626814126968384, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6934094429016113, + "num_tokens": 277746425.0, + "step": 11138 + }, + { + "epoch": 1.2232593894135735, + "grad_norm": 2.3251893520355225, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7027828693389893, + "num_tokens": 277770129.0, + "step": 11139 + }, + { + "epoch": 1.2233692071161872, + "grad_norm": 2.197845935821533, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.6998062133789062, + "num_tokens": 277796455.0, + "step": 11140 + }, + { + "epoch": 1.2234790248188008, + "grad_norm": 2.3319380283355713, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7264363765716553, + "num_tokens": 277818966.0, + "step": 11141 + }, + { + "epoch": 1.2235888425214145, + "grad_norm": 2.115281581878662, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7074456810951233, + "num_tokens": 277847104.0, + "step": 11142 + }, + { + "epoch": 1.223698660224028, + "grad_norm": 2.4307029247283936, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7134336233139038, + "num_tokens": 277869977.0, + "step": 11143 + }, + { + "epoch": 1.2238084779266418, + "grad_norm": 2.269365072250366, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7283072471618652, + "num_tokens": 277895608.0, + "step": 11144 + }, + { + "epoch": 1.2239182956292554, + "grad_norm": 2.3013620376586914, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7191988229751587, + "num_tokens": 277921289.0, + "step": 11145 + }, + { + "epoch": 1.2240281133318691, + "grad_norm": 2.540553092956543, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7243431806564331, + "num_tokens": 277943972.0, + "step": 11146 + }, + { + "epoch": 1.2241379310344827, + "grad_norm": 2.0321784019470215, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.737476110458374, + "num_tokens": 277970970.0, + "step": 11147 + }, + { + "epoch": 1.2242477487370964, + "grad_norm": 2.2665836811065674, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.745082437992096, + "num_tokens": 277994730.0, + "step": 11148 + }, + { + "epoch": 1.2243575664397102, + "grad_norm": 2.4270246028900146, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7171530723571777, + "num_tokens": 278019514.0, + "step": 11149 + }, + { + "epoch": 1.2244673841423237, + "grad_norm": 2.2379777431488037, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7196639776229858, + "num_tokens": 278045295.0, + "step": 11150 + }, + { + "epoch": 1.2245772018449375, + "grad_norm": 2.461937665939331, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7033635377883911, + "num_tokens": 278068255.0, + "step": 11151 + }, + { + "epoch": 1.224687019547551, + "grad_norm": 2.075756311416626, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7181159257888794, + "num_tokens": 278097323.0, + "step": 11152 + }, + { + "epoch": 1.2247968372501647, + "grad_norm": 2.2833170890808105, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7265346646308899, + "num_tokens": 278121250.0, + "step": 11153 + }, + { + "epoch": 1.2249066549527785, + "grad_norm": 2.1882357597351074, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7262508869171143, + "num_tokens": 278147198.0, + "step": 11154 + }, + { + "epoch": 1.225016472655392, + "grad_norm": 2.1126036643981934, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7168525457382202, + "num_tokens": 278176236.0, + "step": 11155 + }, + { + "epoch": 1.2251262903580058, + "grad_norm": 2.6537957191467285, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7196356058120728, + "num_tokens": 278194765.0, + "step": 11156 + }, + { + "epoch": 1.2252361080606193, + "grad_norm": 2.275151014328003, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6976901888847351, + "num_tokens": 278220167.0, + "step": 11157 + }, + { + "epoch": 1.225345925763233, + "grad_norm": 2.243912935256958, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.718994677066803, + "num_tokens": 278246768.0, + "step": 11158 + }, + { + "epoch": 1.2254557434658466, + "grad_norm": 2.174943685531616, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7104949951171875, + "num_tokens": 278272248.0, + "step": 11159 + }, + { + "epoch": 1.2255655611684604, + "grad_norm": 2.3634092807769775, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.734453022480011, + "num_tokens": 278295512.0, + "step": 11160 + }, + { + "epoch": 1.225675378871074, + "grad_norm": 2.3028171062469482, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7391444444656372, + "num_tokens": 278318509.0, + "step": 11161 + }, + { + "epoch": 1.2257851965736877, + "grad_norm": 2.2145678997039795, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7165793180465698, + "num_tokens": 278343743.0, + "step": 11162 + }, + { + "epoch": 1.2258950142763014, + "grad_norm": 2.283073663711548, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7325685620307922, + "num_tokens": 278368537.0, + "step": 11163 + }, + { + "epoch": 1.226004831978915, + "grad_norm": 2.4837582111358643, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7368848919868469, + "num_tokens": 278389172.0, + "step": 11164 + }, + { + "epoch": 1.2261146496815287, + "grad_norm": 2.5059781074523926, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7108303308486938, + "num_tokens": 278409974.0, + "step": 11165 + }, + { + "epoch": 1.2262244673841423, + "grad_norm": 2.118795871734619, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7162904143333435, + "num_tokens": 278439674.0, + "step": 11166 + }, + { + "epoch": 1.226334285086756, + "grad_norm": 2.6024210453033447, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7275292277336121, + "num_tokens": 278461977.0, + "step": 11167 + }, + { + "epoch": 1.2264441027893696, + "grad_norm": 2.051994562149048, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7197744846343994, + "num_tokens": 278490845.0, + "step": 11168 + }, + { + "epoch": 1.2265539204919833, + "grad_norm": 2.0345232486724854, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7050056457519531, + "num_tokens": 278519934.0, + "step": 11169 + }, + { + "epoch": 1.226663738194597, + "grad_norm": 2.2141923904418945, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7095332145690918, + "num_tokens": 278546091.0, + "step": 11170 + }, + { + "epoch": 1.2267735558972106, + "grad_norm": 2.3952383995056152, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7221323251724243, + "num_tokens": 278569941.0, + "step": 11171 + }, + { + "epoch": 1.2268833735998244, + "grad_norm": 2.2419919967651367, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.719639778137207, + "num_tokens": 278597277.0, + "step": 11172 + }, + { + "epoch": 1.226993191302438, + "grad_norm": 2.133963108062744, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.695785403251648, + "num_tokens": 278627370.0, + "step": 11173 + }, + { + "epoch": 1.2271030090050516, + "grad_norm": 2.2674031257629395, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7172764539718628, + "num_tokens": 278653672.0, + "step": 11174 + }, + { + "epoch": 1.2272128267076652, + "grad_norm": 2.4126906394958496, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7143062949180603, + "num_tokens": 278678953.0, + "step": 11175 + }, + { + "epoch": 1.227322644410279, + "grad_norm": 1.9776229858398438, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7272632122039795, + "num_tokens": 278709191.0, + "step": 11176 + }, + { + "epoch": 1.2274324621128927, + "grad_norm": 2.2088966369628906, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7046391367912292, + "num_tokens": 278736469.0, + "step": 11177 + }, + { + "epoch": 1.2275422798155062, + "grad_norm": 2.236692190170288, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7059330344200134, + "num_tokens": 278765705.0, + "step": 11178 + }, + { + "epoch": 1.22765209751812, + "grad_norm": 2.1331260204315186, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.697928786277771, + "num_tokens": 278796325.0, + "step": 11179 + }, + { + "epoch": 1.2277619152207335, + "grad_norm": 2.21933913230896, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7285321950912476, + "num_tokens": 278821329.0, + "step": 11180 + }, + { + "epoch": 1.2278717329233473, + "grad_norm": 2.09727144241333, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7000051736831665, + "num_tokens": 278850995.0, + "step": 11181 + }, + { + "epoch": 1.2279815506259608, + "grad_norm": 2.4462602138519287, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7238638401031494, + "num_tokens": 278874340.0, + "step": 11182 + }, + { + "epoch": 1.2280913683285746, + "grad_norm": 2.249717950820923, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7203414440155029, + "num_tokens": 278900069.0, + "step": 11183 + }, + { + "epoch": 1.228201186031188, + "grad_norm": 2.3507511615753174, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7051576375961304, + "num_tokens": 278924849.0, + "step": 11184 + }, + { + "epoch": 1.2283110037338019, + "grad_norm": 2.673903226852417, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7318699359893799, + "num_tokens": 278943666.0, + "step": 11185 + }, + { + "epoch": 1.2284208214364156, + "grad_norm": 2.116213798522949, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.730182409286499, + "num_tokens": 278971563.0, + "step": 11186 + }, + { + "epoch": 1.2285306391390292, + "grad_norm": 2.3051815032958984, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7119816541671753, + "num_tokens": 278995671.0, + "step": 11187 + }, + { + "epoch": 1.228640456841643, + "grad_norm": 2.613193988800049, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7043711543083191, + "num_tokens": 279017255.0, + "step": 11188 + }, + { + "epoch": 1.2287502745442564, + "grad_norm": 2.4702858924865723, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7256309986114502, + "num_tokens": 279038634.0, + "step": 11189 + }, + { + "epoch": 1.2288600922468702, + "grad_norm": 2.237521171569824, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7100183367729187, + "num_tokens": 279064588.0, + "step": 11190 + }, + { + "epoch": 1.228969909949484, + "grad_norm": 2.3770220279693604, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7173562049865723, + "num_tokens": 279089261.0, + "step": 11191 + }, + { + "epoch": 1.2290797276520975, + "grad_norm": 2.261775255203247, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7178192138671875, + "num_tokens": 279114713.0, + "step": 11192 + }, + { + "epoch": 1.2291895453547113, + "grad_norm": 2.2273519039154053, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6967295408248901, + "num_tokens": 279139951.0, + "step": 11193 + }, + { + "epoch": 1.2292993630573248, + "grad_norm": 2.168577194213867, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.734429657459259, + "num_tokens": 279165583.0, + "step": 11194 + }, + { + "epoch": 1.2294091807599385, + "grad_norm": 2.0617494583129883, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7162597179412842, + "num_tokens": 279192241.0, + "step": 11195 + }, + { + "epoch": 1.229518998462552, + "grad_norm": 2.2708351612091064, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7050341367721558, + "num_tokens": 279217922.0, + "step": 11196 + }, + { + "epoch": 1.2296288161651658, + "grad_norm": 2.3688998222351074, + "learning_rate": 1e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7439948320388794, + "num_tokens": 279241025.0, + "step": 11197 + }, + { + "epoch": 1.2297386338677794, + "grad_norm": 2.238535165786743, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7093808054924011, + "num_tokens": 279266800.0, + "step": 11198 + }, + { + "epoch": 1.2298484515703931, + "grad_norm": 2.6553163528442383, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7255194187164307, + "num_tokens": 279286162.0, + "step": 11199 + }, + { + "epoch": 1.2299582692730069, + "grad_norm": 2.3208043575286865, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7143338322639465, + "num_tokens": 279310163.0, + "step": 11200 + }, + { + "epoch": 1.2300680869756204, + "grad_norm": 2.2625908851623535, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7157529592514038, + "num_tokens": 279337571.0, + "step": 11201 + }, + { + "epoch": 1.2301779046782342, + "grad_norm": 2.307828903198242, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7068542242050171, + "num_tokens": 279361291.0, + "step": 11202 + }, + { + "epoch": 1.2302877223808477, + "grad_norm": 2.215355396270752, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7238759994506836, + "num_tokens": 279386922.0, + "step": 11203 + }, + { + "epoch": 1.2303975400834615, + "grad_norm": 2.3308780193328857, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.711585521697998, + "num_tokens": 279411139.0, + "step": 11204 + }, + { + "epoch": 1.2305073577860752, + "grad_norm": 1.9994847774505615, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7081524133682251, + "num_tokens": 279444024.0, + "step": 11205 + }, + { + "epoch": 1.2306171754886888, + "grad_norm": 2.2686567306518555, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7321814894676208, + "num_tokens": 279468786.0, + "step": 11206 + }, + { + "epoch": 1.2307269931913025, + "grad_norm": 2.478870391845703, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.699729859828949, + "num_tokens": 279491291.0, + "step": 11207 + }, + { + "epoch": 1.230836810893916, + "grad_norm": 2.015730619430542, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7074805498123169, + "num_tokens": 279521792.0, + "step": 11208 + }, + { + "epoch": 1.2309466285965298, + "grad_norm": 2.1150031089782715, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7371029853820801, + "num_tokens": 279548056.0, + "step": 11209 + }, + { + "epoch": 1.2310564462991433, + "grad_norm": 2.3021628856658936, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7080244421958923, + "num_tokens": 279574701.0, + "step": 11210 + }, + { + "epoch": 1.231166264001757, + "grad_norm": 2.6662237644195557, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7237198352813721, + "num_tokens": 279593326.0, + "step": 11211 + }, + { + "epoch": 1.2312760817043706, + "grad_norm": 2.292422294616699, + "learning_rate": 1e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.7446725368499756, + "num_tokens": 279617031.0, + "step": 11212 + }, + { + "epoch": 1.2313858994069844, + "grad_norm": 2.466733455657959, + "learning_rate": 1e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7414674758911133, + "num_tokens": 279639401.0, + "step": 11213 + }, + { + "epoch": 1.2314957171095982, + "grad_norm": 2.24345326423645, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7137559652328491, + "num_tokens": 279663954.0, + "step": 11214 + }, + { + "epoch": 1.2316055348122117, + "grad_norm": 2.1872828006744385, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6979978084564209, + "num_tokens": 279690625.0, + "step": 11215 + }, + { + "epoch": 1.2317153525148254, + "grad_norm": 2.3364930152893066, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7084870934486389, + "num_tokens": 279714241.0, + "step": 11216 + }, + { + "epoch": 1.231825170217439, + "grad_norm": 1.995571255683899, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7202389240264893, + "num_tokens": 279746281.0, + "step": 11217 + }, + { + "epoch": 1.2319349879200527, + "grad_norm": 2.31911563873291, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7271284461021423, + "num_tokens": 279768881.0, + "step": 11218 + }, + { + "epoch": 1.2320448056226665, + "grad_norm": 2.528792142868042, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7272776365280151, + "num_tokens": 279790178.0, + "step": 11219 + }, + { + "epoch": 1.23215462332528, + "grad_norm": 2.1356866359710693, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7361148595809937, + "num_tokens": 279817265.0, + "step": 11220 + }, + { + "epoch": 1.2322644410278938, + "grad_norm": 2.475797414779663, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7287247776985168, + "num_tokens": 279839014.0, + "step": 11221 + }, + { + "epoch": 1.2323742587305073, + "grad_norm": 2.4882185459136963, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7492801547050476, + "num_tokens": 279861168.0, + "step": 11222 + }, + { + "epoch": 1.232484076433121, + "grad_norm": 2.717534303665161, + "learning_rate": 1e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7362017035484314, + "num_tokens": 279879495.0, + "step": 11223 + }, + { + "epoch": 1.2325938941357346, + "grad_norm": 2.3182170391082764, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.709516167640686, + "num_tokens": 279904641.0, + "step": 11224 + }, + { + "epoch": 1.2327037118383484, + "grad_norm": 2.059882164001465, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7181801795959473, + "num_tokens": 279934121.0, + "step": 11225 + }, + { + "epoch": 1.232813529540962, + "grad_norm": 2.138735771179199, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.712565541267395, + "num_tokens": 279964824.0, + "step": 11226 + }, + { + "epoch": 1.2329233472435757, + "grad_norm": 2.320080280303955, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6928187608718872, + "num_tokens": 279991027.0, + "step": 11227 + }, + { + "epoch": 1.2330331649461894, + "grad_norm": 2.318812608718872, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7246432900428772, + "num_tokens": 280014968.0, + "step": 11228 + }, + { + "epoch": 1.233142982648803, + "grad_norm": 2.2988622188568115, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7190567255020142, + "num_tokens": 280039218.0, + "step": 11229 + }, + { + "epoch": 1.2332528003514167, + "grad_norm": 2.6566476821899414, + "learning_rate": 1e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7381943464279175, + "num_tokens": 280058317.0, + "step": 11230 + }, + { + "epoch": 1.2333626180540302, + "grad_norm": 2.73112154006958, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7287616729736328, + "num_tokens": 280077677.0, + "step": 11231 + }, + { + "epoch": 1.233472435756644, + "grad_norm": 2.300752878189087, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7187843322753906, + "num_tokens": 280103071.0, + "step": 11232 + }, + { + "epoch": 1.2335822534592575, + "grad_norm": 2.217794418334961, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.713981032371521, + "num_tokens": 280128272.0, + "step": 11233 + }, + { + "epoch": 1.2336920711618713, + "grad_norm": 2.299499988555908, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7341432571411133, + "num_tokens": 280154195.0, + "step": 11234 + }, + { + "epoch": 1.2338018888644848, + "grad_norm": 2.231793165206909, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7370775938034058, + "num_tokens": 280179672.0, + "step": 11235 + }, + { + "epoch": 1.2339117065670986, + "grad_norm": 2.534058094024658, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6974136829376221, + "num_tokens": 280202273.0, + "step": 11236 + }, + { + "epoch": 1.2340215242697123, + "grad_norm": 2.41334867477417, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7116878628730774, + "num_tokens": 280227041.0, + "step": 11237 + }, + { + "epoch": 1.2341313419723259, + "grad_norm": 2.3203272819519043, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7364616394042969, + "num_tokens": 280250635.0, + "step": 11238 + }, + { + "epoch": 1.2342411596749396, + "grad_norm": 2.4676496982574463, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7234139442443848, + "num_tokens": 280271979.0, + "step": 11239 + }, + { + "epoch": 1.2343509773775532, + "grad_norm": 2.755295991897583, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7290600538253784, + "num_tokens": 280288995.0, + "step": 11240 + }, + { + "epoch": 1.234460795080167, + "grad_norm": 2.4322400093078613, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7105151414871216, + "num_tokens": 280313693.0, + "step": 11241 + }, + { + "epoch": 1.2345706127827807, + "grad_norm": 2.136532783508301, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7254120111465454, + "num_tokens": 280340872.0, + "step": 11242 + }, + { + "epoch": 1.2346804304853942, + "grad_norm": 2.191455841064453, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7103680968284607, + "num_tokens": 280368330.0, + "step": 11243 + }, + { + "epoch": 1.234790248188008, + "grad_norm": 2.426898717880249, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.6964975595474243, + "num_tokens": 280391261.0, + "step": 11244 + }, + { + "epoch": 1.2349000658906215, + "grad_norm": 2.3504462242126465, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7318617105484009, + "num_tokens": 280413476.0, + "step": 11245 + }, + { + "epoch": 1.2350098835932353, + "grad_norm": 2.3009841442108154, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7339865565299988, + "num_tokens": 280437874.0, + "step": 11246 + }, + { + "epoch": 1.2351197012958488, + "grad_norm": 2.0642213821411133, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7086775898933411, + "num_tokens": 280468956.0, + "step": 11247 + }, + { + "epoch": 1.2352295189984626, + "grad_norm": 2.208566427230835, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7041610479354858, + "num_tokens": 280495082.0, + "step": 11248 + }, + { + "epoch": 1.235339336701076, + "grad_norm": 2.432375192642212, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7144476175308228, + "num_tokens": 280516750.0, + "step": 11249 + }, + { + "epoch": 1.2354491544036899, + "grad_norm": 2.3936190605163574, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7298808097839355, + "num_tokens": 280539795.0, + "step": 11250 + }, + { + "epoch": 1.2355589721063036, + "grad_norm": 2.0632948875427246, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.729296088218689, + "num_tokens": 280571099.0, + "step": 11251 + }, + { + "epoch": 1.2356687898089171, + "grad_norm": 2.449808120727539, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6994776129722595, + "num_tokens": 280594648.0, + "step": 11252 + }, + { + "epoch": 1.235778607511531, + "grad_norm": 2.048527240753174, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7074446678161621, + "num_tokens": 280624635.0, + "step": 11253 + }, + { + "epoch": 1.2358884252141444, + "grad_norm": 2.299097776412964, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7036371231079102, + "num_tokens": 280649737.0, + "step": 11254 + }, + { + "epoch": 1.2359982429167582, + "grad_norm": 2.4375951290130615, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7089464664459229, + "num_tokens": 280674545.0, + "step": 11255 + }, + { + "epoch": 1.236108060619372, + "grad_norm": 2.7430949211120605, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7327495813369751, + "num_tokens": 280692192.0, + "step": 11256 + }, + { + "epoch": 1.2362178783219855, + "grad_norm": 2.272268295288086, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7185591459274292, + "num_tokens": 280716893.0, + "step": 11257 + }, + { + "epoch": 1.2363276960245992, + "grad_norm": 2.2058236598968506, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7139415740966797, + "num_tokens": 280743742.0, + "step": 11258 + }, + { + "epoch": 1.2364375137272128, + "grad_norm": 2.278153896331787, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.713081955909729, + "num_tokens": 280768826.0, + "step": 11259 + }, + { + "epoch": 1.2365473314298265, + "grad_norm": 2.281202554702759, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7118197679519653, + "num_tokens": 280792848.0, + "step": 11260 + }, + { + "epoch": 1.23665714913244, + "grad_norm": 2.2644283771514893, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7166743278503418, + "num_tokens": 280818214.0, + "step": 11261 + }, + { + "epoch": 1.2367669668350538, + "grad_norm": 2.1166372299194336, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7149714231491089, + "num_tokens": 280847671.0, + "step": 11262 + }, + { + "epoch": 1.2368767845376674, + "grad_norm": 2.2146737575531006, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7107617855072021, + "num_tokens": 280875978.0, + "step": 11263 + }, + { + "epoch": 1.2369866022402811, + "grad_norm": 2.378577947616577, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7337136268615723, + "num_tokens": 280899928.0, + "step": 11264 + }, + { + "epoch": 1.2370964199428949, + "grad_norm": 2.249035120010376, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7087191343307495, + "num_tokens": 280925723.0, + "step": 11265 + }, + { + "epoch": 1.2372062376455084, + "grad_norm": 2.6350181102752686, + "learning_rate": 1e-06, + "loss": 0.7974, + "mean_token_accuracy": 0.752207338809967, + "num_tokens": 280945100.0, + "step": 11266 + }, + { + "epoch": 1.2373160553481222, + "grad_norm": 2.4090993404388428, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7180110812187195, + "num_tokens": 280966769.0, + "step": 11267 + }, + { + "epoch": 1.2374258730507357, + "grad_norm": 2.3930461406707764, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7240586280822754, + "num_tokens": 280990032.0, + "step": 11268 + }, + { + "epoch": 1.2375356907533495, + "grad_norm": 2.2730355262756348, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7217850685119629, + "num_tokens": 281015902.0, + "step": 11269 + }, + { + "epoch": 1.2376455084559632, + "grad_norm": 2.2728779315948486, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7187681198120117, + "num_tokens": 281040463.0, + "step": 11270 + }, + { + "epoch": 1.2377553261585768, + "grad_norm": 2.2222793102264404, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7104791402816772, + "num_tokens": 281067374.0, + "step": 11271 + }, + { + "epoch": 1.2378651438611905, + "grad_norm": 2.4187328815460205, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.716899037361145, + "num_tokens": 281090806.0, + "step": 11272 + }, + { + "epoch": 1.237974961563804, + "grad_norm": 2.3726091384887695, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7187682390213013, + "num_tokens": 281114851.0, + "step": 11273 + }, + { + "epoch": 1.2380847792664178, + "grad_norm": 2.502682685852051, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7236775159835815, + "num_tokens": 281135735.0, + "step": 11274 + }, + { + "epoch": 1.2381945969690313, + "grad_norm": 2.4006569385528564, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7303333878517151, + "num_tokens": 281156889.0, + "step": 11275 + }, + { + "epoch": 1.238304414671645, + "grad_norm": 2.1267247200012207, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7128389477729797, + "num_tokens": 281184170.0, + "step": 11276 + }, + { + "epoch": 1.2384142323742586, + "grad_norm": 2.3729116916656494, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.721706211566925, + "num_tokens": 281207021.0, + "step": 11277 + }, + { + "epoch": 1.2385240500768724, + "grad_norm": 2.646416187286377, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7044490575790405, + "num_tokens": 281227080.0, + "step": 11278 + }, + { + "epoch": 1.2386338677794861, + "grad_norm": 2.6477510929107666, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7503889799118042, + "num_tokens": 281245735.0, + "step": 11279 + }, + { + "epoch": 1.2387436854820997, + "grad_norm": 2.5482747554779053, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7214153409004211, + "num_tokens": 281267090.0, + "step": 11280 + }, + { + "epoch": 1.2388535031847134, + "grad_norm": 2.021110773086548, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7138220071792603, + "num_tokens": 281298586.0, + "step": 11281 + }, + { + "epoch": 1.238963320887327, + "grad_norm": 2.0371015071868896, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7200440168380737, + "num_tokens": 281326037.0, + "step": 11282 + }, + { + "epoch": 1.2390731385899407, + "grad_norm": 2.650972604751587, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7201728224754333, + "num_tokens": 281345237.0, + "step": 11283 + }, + { + "epoch": 1.2391829562925545, + "grad_norm": 2.2134299278259277, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.723041296005249, + "num_tokens": 281370687.0, + "step": 11284 + }, + { + "epoch": 1.239292773995168, + "grad_norm": 2.369429349899292, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7070207595825195, + "num_tokens": 281396318.0, + "step": 11285 + }, + { + "epoch": 1.2394025916977818, + "grad_norm": 2.1495726108551025, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6787705421447754, + "num_tokens": 281426971.0, + "step": 11286 + }, + { + "epoch": 1.2395124094003953, + "grad_norm": 2.257892370223999, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7082301378250122, + "num_tokens": 281452204.0, + "step": 11287 + }, + { + "epoch": 1.239622227103009, + "grad_norm": 2.5761494636535645, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7322185039520264, + "num_tokens": 281472009.0, + "step": 11288 + }, + { + "epoch": 1.2397320448056226, + "grad_norm": 2.197667360305786, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.713880181312561, + "num_tokens": 281498581.0, + "step": 11289 + }, + { + "epoch": 1.2398418625082364, + "grad_norm": 2.2445969581604004, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7319765090942383, + "num_tokens": 281526431.0, + "step": 11290 + }, + { + "epoch": 1.23995168021085, + "grad_norm": 2.035914897918701, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7378069162368774, + "num_tokens": 281554323.0, + "step": 11291 + }, + { + "epoch": 1.2400614979134637, + "grad_norm": 2.613694429397583, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7279515266418457, + "num_tokens": 281577674.0, + "step": 11292 + }, + { + "epoch": 1.2401713156160774, + "grad_norm": 2.292693853378296, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7281774282455444, + "num_tokens": 281602831.0, + "step": 11293 + }, + { + "epoch": 1.240281133318691, + "grad_norm": 2.4150924682617188, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7060707807540894, + "num_tokens": 281627814.0, + "step": 11294 + }, + { + "epoch": 1.2403909510213047, + "grad_norm": 2.188464641571045, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6928904056549072, + "num_tokens": 281656296.0, + "step": 11295 + }, + { + "epoch": 1.2405007687239182, + "grad_norm": 2.475088357925415, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7391802668571472, + "num_tokens": 281678857.0, + "step": 11296 + }, + { + "epoch": 1.240610586426532, + "grad_norm": 2.3455677032470703, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7478756904602051, + "num_tokens": 281703803.0, + "step": 11297 + }, + { + "epoch": 1.2407204041291455, + "grad_norm": 2.010935068130493, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.705021858215332, + "num_tokens": 281735230.0, + "step": 11298 + }, + { + "epoch": 1.2408302218317593, + "grad_norm": 2.242825508117676, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7165263295173645, + "num_tokens": 281760253.0, + "step": 11299 + }, + { + "epoch": 1.2409400395343728, + "grad_norm": 2.3474667072296143, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7239416837692261, + "num_tokens": 281784712.0, + "step": 11300 + }, + { + "epoch": 1.2410498572369866, + "grad_norm": 2.6625993251800537, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.722594141960144, + "num_tokens": 281805237.0, + "step": 11301 + }, + { + "epoch": 1.2411596749396003, + "grad_norm": 2.3288309574127197, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7273243069648743, + "num_tokens": 281827711.0, + "step": 11302 + }, + { + "epoch": 1.2412694926422139, + "grad_norm": 2.379077434539795, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.709380030632019, + "num_tokens": 281854565.0, + "step": 11303 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 2.195354461669922, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6930899024009705, + "num_tokens": 281882805.0, + "step": 11304 + }, + { + "epoch": 1.2414891280474412, + "grad_norm": 2.3730835914611816, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7181791067123413, + "num_tokens": 281905895.0, + "step": 11305 + }, + { + "epoch": 1.241598945750055, + "grad_norm": 2.4783272743225098, + "learning_rate": 1e-06, + "loss": 0.7876, + "mean_token_accuracy": 0.7478028535842896, + "num_tokens": 281926739.0, + "step": 11306 + }, + { + "epoch": 1.2417087634526687, + "grad_norm": 2.393977642059326, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7205406427383423, + "num_tokens": 281948687.0, + "step": 11307 + }, + { + "epoch": 1.2418185811552822, + "grad_norm": 2.351102590560913, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.738353967666626, + "num_tokens": 281971556.0, + "step": 11308 + }, + { + "epoch": 1.241928398857896, + "grad_norm": 2.146798610687256, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7210773229598999, + "num_tokens": 281998135.0, + "step": 11309 + }, + { + "epoch": 1.2420382165605095, + "grad_norm": 2.2130696773529053, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7238091230392456, + "num_tokens": 282024001.0, + "step": 11310 + }, + { + "epoch": 1.2421480342631233, + "grad_norm": 2.4997308254241943, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7039958238601685, + "num_tokens": 282046574.0, + "step": 11311 + }, + { + "epoch": 1.2422578519657368, + "grad_norm": 2.3284358978271484, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7202317714691162, + "num_tokens": 282071493.0, + "step": 11312 + }, + { + "epoch": 1.2423676696683505, + "grad_norm": 2.2232439517974854, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7336591482162476, + "num_tokens": 282099926.0, + "step": 11313 + }, + { + "epoch": 1.242477487370964, + "grad_norm": 1.9612852334976196, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7049626111984253, + "num_tokens": 282128358.0, + "step": 11314 + }, + { + "epoch": 1.2425873050735778, + "grad_norm": 2.4077706336975098, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.727768063545227, + "num_tokens": 282151118.0, + "step": 11315 + }, + { + "epoch": 1.2426971227761916, + "grad_norm": 2.161766290664673, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7197892665863037, + "num_tokens": 282178534.0, + "step": 11316 + }, + { + "epoch": 1.2428069404788051, + "grad_norm": 2.243881940841675, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7439289093017578, + "num_tokens": 282201804.0, + "step": 11317 + }, + { + "epoch": 1.242916758181419, + "grad_norm": 2.444685935974121, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7251611948013306, + "num_tokens": 282223105.0, + "step": 11318 + }, + { + "epoch": 1.2430265758840324, + "grad_norm": 2.2374496459960938, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7415138483047485, + "num_tokens": 282246732.0, + "step": 11319 + }, + { + "epoch": 1.2431363935866462, + "grad_norm": 2.0561208724975586, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.6978973150253296, + "num_tokens": 282277029.0, + "step": 11320 + }, + { + "epoch": 1.24324621128926, + "grad_norm": 2.1777706146240234, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7225905656814575, + "num_tokens": 282302264.0, + "step": 11321 + }, + { + "epoch": 1.2433560289918735, + "grad_norm": 2.7067129611968994, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7259761095046997, + "num_tokens": 282322208.0, + "step": 11322 + }, + { + "epoch": 1.2434658466944872, + "grad_norm": 2.116203784942627, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7254709601402283, + "num_tokens": 282351701.0, + "step": 11323 + }, + { + "epoch": 1.2435756643971008, + "grad_norm": 2.0781853199005127, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.6995011568069458, + "num_tokens": 282381560.0, + "step": 11324 + }, + { + "epoch": 1.2436854820997145, + "grad_norm": 2.398874521255493, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7226850390434265, + "num_tokens": 282405081.0, + "step": 11325 + }, + { + "epoch": 1.243795299802328, + "grad_norm": 2.267406702041626, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7324983477592468, + "num_tokens": 282434497.0, + "step": 11326 + }, + { + "epoch": 1.2439051175049418, + "grad_norm": 2.24544095993042, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7165837287902832, + "num_tokens": 282461632.0, + "step": 11327 + }, + { + "epoch": 1.2440149352075554, + "grad_norm": 2.2893331050872803, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7249643206596375, + "num_tokens": 282485478.0, + "step": 11328 + }, + { + "epoch": 1.244124752910169, + "grad_norm": 2.2754712104797363, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7162940502166748, + "num_tokens": 282510202.0, + "step": 11329 + }, + { + "epoch": 1.2442345706127829, + "grad_norm": 2.245666027069092, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7279514074325562, + "num_tokens": 282536774.0, + "step": 11330 + }, + { + "epoch": 1.2443443883153964, + "grad_norm": 2.6915194988250732, + "learning_rate": 1e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7347942590713501, + "num_tokens": 282556919.0, + "step": 11331 + }, + { + "epoch": 1.2444542060180102, + "grad_norm": 2.1502630710601807, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7194802165031433, + "num_tokens": 282584915.0, + "step": 11332 + }, + { + "epoch": 1.2445640237206237, + "grad_norm": 2.2092840671539307, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7263870239257812, + "num_tokens": 282611686.0, + "step": 11333 + }, + { + "epoch": 1.2446738414232374, + "grad_norm": 2.4778525829315186, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7253975868225098, + "num_tokens": 282631597.0, + "step": 11334 + }, + { + "epoch": 1.2447836591258512, + "grad_norm": 2.422390937805176, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7217360734939575, + "num_tokens": 282655331.0, + "step": 11335 + }, + { + "epoch": 1.2448934768284647, + "grad_norm": 2.317077159881592, + "learning_rate": 1e-06, + "loss": 0.8291, + "mean_token_accuracy": 0.733900249004364, + "num_tokens": 282679837.0, + "step": 11336 + }, + { + "epoch": 1.2450032945310785, + "grad_norm": 2.214494228363037, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7274397015571594, + "num_tokens": 282706191.0, + "step": 11337 + }, + { + "epoch": 1.245113112233692, + "grad_norm": 2.274522304534912, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7027823328971863, + "num_tokens": 282731778.0, + "step": 11338 + }, + { + "epoch": 1.2452229299363058, + "grad_norm": 2.4104979038238525, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7288854718208313, + "num_tokens": 282751717.0, + "step": 11339 + }, + { + "epoch": 1.2453327476389193, + "grad_norm": 2.359966278076172, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7259447574615479, + "num_tokens": 282775904.0, + "step": 11340 + }, + { + "epoch": 1.245442565341533, + "grad_norm": 2.276125192642212, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.711388885974884, + "num_tokens": 282799583.0, + "step": 11341 + }, + { + "epoch": 1.2455523830441466, + "grad_norm": 2.010870933532715, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7022895216941833, + "num_tokens": 282828756.0, + "step": 11342 + }, + { + "epoch": 1.2456622007467604, + "grad_norm": 2.6456854343414307, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7494903802871704, + "num_tokens": 282848473.0, + "step": 11343 + }, + { + "epoch": 1.2457720184493741, + "grad_norm": 2.5999484062194824, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6966290473937988, + "num_tokens": 282871119.0, + "step": 11344 + }, + { + "epoch": 1.2458818361519877, + "grad_norm": 2.2313249111175537, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7137627601623535, + "num_tokens": 282896729.0, + "step": 11345 + }, + { + "epoch": 1.2459916538546014, + "grad_norm": 2.3339273929595947, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.717037558555603, + "num_tokens": 282919939.0, + "step": 11346 + }, + { + "epoch": 1.246101471557215, + "grad_norm": 2.4621706008911133, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7376376986503601, + "num_tokens": 282941691.0, + "step": 11347 + }, + { + "epoch": 1.2462112892598287, + "grad_norm": 2.169210195541382, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7262765765190125, + "num_tokens": 282968378.0, + "step": 11348 + }, + { + "epoch": 1.2463211069624422, + "grad_norm": 2.453385829925537, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7000182271003723, + "num_tokens": 282992042.0, + "step": 11349 + }, + { + "epoch": 1.246430924665056, + "grad_norm": 2.062788248062134, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7007790803909302, + "num_tokens": 283022057.0, + "step": 11350 + }, + { + "epoch": 1.2465407423676698, + "grad_norm": 2.162289619445801, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7159589529037476, + "num_tokens": 283048701.0, + "step": 11351 + }, + { + "epoch": 1.2466505600702833, + "grad_norm": 2.33872652053833, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7080966234207153, + "num_tokens": 283072540.0, + "step": 11352 + }, + { + "epoch": 1.246760377772897, + "grad_norm": 2.4437053203582764, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7393674850463867, + "num_tokens": 283094880.0, + "step": 11353 + }, + { + "epoch": 1.2468701954755106, + "grad_norm": 2.3524744510650635, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7151932716369629, + "num_tokens": 283118789.0, + "step": 11354 + }, + { + "epoch": 1.2469800131781243, + "grad_norm": 2.5289204120635986, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7336689233779907, + "num_tokens": 283138600.0, + "step": 11355 + }, + { + "epoch": 1.2470898308807379, + "grad_norm": 2.1649363040924072, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7131167054176331, + "num_tokens": 283166866.0, + "step": 11356 + }, + { + "epoch": 1.2471996485833516, + "grad_norm": 1.9863797426223755, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6984869241714478, + "num_tokens": 283200198.0, + "step": 11357 + }, + { + "epoch": 1.2473094662859654, + "grad_norm": 2.2228715419769287, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7215301990509033, + "num_tokens": 283226520.0, + "step": 11358 + }, + { + "epoch": 1.247419283988579, + "grad_norm": 2.210493564605713, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7254021763801575, + "num_tokens": 283251112.0, + "step": 11359 + }, + { + "epoch": 1.2475291016911927, + "grad_norm": 2.1008756160736084, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7051241993904114, + "num_tokens": 283280541.0, + "step": 11360 + }, + { + "epoch": 1.2476389193938062, + "grad_norm": 2.4269790649414062, + "learning_rate": 1e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7374335527420044, + "num_tokens": 283302620.0, + "step": 11361 + }, + { + "epoch": 1.24774873709642, + "grad_norm": 2.061614513397217, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.711165189743042, + "num_tokens": 283330835.0, + "step": 11362 + }, + { + "epoch": 1.2478585547990335, + "grad_norm": 2.6666061878204346, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7528244256973267, + "num_tokens": 283349515.0, + "step": 11363 + }, + { + "epoch": 1.2479683725016473, + "grad_norm": 2.4857945442199707, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7247960567474365, + "num_tokens": 283371545.0, + "step": 11364 + }, + { + "epoch": 1.2480781902042608, + "grad_norm": 2.677107810974121, + "learning_rate": 1e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7350013852119446, + "num_tokens": 283389728.0, + "step": 11365 + }, + { + "epoch": 1.2481880079068746, + "grad_norm": 2.260174512863159, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7228105068206787, + "num_tokens": 283413766.0, + "step": 11366 + }, + { + "epoch": 1.2482978256094883, + "grad_norm": 2.092771291732788, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7101497650146484, + "num_tokens": 283443944.0, + "step": 11367 + }, + { + "epoch": 1.2484076433121019, + "grad_norm": 2.1161346435546875, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7325464487075806, + "num_tokens": 283471041.0, + "step": 11368 + }, + { + "epoch": 1.2485174610147156, + "grad_norm": 2.478286027908325, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.729175865650177, + "num_tokens": 283492156.0, + "step": 11369 + }, + { + "epoch": 1.2486272787173291, + "grad_norm": 2.3565382957458496, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7062176465988159, + "num_tokens": 283516308.0, + "step": 11370 + }, + { + "epoch": 1.248737096419943, + "grad_norm": 2.101480722427368, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7089105844497681, + "num_tokens": 283546280.0, + "step": 11371 + }, + { + "epoch": 1.2488469141225567, + "grad_norm": 2.166486978530884, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7272711396217346, + "num_tokens": 283574557.0, + "step": 11372 + }, + { + "epoch": 1.2489567318251702, + "grad_norm": 2.2512831687927246, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6992120742797852, + "num_tokens": 283599627.0, + "step": 11373 + }, + { + "epoch": 1.249066549527784, + "grad_norm": 2.416508436203003, + "learning_rate": 1e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7552834749221802, + "num_tokens": 283621990.0, + "step": 11374 + }, + { + "epoch": 1.2491763672303975, + "grad_norm": 2.4414429664611816, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.724155068397522, + "num_tokens": 283643824.0, + "step": 11375 + }, + { + "epoch": 1.2492861849330112, + "grad_norm": 2.2148141860961914, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6891368627548218, + "num_tokens": 283671291.0, + "step": 11376 + }, + { + "epoch": 1.2493960026356248, + "grad_norm": 2.455728530883789, + "learning_rate": 1e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7453733682632446, + "num_tokens": 283692347.0, + "step": 11377 + }, + { + "epoch": 1.2495058203382385, + "grad_norm": 2.5632035732269287, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7212249040603638, + "num_tokens": 283713250.0, + "step": 11378 + }, + { + "epoch": 1.249615638040852, + "grad_norm": 2.0836544036865234, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7132666707038879, + "num_tokens": 283740486.0, + "step": 11379 + }, + { + "epoch": 1.2497254557434658, + "grad_norm": 2.192490577697754, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7311372756958008, + "num_tokens": 283765702.0, + "step": 11380 + }, + { + "epoch": 1.2498352734460796, + "grad_norm": 2.224607229232788, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7082961201667786, + "num_tokens": 283791015.0, + "step": 11381 + }, + { + "epoch": 1.2499450911486931, + "grad_norm": 2.0904386043548584, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7097352147102356, + "num_tokens": 283817650.0, + "step": 11382 + }, + { + "epoch": 1.2500549088513069, + "grad_norm": 2.157221794128418, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7064074277877808, + "num_tokens": 283845235.0, + "step": 11383 + }, + { + "epoch": 1.2501647265539204, + "grad_norm": 2.07784366607666, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7226678133010864, + "num_tokens": 283870997.0, + "step": 11384 + }, + { + "epoch": 1.2502745442565342, + "grad_norm": 2.3467180728912354, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7196900844573975, + "num_tokens": 283894312.0, + "step": 11385 + }, + { + "epoch": 1.250384361959148, + "grad_norm": 2.703535556793213, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7191376090049744, + "num_tokens": 283915075.0, + "step": 11386 + }, + { + "epoch": 1.2504941796617615, + "grad_norm": 2.3958640098571777, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7171005010604858, + "num_tokens": 283936932.0, + "step": 11387 + }, + { + "epoch": 1.2506039973643752, + "grad_norm": 2.4554576873779297, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7205845713615417, + "num_tokens": 283958525.0, + "step": 11388 + }, + { + "epoch": 1.2507138150669888, + "grad_norm": 2.3369603157043457, + "learning_rate": 1e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7512696981430054, + "num_tokens": 283982880.0, + "step": 11389 + }, + { + "epoch": 1.2508236327696025, + "grad_norm": 2.000542402267456, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6844557523727417, + "num_tokens": 284015164.0, + "step": 11390 + }, + { + "epoch": 1.250933450472216, + "grad_norm": 2.1507482528686523, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7075502872467041, + "num_tokens": 284045679.0, + "step": 11391 + }, + { + "epoch": 1.2510432681748298, + "grad_norm": 2.300720691680908, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.728781521320343, + "num_tokens": 284068960.0, + "step": 11392 + }, + { + "epoch": 1.2511530858774433, + "grad_norm": 2.732314109802246, + "learning_rate": 1e-06, + "loss": 0.7345, + "mean_token_accuracy": 0.7551025748252869, + "num_tokens": 284086414.0, + "step": 11393 + }, + { + "epoch": 1.251262903580057, + "grad_norm": 2.212287187576294, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7412487268447876, + "num_tokens": 284112126.0, + "step": 11394 + }, + { + "epoch": 1.2513727212826709, + "grad_norm": 2.4989991188049316, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7336199283599854, + "num_tokens": 284133020.0, + "step": 11395 + }, + { + "epoch": 1.2514825389852844, + "grad_norm": 2.516958713531494, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7144836187362671, + "num_tokens": 284156851.0, + "step": 11396 + }, + { + "epoch": 1.2515923566878981, + "grad_norm": 2.4873199462890625, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.743438184261322, + "num_tokens": 284178866.0, + "step": 11397 + }, + { + "epoch": 1.2517021743905117, + "grad_norm": 2.0758447647094727, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7143716812133789, + "num_tokens": 284208221.0, + "step": 11398 + }, + { + "epoch": 1.2518119920931254, + "grad_norm": 2.264909505844116, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7277368307113647, + "num_tokens": 284234945.0, + "step": 11399 + }, + { + "epoch": 1.2519218097957392, + "grad_norm": 2.3952131271362305, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7314035892486572, + "num_tokens": 284258312.0, + "step": 11400 + }, + { + "epoch": 1.2520316274983527, + "grad_norm": 2.1696438789367676, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7113019227981567, + "num_tokens": 284285089.0, + "step": 11401 + }, + { + "epoch": 1.2521414452009663, + "grad_norm": 2.264820098876953, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7046200633049011, + "num_tokens": 284310746.0, + "step": 11402 + }, + { + "epoch": 1.25225126290358, + "grad_norm": 2.501568555831909, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7199389338493347, + "num_tokens": 284333181.0, + "step": 11403 + }, + { + "epoch": 1.2523610806061938, + "grad_norm": 2.443405866622925, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7311200499534607, + "num_tokens": 284354210.0, + "step": 11404 + }, + { + "epoch": 1.2524708983088073, + "grad_norm": 2.164180278778076, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7204223871231079, + "num_tokens": 284380759.0, + "step": 11405 + }, + { + "epoch": 1.252580716011421, + "grad_norm": 2.3255064487457275, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7347620129585266, + "num_tokens": 284405080.0, + "step": 11406 + }, + { + "epoch": 1.2526905337140346, + "grad_norm": 2.386950731277466, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7319740653038025, + "num_tokens": 284429494.0, + "step": 11407 + }, + { + "epoch": 1.2528003514166484, + "grad_norm": 2.1439712047576904, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7425524592399597, + "num_tokens": 284457850.0, + "step": 11408 + }, + { + "epoch": 1.2529101691192621, + "grad_norm": 2.5162644386291504, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7298934459686279, + "num_tokens": 284477913.0, + "step": 11409 + }, + { + "epoch": 1.2530199868218757, + "grad_norm": 2.4014739990234375, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.721031665802002, + "num_tokens": 284500719.0, + "step": 11410 + }, + { + "epoch": 1.2531298045244894, + "grad_norm": 2.1567835807800293, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7220537662506104, + "num_tokens": 284526819.0, + "step": 11411 + }, + { + "epoch": 1.253239622227103, + "grad_norm": 2.5329575538635254, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7004112005233765, + "num_tokens": 284548647.0, + "step": 11412 + }, + { + "epoch": 1.2533494399297167, + "grad_norm": 2.2288804054260254, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7345094680786133, + "num_tokens": 284574540.0, + "step": 11413 + }, + { + "epoch": 1.2534592576323305, + "grad_norm": 2.2358486652374268, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7211042642593384, + "num_tokens": 284601278.0, + "step": 11414 + }, + { + "epoch": 1.253569075334944, + "grad_norm": 2.127105712890625, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7229695320129395, + "num_tokens": 284630329.0, + "step": 11415 + }, + { + "epoch": 1.2536788930375575, + "grad_norm": 2.230278253555298, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7080504894256592, + "num_tokens": 284658724.0, + "step": 11416 + }, + { + "epoch": 1.2537887107401713, + "grad_norm": 2.0723018646240234, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7158740758895874, + "num_tokens": 284687320.0, + "step": 11417 + }, + { + "epoch": 1.253898528442785, + "grad_norm": 2.1938161849975586, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.727269172668457, + "num_tokens": 284713349.0, + "step": 11418 + }, + { + "epoch": 1.2540083461453986, + "grad_norm": 2.307879686355591, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7219418883323669, + "num_tokens": 284738453.0, + "step": 11419 + }, + { + "epoch": 1.2541181638480123, + "grad_norm": 2.100172758102417, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7174583673477173, + "num_tokens": 284766033.0, + "step": 11420 + }, + { + "epoch": 1.2542279815506259, + "grad_norm": 2.128235101699829, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7190291881561279, + "num_tokens": 284795020.0, + "step": 11421 + }, + { + "epoch": 1.2543377992532396, + "grad_norm": 2.118180274963379, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7134978771209717, + "num_tokens": 284824188.0, + "step": 11422 + }, + { + "epoch": 1.2544476169558534, + "grad_norm": 2.19606351852417, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7254990935325623, + "num_tokens": 284849865.0, + "step": 11423 + }, + { + "epoch": 1.254557434658467, + "grad_norm": 2.0916874408721924, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.724563717842102, + "num_tokens": 284877293.0, + "step": 11424 + }, + { + "epoch": 1.2546672523610807, + "grad_norm": 2.449833631515503, + "learning_rate": 1e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7445200681686401, + "num_tokens": 284897920.0, + "step": 11425 + }, + { + "epoch": 1.2547770700636942, + "grad_norm": 2.5331594944000244, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7014478445053101, + "num_tokens": 284920199.0, + "step": 11426 + }, + { + "epoch": 1.254886887766308, + "grad_norm": 2.159736156463623, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.716065526008606, + "num_tokens": 284948312.0, + "step": 11427 + }, + { + "epoch": 1.2549967054689217, + "grad_norm": 2.531768560409546, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7360605001449585, + "num_tokens": 284967701.0, + "step": 11428 + }, + { + "epoch": 1.2551065231715353, + "grad_norm": 2.299117088317871, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7285348773002625, + "num_tokens": 284993304.0, + "step": 11429 + }, + { + "epoch": 1.2552163408741488, + "grad_norm": 2.0639450550079346, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.6950458288192749, + "num_tokens": 285024728.0, + "step": 11430 + }, + { + "epoch": 1.2553261585767626, + "grad_norm": 2.6235504150390625, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7269613742828369, + "num_tokens": 285043451.0, + "step": 11431 + }, + { + "epoch": 1.2554359762793763, + "grad_norm": 2.436537981033325, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7338628172874451, + "num_tokens": 285066251.0, + "step": 11432 + }, + { + "epoch": 1.2555457939819898, + "grad_norm": 2.2774670124053955, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7294015884399414, + "num_tokens": 285091642.0, + "step": 11433 + }, + { + "epoch": 1.2556556116846036, + "grad_norm": 2.2252275943756104, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7150894403457642, + "num_tokens": 285116887.0, + "step": 11434 + }, + { + "epoch": 1.2557654293872171, + "grad_norm": 2.255711793899536, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7264852523803711, + "num_tokens": 285141539.0, + "step": 11435 + }, + { + "epoch": 1.255875247089831, + "grad_norm": 2.008957624435425, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.6921234726905823, + "num_tokens": 285176176.0, + "step": 11436 + }, + { + "epoch": 1.2559850647924446, + "grad_norm": 2.2275030612945557, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7289491891860962, + "num_tokens": 285201309.0, + "step": 11437 + }, + { + "epoch": 1.2560948824950582, + "grad_norm": 2.1269357204437256, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7090861797332764, + "num_tokens": 285230347.0, + "step": 11438 + }, + { + "epoch": 1.256204700197672, + "grad_norm": 2.15421199798584, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7047559022903442, + "num_tokens": 285260109.0, + "step": 11439 + }, + { + "epoch": 1.2563145179002855, + "grad_norm": 2.5867393016815186, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7063847780227661, + "num_tokens": 285281013.0, + "step": 11440 + }, + { + "epoch": 1.2564243356028992, + "grad_norm": 2.225456714630127, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7172618508338928, + "num_tokens": 285305365.0, + "step": 11441 + }, + { + "epoch": 1.2565341533055128, + "grad_norm": 2.588245391845703, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.729609489440918, + "num_tokens": 285326313.0, + "step": 11442 + }, + { + "epoch": 1.2566439710081265, + "grad_norm": 2.0515644550323486, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7152103185653687, + "num_tokens": 285355411.0, + "step": 11443 + }, + { + "epoch": 1.25675378871074, + "grad_norm": 2.5524933338165283, + "learning_rate": 1e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7514148950576782, + "num_tokens": 285373769.0, + "step": 11444 + }, + { + "epoch": 1.2568636064133538, + "grad_norm": 2.233069896697998, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7327419519424438, + "num_tokens": 285397982.0, + "step": 11445 + }, + { + "epoch": 1.2569734241159676, + "grad_norm": 2.1900086402893066, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7349010705947876, + "num_tokens": 285423781.0, + "step": 11446 + }, + { + "epoch": 1.257083241818581, + "grad_norm": 2.1356191635131836, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7268291711807251, + "num_tokens": 285452455.0, + "step": 11447 + }, + { + "epoch": 1.2571930595211949, + "grad_norm": 2.260448694229126, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7130938768386841, + "num_tokens": 285477507.0, + "step": 11448 + }, + { + "epoch": 1.2573028772238084, + "grad_norm": 2.117112636566162, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7257674932479858, + "num_tokens": 285504786.0, + "step": 11449 + }, + { + "epoch": 1.2574126949264222, + "grad_norm": 2.511432647705078, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7257853746414185, + "num_tokens": 285525094.0, + "step": 11450 + }, + { + "epoch": 1.257522512629036, + "grad_norm": 1.9778156280517578, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.713426947593689, + "num_tokens": 285556677.0, + "step": 11451 + }, + { + "epoch": 1.2576323303316495, + "grad_norm": 2.4677422046661377, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7323316931724548, + "num_tokens": 285578219.0, + "step": 11452 + }, + { + "epoch": 1.257742148034263, + "grad_norm": 2.348759651184082, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7097413539886475, + "num_tokens": 285603046.0, + "step": 11453 + }, + { + "epoch": 1.2578519657368767, + "grad_norm": 2.7885513305664062, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7415119409561157, + "num_tokens": 285620780.0, + "step": 11454 + }, + { + "epoch": 1.2579617834394905, + "grad_norm": 2.154693126678467, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7196524143218994, + "num_tokens": 285646954.0, + "step": 11455 + }, + { + "epoch": 1.258071601142104, + "grad_norm": 2.218137264251709, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7274682521820068, + "num_tokens": 285673624.0, + "step": 11456 + }, + { + "epoch": 1.2581814188447178, + "grad_norm": 2.351780414581299, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7215803265571594, + "num_tokens": 285698873.0, + "step": 11457 + }, + { + "epoch": 1.2582912365473313, + "grad_norm": 2.111985921859741, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7295503616333008, + "num_tokens": 285727644.0, + "step": 11458 + }, + { + "epoch": 1.258401054249945, + "grad_norm": 1.9326860904693604, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.734052836894989, + "num_tokens": 285759630.0, + "step": 11459 + }, + { + "epoch": 1.2585108719525588, + "grad_norm": 1.9992179870605469, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7078539133071899, + "num_tokens": 285792338.0, + "step": 11460 + }, + { + "epoch": 1.2586206896551724, + "grad_norm": 2.1047606468200684, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7347018718719482, + "num_tokens": 285820951.0, + "step": 11461 + }, + { + "epoch": 1.2587305073577861, + "grad_norm": 2.4156978130340576, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7086296081542969, + "num_tokens": 285842941.0, + "step": 11462 + }, + { + "epoch": 1.2588403250603997, + "grad_norm": 2.3106138706207275, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7207853198051453, + "num_tokens": 285868820.0, + "step": 11463 + }, + { + "epoch": 1.2589501427630134, + "grad_norm": 1.9969407320022583, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6946079730987549, + "num_tokens": 285901631.0, + "step": 11464 + }, + { + "epoch": 1.2590599604656272, + "grad_norm": 2.607930898666382, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7440243363380432, + "num_tokens": 285920960.0, + "step": 11465 + }, + { + "epoch": 1.2591697781682407, + "grad_norm": 2.302809238433838, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7342847585678101, + "num_tokens": 285945120.0, + "step": 11466 + }, + { + "epoch": 1.2592795958708543, + "grad_norm": 2.0487053394317627, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.696727454662323, + "num_tokens": 285974666.0, + "step": 11467 + }, + { + "epoch": 1.259389413573468, + "grad_norm": 2.2633883953094482, + "learning_rate": 1e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7411041259765625, + "num_tokens": 285996101.0, + "step": 11468 + }, + { + "epoch": 1.2594992312760818, + "grad_norm": 2.114114999771118, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7171447277069092, + "num_tokens": 286021193.0, + "step": 11469 + }, + { + "epoch": 1.2596090489786953, + "grad_norm": 2.3307275772094727, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6969597935676575, + "num_tokens": 286046032.0, + "step": 11470 + }, + { + "epoch": 1.259718866681309, + "grad_norm": 2.3058741092681885, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7492943406105042, + "num_tokens": 286070895.0, + "step": 11471 + }, + { + "epoch": 1.2598286843839226, + "grad_norm": 2.0978353023529053, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7337982058525085, + "num_tokens": 286099240.0, + "step": 11472 + }, + { + "epoch": 1.2599385020865363, + "grad_norm": 2.09338116645813, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7114280462265015, + "num_tokens": 286125810.0, + "step": 11473 + }, + { + "epoch": 1.26004831978915, + "grad_norm": 2.1385915279388428, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7284143567085266, + "num_tokens": 286151451.0, + "step": 11474 + }, + { + "epoch": 1.2601581374917636, + "grad_norm": 2.2075328826904297, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7286151051521301, + "num_tokens": 286177658.0, + "step": 11475 + }, + { + "epoch": 1.2602679551943774, + "grad_norm": 2.173778533935547, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7076540589332581, + "num_tokens": 286206820.0, + "step": 11476 + }, + { + "epoch": 1.260377772896991, + "grad_norm": 2.4707796573638916, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7421520948410034, + "num_tokens": 286228549.0, + "step": 11477 + }, + { + "epoch": 1.2604875905996047, + "grad_norm": 2.104999542236328, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7264416217803955, + "num_tokens": 286256824.0, + "step": 11478 + }, + { + "epoch": 1.2605974083022184, + "grad_norm": 2.690650224685669, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7080227732658386, + "num_tokens": 286277091.0, + "step": 11479 + }, + { + "epoch": 1.260707226004832, + "grad_norm": 2.380918025970459, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7337259650230408, + "num_tokens": 286299388.0, + "step": 11480 + }, + { + "epoch": 1.2608170437074455, + "grad_norm": 2.6909079551696777, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7169865369796753, + "num_tokens": 286319465.0, + "step": 11481 + }, + { + "epoch": 1.2609268614100593, + "grad_norm": 2.182384967803955, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7160912156105042, + "num_tokens": 286347007.0, + "step": 11482 + }, + { + "epoch": 1.261036679112673, + "grad_norm": 2.2206711769104004, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7049306631088257, + "num_tokens": 286373680.0, + "step": 11483 + }, + { + "epoch": 1.2611464968152866, + "grad_norm": 2.120628595352173, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7145278453826904, + "num_tokens": 286401774.0, + "step": 11484 + }, + { + "epoch": 1.2612563145179003, + "grad_norm": 2.262986183166504, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7071079611778259, + "num_tokens": 286428173.0, + "step": 11485 + }, + { + "epoch": 1.2613661322205139, + "grad_norm": 2.434767246246338, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7290650606155396, + "num_tokens": 286451163.0, + "step": 11486 + }, + { + "epoch": 1.2614759499231276, + "grad_norm": 2.295413017272949, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7442389726638794, + "num_tokens": 286475634.0, + "step": 11487 + }, + { + "epoch": 1.2615857676257414, + "grad_norm": 1.9962644577026367, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7191483378410339, + "num_tokens": 286507610.0, + "step": 11488 + }, + { + "epoch": 1.261695585328355, + "grad_norm": 2.416567087173462, + "learning_rate": 1e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7508089542388916, + "num_tokens": 286526550.0, + "step": 11489 + }, + { + "epoch": 1.2618054030309687, + "grad_norm": 2.3593389987945557, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7159039974212646, + "num_tokens": 286549853.0, + "step": 11490 + }, + { + "epoch": 1.2619152207335822, + "grad_norm": 2.5823819637298584, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7267417907714844, + "num_tokens": 286568933.0, + "step": 11491 + }, + { + "epoch": 1.262025038436196, + "grad_norm": 2.3784573078155518, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.7618409991264343, + "num_tokens": 286589693.0, + "step": 11492 + }, + { + "epoch": 1.2621348561388097, + "grad_norm": 2.2446610927581787, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7105146646499634, + "num_tokens": 286616416.0, + "step": 11493 + }, + { + "epoch": 1.2622446738414232, + "grad_norm": 2.220136880874634, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7032380104064941, + "num_tokens": 286645122.0, + "step": 11494 + }, + { + "epoch": 1.2623544915440368, + "grad_norm": 2.290339708328247, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7180781960487366, + "num_tokens": 286672342.0, + "step": 11495 + }, + { + "epoch": 1.2624643092466505, + "grad_norm": 2.3458597660064697, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7064454555511475, + "num_tokens": 286697590.0, + "step": 11496 + }, + { + "epoch": 1.2625741269492643, + "grad_norm": 2.2343783378601074, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6884135007858276, + "num_tokens": 286723164.0, + "step": 11497 + }, + { + "epoch": 1.2626839446518778, + "grad_norm": 2.5398142337799072, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7286816835403442, + "num_tokens": 286743453.0, + "step": 11498 + }, + { + "epoch": 1.2627937623544916, + "grad_norm": 2.335329532623291, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7192555069923401, + "num_tokens": 286766189.0, + "step": 11499 + }, + { + "epoch": 1.2629035800571051, + "grad_norm": 2.39900541305542, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.709632158279419, + "num_tokens": 286788923.0, + "step": 11500 + }, + { + "epoch": 1.2630133977597189, + "grad_norm": 2.334132671356201, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7266699075698853, + "num_tokens": 286813610.0, + "step": 11501 + }, + { + "epoch": 1.2631232154623326, + "grad_norm": 2.338963508605957, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7289457321166992, + "num_tokens": 286838634.0, + "step": 11502 + }, + { + "epoch": 1.2632330331649462, + "grad_norm": 2.5264151096343994, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.719465434551239, + "num_tokens": 286860889.0, + "step": 11503 + }, + { + "epoch": 1.26334285086756, + "grad_norm": 2.0968246459960938, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7409994602203369, + "num_tokens": 286888709.0, + "step": 11504 + }, + { + "epoch": 1.2634526685701735, + "grad_norm": 2.284161329269409, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7164932489395142, + "num_tokens": 286914047.0, + "step": 11505 + }, + { + "epoch": 1.2635624862727872, + "grad_norm": 2.4385287761688232, + "learning_rate": 1e-06, + "loss": 0.7493, + "mean_token_accuracy": 0.7562494277954102, + "num_tokens": 286936242.0, + "step": 11506 + }, + { + "epoch": 1.2636723039754008, + "grad_norm": 2.43881893157959, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7325290441513062, + "num_tokens": 286957588.0, + "step": 11507 + }, + { + "epoch": 1.2637821216780145, + "grad_norm": 2.1764512062072754, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7101900577545166, + "num_tokens": 286984894.0, + "step": 11508 + }, + { + "epoch": 1.263891939380628, + "grad_norm": 2.388599157333374, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7134972810745239, + "num_tokens": 287007307.0, + "step": 11509 + }, + { + "epoch": 1.2640017570832418, + "grad_norm": 2.224916458129883, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7070643901824951, + "num_tokens": 287034331.0, + "step": 11510 + }, + { + "epoch": 1.2641115747858556, + "grad_norm": 2.7380080223083496, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7412625551223755, + "num_tokens": 287052690.0, + "step": 11511 + }, + { + "epoch": 1.264221392488469, + "grad_norm": 2.8553502559661865, + "learning_rate": 1e-06, + "loss": 0.7937, + "mean_token_accuracy": 0.750281572341919, + "num_tokens": 287068767.0, + "step": 11512 + }, + { + "epoch": 1.2643312101910829, + "grad_norm": 1.9383481740951538, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6923909187316895, + "num_tokens": 287103690.0, + "step": 11513 + }, + { + "epoch": 1.2644410278936964, + "grad_norm": 2.3746142387390137, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7218756675720215, + "num_tokens": 287128396.0, + "step": 11514 + }, + { + "epoch": 1.2645508455963101, + "grad_norm": 2.88081693649292, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7313277721405029, + "num_tokens": 287145553.0, + "step": 11515 + }, + { + "epoch": 1.264660663298924, + "grad_norm": 2.441896677017212, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7116739749908447, + "num_tokens": 287169343.0, + "step": 11516 + }, + { + "epoch": 1.2647704810015374, + "grad_norm": 2.156160354614258, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7319818139076233, + "num_tokens": 287196338.0, + "step": 11517 + }, + { + "epoch": 1.264880298704151, + "grad_norm": 2.5439023971557617, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7016338109970093, + "num_tokens": 287218171.0, + "step": 11518 + }, + { + "epoch": 1.2649901164067647, + "grad_norm": 2.3338074684143066, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7212475538253784, + "num_tokens": 287242977.0, + "step": 11519 + }, + { + "epoch": 1.2650999341093785, + "grad_norm": 2.340979814529419, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6950955986976624, + "num_tokens": 287268700.0, + "step": 11520 + }, + { + "epoch": 1.265209751811992, + "grad_norm": 2.538304328918457, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7342896461486816, + "num_tokens": 287290132.0, + "step": 11521 + }, + { + "epoch": 1.2653195695146058, + "grad_norm": 2.231245517730713, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7081158757209778, + "num_tokens": 287315813.0, + "step": 11522 + }, + { + "epoch": 1.2654293872172193, + "grad_norm": 2.0536606311798096, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7213351726531982, + "num_tokens": 287344370.0, + "step": 11523 + }, + { + "epoch": 1.265539204919833, + "grad_norm": 2.4123592376708984, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7377516031265259, + "num_tokens": 287364317.0, + "step": 11524 + }, + { + "epoch": 1.2656490226224468, + "grad_norm": 2.1929221153259277, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7117631435394287, + "num_tokens": 287391613.0, + "step": 11525 + }, + { + "epoch": 1.2657588403250604, + "grad_norm": 2.2607274055480957, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7368741035461426, + "num_tokens": 287414748.0, + "step": 11526 + }, + { + "epoch": 1.2658686580276741, + "grad_norm": 2.206665515899658, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.6919868588447571, + "num_tokens": 287441208.0, + "step": 11527 + }, + { + "epoch": 1.2659784757302877, + "grad_norm": 2.4863855838775635, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7379812002182007, + "num_tokens": 287462345.0, + "step": 11528 + }, + { + "epoch": 1.2660882934329014, + "grad_norm": 2.608079433441162, + "learning_rate": 1e-06, + "loss": 0.767, + "mean_token_accuracy": 0.7521551847457886, + "num_tokens": 287480716.0, + "step": 11529 + }, + { + "epoch": 1.2661981111355152, + "grad_norm": 2.4785096645355225, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7535392642021179, + "num_tokens": 287503961.0, + "step": 11530 + }, + { + "epoch": 1.2663079288381287, + "grad_norm": 2.4596047401428223, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7216161489486694, + "num_tokens": 287526599.0, + "step": 11531 + }, + { + "epoch": 1.2664177465407422, + "grad_norm": 2.007946252822876, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7108287811279297, + "num_tokens": 287556837.0, + "step": 11532 + }, + { + "epoch": 1.266527564243356, + "grad_norm": 2.4806082248687744, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7278284430503845, + "num_tokens": 287580193.0, + "step": 11533 + }, + { + "epoch": 1.2666373819459698, + "grad_norm": 2.2351298332214355, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7332177758216858, + "num_tokens": 287605492.0, + "step": 11534 + }, + { + "epoch": 1.2667471996485833, + "grad_norm": 2.191230297088623, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.715251088142395, + "num_tokens": 287631411.0, + "step": 11535 + }, + { + "epoch": 1.266857017351197, + "grad_norm": 2.1238343715667725, + "learning_rate": 1e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7382319569587708, + "num_tokens": 287662398.0, + "step": 11536 + }, + { + "epoch": 1.2669668350538106, + "grad_norm": 2.1073975563049316, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7012525796890259, + "num_tokens": 287692440.0, + "step": 11537 + }, + { + "epoch": 1.2670766527564243, + "grad_norm": 1.983184814453125, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7184593677520752, + "num_tokens": 287724388.0, + "step": 11538 + }, + { + "epoch": 1.267186470459038, + "grad_norm": 2.252286672592163, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7133665680885315, + "num_tokens": 287749128.0, + "step": 11539 + }, + { + "epoch": 1.2672962881616516, + "grad_norm": 2.4092135429382324, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7249656915664673, + "num_tokens": 287770541.0, + "step": 11540 + }, + { + "epoch": 1.2674061058642654, + "grad_norm": 2.361158609390259, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7333807349205017, + "num_tokens": 287794707.0, + "step": 11541 + }, + { + "epoch": 1.267515923566879, + "grad_norm": 2.5336334705352783, + "learning_rate": 1e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7367796301841736, + "num_tokens": 287816171.0, + "step": 11542 + }, + { + "epoch": 1.2676257412694927, + "grad_norm": 2.1977624893188477, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7258951663970947, + "num_tokens": 287842536.0, + "step": 11543 + }, + { + "epoch": 1.2677355589721064, + "grad_norm": 2.2685229778289795, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7283774018287659, + "num_tokens": 287867567.0, + "step": 11544 + }, + { + "epoch": 1.26784537667472, + "grad_norm": 2.115859031677246, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7207776308059692, + "num_tokens": 287893608.0, + "step": 11545 + }, + { + "epoch": 1.2679551943773335, + "grad_norm": 2.089240550994873, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7112840414047241, + "num_tokens": 287923082.0, + "step": 11546 + }, + { + "epoch": 1.2680650120799473, + "grad_norm": 2.15228271484375, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7130366563796997, + "num_tokens": 287951316.0, + "step": 11547 + }, + { + "epoch": 1.268174829782561, + "grad_norm": 2.0889456272125244, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7059571743011475, + "num_tokens": 287983863.0, + "step": 11548 + }, + { + "epoch": 1.2682846474851746, + "grad_norm": 2.2217090129852295, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7206724286079407, + "num_tokens": 288008887.0, + "step": 11549 + }, + { + "epoch": 1.2683944651877883, + "grad_norm": 2.326897382736206, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7055158019065857, + "num_tokens": 288035394.0, + "step": 11550 + }, + { + "epoch": 1.2685042828904018, + "grad_norm": 2.1799488067626953, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.735206663608551, + "num_tokens": 288060971.0, + "step": 11551 + }, + { + "epoch": 1.2686141005930156, + "grad_norm": 2.4143564701080322, + "learning_rate": 1e-06, + "loss": 0.7282, + "mean_token_accuracy": 0.761234700679779, + "num_tokens": 288081700.0, + "step": 11552 + }, + { + "epoch": 1.2687239182956294, + "grad_norm": 2.3629820346832275, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7381337881088257, + "num_tokens": 288103723.0, + "step": 11553 + }, + { + "epoch": 1.268833735998243, + "grad_norm": 2.2314705848693848, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7283946871757507, + "num_tokens": 288129835.0, + "step": 11554 + }, + { + "epoch": 1.2689435537008567, + "grad_norm": 2.506290912628174, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7109801769256592, + "num_tokens": 288152181.0, + "step": 11555 + }, + { + "epoch": 1.2690533714034702, + "grad_norm": 2.0939407348632812, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.716880738735199, + "num_tokens": 288179355.0, + "step": 11556 + }, + { + "epoch": 1.269163189106084, + "grad_norm": 2.247288465499878, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7095004320144653, + "num_tokens": 288205312.0, + "step": 11557 + }, + { + "epoch": 1.2692730068086975, + "grad_norm": 2.6045830249786377, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7331628799438477, + "num_tokens": 288224860.0, + "step": 11558 + }, + { + "epoch": 1.2693828245113112, + "grad_norm": 2.3522419929504395, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7169192433357239, + "num_tokens": 288251408.0, + "step": 11559 + }, + { + "epoch": 1.2694926422139248, + "grad_norm": 2.227627754211426, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7428525686264038, + "num_tokens": 288276583.0, + "step": 11560 + }, + { + "epoch": 1.2696024599165385, + "grad_norm": 2.4767704010009766, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7066898345947266, + "num_tokens": 288298145.0, + "step": 11561 + }, + { + "epoch": 1.2697122776191523, + "grad_norm": 2.2748007774353027, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7153221368789673, + "num_tokens": 288323271.0, + "step": 11562 + }, + { + "epoch": 1.2698220953217658, + "grad_norm": 2.2585949897766113, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7272241115570068, + "num_tokens": 288349266.0, + "step": 11563 + }, + { + "epoch": 1.2699319130243796, + "grad_norm": 2.1954345703125, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7411566972732544, + "num_tokens": 288375286.0, + "step": 11564 + }, + { + "epoch": 1.2700417307269931, + "grad_norm": 2.437246799468994, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7382373809814453, + "num_tokens": 288396418.0, + "step": 11565 + }, + { + "epoch": 1.2701515484296069, + "grad_norm": 2.240489959716797, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7159228324890137, + "num_tokens": 288423355.0, + "step": 11566 + }, + { + "epoch": 1.2702613661322206, + "grad_norm": 2.475376605987549, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7232333421707153, + "num_tokens": 288446532.0, + "step": 11567 + }, + { + "epoch": 1.2703711838348342, + "grad_norm": 2.5543289184570312, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.737015962600708, + "num_tokens": 288466326.0, + "step": 11568 + }, + { + "epoch": 1.270481001537448, + "grad_norm": 2.3572568893432617, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7167433500289917, + "num_tokens": 288488671.0, + "step": 11569 + }, + { + "epoch": 1.2705908192400615, + "grad_norm": 2.1140716075897217, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7075249552726746, + "num_tokens": 288516570.0, + "step": 11570 + }, + { + "epoch": 1.2707006369426752, + "grad_norm": 2.1412768363952637, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7326478362083435, + "num_tokens": 288542358.0, + "step": 11571 + }, + { + "epoch": 1.2708104546452887, + "grad_norm": 2.6220057010650635, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7226823568344116, + "num_tokens": 288562261.0, + "step": 11572 + }, + { + "epoch": 1.2709202723479025, + "grad_norm": 2.2851741313934326, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7363007664680481, + "num_tokens": 288587431.0, + "step": 11573 + }, + { + "epoch": 1.271030090050516, + "grad_norm": 2.3974249362945557, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7242760062217712, + "num_tokens": 288609928.0, + "step": 11574 + }, + { + "epoch": 1.2711399077531298, + "grad_norm": 2.4039645195007324, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7144830226898193, + "num_tokens": 288632260.0, + "step": 11575 + }, + { + "epoch": 1.2712497254557436, + "grad_norm": 1.9670023918151855, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.6996661424636841, + "num_tokens": 288662543.0, + "step": 11576 + }, + { + "epoch": 1.271359543158357, + "grad_norm": 2.3285160064697266, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7220863103866577, + "num_tokens": 288687896.0, + "step": 11577 + }, + { + "epoch": 1.2714693608609708, + "grad_norm": 2.1546790599823, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.743019700050354, + "num_tokens": 288714729.0, + "step": 11578 + }, + { + "epoch": 1.2715791785635844, + "grad_norm": 2.409351348876953, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7194064855575562, + "num_tokens": 288738113.0, + "step": 11579 + }, + { + "epoch": 1.2716889962661981, + "grad_norm": 2.369767189025879, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7351648211479187, + "num_tokens": 288760996.0, + "step": 11580 + }, + { + "epoch": 1.271798813968812, + "grad_norm": 2.3871264457702637, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7440612316131592, + "num_tokens": 288782874.0, + "step": 11581 + }, + { + "epoch": 1.2719086316714254, + "grad_norm": 2.1746301651000977, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7216534614562988, + "num_tokens": 288810223.0, + "step": 11582 + }, + { + "epoch": 1.272018449374039, + "grad_norm": 2.228404998779297, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7325439453125, + "num_tokens": 288837267.0, + "step": 11583 + }, + { + "epoch": 1.2721282670766527, + "grad_norm": 2.327192544937134, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7425031661987305, + "num_tokens": 288861576.0, + "step": 11584 + }, + { + "epoch": 1.2722380847792665, + "grad_norm": 1.9874649047851562, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7173259854316711, + "num_tokens": 288890529.0, + "step": 11585 + }, + { + "epoch": 1.27234790248188, + "grad_norm": 2.003159284591675, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7177315950393677, + "num_tokens": 288921320.0, + "step": 11586 + }, + { + "epoch": 1.2724577201844938, + "grad_norm": 2.13942813873291, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7144721150398254, + "num_tokens": 288949429.0, + "step": 11587 + }, + { + "epoch": 1.2725675378871073, + "grad_norm": 2.190551280975342, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6890708208084106, + "num_tokens": 288975782.0, + "step": 11588 + }, + { + "epoch": 1.272677355589721, + "grad_norm": 2.325866222381592, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7099778652191162, + "num_tokens": 289000927.0, + "step": 11589 + }, + { + "epoch": 1.2727871732923348, + "grad_norm": 2.1259679794311523, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7350662350654602, + "num_tokens": 289028077.0, + "step": 11590 + }, + { + "epoch": 1.2728969909949484, + "grad_norm": 2.246706008911133, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7078127861022949, + "num_tokens": 289056518.0, + "step": 11591 + }, + { + "epoch": 1.273006808697562, + "grad_norm": 2.1137771606445312, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7172769904136658, + "num_tokens": 289084799.0, + "step": 11592 + }, + { + "epoch": 1.2731166264001756, + "grad_norm": 2.379831075668335, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7146953344345093, + "num_tokens": 289108375.0, + "step": 11593 + }, + { + "epoch": 1.2732264441027894, + "grad_norm": 2.7036759853363037, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7244300842285156, + "num_tokens": 289127290.0, + "step": 11594 + }, + { + "epoch": 1.2733362618054032, + "grad_norm": 2.360480308532715, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7173068523406982, + "num_tokens": 289150169.0, + "step": 11595 + }, + { + "epoch": 1.2734460795080167, + "grad_norm": 2.4502012729644775, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7157829999923706, + "num_tokens": 289172786.0, + "step": 11596 + }, + { + "epoch": 1.2735558972106302, + "grad_norm": 2.4636616706848145, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7142316699028015, + "num_tokens": 289195100.0, + "step": 11597 + }, + { + "epoch": 1.273665714913244, + "grad_norm": 2.1137807369232178, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7084227204322815, + "num_tokens": 289223823.0, + "step": 11598 + }, + { + "epoch": 1.2737755326158577, + "grad_norm": 3.0165135860443115, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7265951633453369, + "num_tokens": 289238838.0, + "step": 11599 + }, + { + "epoch": 1.2738853503184713, + "grad_norm": 2.4244377613067627, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7157748937606812, + "num_tokens": 289261645.0, + "step": 11600 + }, + { + "epoch": 1.273995168021085, + "grad_norm": 2.34432053565979, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7238638401031494, + "num_tokens": 289285217.0, + "step": 11601 + }, + { + "epoch": 1.2741049857236986, + "grad_norm": 2.6276323795318604, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7385712265968323, + "num_tokens": 289303802.0, + "step": 11602 + }, + { + "epoch": 1.2742148034263123, + "grad_norm": 2.300494432449341, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7318697571754456, + "num_tokens": 289328922.0, + "step": 11603 + }, + { + "epoch": 1.274324621128926, + "grad_norm": 2.448007345199585, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7134907245635986, + "num_tokens": 289351687.0, + "step": 11604 + }, + { + "epoch": 1.2744344388315396, + "grad_norm": 2.1993396282196045, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.714164137840271, + "num_tokens": 289378560.0, + "step": 11605 + }, + { + "epoch": 1.2745442565341534, + "grad_norm": 2.7482707500457764, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7205157279968262, + "num_tokens": 289397190.0, + "step": 11606 + }, + { + "epoch": 1.274654074236767, + "grad_norm": 2.2939395904541016, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7245218753814697, + "num_tokens": 289422557.0, + "step": 11607 + }, + { + "epoch": 1.2747638919393807, + "grad_norm": 2.226590633392334, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.706203818321228, + "num_tokens": 289449088.0, + "step": 11608 + }, + { + "epoch": 1.2748737096419944, + "grad_norm": 2.4106085300445557, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7382226586341858, + "num_tokens": 289471114.0, + "step": 11609 + }, + { + "epoch": 1.274983527344608, + "grad_norm": 2.2487120628356934, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7197633981704712, + "num_tokens": 289497424.0, + "step": 11610 + }, + { + "epoch": 1.2750933450472215, + "grad_norm": 2.243680000305176, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7288037538528442, + "num_tokens": 289522244.0, + "step": 11611 + }, + { + "epoch": 1.2752031627498353, + "grad_norm": 1.970458984375, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7164633274078369, + "num_tokens": 289553332.0, + "step": 11612 + }, + { + "epoch": 1.275312980452449, + "grad_norm": 2.1744930744171143, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6953054666519165, + "num_tokens": 289579125.0, + "step": 11613 + }, + { + "epoch": 1.2754227981550625, + "grad_norm": 2.3235926628112793, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7045309543609619, + "num_tokens": 289603139.0, + "step": 11614 + }, + { + "epoch": 1.2755326158576763, + "grad_norm": 2.17960786819458, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7185197472572327, + "num_tokens": 289630320.0, + "step": 11615 + }, + { + "epoch": 1.2756424335602898, + "grad_norm": 2.374783754348755, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7076801657676697, + "num_tokens": 289654502.0, + "step": 11616 + }, + { + "epoch": 1.2757522512629036, + "grad_norm": 2.403033971786499, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7246747016906738, + "num_tokens": 289675236.0, + "step": 11617 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 2.345149517059326, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7124351263046265, + "num_tokens": 289697899.0, + "step": 11618 + }, + { + "epoch": 1.2759718866681309, + "grad_norm": 2.244107484817505, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6938562393188477, + "num_tokens": 289724207.0, + "step": 11619 + }, + { + "epoch": 1.2760817043707446, + "grad_norm": 2.7412467002868652, + "learning_rate": 1e-06, + "loss": 0.757, + "mean_token_accuracy": 0.7542560696601868, + "num_tokens": 289741592.0, + "step": 11620 + }, + { + "epoch": 1.2761915220733582, + "grad_norm": 2.205077648162842, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7574089765548706, + "num_tokens": 289766043.0, + "step": 11621 + }, + { + "epoch": 1.276301339775972, + "grad_norm": 2.368610143661499, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7341479659080505, + "num_tokens": 289789240.0, + "step": 11622 + }, + { + "epoch": 1.2764111574785855, + "grad_norm": 2.1935455799102783, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7153899073600769, + "num_tokens": 289816514.0, + "step": 11623 + }, + { + "epoch": 1.2765209751811992, + "grad_norm": 2.2659237384796143, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7228668928146362, + "num_tokens": 289842020.0, + "step": 11624 + }, + { + "epoch": 1.2766307928838128, + "grad_norm": 2.615034341812134, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7317789793014526, + "num_tokens": 289862804.0, + "step": 11625 + }, + { + "epoch": 1.2767406105864265, + "grad_norm": 2.1780319213867188, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7105650305747986, + "num_tokens": 289891571.0, + "step": 11626 + }, + { + "epoch": 1.2768504282890403, + "grad_norm": 2.3168303966522217, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7043927907943726, + "num_tokens": 289915112.0, + "step": 11627 + }, + { + "epoch": 1.2769602459916538, + "grad_norm": 2.3716132640838623, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.6976227760314941, + "num_tokens": 289937378.0, + "step": 11628 + }, + { + "epoch": 1.2770700636942676, + "grad_norm": 2.3136026859283447, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7189181447029114, + "num_tokens": 289963174.0, + "step": 11629 + }, + { + "epoch": 1.277179881396881, + "grad_norm": 2.305424213409424, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7156721949577332, + "num_tokens": 289988367.0, + "step": 11630 + }, + { + "epoch": 1.2772896990994949, + "grad_norm": 2.1774322986602783, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7300385236740112, + "num_tokens": 290013893.0, + "step": 11631 + }, + { + "epoch": 1.2773995168021086, + "grad_norm": 2.712573528289795, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7312580347061157, + "num_tokens": 290031648.0, + "step": 11632 + }, + { + "epoch": 1.2775093345047221, + "grad_norm": 2.21197509765625, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6859665513038635, + "num_tokens": 290059557.0, + "step": 11633 + }, + { + "epoch": 1.2776191522073357, + "grad_norm": 2.872847080230713, + "learning_rate": 1e-06, + "loss": 0.8085, + "mean_token_accuracy": 0.7425535917282104, + "num_tokens": 290075987.0, + "step": 11634 + }, + { + "epoch": 1.2777289699099494, + "grad_norm": 2.1894774436950684, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7246854901313782, + "num_tokens": 290100393.0, + "step": 11635 + }, + { + "epoch": 1.2778387876125632, + "grad_norm": 2.1732587814331055, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7184255123138428, + "num_tokens": 290127519.0, + "step": 11636 + }, + { + "epoch": 1.2779486053151767, + "grad_norm": 2.3293371200561523, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7219091653823853, + "num_tokens": 290150428.0, + "step": 11637 + }, + { + "epoch": 1.2780584230177905, + "grad_norm": 2.2245736122131348, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7009477019309998, + "num_tokens": 290177765.0, + "step": 11638 + }, + { + "epoch": 1.278168240720404, + "grad_norm": 2.205509901046753, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7209470868110657, + "num_tokens": 290203745.0, + "step": 11639 + }, + { + "epoch": 1.2782780584230178, + "grad_norm": 2.120527744293213, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7150552272796631, + "num_tokens": 290229765.0, + "step": 11640 + }, + { + "epoch": 1.2783878761256315, + "grad_norm": 1.9757977724075317, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7220283150672913, + "num_tokens": 290260382.0, + "step": 11641 + }, + { + "epoch": 1.278497693828245, + "grad_norm": 2.2269468307495117, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7171066999435425, + "num_tokens": 290287186.0, + "step": 11642 + }, + { + "epoch": 1.2786075115308588, + "grad_norm": 2.2076165676116943, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7199975252151489, + "num_tokens": 290312702.0, + "step": 11643 + }, + { + "epoch": 1.2787173292334724, + "grad_norm": 2.3092446327209473, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7063025236129761, + "num_tokens": 290337591.0, + "step": 11644 + }, + { + "epoch": 1.2788271469360861, + "grad_norm": 2.1691548824310303, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7384558916091919, + "num_tokens": 290362730.0, + "step": 11645 + }, + { + "epoch": 1.2789369646386999, + "grad_norm": 2.6655333042144775, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7390289306640625, + "num_tokens": 290382230.0, + "step": 11646 + }, + { + "epoch": 1.2790467823413134, + "grad_norm": 2.13832950592041, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7265987396240234, + "num_tokens": 290408550.0, + "step": 11647 + }, + { + "epoch": 1.279156600043927, + "grad_norm": 2.1731698513031006, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7221322655677795, + "num_tokens": 290435343.0, + "step": 11648 + }, + { + "epoch": 1.2792664177465407, + "grad_norm": 2.740104913711548, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7305157780647278, + "num_tokens": 290453628.0, + "step": 11649 + }, + { + "epoch": 1.2793762354491545, + "grad_norm": 2.502244234085083, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7261883616447449, + "num_tokens": 290474596.0, + "step": 11650 + }, + { + "epoch": 1.279486053151768, + "grad_norm": 2.3579623699188232, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7241263389587402, + "num_tokens": 290497582.0, + "step": 11651 + }, + { + "epoch": 1.2795958708543818, + "grad_norm": 2.439347505569458, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7346142530441284, + "num_tokens": 290522862.0, + "step": 11652 + }, + { + "epoch": 1.2797056885569953, + "grad_norm": 2.4668805599212646, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7167550325393677, + "num_tokens": 290545741.0, + "step": 11653 + }, + { + "epoch": 1.279815506259609, + "grad_norm": 2.202970266342163, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7222538590431213, + "num_tokens": 290569582.0, + "step": 11654 + }, + { + "epoch": 1.2799253239622228, + "grad_norm": 2.2464890480041504, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6946465373039246, + "num_tokens": 290595972.0, + "step": 11655 + }, + { + "epoch": 1.2800351416648363, + "grad_norm": 2.2509348392486572, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7069427371025085, + "num_tokens": 290620940.0, + "step": 11656 + }, + { + "epoch": 1.28014495936745, + "grad_norm": 2.5570311546325684, + "learning_rate": 1e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7464568614959717, + "num_tokens": 290640670.0, + "step": 11657 + }, + { + "epoch": 1.2802547770700636, + "grad_norm": 2.2127768993377686, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7078754305839539, + "num_tokens": 290668422.0, + "step": 11658 + }, + { + "epoch": 1.2803645947726774, + "grad_norm": 2.200105905532837, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7294952869415283, + "num_tokens": 290694793.0, + "step": 11659 + }, + { + "epoch": 1.2804744124752911, + "grad_norm": 2.549571990966797, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.728219211101532, + "num_tokens": 290714254.0, + "step": 11660 + }, + { + "epoch": 1.2805842301779047, + "grad_norm": 2.4697203636169434, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7217867970466614, + "num_tokens": 290737208.0, + "step": 11661 + }, + { + "epoch": 1.2806940478805182, + "grad_norm": 2.5052330493927, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7472308874130249, + "num_tokens": 290757007.0, + "step": 11662 + }, + { + "epoch": 1.280803865583132, + "grad_norm": 2.1944735050201416, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.6999574303627014, + "num_tokens": 290783520.0, + "step": 11663 + }, + { + "epoch": 1.2809136832857457, + "grad_norm": 2.246016263961792, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.6962617635726929, + "num_tokens": 290809185.0, + "step": 11664 + }, + { + "epoch": 1.2810235009883593, + "grad_norm": 2.358253002166748, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.691181480884552, + "num_tokens": 290834607.0, + "step": 11665 + }, + { + "epoch": 1.281133318690973, + "grad_norm": 2.0174834728240967, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7238988876342773, + "num_tokens": 290864861.0, + "step": 11666 + }, + { + "epoch": 1.2812431363935866, + "grad_norm": 2.278785228729248, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7299818396568298, + "num_tokens": 290889146.0, + "step": 11667 + }, + { + "epoch": 1.2813529540962003, + "grad_norm": 2.0709757804870605, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7030045986175537, + "num_tokens": 290919535.0, + "step": 11668 + }, + { + "epoch": 1.281462771798814, + "grad_norm": 2.238661766052246, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.721833348274231, + "num_tokens": 290945522.0, + "step": 11669 + }, + { + "epoch": 1.2815725895014276, + "grad_norm": 2.435091018676758, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.734315812587738, + "num_tokens": 290968454.0, + "step": 11670 + }, + { + "epoch": 1.2816824072040414, + "grad_norm": 2.276124954223633, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7250849604606628, + "num_tokens": 290994036.0, + "step": 11671 + }, + { + "epoch": 1.281792224906655, + "grad_norm": 2.1587259769439697, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7208998203277588, + "num_tokens": 291020844.0, + "step": 11672 + }, + { + "epoch": 1.2819020426092687, + "grad_norm": 2.7038285732269287, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.734699547290802, + "num_tokens": 291039511.0, + "step": 11673 + }, + { + "epoch": 1.2820118603118824, + "grad_norm": 2.951692819595337, + "learning_rate": 1e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7606961727142334, + "num_tokens": 291055750.0, + "step": 11674 + }, + { + "epoch": 1.282121678014496, + "grad_norm": 2.4088237285614014, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7418229579925537, + "num_tokens": 291077160.0, + "step": 11675 + }, + { + "epoch": 1.2822314957171095, + "grad_norm": 2.2072243690490723, + "learning_rate": 1e-06, + "loss": 0.7539, + "mean_token_accuracy": 0.7605413794517517, + "num_tokens": 291100539.0, + "step": 11676 + }, + { + "epoch": 1.2823413134197232, + "grad_norm": 2.3123531341552734, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7444171905517578, + "num_tokens": 291123976.0, + "step": 11677 + }, + { + "epoch": 1.282451131122337, + "grad_norm": 2.2659497261047363, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7036935687065125, + "num_tokens": 291153459.0, + "step": 11678 + }, + { + "epoch": 1.2825609488249505, + "grad_norm": 2.3109068870544434, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7192937135696411, + "num_tokens": 291178220.0, + "step": 11679 + }, + { + "epoch": 1.2826707665275643, + "grad_norm": 2.1393463611602783, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7179976105690002, + "num_tokens": 291206155.0, + "step": 11680 + }, + { + "epoch": 1.2827805842301778, + "grad_norm": 2.5409159660339355, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7343930006027222, + "num_tokens": 291227652.0, + "step": 11681 + }, + { + "epoch": 1.2828904019327916, + "grad_norm": 2.1533873081207275, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7202613949775696, + "num_tokens": 291256032.0, + "step": 11682 + }, + { + "epoch": 1.2830002196354053, + "grad_norm": 2.0573267936706543, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7045144438743591, + "num_tokens": 291285008.0, + "step": 11683 + }, + { + "epoch": 1.2831100373380189, + "grad_norm": 2.41019868850708, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7273597717285156, + "num_tokens": 291307941.0, + "step": 11684 + }, + { + "epoch": 1.2832198550406326, + "grad_norm": 2.486987829208374, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7458191514015198, + "num_tokens": 291328262.0, + "step": 11685 + }, + { + "epoch": 1.2833296727432462, + "grad_norm": 2.400045156478882, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7154946327209473, + "num_tokens": 291353535.0, + "step": 11686 + }, + { + "epoch": 1.28343949044586, + "grad_norm": 2.3910348415374756, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7100677490234375, + "num_tokens": 291376674.0, + "step": 11687 + }, + { + "epoch": 1.2835493081484735, + "grad_norm": 2.4437060356140137, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7167332172393799, + "num_tokens": 291399329.0, + "step": 11688 + }, + { + "epoch": 1.2836591258510872, + "grad_norm": 2.1499736309051514, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7083909511566162, + "num_tokens": 291425214.0, + "step": 11689 + }, + { + "epoch": 1.2837689435537007, + "grad_norm": 2.3741767406463623, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.715557336807251, + "num_tokens": 291447501.0, + "step": 11690 + }, + { + "epoch": 1.2838787612563145, + "grad_norm": 2.1036171913146973, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.744096577167511, + "num_tokens": 291475272.0, + "step": 11691 + }, + { + "epoch": 1.2839885789589283, + "grad_norm": 2.575146436691284, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.736844003200531, + "num_tokens": 291495991.0, + "step": 11692 + }, + { + "epoch": 1.2840983966615418, + "grad_norm": 2.0025246143341064, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.729897677898407, + "num_tokens": 291523678.0, + "step": 11693 + }, + { + "epoch": 1.2842082143641556, + "grad_norm": 2.276780128479004, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7116971611976624, + "num_tokens": 291548696.0, + "step": 11694 + }, + { + "epoch": 1.284318032066769, + "grad_norm": 2.6356706619262695, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7366659641265869, + "num_tokens": 291566523.0, + "step": 11695 + }, + { + "epoch": 1.2844278497693828, + "grad_norm": 2.334343433380127, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7294433116912842, + "num_tokens": 291591798.0, + "step": 11696 + }, + { + "epoch": 1.2845376674719966, + "grad_norm": 2.141144275665283, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7147653102874756, + "num_tokens": 291619777.0, + "step": 11697 + }, + { + "epoch": 1.2846474851746101, + "grad_norm": 2.4465126991271973, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7184183597564697, + "num_tokens": 291643048.0, + "step": 11698 + }, + { + "epoch": 1.2847573028772237, + "grad_norm": 2.226284980773926, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7115347981452942, + "num_tokens": 291668103.0, + "step": 11699 + }, + { + "epoch": 1.2848671205798374, + "grad_norm": 2.17039155960083, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7026088833808899, + "num_tokens": 291699117.0, + "step": 11700 + }, + { + "epoch": 1.2849769382824512, + "grad_norm": 2.377624034881592, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7267110347747803, + "num_tokens": 291723291.0, + "step": 11701 + }, + { + "epoch": 1.2850867559850647, + "grad_norm": 2.2088782787323, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7177120447158813, + "num_tokens": 291750553.0, + "step": 11702 + }, + { + "epoch": 1.2851965736876785, + "grad_norm": 2.089901924133301, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7286507487297058, + "num_tokens": 291779637.0, + "step": 11703 + }, + { + "epoch": 1.285306391390292, + "grad_norm": 2.0496888160705566, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7048032283782959, + "num_tokens": 291808632.0, + "step": 11704 + }, + { + "epoch": 1.2854162090929058, + "grad_norm": 2.3007426261901855, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7510396242141724, + "num_tokens": 291830875.0, + "step": 11705 + }, + { + "epoch": 1.2855260267955195, + "grad_norm": 2.054926633834839, + "learning_rate": 1e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7492781281471252, + "num_tokens": 291858551.0, + "step": 11706 + }, + { + "epoch": 1.285635844498133, + "grad_norm": 2.48823881149292, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7355809211730957, + "num_tokens": 291879753.0, + "step": 11707 + }, + { + "epoch": 1.2857456622007468, + "grad_norm": 2.2807610034942627, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7360220551490784, + "num_tokens": 291904947.0, + "step": 11708 + }, + { + "epoch": 1.2858554799033604, + "grad_norm": 2.1380298137664795, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7261306047439575, + "num_tokens": 291931931.0, + "step": 11709 + }, + { + "epoch": 1.2859652976059741, + "grad_norm": 1.9457858800888062, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7286545634269714, + "num_tokens": 291966960.0, + "step": 11710 + }, + { + "epoch": 1.2860751153085879, + "grad_norm": 2.3921544551849365, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7213270664215088, + "num_tokens": 291991849.0, + "step": 11711 + }, + { + "epoch": 1.2861849330112014, + "grad_norm": 2.3503289222717285, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7146097421646118, + "num_tokens": 292016465.0, + "step": 11712 + }, + { + "epoch": 1.286294750713815, + "grad_norm": 2.3675856590270996, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.714582622051239, + "num_tokens": 292040246.0, + "step": 11713 + }, + { + "epoch": 1.2864045684164287, + "grad_norm": 2.2601771354675293, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7311831712722778, + "num_tokens": 292065399.0, + "step": 11714 + }, + { + "epoch": 1.2865143861190425, + "grad_norm": 2.1474671363830566, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7021103501319885, + "num_tokens": 292094232.0, + "step": 11715 + }, + { + "epoch": 1.286624203821656, + "grad_norm": 2.5056557655334473, + "learning_rate": 1e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7471781969070435, + "num_tokens": 292115174.0, + "step": 11716 + }, + { + "epoch": 1.2867340215242697, + "grad_norm": 2.4040651321411133, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7285758852958679, + "num_tokens": 292136377.0, + "step": 11717 + }, + { + "epoch": 1.2868438392268833, + "grad_norm": 2.2568182945251465, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7018677592277527, + "num_tokens": 292161772.0, + "step": 11718 + }, + { + "epoch": 1.286953656929497, + "grad_norm": 2.0308964252471924, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7075487375259399, + "num_tokens": 292191208.0, + "step": 11719 + }, + { + "epoch": 1.2870634746321108, + "grad_norm": 2.2983744144439697, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7402475476264954, + "num_tokens": 292214447.0, + "step": 11720 + }, + { + "epoch": 1.2871732923347243, + "grad_norm": 2.0564420223236084, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7171249389648438, + "num_tokens": 292243935.0, + "step": 11721 + }, + { + "epoch": 1.287283110037338, + "grad_norm": 2.365163803100586, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7137514352798462, + "num_tokens": 292268117.0, + "step": 11722 + }, + { + "epoch": 1.2873929277399516, + "grad_norm": 2.2237021923065186, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7431043386459351, + "num_tokens": 292294202.0, + "step": 11723 + }, + { + "epoch": 1.2875027454425654, + "grad_norm": 2.367846965789795, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7102187275886536, + "num_tokens": 292317266.0, + "step": 11724 + }, + { + "epoch": 1.2876125631451791, + "grad_norm": 2.093168020248413, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7096468806266785, + "num_tokens": 292347237.0, + "step": 11725 + }, + { + "epoch": 1.2877223808477927, + "grad_norm": 2.2172892093658447, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7238534092903137, + "num_tokens": 292374054.0, + "step": 11726 + }, + { + "epoch": 1.2878321985504062, + "grad_norm": 2.61647891998291, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7321935296058655, + "num_tokens": 292393374.0, + "step": 11727 + }, + { + "epoch": 1.28794201625302, + "grad_norm": 2.207550048828125, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6990035772323608, + "num_tokens": 292421992.0, + "step": 11728 + }, + { + "epoch": 1.2880518339556337, + "grad_norm": 2.3016393184661865, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7173532247543335, + "num_tokens": 292447717.0, + "step": 11729 + }, + { + "epoch": 1.2881616516582473, + "grad_norm": 2.5335164070129395, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.730383038520813, + "num_tokens": 292468707.0, + "step": 11730 + }, + { + "epoch": 1.288271469360861, + "grad_norm": 2.337167739868164, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.706504762172699, + "num_tokens": 292491792.0, + "step": 11731 + }, + { + "epoch": 1.2883812870634745, + "grad_norm": 2.116797685623169, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7088971734046936, + "num_tokens": 292520250.0, + "step": 11732 + }, + { + "epoch": 1.2884911047660883, + "grad_norm": 2.292574405670166, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.696045994758606, + "num_tokens": 292545951.0, + "step": 11733 + }, + { + "epoch": 1.288600922468702, + "grad_norm": 2.328031063079834, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7218167781829834, + "num_tokens": 292570271.0, + "step": 11734 + }, + { + "epoch": 1.2887107401713156, + "grad_norm": 2.3564136028289795, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7346256971359253, + "num_tokens": 292594188.0, + "step": 11735 + }, + { + "epoch": 1.2888205578739294, + "grad_norm": 1.9187030792236328, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.694868803024292, + "num_tokens": 292630095.0, + "step": 11736 + }, + { + "epoch": 1.2889303755765429, + "grad_norm": 1.9352071285247803, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7364135384559631, + "num_tokens": 292660229.0, + "step": 11737 + }, + { + "epoch": 1.2890401932791566, + "grad_norm": 2.505253553390503, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7437275648117065, + "num_tokens": 292681086.0, + "step": 11738 + }, + { + "epoch": 1.2891500109817704, + "grad_norm": 2.2118656635284424, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7263290882110596, + "num_tokens": 292707598.0, + "step": 11739 + }, + { + "epoch": 1.289259828684384, + "grad_norm": 2.2219345569610596, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6941331028938293, + "num_tokens": 292735389.0, + "step": 11740 + }, + { + "epoch": 1.2893696463869975, + "grad_norm": 2.0899605751037598, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.726425051689148, + "num_tokens": 292766662.0, + "step": 11741 + }, + { + "epoch": 1.2894794640896112, + "grad_norm": 2.25638747215271, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7036501169204712, + "num_tokens": 292793246.0, + "step": 11742 + }, + { + "epoch": 1.289589281792225, + "grad_norm": 2.094670057296753, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7256984710693359, + "num_tokens": 292821987.0, + "step": 11743 + }, + { + "epoch": 1.2896990994948385, + "grad_norm": 2.3557138442993164, + "learning_rate": 1e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7498922348022461, + "num_tokens": 292844738.0, + "step": 11744 + }, + { + "epoch": 1.2898089171974523, + "grad_norm": 2.2411296367645264, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7329814434051514, + "num_tokens": 292870392.0, + "step": 11745 + }, + { + "epoch": 1.2899187349000658, + "grad_norm": 2.4330663681030273, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7116175293922424, + "num_tokens": 292895462.0, + "step": 11746 + }, + { + "epoch": 1.2900285526026796, + "grad_norm": 2.45688796043396, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.732761025428772, + "num_tokens": 292917372.0, + "step": 11747 + }, + { + "epoch": 1.2901383703052933, + "grad_norm": 2.089937210083008, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.712578535079956, + "num_tokens": 292948839.0, + "step": 11748 + }, + { + "epoch": 1.2902481880079069, + "grad_norm": 2.381338119506836, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7182304859161377, + "num_tokens": 292971332.0, + "step": 11749 + }, + { + "epoch": 1.2903580057105206, + "grad_norm": 2.1466832160949707, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7360000610351562, + "num_tokens": 292997671.0, + "step": 11750 + }, + { + "epoch": 1.2904678234131342, + "grad_norm": 2.309471607208252, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7096288204193115, + "num_tokens": 293025056.0, + "step": 11751 + }, + { + "epoch": 1.290577641115748, + "grad_norm": 2.131930112838745, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.716828465461731, + "num_tokens": 293051580.0, + "step": 11752 + }, + { + "epoch": 1.2906874588183614, + "grad_norm": 2.3558952808380127, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7129499912261963, + "num_tokens": 293074843.0, + "step": 11753 + }, + { + "epoch": 1.2907972765209752, + "grad_norm": 2.0337753295898438, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7023782134056091, + "num_tokens": 293104137.0, + "step": 11754 + }, + { + "epoch": 1.2909070942235887, + "grad_norm": 2.635911703109741, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7373120188713074, + "num_tokens": 293123142.0, + "step": 11755 + }, + { + "epoch": 1.2910169119262025, + "grad_norm": 2.6412510871887207, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7313394546508789, + "num_tokens": 293144431.0, + "step": 11756 + }, + { + "epoch": 1.2911267296288162, + "grad_norm": 2.290599822998047, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7121012210845947, + "num_tokens": 293171104.0, + "step": 11757 + }, + { + "epoch": 1.2912365473314298, + "grad_norm": 2.4091243743896484, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7008698582649231, + "num_tokens": 293193390.0, + "step": 11758 + }, + { + "epoch": 1.2913463650340435, + "grad_norm": 2.533168315887451, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7418708801269531, + "num_tokens": 293212869.0, + "step": 11759 + }, + { + "epoch": 1.291456182736657, + "grad_norm": 2.2476751804351807, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.721403956413269, + "num_tokens": 293237432.0, + "step": 11760 + }, + { + "epoch": 1.2915660004392708, + "grad_norm": 2.2124218940734863, + "learning_rate": 1e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.74588942527771, + "num_tokens": 293262384.0, + "step": 11761 + }, + { + "epoch": 1.2916758181418846, + "grad_norm": 2.585012674331665, + "learning_rate": 1e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7488257884979248, + "num_tokens": 293282393.0, + "step": 11762 + }, + { + "epoch": 1.2917856358444981, + "grad_norm": 2.7899973392486572, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7405646443367004, + "num_tokens": 293299631.0, + "step": 11763 + }, + { + "epoch": 1.2918954535471117, + "grad_norm": 2.1799676418304443, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.726722240447998, + "num_tokens": 293325936.0, + "step": 11764 + }, + { + "epoch": 1.2920052712497254, + "grad_norm": 2.4499096870422363, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7253918647766113, + "num_tokens": 293348754.0, + "step": 11765 + }, + { + "epoch": 1.2921150889523392, + "grad_norm": 2.3128039836883545, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.6959534883499146, + "num_tokens": 293374928.0, + "step": 11766 + }, + { + "epoch": 1.2922249066549527, + "grad_norm": 2.0208709239959717, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.6961880922317505, + "num_tokens": 293407055.0, + "step": 11767 + }, + { + "epoch": 1.2923347243575665, + "grad_norm": 2.575652837753296, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7332729697227478, + "num_tokens": 293428141.0, + "step": 11768 + }, + { + "epoch": 1.29244454206018, + "grad_norm": 2.6521501541137695, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7144157886505127, + "num_tokens": 293449210.0, + "step": 11769 + }, + { + "epoch": 1.2925543597627938, + "grad_norm": 2.592050552368164, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7339690327644348, + "num_tokens": 293469812.0, + "step": 11770 + }, + { + "epoch": 1.2926641774654075, + "grad_norm": 2.1500933170318604, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7189590930938721, + "num_tokens": 293498707.0, + "step": 11771 + }, + { + "epoch": 1.292773995168021, + "grad_norm": 2.136824131011963, + "learning_rate": 1e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7316519618034363, + "num_tokens": 293524276.0, + "step": 11772 + }, + { + "epoch": 1.2928838128706348, + "grad_norm": 2.349163055419922, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7393203377723694, + "num_tokens": 293547376.0, + "step": 11773 + }, + { + "epoch": 1.2929936305732483, + "grad_norm": 2.3413867950439453, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7017171382904053, + "num_tokens": 293571921.0, + "step": 11774 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 2.482513904571533, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7032829523086548, + "num_tokens": 293595622.0, + "step": 11775 + }, + { + "epoch": 1.2932132659784759, + "grad_norm": 2.230729341506958, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.717914342880249, + "num_tokens": 293621667.0, + "step": 11776 + }, + { + "epoch": 1.2933230836810894, + "grad_norm": 2.5785531997680664, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.741614580154419, + "num_tokens": 293640537.0, + "step": 11777 + }, + { + "epoch": 1.293432901383703, + "grad_norm": 2.040184497833252, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7323094010353088, + "num_tokens": 293672288.0, + "step": 11778 + }, + { + "epoch": 1.2935427190863167, + "grad_norm": 2.061554431915283, + "learning_rate": 1e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7463719844818115, + "num_tokens": 293699610.0, + "step": 11779 + }, + { + "epoch": 1.2936525367889304, + "grad_norm": 2.3402795791625977, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7233843803405762, + "num_tokens": 293721327.0, + "step": 11780 + }, + { + "epoch": 1.293762354491544, + "grad_norm": 2.1502342224121094, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7342365980148315, + "num_tokens": 293745863.0, + "step": 11781 + }, + { + "epoch": 1.2938721721941577, + "grad_norm": 2.429764986038208, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7052919864654541, + "num_tokens": 293767840.0, + "step": 11782 + }, + { + "epoch": 1.2939819898967713, + "grad_norm": 2.5987493991851807, + "learning_rate": 1e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7517804503440857, + "num_tokens": 293786534.0, + "step": 11783 + }, + { + "epoch": 1.294091807599385, + "grad_norm": 2.2785651683807373, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7167751789093018, + "num_tokens": 293812067.0, + "step": 11784 + }, + { + "epoch": 1.2942016253019988, + "grad_norm": 2.3892924785614014, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.702890157699585, + "num_tokens": 293835692.0, + "step": 11785 + }, + { + "epoch": 1.2943114430046123, + "grad_norm": 2.3074252605438232, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.736044704914093, + "num_tokens": 293859694.0, + "step": 11786 + }, + { + "epoch": 1.294421260707226, + "grad_norm": 2.02532696723938, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7107252478599548, + "num_tokens": 293889075.0, + "step": 11787 + }, + { + "epoch": 1.2945310784098396, + "grad_norm": 2.847714424133301, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7478821277618408, + "num_tokens": 293906400.0, + "step": 11788 + }, + { + "epoch": 1.2946408961124534, + "grad_norm": 2.4529900550842285, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.741057276725769, + "num_tokens": 293928101.0, + "step": 11789 + }, + { + "epoch": 1.2947507138150671, + "grad_norm": 2.295879364013672, + "learning_rate": 1e-06, + "loss": 0.7565, + "mean_token_accuracy": 0.762588620185852, + "num_tokens": 293950946.0, + "step": 11790 + }, + { + "epoch": 1.2948605315176807, + "grad_norm": 2.2820630073547363, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7260907292366028, + "num_tokens": 293977551.0, + "step": 11791 + }, + { + "epoch": 1.2949703492202942, + "grad_norm": 2.387742280960083, + "learning_rate": 1e-06, + "loss": 0.7886, + "mean_token_accuracy": 0.750571072101593, + "num_tokens": 293999486.0, + "step": 11792 + }, + { + "epoch": 1.295080166922908, + "grad_norm": 2.142498016357422, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.6949913501739502, + "num_tokens": 294028000.0, + "step": 11793 + }, + { + "epoch": 1.2951899846255217, + "grad_norm": 2.335632085800171, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7498396039009094, + "num_tokens": 294049309.0, + "step": 11794 + }, + { + "epoch": 1.2952998023281352, + "grad_norm": 2.1196823120117188, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7123790979385376, + "num_tokens": 294076853.0, + "step": 11795 + }, + { + "epoch": 1.295409620030749, + "grad_norm": 2.3394150733947754, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7050837278366089, + "num_tokens": 294100746.0, + "step": 11796 + }, + { + "epoch": 1.2955194377333625, + "grad_norm": 2.386690855026245, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7224119305610657, + "num_tokens": 294122959.0, + "step": 11797 + }, + { + "epoch": 1.2956292554359763, + "grad_norm": 2.3348724842071533, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.718139111995697, + "num_tokens": 294145990.0, + "step": 11798 + }, + { + "epoch": 1.29573907313859, + "grad_norm": 2.4468038082122803, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7433362007141113, + "num_tokens": 294166676.0, + "step": 11799 + }, + { + "epoch": 1.2958488908412036, + "grad_norm": 2.409897565841675, + "learning_rate": 1e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.745705783367157, + "num_tokens": 294187654.0, + "step": 11800 + }, + { + "epoch": 1.2959587085438173, + "grad_norm": 2.6372578144073486, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7381997108459473, + "num_tokens": 294206414.0, + "step": 11801 + }, + { + "epoch": 1.2960685262464309, + "grad_norm": 2.207714557647705, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7122213840484619, + "num_tokens": 294233492.0, + "step": 11802 + }, + { + "epoch": 1.2961783439490446, + "grad_norm": 2.447654962539673, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6909232139587402, + "num_tokens": 294255783.0, + "step": 11803 + }, + { + "epoch": 1.2962881616516582, + "grad_norm": 2.9563984870910645, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7353579998016357, + "num_tokens": 294272416.0, + "step": 11804 + }, + { + "epoch": 1.296397979354272, + "grad_norm": 2.5667989253997803, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7317209839820862, + "num_tokens": 294293866.0, + "step": 11805 + }, + { + "epoch": 1.2965077970568855, + "grad_norm": 2.202247381210327, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7107224464416504, + "num_tokens": 294320004.0, + "step": 11806 + }, + { + "epoch": 1.2966176147594992, + "grad_norm": 1.9851100444793701, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7122660875320435, + "num_tokens": 294350696.0, + "step": 11807 + }, + { + "epoch": 1.296727432462113, + "grad_norm": 2.0622897148132324, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7142740488052368, + "num_tokens": 294380155.0, + "step": 11808 + }, + { + "epoch": 1.2968372501647265, + "grad_norm": 2.3271901607513428, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7234531044960022, + "num_tokens": 294403446.0, + "step": 11809 + }, + { + "epoch": 1.2969470678673403, + "grad_norm": 2.4932515621185303, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7297024726867676, + "num_tokens": 294423087.0, + "step": 11810 + }, + { + "epoch": 1.2970568855699538, + "grad_norm": 2.3639681339263916, + "learning_rate": 1e-06, + "loss": 0.801, + "mean_token_accuracy": 0.7474133372306824, + "num_tokens": 294444380.0, + "step": 11811 + }, + { + "epoch": 1.2971667032725676, + "grad_norm": 2.473372220993042, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7238166332244873, + "num_tokens": 294464847.0, + "step": 11812 + }, + { + "epoch": 1.2972765209751813, + "grad_norm": 2.2077794075012207, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.721798300743103, + "num_tokens": 294490057.0, + "step": 11813 + }, + { + "epoch": 1.2973863386777948, + "grad_norm": 2.4374611377716064, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7024422883987427, + "num_tokens": 294514604.0, + "step": 11814 + }, + { + "epoch": 1.2974961563804086, + "grad_norm": 2.1784868240356445, + "learning_rate": 1e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7423300743103027, + "num_tokens": 294540825.0, + "step": 11815 + }, + { + "epoch": 1.2976059740830221, + "grad_norm": 2.351619005203247, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7116721868515015, + "num_tokens": 294564958.0, + "step": 11816 + }, + { + "epoch": 1.297715791785636, + "grad_norm": 2.5360305309295654, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.712424635887146, + "num_tokens": 294586848.0, + "step": 11817 + }, + { + "epoch": 1.2978256094882494, + "grad_norm": 2.4342758655548096, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7147424817085266, + "num_tokens": 294608459.0, + "step": 11818 + }, + { + "epoch": 1.2979354271908632, + "grad_norm": 2.209329605102539, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7098421454429626, + "num_tokens": 294635590.0, + "step": 11819 + }, + { + "epoch": 1.2980452448934767, + "grad_norm": 2.1307129859924316, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7136586904525757, + "num_tokens": 294665003.0, + "step": 11820 + }, + { + "epoch": 1.2981550625960905, + "grad_norm": 2.016119956970215, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7447450757026672, + "num_tokens": 294693906.0, + "step": 11821 + }, + { + "epoch": 1.2982648802987042, + "grad_norm": 2.2184455394744873, + "learning_rate": 1e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7503254413604736, + "num_tokens": 294717632.0, + "step": 11822 + }, + { + "epoch": 1.2983746980013178, + "grad_norm": 2.4610848426818848, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7242976427078247, + "num_tokens": 294741568.0, + "step": 11823 + }, + { + "epoch": 1.2984845157039315, + "grad_norm": 2.344606876373291, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7378409504890442, + "num_tokens": 294763845.0, + "step": 11824 + }, + { + "epoch": 1.298594333406545, + "grad_norm": 2.2729599475860596, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7079998254776001, + "num_tokens": 294789776.0, + "step": 11825 + }, + { + "epoch": 1.2987041511091588, + "grad_norm": 2.057431697845459, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7119689583778381, + "num_tokens": 294819145.0, + "step": 11826 + }, + { + "epoch": 1.2988139688117726, + "grad_norm": 2.2970328330993652, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7263635993003845, + "num_tokens": 294844258.0, + "step": 11827 + }, + { + "epoch": 1.2989237865143861, + "grad_norm": 2.906118154525757, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7348926663398743, + "num_tokens": 294861170.0, + "step": 11828 + }, + { + "epoch": 1.2990336042169996, + "grad_norm": 2.3940269947052, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7228013277053833, + "num_tokens": 294884078.0, + "step": 11829 + }, + { + "epoch": 1.2991434219196134, + "grad_norm": 2.1445300579071045, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7023097276687622, + "num_tokens": 294914105.0, + "step": 11830 + }, + { + "epoch": 1.2992532396222272, + "grad_norm": 2.21433687210083, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7300904393196106, + "num_tokens": 294940255.0, + "step": 11831 + }, + { + "epoch": 1.2993630573248407, + "grad_norm": 2.570988416671753, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.716231107711792, + "num_tokens": 294959641.0, + "step": 11832 + }, + { + "epoch": 1.2994728750274545, + "grad_norm": 2.11777663230896, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7045731544494629, + "num_tokens": 294989964.0, + "step": 11833 + }, + { + "epoch": 1.299582692730068, + "grad_norm": 2.194960594177246, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7295194268226624, + "num_tokens": 295015890.0, + "step": 11834 + }, + { + "epoch": 1.2996925104326817, + "grad_norm": 2.3419289588928223, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7025651931762695, + "num_tokens": 295042909.0, + "step": 11835 + }, + { + "epoch": 1.2998023281352955, + "grad_norm": 2.799929618835449, + "learning_rate": 1e-06, + "loss": 0.7212, + "mean_token_accuracy": 0.7719242572784424, + "num_tokens": 295059293.0, + "step": 11836 + }, + { + "epoch": 1.299912145837909, + "grad_norm": 2.535794496536255, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7134378552436829, + "num_tokens": 295080358.0, + "step": 11837 + }, + { + "epoch": 1.3000219635405228, + "grad_norm": 2.6951122283935547, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7391787767410278, + "num_tokens": 295100200.0, + "step": 11838 + }, + { + "epoch": 1.3001317812431363, + "grad_norm": 2.1162097454071045, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7078686952590942, + "num_tokens": 295128980.0, + "step": 11839 + }, + { + "epoch": 1.30024159894575, + "grad_norm": 2.159207344055176, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7056300640106201, + "num_tokens": 295155706.0, + "step": 11840 + }, + { + "epoch": 1.3003514166483638, + "grad_norm": 2.26019287109375, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.712912380695343, + "num_tokens": 295181199.0, + "step": 11841 + }, + { + "epoch": 1.3004612343509774, + "grad_norm": 2.231785774230957, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7383246421813965, + "num_tokens": 295207478.0, + "step": 11842 + }, + { + "epoch": 1.300571052053591, + "grad_norm": 2.26104474067688, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7148575186729431, + "num_tokens": 295233078.0, + "step": 11843 + }, + { + "epoch": 1.3006808697562047, + "grad_norm": 2.3169612884521484, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7502701282501221, + "num_tokens": 295256911.0, + "step": 11844 + }, + { + "epoch": 1.3007906874588184, + "grad_norm": 2.3465380668640137, + "learning_rate": 1e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7370998859405518, + "num_tokens": 295280877.0, + "step": 11845 + }, + { + "epoch": 1.300900505161432, + "grad_norm": 2.701134443283081, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7246191501617432, + "num_tokens": 295300347.0, + "step": 11846 + }, + { + "epoch": 1.3010103228640457, + "grad_norm": 2.044896364212036, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.709148108959198, + "num_tokens": 295330868.0, + "step": 11847 + }, + { + "epoch": 1.3011201405666593, + "grad_norm": 2.2872257232666016, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7320584058761597, + "num_tokens": 295354877.0, + "step": 11848 + }, + { + "epoch": 1.301229958269273, + "grad_norm": 2.444795608520508, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7200467586517334, + "num_tokens": 295378668.0, + "step": 11849 + }, + { + "epoch": 1.3013397759718868, + "grad_norm": 2.204050064086914, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7302541732788086, + "num_tokens": 295406038.0, + "step": 11850 + }, + { + "epoch": 1.3014495936745003, + "grad_norm": 2.196809768676758, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7201496362686157, + "num_tokens": 295431432.0, + "step": 11851 + }, + { + "epoch": 1.301559411377114, + "grad_norm": 2.453153133392334, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.708437442779541, + "num_tokens": 295453356.0, + "step": 11852 + }, + { + "epoch": 1.3016692290797276, + "grad_norm": 2.089545726776123, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7344648838043213, + "num_tokens": 295480631.0, + "step": 11853 + }, + { + "epoch": 1.3017790467823414, + "grad_norm": 2.3664281368255615, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7089131474494934, + "num_tokens": 295504903.0, + "step": 11854 + }, + { + "epoch": 1.301888864484955, + "grad_norm": 2.325995683670044, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.728980302810669, + "num_tokens": 295528355.0, + "step": 11855 + }, + { + "epoch": 1.3019986821875686, + "grad_norm": 2.7465736865997314, + "learning_rate": 1e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7562187910079956, + "num_tokens": 295545196.0, + "step": 11856 + }, + { + "epoch": 1.3021084998901822, + "grad_norm": 2.416358709335327, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7238864898681641, + "num_tokens": 295568361.0, + "step": 11857 + }, + { + "epoch": 1.302218317592796, + "grad_norm": 2.737912654876709, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7245157957077026, + "num_tokens": 295587015.0, + "step": 11858 + }, + { + "epoch": 1.3023281352954097, + "grad_norm": 2.2435219287872314, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.716286301612854, + "num_tokens": 295611986.0, + "step": 11859 + }, + { + "epoch": 1.3024379529980232, + "grad_norm": 2.631272077560425, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.730536937713623, + "num_tokens": 295632186.0, + "step": 11860 + }, + { + "epoch": 1.302547770700637, + "grad_norm": 2.6082794666290283, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7485158443450928, + "num_tokens": 295650110.0, + "step": 11861 + }, + { + "epoch": 1.3026575884032505, + "grad_norm": 2.280360221862793, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6873949766159058, + "num_tokens": 295677831.0, + "step": 11862 + }, + { + "epoch": 1.3027674061058643, + "grad_norm": 2.3588459491729736, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7190365791320801, + "num_tokens": 295702246.0, + "step": 11863 + }, + { + "epoch": 1.302877223808478, + "grad_norm": 2.30230712890625, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7147802114486694, + "num_tokens": 295727386.0, + "step": 11864 + }, + { + "epoch": 1.3029870415110916, + "grad_norm": 2.3198838233947754, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7172708511352539, + "num_tokens": 295751028.0, + "step": 11865 + }, + { + "epoch": 1.3030968592137053, + "grad_norm": 2.0042426586151123, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7451748847961426, + "num_tokens": 295778749.0, + "step": 11866 + }, + { + "epoch": 1.3032066769163189, + "grad_norm": 2.1493537425994873, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7242002487182617, + "num_tokens": 295807691.0, + "step": 11867 + }, + { + "epoch": 1.3033164946189326, + "grad_norm": 2.3775599002838135, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7311130166053772, + "num_tokens": 295829081.0, + "step": 11868 + }, + { + "epoch": 1.3034263123215462, + "grad_norm": 2.4470419883728027, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7367385029792786, + "num_tokens": 295852172.0, + "step": 11869 + }, + { + "epoch": 1.30353613002416, + "grad_norm": 2.477638006210327, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7253597378730774, + "num_tokens": 295873281.0, + "step": 11870 + }, + { + "epoch": 1.3036459477267734, + "grad_norm": 1.9984288215637207, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.72154301404953, + "num_tokens": 295903311.0, + "step": 11871 + }, + { + "epoch": 1.3037557654293872, + "grad_norm": 2.265195608139038, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7281550168991089, + "num_tokens": 295929598.0, + "step": 11872 + }, + { + "epoch": 1.303865583132001, + "grad_norm": 2.134962320327759, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7254838943481445, + "num_tokens": 295958991.0, + "step": 11873 + }, + { + "epoch": 1.3039754008346145, + "grad_norm": 2.18350887298584, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7277116179466248, + "num_tokens": 295984836.0, + "step": 11874 + }, + { + "epoch": 1.3040852185372283, + "grad_norm": 2.2567248344421387, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7343477606773376, + "num_tokens": 296010387.0, + "step": 11875 + }, + { + "epoch": 1.3041950362398418, + "grad_norm": 2.1561124324798584, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7065601348876953, + "num_tokens": 296038211.0, + "step": 11876 + }, + { + "epoch": 1.3043048539424555, + "grad_norm": 2.333770990371704, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7182403206825256, + "num_tokens": 296062781.0, + "step": 11877 + }, + { + "epoch": 1.3044146716450693, + "grad_norm": 2.401838779449463, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7272672653198242, + "num_tokens": 296086014.0, + "step": 11878 + }, + { + "epoch": 1.3045244893476828, + "grad_norm": 2.6147191524505615, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7152507305145264, + "num_tokens": 296106960.0, + "step": 11879 + }, + { + "epoch": 1.3046343070502964, + "grad_norm": 2.187668800354004, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.6989499926567078, + "num_tokens": 296135450.0, + "step": 11880 + }, + { + "epoch": 1.3047441247529101, + "grad_norm": 2.3345558643341064, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.736679196357727, + "num_tokens": 296160009.0, + "step": 11881 + }, + { + "epoch": 1.3048539424555239, + "grad_norm": 2.345900297164917, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7114861011505127, + "num_tokens": 296182803.0, + "step": 11882 + }, + { + "epoch": 1.3049637601581374, + "grad_norm": 2.5158586502075195, + "learning_rate": 1e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7452559471130371, + "num_tokens": 296203149.0, + "step": 11883 + }, + { + "epoch": 1.3050735778607512, + "grad_norm": 2.376803159713745, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7233341932296753, + "num_tokens": 296226382.0, + "step": 11884 + }, + { + "epoch": 1.3051833955633647, + "grad_norm": 2.3632893562316895, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7114719152450562, + "num_tokens": 296249306.0, + "step": 11885 + }, + { + "epoch": 1.3052932132659785, + "grad_norm": 2.439739465713501, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7364348769187927, + "num_tokens": 296271039.0, + "step": 11886 + }, + { + "epoch": 1.3054030309685922, + "grad_norm": 2.1519739627838135, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7008973956108093, + "num_tokens": 296300351.0, + "step": 11887 + }, + { + "epoch": 1.3055128486712058, + "grad_norm": 2.267392635345459, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7264170050621033, + "num_tokens": 296326255.0, + "step": 11888 + }, + { + "epoch": 1.3056226663738195, + "grad_norm": 2.0473809242248535, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7271836400032043, + "num_tokens": 296355353.0, + "step": 11889 + }, + { + "epoch": 1.305732484076433, + "grad_norm": 2.0187811851501465, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7041143178939819, + "num_tokens": 296385868.0, + "step": 11890 + }, + { + "epoch": 1.3058423017790468, + "grad_norm": 2.771305561065674, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.751471757888794, + "num_tokens": 296404258.0, + "step": 11891 + }, + { + "epoch": 1.3059521194816606, + "grad_norm": 2.231579542160034, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7457398176193237, + "num_tokens": 296431378.0, + "step": 11892 + }, + { + "epoch": 1.306061937184274, + "grad_norm": 2.1879069805145264, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7179900407791138, + "num_tokens": 296458997.0, + "step": 11893 + }, + { + "epoch": 1.3061717548868876, + "grad_norm": 2.1500370502471924, + "learning_rate": 1e-06, + "loss": 0.8157, + "mean_token_accuracy": 0.7412301301956177, + "num_tokens": 296485967.0, + "step": 11894 + }, + { + "epoch": 1.3062815725895014, + "grad_norm": 2.179046154022217, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7286819219589233, + "num_tokens": 296512759.0, + "step": 11895 + }, + { + "epoch": 1.3063913902921152, + "grad_norm": 2.44739031791687, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7240739464759827, + "num_tokens": 296533079.0, + "step": 11896 + }, + { + "epoch": 1.3065012079947287, + "grad_norm": 2.463712453842163, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7385556697845459, + "num_tokens": 296554964.0, + "step": 11897 + }, + { + "epoch": 1.3066110256973424, + "grad_norm": 2.357832908630371, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7335164546966553, + "num_tokens": 296581318.0, + "step": 11898 + }, + { + "epoch": 1.306720843399956, + "grad_norm": 2.247281312942505, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6983562111854553, + "num_tokens": 296605317.0, + "step": 11899 + }, + { + "epoch": 1.3068306611025697, + "grad_norm": 2.1084423065185547, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6941170692443848, + "num_tokens": 296633566.0, + "step": 11900 + }, + { + "epoch": 1.3069404788051835, + "grad_norm": 2.270109176635742, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7293176651000977, + "num_tokens": 296657032.0, + "step": 11901 + }, + { + "epoch": 1.307050296507797, + "grad_norm": 2.0982396602630615, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7045937776565552, + "num_tokens": 296685134.0, + "step": 11902 + }, + { + "epoch": 1.3071601142104108, + "grad_norm": 2.324197292327881, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.725121259689331, + "num_tokens": 296710033.0, + "step": 11903 + }, + { + "epoch": 1.3072699319130243, + "grad_norm": 2.371320962905884, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7251554131507874, + "num_tokens": 296733058.0, + "step": 11904 + }, + { + "epoch": 1.307379749615638, + "grad_norm": 2.2128701210021973, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7020397186279297, + "num_tokens": 296762854.0, + "step": 11905 + }, + { + "epoch": 1.3074895673182518, + "grad_norm": 2.4424023628234863, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7266939878463745, + "num_tokens": 296783169.0, + "step": 11906 + }, + { + "epoch": 1.3075993850208654, + "grad_norm": 2.1866261959075928, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.6963722705841064, + "num_tokens": 296810747.0, + "step": 11907 + }, + { + "epoch": 1.307709202723479, + "grad_norm": 2.01871657371521, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7074116468429565, + "num_tokens": 296840641.0, + "step": 11908 + }, + { + "epoch": 1.3078190204260927, + "grad_norm": 2.1785266399383545, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7094565629959106, + "num_tokens": 296868198.0, + "step": 11909 + }, + { + "epoch": 1.3079288381287064, + "grad_norm": 2.3151144981384277, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7121400833129883, + "num_tokens": 296892309.0, + "step": 11910 + }, + { + "epoch": 1.30803865583132, + "grad_norm": 2.4741172790527344, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7281376719474792, + "num_tokens": 296914535.0, + "step": 11911 + }, + { + "epoch": 1.3081484735339337, + "grad_norm": 2.6879820823669434, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7424572110176086, + "num_tokens": 296933834.0, + "step": 11912 + }, + { + "epoch": 1.3082582912365472, + "grad_norm": 2.2184524536132812, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7069627046585083, + "num_tokens": 296962071.0, + "step": 11913 + }, + { + "epoch": 1.308368108939161, + "grad_norm": 2.6532323360443115, + "learning_rate": 1e-06, + "loss": 0.7473, + "mean_token_accuracy": 0.755486011505127, + "num_tokens": 296979520.0, + "step": 11914 + }, + { + "epoch": 1.3084779266417748, + "grad_norm": 1.9747964143753052, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7103546857833862, + "num_tokens": 297010721.0, + "step": 11915 + }, + { + "epoch": 1.3085877443443883, + "grad_norm": 2.3230385780334473, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7047104835510254, + "num_tokens": 297035297.0, + "step": 11916 + }, + { + "epoch": 1.308697562047002, + "grad_norm": 2.3417248725891113, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7472665309906006, + "num_tokens": 297058669.0, + "step": 11917 + }, + { + "epoch": 1.3088073797496156, + "grad_norm": 1.9570236206054688, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.698199987411499, + "num_tokens": 297091091.0, + "step": 11918 + }, + { + "epoch": 1.3089171974522293, + "grad_norm": 2.278865337371826, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7236319780349731, + "num_tokens": 297116429.0, + "step": 11919 + }, + { + "epoch": 1.309027015154843, + "grad_norm": 2.2582428455352783, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7383589744567871, + "num_tokens": 297142237.0, + "step": 11920 + }, + { + "epoch": 1.3091368328574566, + "grad_norm": 2.097429037094116, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7056588530540466, + "num_tokens": 297170091.0, + "step": 11921 + }, + { + "epoch": 1.3092466505600702, + "grad_norm": 2.2832536697387695, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7220189571380615, + "num_tokens": 297196103.0, + "step": 11922 + }, + { + "epoch": 1.309356468262684, + "grad_norm": 2.5507872104644775, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7413655519485474, + "num_tokens": 297216296.0, + "step": 11923 + }, + { + "epoch": 1.3094662859652977, + "grad_norm": 2.288318395614624, + "learning_rate": 1e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7470194101333618, + "num_tokens": 297242039.0, + "step": 11924 + }, + { + "epoch": 1.3095761036679112, + "grad_norm": 2.4166438579559326, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7310246825218201, + "num_tokens": 297265740.0, + "step": 11925 + }, + { + "epoch": 1.309685921370525, + "grad_norm": 2.207995653152466, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7112247347831726, + "num_tokens": 297290654.0, + "step": 11926 + }, + { + "epoch": 1.3097957390731385, + "grad_norm": 2.4660656452178955, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7433384656906128, + "num_tokens": 297312245.0, + "step": 11927 + }, + { + "epoch": 1.3099055567757523, + "grad_norm": 2.1525986194610596, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7297217845916748, + "num_tokens": 297339407.0, + "step": 11928 + }, + { + "epoch": 1.310015374478366, + "grad_norm": 1.9502745866775513, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6869699358940125, + "num_tokens": 297372323.0, + "step": 11929 + }, + { + "epoch": 1.3101251921809796, + "grad_norm": 2.158245801925659, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.6990190744400024, + "num_tokens": 297398475.0, + "step": 11930 + }, + { + "epoch": 1.3102350098835933, + "grad_norm": 2.143073797225952, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.714455783367157, + "num_tokens": 297429302.0, + "step": 11931 + }, + { + "epoch": 1.3103448275862069, + "grad_norm": 1.9861544370651245, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7262242436408997, + "num_tokens": 297459298.0, + "step": 11932 + }, + { + "epoch": 1.3104546452888206, + "grad_norm": 2.2237660884857178, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7060314416885376, + "num_tokens": 297486628.0, + "step": 11933 + }, + { + "epoch": 1.3105644629914341, + "grad_norm": 2.2776949405670166, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7009156346321106, + "num_tokens": 297511452.0, + "step": 11934 + }, + { + "epoch": 1.310674280694048, + "grad_norm": 2.214737892150879, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6882674694061279, + "num_tokens": 297541535.0, + "step": 11935 + }, + { + "epoch": 1.3107840983966614, + "grad_norm": 2.4475157260894775, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7145524024963379, + "num_tokens": 297564860.0, + "step": 11936 + }, + { + "epoch": 1.3108939160992752, + "grad_norm": 2.3465468883514404, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.6991341710090637, + "num_tokens": 297588984.0, + "step": 11937 + }, + { + "epoch": 1.311003733801889, + "grad_norm": 2.254612922668457, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7353463172912598, + "num_tokens": 297613076.0, + "step": 11938 + }, + { + "epoch": 1.3111135515045025, + "grad_norm": 2.4389517307281494, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7285377979278564, + "num_tokens": 297635554.0, + "step": 11939 + }, + { + "epoch": 1.3112233692071162, + "grad_norm": 2.233980655670166, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7372004389762878, + "num_tokens": 297661896.0, + "step": 11940 + }, + { + "epoch": 1.3113331869097298, + "grad_norm": 2.0712194442749023, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6977168321609497, + "num_tokens": 297690863.0, + "step": 11941 + }, + { + "epoch": 1.3114430046123435, + "grad_norm": 2.2876734733581543, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7097713947296143, + "num_tokens": 297716747.0, + "step": 11942 + }, + { + "epoch": 1.3115528223149573, + "grad_norm": 2.4556643962860107, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7179738283157349, + "num_tokens": 297740136.0, + "step": 11943 + }, + { + "epoch": 1.3116626400175708, + "grad_norm": 2.2230851650238037, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.6974314451217651, + "num_tokens": 297766196.0, + "step": 11944 + }, + { + "epoch": 1.3117724577201844, + "grad_norm": 2.339120864868164, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7047315835952759, + "num_tokens": 297790714.0, + "step": 11945 + }, + { + "epoch": 1.3118822754227981, + "grad_norm": 2.257075309753418, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7008426189422607, + "num_tokens": 297816276.0, + "step": 11946 + }, + { + "epoch": 1.3119920931254119, + "grad_norm": 2.2086753845214844, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7151973843574524, + "num_tokens": 297840595.0, + "step": 11947 + }, + { + "epoch": 1.3121019108280254, + "grad_norm": 2.089608907699585, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7065784931182861, + "num_tokens": 297868565.0, + "step": 11948 + }, + { + "epoch": 1.3122117285306392, + "grad_norm": 2.2049436569213867, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6964705586433411, + "num_tokens": 297894129.0, + "step": 11949 + }, + { + "epoch": 1.3123215462332527, + "grad_norm": 2.3141281604766846, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.702812135219574, + "num_tokens": 297921627.0, + "step": 11950 + }, + { + "epoch": 1.3124313639358665, + "grad_norm": 2.252272844314575, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7269280552864075, + "num_tokens": 297945690.0, + "step": 11951 + }, + { + "epoch": 1.3125411816384802, + "grad_norm": 2.6122210025787354, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7308939695358276, + "num_tokens": 297964908.0, + "step": 11952 + }, + { + "epoch": 1.3126509993410937, + "grad_norm": 2.164512872695923, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7082388401031494, + "num_tokens": 297990313.0, + "step": 11953 + }, + { + "epoch": 1.3127608170437075, + "grad_norm": 2.2549335956573486, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6977626085281372, + "num_tokens": 298018251.0, + "step": 11954 + }, + { + "epoch": 1.312870634746321, + "grad_norm": 2.32139253616333, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7224912643432617, + "num_tokens": 298042449.0, + "step": 11955 + }, + { + "epoch": 1.3129804524489348, + "grad_norm": 2.685595750808716, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7274160385131836, + "num_tokens": 298061421.0, + "step": 11956 + }, + { + "epoch": 1.3130902701515486, + "grad_norm": 2.427567958831787, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7293428778648376, + "num_tokens": 298083301.0, + "step": 11957 + }, + { + "epoch": 1.313200087854162, + "grad_norm": 2.3555490970611572, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7381976842880249, + "num_tokens": 298104014.0, + "step": 11958 + }, + { + "epoch": 1.3133099055567756, + "grad_norm": 2.2409963607788086, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.6960864067077637, + "num_tokens": 298130904.0, + "step": 11959 + }, + { + "epoch": 1.3134197232593894, + "grad_norm": 2.644582748413086, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7262545824050903, + "num_tokens": 298150199.0, + "step": 11960 + }, + { + "epoch": 1.3135295409620031, + "grad_norm": 2.0861358642578125, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7055650949478149, + "num_tokens": 298180153.0, + "step": 11961 + }, + { + "epoch": 1.3136393586646167, + "grad_norm": 2.15085506439209, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7091046571731567, + "num_tokens": 298210962.0, + "step": 11962 + }, + { + "epoch": 1.3137491763672304, + "grad_norm": 2.400089740753174, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7179948091506958, + "num_tokens": 298234967.0, + "step": 11963 + }, + { + "epoch": 1.313858994069844, + "grad_norm": 2.3336260318756104, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7499308586120605, + "num_tokens": 298258019.0, + "step": 11964 + }, + { + "epoch": 1.3139688117724577, + "grad_norm": 2.3567895889282227, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7023794651031494, + "num_tokens": 298283335.0, + "step": 11965 + }, + { + "epoch": 1.3140786294750715, + "grad_norm": 2.2225608825683594, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.706290066242218, + "num_tokens": 298308779.0, + "step": 11966 + }, + { + "epoch": 1.314188447177685, + "grad_norm": 2.206348419189453, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7342516183853149, + "num_tokens": 298334389.0, + "step": 11967 + }, + { + "epoch": 1.3142982648802988, + "grad_norm": 2.1270320415496826, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6972209811210632, + "num_tokens": 298362878.0, + "step": 11968 + }, + { + "epoch": 1.3144080825829123, + "grad_norm": 2.5624618530273438, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7298567891120911, + "num_tokens": 298383327.0, + "step": 11969 + }, + { + "epoch": 1.314517900285526, + "grad_norm": 2.139516592025757, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7122492790222168, + "num_tokens": 298412367.0, + "step": 11970 + }, + { + "epoch": 1.3146277179881398, + "grad_norm": 2.4192514419555664, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7083602547645569, + "num_tokens": 298435065.0, + "step": 11971 + }, + { + "epoch": 1.3147375356907534, + "grad_norm": 2.433974027633667, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7260892391204834, + "num_tokens": 298456618.0, + "step": 11972 + }, + { + "epoch": 1.314847353393367, + "grad_norm": 2.178290605545044, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6961249113082886, + "num_tokens": 298484883.0, + "step": 11973 + }, + { + "epoch": 1.3149571710959806, + "grad_norm": 2.519531011581421, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7156046628952026, + "num_tokens": 298506006.0, + "step": 11974 + }, + { + "epoch": 1.3150669887985944, + "grad_norm": 2.23561429977417, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7134537696838379, + "num_tokens": 298530319.0, + "step": 11975 + }, + { + "epoch": 1.315176806501208, + "grad_norm": 2.1411566734313965, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.709893524646759, + "num_tokens": 298557578.0, + "step": 11976 + }, + { + "epoch": 1.3152866242038217, + "grad_norm": 2.232511520385742, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7289639115333557, + "num_tokens": 298581393.0, + "step": 11977 + }, + { + "epoch": 1.3153964419064352, + "grad_norm": 2.014251470565796, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7019891142845154, + "num_tokens": 298612713.0, + "step": 11978 + }, + { + "epoch": 1.315506259609049, + "grad_norm": 2.5976858139038086, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7310734987258911, + "num_tokens": 298632545.0, + "step": 11979 + }, + { + "epoch": 1.3156160773116627, + "grad_norm": 2.176027297973633, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7112925052642822, + "num_tokens": 298657163.0, + "step": 11980 + }, + { + "epoch": 1.3157258950142763, + "grad_norm": 1.8815499544143677, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7299182415008545, + "num_tokens": 298691341.0, + "step": 11981 + }, + { + "epoch": 1.31583571271689, + "grad_norm": 2.5574636459350586, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7429986000061035, + "num_tokens": 298710757.0, + "step": 11982 + }, + { + "epoch": 1.3159455304195036, + "grad_norm": 2.245893716812134, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7257100343704224, + "num_tokens": 298737160.0, + "step": 11983 + }, + { + "epoch": 1.3160553481221173, + "grad_norm": 2.3035082817077637, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7293640375137329, + "num_tokens": 298763168.0, + "step": 11984 + }, + { + "epoch": 1.3161651658247309, + "grad_norm": 2.0714128017425537, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7252858877182007, + "num_tokens": 298790961.0, + "step": 11985 + }, + { + "epoch": 1.3162749835273446, + "grad_norm": 2.268030881881714, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.712281346321106, + "num_tokens": 298817889.0, + "step": 11986 + }, + { + "epoch": 1.3163848012299582, + "grad_norm": 2.910773754119873, + "learning_rate": 1e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7460294365882874, + "num_tokens": 298836380.0, + "step": 11987 + }, + { + "epoch": 1.316494618932572, + "grad_norm": 2.408761739730835, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.710482120513916, + "num_tokens": 298857810.0, + "step": 11988 + }, + { + "epoch": 1.3166044366351857, + "grad_norm": 2.1371381282806396, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7428276538848877, + "num_tokens": 298886277.0, + "step": 11989 + }, + { + "epoch": 1.3167142543377992, + "grad_norm": 2.0009915828704834, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7003155946731567, + "num_tokens": 298916733.0, + "step": 11990 + }, + { + "epoch": 1.316824072040413, + "grad_norm": 2.6239047050476074, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7424541711807251, + "num_tokens": 298937914.0, + "step": 11991 + }, + { + "epoch": 1.3169338897430265, + "grad_norm": 2.2712275981903076, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7033548951148987, + "num_tokens": 298964275.0, + "step": 11992 + }, + { + "epoch": 1.3170437074456403, + "grad_norm": 2.221315860748291, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7122131586074829, + "num_tokens": 298990338.0, + "step": 11993 + }, + { + "epoch": 1.317153525148254, + "grad_norm": 2.39206600189209, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7330291271209717, + "num_tokens": 299013906.0, + "step": 11994 + }, + { + "epoch": 1.3172633428508675, + "grad_norm": 2.1099936962127686, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7239780426025391, + "num_tokens": 299041594.0, + "step": 11995 + }, + { + "epoch": 1.3173731605534813, + "grad_norm": 2.445580244064331, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7224778532981873, + "num_tokens": 299063936.0, + "step": 11996 + }, + { + "epoch": 1.3174829782560948, + "grad_norm": 2.209160089492798, + "learning_rate": 1e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7535313367843628, + "num_tokens": 299089078.0, + "step": 11997 + }, + { + "epoch": 1.3175927959587086, + "grad_norm": 2.154966115951538, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6898139119148254, + "num_tokens": 299117725.0, + "step": 11998 + }, + { + "epoch": 1.3177026136613221, + "grad_norm": 2.1684417724609375, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7071415185928345, + "num_tokens": 299144995.0, + "step": 11999 + }, + { + "epoch": 1.3178124313639359, + "grad_norm": 2.2718379497528076, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7222712635993958, + "num_tokens": 299170879.0, + "step": 12000 + }, + { + "epoch": 1.3179222490665494, + "grad_norm": 2.6080472469329834, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7274833917617798, + "num_tokens": 299190020.0, + "step": 12001 + }, + { + "epoch": 1.3180320667691632, + "grad_norm": 2.2845067977905273, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7113145589828491, + "num_tokens": 299215904.0, + "step": 12002 + }, + { + "epoch": 1.318141884471777, + "grad_norm": 2.219486713409424, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7215796709060669, + "num_tokens": 299239696.0, + "step": 12003 + }, + { + "epoch": 1.3182517021743905, + "grad_norm": 2.2397420406341553, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7015548944473267, + "num_tokens": 299265120.0, + "step": 12004 + }, + { + "epoch": 1.3183615198770042, + "grad_norm": 2.4556283950805664, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7210148572921753, + "num_tokens": 299288048.0, + "step": 12005 + }, + { + "epoch": 1.3184713375796178, + "grad_norm": 2.3901782035827637, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7076636552810669, + "num_tokens": 299313039.0, + "step": 12006 + }, + { + "epoch": 1.3185811552822315, + "grad_norm": 2.216620922088623, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.704948365688324, + "num_tokens": 299340848.0, + "step": 12007 + }, + { + "epoch": 1.3186909729848453, + "grad_norm": 2.3242592811584473, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7276890873908997, + "num_tokens": 299364361.0, + "step": 12008 + }, + { + "epoch": 1.3188007906874588, + "grad_norm": 2.3600432872772217, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7288423180580139, + "num_tokens": 299388389.0, + "step": 12009 + }, + { + "epoch": 1.3189106083900723, + "grad_norm": 2.4050869941711426, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7263404130935669, + "num_tokens": 299412140.0, + "step": 12010 + }, + { + "epoch": 1.319020426092686, + "grad_norm": 2.539257526397705, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7392725944519043, + "num_tokens": 299432124.0, + "step": 12011 + }, + { + "epoch": 1.3191302437952999, + "grad_norm": 2.5092079639434814, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.723652184009552, + "num_tokens": 299455325.0, + "step": 12012 + }, + { + "epoch": 1.3192400614979134, + "grad_norm": 2.1100552082061768, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7158733606338501, + "num_tokens": 299481957.0, + "step": 12013 + }, + { + "epoch": 1.3193498792005272, + "grad_norm": 2.4015262126922607, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7278505563735962, + "num_tokens": 299504467.0, + "step": 12014 + }, + { + "epoch": 1.3194596969031407, + "grad_norm": 2.442070484161377, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7362626791000366, + "num_tokens": 299526176.0, + "step": 12015 + }, + { + "epoch": 1.3195695146057544, + "grad_norm": 2.4997682571411133, + "learning_rate": 1e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7354148626327515, + "num_tokens": 299548004.0, + "step": 12016 + }, + { + "epoch": 1.3196793323083682, + "grad_norm": 2.424464702606201, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7347679734230042, + "num_tokens": 299570706.0, + "step": 12017 + }, + { + "epoch": 1.3197891500109817, + "grad_norm": 2.394974946975708, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7283322215080261, + "num_tokens": 299593180.0, + "step": 12018 + }, + { + "epoch": 1.3198989677135955, + "grad_norm": 2.621368885040283, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.736030101776123, + "num_tokens": 299613280.0, + "step": 12019 + }, + { + "epoch": 1.320008785416209, + "grad_norm": 2.1821281909942627, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7354151010513306, + "num_tokens": 299638816.0, + "step": 12020 + }, + { + "epoch": 1.3201186031188228, + "grad_norm": 2.3947594165802, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7254718542098999, + "num_tokens": 299662203.0, + "step": 12021 + }, + { + "epoch": 1.3202284208214365, + "grad_norm": 2.187537431716919, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7348637580871582, + "num_tokens": 299688747.0, + "step": 12022 + }, + { + "epoch": 1.32033823852405, + "grad_norm": 2.3780689239501953, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7349634170532227, + "num_tokens": 299711109.0, + "step": 12023 + }, + { + "epoch": 1.3204480562266636, + "grad_norm": 2.384647846221924, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.702052652835846, + "num_tokens": 299735514.0, + "step": 12024 + }, + { + "epoch": 1.3205578739292774, + "grad_norm": 2.2096028327941895, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7033544778823853, + "num_tokens": 299763191.0, + "step": 12025 + }, + { + "epoch": 1.3206676916318911, + "grad_norm": 2.1678836345672607, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.727484405040741, + "num_tokens": 299789806.0, + "step": 12026 + }, + { + "epoch": 1.3207775093345047, + "grad_norm": 2.1288797855377197, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.705213725566864, + "num_tokens": 299818450.0, + "step": 12027 + }, + { + "epoch": 1.3208873270371184, + "grad_norm": 2.179323673248291, + "learning_rate": 1e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7294721007347107, + "num_tokens": 299844724.0, + "step": 12028 + }, + { + "epoch": 1.320997144739732, + "grad_norm": 2.413017988204956, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.74410080909729, + "num_tokens": 299866718.0, + "step": 12029 + }, + { + "epoch": 1.3211069624423457, + "grad_norm": 2.498354196548462, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7092853784561157, + "num_tokens": 299889049.0, + "step": 12030 + }, + { + "epoch": 1.3212167801449595, + "grad_norm": 2.1776020526885986, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7231668829917908, + "num_tokens": 299915048.0, + "step": 12031 + }, + { + "epoch": 1.321326597847573, + "grad_norm": 2.5890872478485107, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7248550653457642, + "num_tokens": 299935134.0, + "step": 12032 + }, + { + "epoch": 1.3214364155501868, + "grad_norm": 2.159820079803467, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7407599687576294, + "num_tokens": 299960559.0, + "step": 12033 + }, + { + "epoch": 1.3215462332528003, + "grad_norm": 2.179579973220825, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7127991914749146, + "num_tokens": 299987919.0, + "step": 12034 + }, + { + "epoch": 1.321656050955414, + "grad_norm": 2.3696913719177246, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7348418235778809, + "num_tokens": 300012226.0, + "step": 12035 + }, + { + "epoch": 1.3217658686580278, + "grad_norm": 2.2093348503112793, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7210100889205933, + "num_tokens": 300037370.0, + "step": 12036 + }, + { + "epoch": 1.3218756863606413, + "grad_norm": 2.2899913787841797, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7160798907279968, + "num_tokens": 300061439.0, + "step": 12037 + }, + { + "epoch": 1.3219855040632549, + "grad_norm": 2.288926839828491, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7379679083824158, + "num_tokens": 300086486.0, + "step": 12038 + }, + { + "epoch": 1.3220953217658686, + "grad_norm": 2.2344119548797607, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7146133184432983, + "num_tokens": 300114124.0, + "step": 12039 + }, + { + "epoch": 1.3222051394684824, + "grad_norm": 2.264918088912964, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7100663185119629, + "num_tokens": 300138256.0, + "step": 12040 + }, + { + "epoch": 1.322314957171096, + "grad_norm": 2.074390411376953, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7088949680328369, + "num_tokens": 300168082.0, + "step": 12041 + }, + { + "epoch": 1.3224247748737097, + "grad_norm": 2.115126132965088, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7031493186950684, + "num_tokens": 300195051.0, + "step": 12042 + }, + { + "epoch": 1.3225345925763232, + "grad_norm": 2.306389331817627, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7184861302375793, + "num_tokens": 300219483.0, + "step": 12043 + }, + { + "epoch": 1.322644410278937, + "grad_norm": 2.4835357666015625, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7369093298912048, + "num_tokens": 300241309.0, + "step": 12044 + }, + { + "epoch": 1.3227542279815507, + "grad_norm": 2.494295835494995, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7324167490005493, + "num_tokens": 300261738.0, + "step": 12045 + }, + { + "epoch": 1.3228640456841643, + "grad_norm": 2.197162628173828, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7148151993751526, + "num_tokens": 300289744.0, + "step": 12046 + }, + { + "epoch": 1.322973863386778, + "grad_norm": 2.3113741874694824, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7004772424697876, + "num_tokens": 300316936.0, + "step": 12047 + }, + { + "epoch": 1.3230836810893916, + "grad_norm": 2.2586658000946045, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7343097925186157, + "num_tokens": 300341372.0, + "step": 12048 + }, + { + "epoch": 1.3231934987920053, + "grad_norm": 2.1821253299713135, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7192893028259277, + "num_tokens": 300368582.0, + "step": 12049 + }, + { + "epoch": 1.3233033164946189, + "grad_norm": 2.1651034355163574, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7296830415725708, + "num_tokens": 300394888.0, + "step": 12050 + }, + { + "epoch": 1.3234131341972326, + "grad_norm": 2.3801448345184326, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7283557653427124, + "num_tokens": 300419516.0, + "step": 12051 + }, + { + "epoch": 1.3235229518998461, + "grad_norm": 2.230945110321045, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7331669926643372, + "num_tokens": 300444286.0, + "step": 12052 + }, + { + "epoch": 1.32363276960246, + "grad_norm": 2.0823302268981934, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7223338484764099, + "num_tokens": 300474149.0, + "step": 12053 + }, + { + "epoch": 1.3237425873050737, + "grad_norm": 2.255941152572632, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7193533778190613, + "num_tokens": 300501439.0, + "step": 12054 + }, + { + "epoch": 1.3238524050076872, + "grad_norm": 1.988031268119812, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7195984721183777, + "num_tokens": 300530469.0, + "step": 12055 + }, + { + "epoch": 1.323962222710301, + "grad_norm": 2.3440134525299072, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.728057861328125, + "num_tokens": 300552640.0, + "step": 12056 + }, + { + "epoch": 1.3240720404129145, + "grad_norm": 2.2068474292755127, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7321221828460693, + "num_tokens": 300577723.0, + "step": 12057 + }, + { + "epoch": 1.3241818581155282, + "grad_norm": 2.516923666000366, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7311387062072754, + "num_tokens": 300598807.0, + "step": 12058 + }, + { + "epoch": 1.324291675818142, + "grad_norm": 2.5363004207611084, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7256577014923096, + "num_tokens": 300620305.0, + "step": 12059 + }, + { + "epoch": 1.3244014935207555, + "grad_norm": 2.284479856491089, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7243013978004456, + "num_tokens": 300644869.0, + "step": 12060 + }, + { + "epoch": 1.324511311223369, + "grad_norm": 2.3606908321380615, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7282450199127197, + "num_tokens": 300669685.0, + "step": 12061 + }, + { + "epoch": 1.3246211289259828, + "grad_norm": 2.5650222301483154, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7295880913734436, + "num_tokens": 300690574.0, + "step": 12062 + }, + { + "epoch": 1.3247309466285966, + "grad_norm": 2.206181049346924, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.697062611579895, + "num_tokens": 300720660.0, + "step": 12063 + }, + { + "epoch": 1.3248407643312101, + "grad_norm": 2.4650397300720215, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.706748366355896, + "num_tokens": 300743502.0, + "step": 12064 + }, + { + "epoch": 1.3249505820338239, + "grad_norm": 1.8650996685028076, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7012999057769775, + "num_tokens": 300779933.0, + "step": 12065 + }, + { + "epoch": 1.3250603997364374, + "grad_norm": 2.4195125102996826, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7164413928985596, + "num_tokens": 300801686.0, + "step": 12066 + }, + { + "epoch": 1.3251702174390512, + "grad_norm": 2.5259499549865723, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7197877168655396, + "num_tokens": 300822693.0, + "step": 12067 + }, + { + "epoch": 1.325280035141665, + "grad_norm": 2.2640838623046875, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7210149168968201, + "num_tokens": 300848059.0, + "step": 12068 + }, + { + "epoch": 1.3253898528442785, + "grad_norm": 2.367786169052124, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7051515579223633, + "num_tokens": 300872803.0, + "step": 12069 + }, + { + "epoch": 1.3254996705468922, + "grad_norm": 2.114792823791504, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7210817933082581, + "num_tokens": 300898545.0, + "step": 12070 + }, + { + "epoch": 1.3256094882495058, + "grad_norm": 2.3664839267730713, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7183175683021545, + "num_tokens": 300921305.0, + "step": 12071 + }, + { + "epoch": 1.3257193059521195, + "grad_norm": 2.216488838195801, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7329022884368896, + "num_tokens": 300947406.0, + "step": 12072 + }, + { + "epoch": 1.3258291236547333, + "grad_norm": 2.056852340698242, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7119137048721313, + "num_tokens": 300977715.0, + "step": 12073 + }, + { + "epoch": 1.3259389413573468, + "grad_norm": 2.4811041355133057, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.7560247778892517, + "num_tokens": 300997509.0, + "step": 12074 + }, + { + "epoch": 1.3260487590599603, + "grad_norm": 2.4249627590179443, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7096259593963623, + "num_tokens": 301020270.0, + "step": 12075 + }, + { + "epoch": 1.326158576762574, + "grad_norm": 2.0543832778930664, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7396811842918396, + "num_tokens": 301049211.0, + "step": 12076 + }, + { + "epoch": 1.3262683944651878, + "grad_norm": 2.1699836254119873, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7081323862075806, + "num_tokens": 301078529.0, + "step": 12077 + }, + { + "epoch": 1.3263782121678014, + "grad_norm": 2.0725321769714355, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7273298501968384, + "num_tokens": 301107522.0, + "step": 12078 + }, + { + "epoch": 1.3264880298704151, + "grad_norm": 2.1739890575408936, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7063261270523071, + "num_tokens": 301135461.0, + "step": 12079 + }, + { + "epoch": 1.3265978475730287, + "grad_norm": 2.4649226665496826, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7154562473297119, + "num_tokens": 301157635.0, + "step": 12080 + }, + { + "epoch": 1.3267076652756424, + "grad_norm": 2.4167120456695557, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7336400151252747, + "num_tokens": 301179844.0, + "step": 12081 + }, + { + "epoch": 1.3268174829782562, + "grad_norm": 2.2242634296417236, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7452535629272461, + "num_tokens": 301204251.0, + "step": 12082 + }, + { + "epoch": 1.3269273006808697, + "grad_norm": 2.4129769802093506, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7275030612945557, + "num_tokens": 301226867.0, + "step": 12083 + }, + { + "epoch": 1.3270371183834835, + "grad_norm": 2.4885647296905518, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.712732195854187, + "num_tokens": 301249623.0, + "step": 12084 + }, + { + "epoch": 1.327146936086097, + "grad_norm": 2.1700916290283203, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7177954912185669, + "num_tokens": 301277471.0, + "step": 12085 + }, + { + "epoch": 1.3272567537887108, + "grad_norm": 2.291724443435669, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7366150617599487, + "num_tokens": 301300752.0, + "step": 12086 + }, + { + "epoch": 1.3273665714913245, + "grad_norm": 2.4072093963623047, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7466236352920532, + "num_tokens": 301321219.0, + "step": 12087 + }, + { + "epoch": 1.327476389193938, + "grad_norm": 2.3780245780944824, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7198143005371094, + "num_tokens": 301346421.0, + "step": 12088 + }, + { + "epoch": 1.3275862068965516, + "grad_norm": 2.235668182373047, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7262235879898071, + "num_tokens": 301372293.0, + "step": 12089 + }, + { + "epoch": 1.3276960245991654, + "grad_norm": 2.353414535522461, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7298071384429932, + "num_tokens": 301394286.0, + "step": 12090 + }, + { + "epoch": 1.3278058423017791, + "grad_norm": 2.209120035171509, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7262507677078247, + "num_tokens": 301419790.0, + "step": 12091 + }, + { + "epoch": 1.3279156600043927, + "grad_norm": 2.365398406982422, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7333881855010986, + "num_tokens": 301443010.0, + "step": 12092 + }, + { + "epoch": 1.3280254777070064, + "grad_norm": 2.0824472904205322, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7237436771392822, + "num_tokens": 301473285.0, + "step": 12093 + }, + { + "epoch": 1.32813529540962, + "grad_norm": 2.1249494552612305, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7214555144309998, + "num_tokens": 301499539.0, + "step": 12094 + }, + { + "epoch": 1.3282451131122337, + "grad_norm": 2.3091986179351807, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7077749967575073, + "num_tokens": 301526880.0, + "step": 12095 + }, + { + "epoch": 1.3283549308148475, + "grad_norm": 2.22574782371521, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7110693454742432, + "num_tokens": 301554986.0, + "step": 12096 + }, + { + "epoch": 1.328464748517461, + "grad_norm": 2.1138062477111816, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6900472044944763, + "num_tokens": 301584899.0, + "step": 12097 + }, + { + "epoch": 1.3285745662200747, + "grad_norm": 2.504818916320801, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.717777669429779, + "num_tokens": 301606712.0, + "step": 12098 + }, + { + "epoch": 1.3286843839226883, + "grad_norm": 2.2938365936279297, + "learning_rate": 1e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7379529476165771, + "num_tokens": 301630533.0, + "step": 12099 + }, + { + "epoch": 1.328794201625302, + "grad_norm": 2.6316449642181396, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.714893102645874, + "num_tokens": 301649522.0, + "step": 12100 + }, + { + "epoch": 1.3289040193279158, + "grad_norm": 2.4840104579925537, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7203353643417358, + "num_tokens": 301671481.0, + "step": 12101 + }, + { + "epoch": 1.3290138370305293, + "grad_norm": 2.2409567832946777, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7144498825073242, + "num_tokens": 301696464.0, + "step": 12102 + }, + { + "epoch": 1.3291236547331429, + "grad_norm": 2.02451229095459, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7165096402168274, + "num_tokens": 301726452.0, + "step": 12103 + }, + { + "epoch": 1.3292334724357566, + "grad_norm": 2.511589527130127, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7099118232727051, + "num_tokens": 301749207.0, + "step": 12104 + }, + { + "epoch": 1.3293432901383704, + "grad_norm": 2.3045825958251953, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7342334389686584, + "num_tokens": 301773988.0, + "step": 12105 + }, + { + "epoch": 1.329453107840984, + "grad_norm": 2.3091440200805664, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7177698016166687, + "num_tokens": 301798307.0, + "step": 12106 + }, + { + "epoch": 1.3295629255435977, + "grad_norm": 2.0623724460601807, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7171756029129028, + "num_tokens": 301827824.0, + "step": 12107 + }, + { + "epoch": 1.3296727432462112, + "grad_norm": 2.3853797912597656, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7184620499610901, + "num_tokens": 301851736.0, + "step": 12108 + }, + { + "epoch": 1.329782560948825, + "grad_norm": 2.273500680923462, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.723537802696228, + "num_tokens": 301876396.0, + "step": 12109 + }, + { + "epoch": 1.3298923786514387, + "grad_norm": 2.6130459308624268, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7294214963912964, + "num_tokens": 301895509.0, + "step": 12110 + }, + { + "epoch": 1.3300021963540523, + "grad_norm": 2.5065395832061768, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7225821018218994, + "num_tokens": 301916291.0, + "step": 12111 + }, + { + "epoch": 1.330112014056666, + "grad_norm": 2.4974169731140137, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7372784614562988, + "num_tokens": 301936768.0, + "step": 12112 + }, + { + "epoch": 1.3302218317592795, + "grad_norm": 2.4127018451690674, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7128724455833435, + "num_tokens": 301961129.0, + "step": 12113 + }, + { + "epoch": 1.3303316494618933, + "grad_norm": 2.513378381729126, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7033290863037109, + "num_tokens": 301984482.0, + "step": 12114 + }, + { + "epoch": 1.3304414671645068, + "grad_norm": 2.298980474472046, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7261419296264648, + "num_tokens": 302008537.0, + "step": 12115 + }, + { + "epoch": 1.3305512848671206, + "grad_norm": 2.3811938762664795, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7053157687187195, + "num_tokens": 302034291.0, + "step": 12116 + }, + { + "epoch": 1.3306611025697341, + "grad_norm": 2.228271722793579, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7062764167785645, + "num_tokens": 302059989.0, + "step": 12117 + }, + { + "epoch": 1.330770920272348, + "grad_norm": 2.22658109664917, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7147558927536011, + "num_tokens": 302086931.0, + "step": 12118 + }, + { + "epoch": 1.3308807379749616, + "grad_norm": 2.6555285453796387, + "learning_rate": 1e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.739608883857727, + "num_tokens": 302106103.0, + "step": 12119 + }, + { + "epoch": 1.3309905556775752, + "grad_norm": 2.3931984901428223, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6975873708724976, + "num_tokens": 302131665.0, + "step": 12120 + }, + { + "epoch": 1.331100373380189, + "grad_norm": 2.018195390701294, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7079014182090759, + "num_tokens": 302161804.0, + "step": 12121 + }, + { + "epoch": 1.3312101910828025, + "grad_norm": 2.3619801998138428, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7199081182479858, + "num_tokens": 302184614.0, + "step": 12122 + }, + { + "epoch": 1.3313200087854162, + "grad_norm": 2.211099863052368, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7278839945793152, + "num_tokens": 302209160.0, + "step": 12123 + }, + { + "epoch": 1.33142982648803, + "grad_norm": 2.0827739238739014, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7205814123153687, + "num_tokens": 302236450.0, + "step": 12124 + }, + { + "epoch": 1.3315396441906435, + "grad_norm": 2.157482385635376, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.6993504166603088, + "num_tokens": 302264994.0, + "step": 12125 + }, + { + "epoch": 1.331649461893257, + "grad_norm": 2.488039493560791, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7204233407974243, + "num_tokens": 302286508.0, + "step": 12126 + }, + { + "epoch": 1.3317592795958708, + "grad_norm": 1.8349756002426147, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.705476701259613, + "num_tokens": 302319597.0, + "step": 12127 + }, + { + "epoch": 1.3318690972984846, + "grad_norm": 1.9562090635299683, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7212475538253784, + "num_tokens": 302351044.0, + "step": 12128 + }, + { + "epoch": 1.331978915001098, + "grad_norm": 2.136152505874634, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7148052453994751, + "num_tokens": 302378439.0, + "step": 12129 + }, + { + "epoch": 1.3320887327037119, + "grad_norm": 2.2700676918029785, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7148914337158203, + "num_tokens": 302405610.0, + "step": 12130 + }, + { + "epoch": 1.3321985504063254, + "grad_norm": 2.2533159255981445, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7173402905464172, + "num_tokens": 302433180.0, + "step": 12131 + }, + { + "epoch": 1.3323083681089392, + "grad_norm": 1.9598146677017212, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7115541100502014, + "num_tokens": 302466956.0, + "step": 12132 + }, + { + "epoch": 1.332418185811553, + "grad_norm": 2.216776132583618, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7141873836517334, + "num_tokens": 302494229.0, + "step": 12133 + }, + { + "epoch": 1.3325280035141664, + "grad_norm": 2.0660572052001953, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7052522301673889, + "num_tokens": 302525201.0, + "step": 12134 + }, + { + "epoch": 1.3326378212167802, + "grad_norm": 2.373206853866577, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6940374970436096, + "num_tokens": 302550452.0, + "step": 12135 + }, + { + "epoch": 1.3327476389193937, + "grad_norm": 2.134690761566162, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7072542905807495, + "num_tokens": 302579342.0, + "step": 12136 + }, + { + "epoch": 1.3328574566220075, + "grad_norm": 2.2433242797851562, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7154953479766846, + "num_tokens": 302606147.0, + "step": 12137 + }, + { + "epoch": 1.3329672743246213, + "grad_norm": 2.503267288208008, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7284667491912842, + "num_tokens": 302627336.0, + "step": 12138 + }, + { + "epoch": 1.3330770920272348, + "grad_norm": 2.4084510803222656, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7223157286643982, + "num_tokens": 302649717.0, + "step": 12139 + }, + { + "epoch": 1.3331869097298483, + "grad_norm": 1.982043743133545, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7384064197540283, + "num_tokens": 302680443.0, + "step": 12140 + }, + { + "epoch": 1.333296727432462, + "grad_norm": 2.022200345993042, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7098955512046814, + "num_tokens": 302710046.0, + "step": 12141 + }, + { + "epoch": 1.3334065451350758, + "grad_norm": 2.1273367404937744, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7259637117385864, + "num_tokens": 302737824.0, + "step": 12142 + }, + { + "epoch": 1.3335163628376894, + "grad_norm": 2.5055043697357178, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7082965970039368, + "num_tokens": 302759316.0, + "step": 12143 + }, + { + "epoch": 1.3336261805403031, + "grad_norm": 2.3037147521972656, + "learning_rate": 1e-06, + "loss": 0.8161, + "mean_token_accuracy": 0.74344801902771, + "num_tokens": 302782582.0, + "step": 12144 + }, + { + "epoch": 1.3337359982429167, + "grad_norm": 2.138256788253784, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7228333950042725, + "num_tokens": 302810108.0, + "step": 12145 + }, + { + "epoch": 1.3338458159455304, + "grad_norm": 2.1619796752929688, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7038867473602295, + "num_tokens": 302837404.0, + "step": 12146 + }, + { + "epoch": 1.3339556336481442, + "grad_norm": 2.3265163898468018, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7279219031333923, + "num_tokens": 302861390.0, + "step": 12147 + }, + { + "epoch": 1.3340654513507577, + "grad_norm": 2.4481189250946045, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7382303476333618, + "num_tokens": 302883869.0, + "step": 12148 + }, + { + "epoch": 1.3341752690533715, + "grad_norm": 2.0613648891448975, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6904914379119873, + "num_tokens": 302911980.0, + "step": 12149 + }, + { + "epoch": 1.334285086755985, + "grad_norm": 2.100759267807007, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6913520097732544, + "num_tokens": 302940754.0, + "step": 12150 + }, + { + "epoch": 1.3343949044585988, + "grad_norm": 2.656503200531006, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7248950600624084, + "num_tokens": 302965628.0, + "step": 12151 + }, + { + "epoch": 1.3345047221612125, + "grad_norm": 2.2542707920074463, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7075152397155762, + "num_tokens": 302990434.0, + "step": 12152 + }, + { + "epoch": 1.334614539863826, + "grad_norm": 2.5251834392547607, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7411163449287415, + "num_tokens": 303010652.0, + "step": 12153 + }, + { + "epoch": 1.3347243575664396, + "grad_norm": 2.417104959487915, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7180289030075073, + "num_tokens": 303035683.0, + "step": 12154 + }, + { + "epoch": 1.3348341752690533, + "grad_norm": 2.853515386581421, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7391424179077148, + "num_tokens": 303052261.0, + "step": 12155 + }, + { + "epoch": 1.334943992971667, + "grad_norm": 2.6257247924804688, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7089680433273315, + "num_tokens": 303072477.0, + "step": 12156 + }, + { + "epoch": 1.3350538106742806, + "grad_norm": 2.183598041534424, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7365624904632568, + "num_tokens": 303098144.0, + "step": 12157 + }, + { + "epoch": 1.3351636283768944, + "grad_norm": 2.336855888366699, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.737579345703125, + "num_tokens": 303120491.0, + "step": 12158 + }, + { + "epoch": 1.335273446079508, + "grad_norm": 2.269164800643921, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6924582719802856, + "num_tokens": 303145535.0, + "step": 12159 + }, + { + "epoch": 1.3353832637821217, + "grad_norm": 2.622683048248291, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7109911441802979, + "num_tokens": 303165593.0, + "step": 12160 + }, + { + "epoch": 1.3354930814847354, + "grad_norm": 2.2233715057373047, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7194795608520508, + "num_tokens": 303190861.0, + "step": 12161 + }, + { + "epoch": 1.335602899187349, + "grad_norm": 2.132192611694336, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7079867720603943, + "num_tokens": 303220634.0, + "step": 12162 + }, + { + "epoch": 1.3357127168899627, + "grad_norm": 2.257107973098755, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7168796062469482, + "num_tokens": 303246862.0, + "step": 12163 + }, + { + "epoch": 1.3358225345925763, + "grad_norm": 2.2445602416992188, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.711887776851654, + "num_tokens": 303273259.0, + "step": 12164 + }, + { + "epoch": 1.33593235229519, + "grad_norm": 2.25201416015625, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7156757116317749, + "num_tokens": 303299054.0, + "step": 12165 + }, + { + "epoch": 1.3360421699978036, + "grad_norm": 1.941802978515625, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7233939170837402, + "num_tokens": 303331488.0, + "step": 12166 + }, + { + "epoch": 1.3361519877004173, + "grad_norm": 1.9582046270370483, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.6994240283966064, + "num_tokens": 303363454.0, + "step": 12167 + }, + { + "epoch": 1.3362618054030309, + "grad_norm": 2.0369653701782227, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7131569981575012, + "num_tokens": 303394598.0, + "step": 12168 + }, + { + "epoch": 1.3363716231056446, + "grad_norm": 2.5887210369110107, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7260270118713379, + "num_tokens": 303415555.0, + "step": 12169 + }, + { + "epoch": 1.3364814408082584, + "grad_norm": 2.047365665435791, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6920238733291626, + "num_tokens": 303447376.0, + "step": 12170 + }, + { + "epoch": 1.336591258510872, + "grad_norm": 1.9954627752304077, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6967289447784424, + "num_tokens": 303479368.0, + "step": 12171 + }, + { + "epoch": 1.3367010762134857, + "grad_norm": 3.1924006938934326, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7405668497085571, + "num_tokens": 303494665.0, + "step": 12172 + }, + { + "epoch": 1.3368108939160992, + "grad_norm": 1.959771990776062, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.6986593008041382, + "num_tokens": 303527442.0, + "step": 12173 + }, + { + "epoch": 1.336920711618713, + "grad_norm": 2.235037326812744, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7210197448730469, + "num_tokens": 303554212.0, + "step": 12174 + }, + { + "epoch": 1.3370305293213267, + "grad_norm": 2.661046266555786, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7329033613204956, + "num_tokens": 303572710.0, + "step": 12175 + }, + { + "epoch": 1.3371403470239402, + "grad_norm": 2.2577219009399414, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7235460877418518, + "num_tokens": 303596596.0, + "step": 12176 + }, + { + "epoch": 1.337250164726554, + "grad_norm": 2.035076856613159, + "learning_rate": 1e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7418298721313477, + "num_tokens": 303622358.0, + "step": 12177 + }, + { + "epoch": 1.3373599824291675, + "grad_norm": 2.342146635055542, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7289667725563049, + "num_tokens": 303646929.0, + "step": 12178 + }, + { + "epoch": 1.3374698001317813, + "grad_norm": 2.2460386753082275, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7132660150527954, + "num_tokens": 303671190.0, + "step": 12179 + }, + { + "epoch": 1.3375796178343948, + "grad_norm": 1.9774402379989624, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7092081308364868, + "num_tokens": 303703128.0, + "step": 12180 + }, + { + "epoch": 1.3376894355370086, + "grad_norm": 2.0879807472229004, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7343995571136475, + "num_tokens": 303730725.0, + "step": 12181 + }, + { + "epoch": 1.3377992532396221, + "grad_norm": 2.4950404167175293, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7159750461578369, + "num_tokens": 303751981.0, + "step": 12182 + }, + { + "epoch": 1.3379090709422359, + "grad_norm": 2.4448678493499756, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7328790426254272, + "num_tokens": 303774222.0, + "step": 12183 + }, + { + "epoch": 1.3380188886448496, + "grad_norm": 2.22145938873291, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7196879386901855, + "num_tokens": 303797933.0, + "step": 12184 + }, + { + "epoch": 1.3381287063474632, + "grad_norm": 2.1165921688079834, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7105100154876709, + "num_tokens": 303826296.0, + "step": 12185 + }, + { + "epoch": 1.338238524050077, + "grad_norm": 2.5270450115203857, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.719529390335083, + "num_tokens": 303846862.0, + "step": 12186 + }, + { + "epoch": 1.3383483417526905, + "grad_norm": 2.232032537460327, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7301343679428101, + "num_tokens": 303873088.0, + "step": 12187 + }, + { + "epoch": 1.3384581594553042, + "grad_norm": 2.0581626892089844, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7233045101165771, + "num_tokens": 303900099.0, + "step": 12188 + }, + { + "epoch": 1.338567977157918, + "grad_norm": 2.230280876159668, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7555570602416992, + "num_tokens": 303925312.0, + "step": 12189 + }, + { + "epoch": 1.3386777948605315, + "grad_norm": 2.2521073818206787, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7181086540222168, + "num_tokens": 303951058.0, + "step": 12190 + }, + { + "epoch": 1.338787612563145, + "grad_norm": 2.506343126296997, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7229344844818115, + "num_tokens": 303971536.0, + "step": 12191 + }, + { + "epoch": 1.3388974302657588, + "grad_norm": 2.1874661445617676, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7207448482513428, + "num_tokens": 303998161.0, + "step": 12192 + }, + { + "epoch": 1.3390072479683726, + "grad_norm": 2.007534980773926, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7271074056625366, + "num_tokens": 304029899.0, + "step": 12193 + }, + { + "epoch": 1.339117065670986, + "grad_norm": 2.460214614868164, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7272613048553467, + "num_tokens": 304051564.0, + "step": 12194 + }, + { + "epoch": 1.3392268833735999, + "grad_norm": 2.256222724914551, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7223471403121948, + "num_tokens": 304077369.0, + "step": 12195 + }, + { + "epoch": 1.3393367010762134, + "grad_norm": 2.3548741340637207, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.70633864402771, + "num_tokens": 304101067.0, + "step": 12196 + }, + { + "epoch": 1.3394465187788271, + "grad_norm": 2.347956418991089, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7342860102653503, + "num_tokens": 304125600.0, + "step": 12197 + }, + { + "epoch": 1.339556336481441, + "grad_norm": 2.684340238571167, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7369227409362793, + "num_tokens": 304143893.0, + "step": 12198 + }, + { + "epoch": 1.3396661541840544, + "grad_norm": 1.9430115222930908, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.687515377998352, + "num_tokens": 304176957.0, + "step": 12199 + }, + { + "epoch": 1.3397759718866682, + "grad_norm": 2.2769100666046143, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.713200569152832, + "num_tokens": 304202254.0, + "step": 12200 + }, + { + "epoch": 1.3398857895892817, + "grad_norm": 2.1262876987457275, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7196428775787354, + "num_tokens": 304229726.0, + "step": 12201 + }, + { + "epoch": 1.3399956072918955, + "grad_norm": 2.030482530593872, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7070666551589966, + "num_tokens": 304261886.0, + "step": 12202 + }, + { + "epoch": 1.3401054249945092, + "grad_norm": 2.620464324951172, + "learning_rate": 1e-06, + "loss": 0.7538, + "mean_token_accuracy": 0.7620713710784912, + "num_tokens": 304280961.0, + "step": 12203 + }, + { + "epoch": 1.3402152426971228, + "grad_norm": 2.319561719894409, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7199251651763916, + "num_tokens": 304306321.0, + "step": 12204 + }, + { + "epoch": 1.3403250603997363, + "grad_norm": 2.2892003059387207, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7126289010047913, + "num_tokens": 304331239.0, + "step": 12205 + }, + { + "epoch": 1.34043487810235, + "grad_norm": 2.599416732788086, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7360691428184509, + "num_tokens": 304351035.0, + "step": 12206 + }, + { + "epoch": 1.3405446958049638, + "grad_norm": 2.258565902709961, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7085242867469788, + "num_tokens": 304377484.0, + "step": 12207 + }, + { + "epoch": 1.3406545135075774, + "grad_norm": 2.2563576698303223, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7420926094055176, + "num_tokens": 304403638.0, + "step": 12208 + }, + { + "epoch": 1.3407643312101911, + "grad_norm": 2.4131741523742676, + "learning_rate": 1e-06, + "loss": 0.7912, + "mean_token_accuracy": 0.7520633339881897, + "num_tokens": 304425775.0, + "step": 12209 + }, + { + "epoch": 1.3408741489128047, + "grad_norm": 2.438986301422119, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7230288982391357, + "num_tokens": 304448181.0, + "step": 12210 + }, + { + "epoch": 1.3409839666154184, + "grad_norm": 2.0598256587982178, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7161178588867188, + "num_tokens": 304475711.0, + "step": 12211 + }, + { + "epoch": 1.3410937843180322, + "grad_norm": 2.3954122066497803, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7307470440864563, + "num_tokens": 304498913.0, + "step": 12212 + }, + { + "epoch": 1.3412036020206457, + "grad_norm": 2.351585865020752, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7389565706253052, + "num_tokens": 304521394.0, + "step": 12213 + }, + { + "epoch": 1.3413134197232595, + "grad_norm": 2.2965290546417236, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7453182339668274, + "num_tokens": 304545586.0, + "step": 12214 + }, + { + "epoch": 1.341423237425873, + "grad_norm": 2.0718963146209717, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.695827305316925, + "num_tokens": 304572943.0, + "step": 12215 + }, + { + "epoch": 1.3415330551284868, + "grad_norm": 2.349416732788086, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7196723818778992, + "num_tokens": 304596165.0, + "step": 12216 + }, + { + "epoch": 1.3416428728311005, + "grad_norm": 2.321990728378296, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.715644121170044, + "num_tokens": 304621877.0, + "step": 12217 + }, + { + "epoch": 1.341752690533714, + "grad_norm": 2.9069204330444336, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7297529578208923, + "num_tokens": 304638613.0, + "step": 12218 + }, + { + "epoch": 1.3418625082363276, + "grad_norm": 2.1017961502075195, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7256085872650146, + "num_tokens": 304666363.0, + "step": 12219 + }, + { + "epoch": 1.3419723259389413, + "grad_norm": 2.376511335372925, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.709121584892273, + "num_tokens": 304689515.0, + "step": 12220 + }, + { + "epoch": 1.342082143641555, + "grad_norm": 2.5893237590789795, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7244640588760376, + "num_tokens": 304710131.0, + "step": 12221 + }, + { + "epoch": 1.3421919613441686, + "grad_norm": 1.9330965280532837, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6924248933792114, + "num_tokens": 304742941.0, + "step": 12222 + }, + { + "epoch": 1.3423017790467824, + "grad_norm": 2.2488760948181152, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7429258823394775, + "num_tokens": 304767198.0, + "step": 12223 + }, + { + "epoch": 1.342411596749396, + "grad_norm": 2.145643472671509, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7309207320213318, + "num_tokens": 304792536.0, + "step": 12224 + }, + { + "epoch": 1.3425214144520097, + "grad_norm": 2.240410327911377, + "learning_rate": 1e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.7456134557723999, + "num_tokens": 304816779.0, + "step": 12225 + }, + { + "epoch": 1.3426312321546234, + "grad_norm": 2.597463607788086, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7140170931816101, + "num_tokens": 304839510.0, + "step": 12226 + }, + { + "epoch": 1.342741049857237, + "grad_norm": 2.302824020385742, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7174925208091736, + "num_tokens": 304864607.0, + "step": 12227 + }, + { + "epoch": 1.3428508675598507, + "grad_norm": 2.721966028213501, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7191131114959717, + "num_tokens": 304883494.0, + "step": 12228 + }, + { + "epoch": 1.3429606852624643, + "grad_norm": 2.273082733154297, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7229852676391602, + "num_tokens": 304912734.0, + "step": 12229 + }, + { + "epoch": 1.343070502965078, + "grad_norm": 2.325472354888916, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7241950035095215, + "num_tokens": 304935912.0, + "step": 12230 + }, + { + "epoch": 1.3431803206676916, + "grad_norm": 2.371431350708008, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7332164645195007, + "num_tokens": 304959514.0, + "step": 12231 + }, + { + "epoch": 1.3432901383703053, + "grad_norm": 2.4129841327667236, + "learning_rate": 1e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.7308306694030762, + "num_tokens": 304981741.0, + "step": 12232 + }, + { + "epoch": 1.3433999560729188, + "grad_norm": 2.1235580444335938, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7122889757156372, + "num_tokens": 305011769.0, + "step": 12233 + }, + { + "epoch": 1.3435097737755326, + "grad_norm": 2.1024327278137207, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7000424861907959, + "num_tokens": 305039363.0, + "step": 12234 + }, + { + "epoch": 1.3436195914781464, + "grad_norm": 1.9809755086898804, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.712993323802948, + "num_tokens": 305069802.0, + "step": 12235 + }, + { + "epoch": 1.34372940918076, + "grad_norm": 2.220245838165283, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7117182612419128, + "num_tokens": 305096143.0, + "step": 12236 + }, + { + "epoch": 1.3438392268833736, + "grad_norm": 2.387331247329712, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7292795777320862, + "num_tokens": 305119436.0, + "step": 12237 + }, + { + "epoch": 1.3439490445859872, + "grad_norm": 2.2684426307678223, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6973720192909241, + "num_tokens": 305147094.0, + "step": 12238 + }, + { + "epoch": 1.344058862288601, + "grad_norm": 1.9404137134552002, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.710631251335144, + "num_tokens": 305180985.0, + "step": 12239 + }, + { + "epoch": 1.3441686799912147, + "grad_norm": 2.265334129333496, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7198023796081543, + "num_tokens": 305208593.0, + "step": 12240 + }, + { + "epoch": 1.3442784976938282, + "grad_norm": 2.2937774658203125, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7033771276473999, + "num_tokens": 305232921.0, + "step": 12241 + }, + { + "epoch": 1.3443883153964418, + "grad_norm": 2.2490248680114746, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7099137902259827, + "num_tokens": 305257512.0, + "step": 12242 + }, + { + "epoch": 1.3444981330990555, + "grad_norm": 2.6060166358947754, + "learning_rate": 1e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7502649426460266, + "num_tokens": 305276499.0, + "step": 12243 + }, + { + "epoch": 1.3446079508016693, + "grad_norm": 1.9953802824020386, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7013688087463379, + "num_tokens": 305309470.0, + "step": 12244 + }, + { + "epoch": 1.3447177685042828, + "grad_norm": 2.235543966293335, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7192682027816772, + "num_tokens": 305336091.0, + "step": 12245 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 2.4261209964752197, + "learning_rate": 1e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7401180863380432, + "num_tokens": 305356851.0, + "step": 12246 + }, + { + "epoch": 1.34493740390951, + "grad_norm": 2.3466997146606445, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7174031734466553, + "num_tokens": 305381691.0, + "step": 12247 + }, + { + "epoch": 1.3450472216121239, + "grad_norm": 2.4030210971832275, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.6993204951286316, + "num_tokens": 305407416.0, + "step": 12248 + }, + { + "epoch": 1.3451570393147376, + "grad_norm": 2.0987963676452637, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6860228180885315, + "num_tokens": 305438082.0, + "step": 12249 + }, + { + "epoch": 1.3452668570173512, + "grad_norm": 2.114356279373169, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7037146091461182, + "num_tokens": 305469037.0, + "step": 12250 + }, + { + "epoch": 1.345376674719965, + "grad_norm": 2.6600241661071777, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7414589524269104, + "num_tokens": 305489798.0, + "step": 12251 + }, + { + "epoch": 1.3454864924225785, + "grad_norm": 2.438786268234253, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7183303236961365, + "num_tokens": 305514619.0, + "step": 12252 + }, + { + "epoch": 1.3455963101251922, + "grad_norm": 2.1524405479431152, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7235802412033081, + "num_tokens": 305541287.0, + "step": 12253 + }, + { + "epoch": 1.345706127827806, + "grad_norm": 2.395627737045288, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7134737372398376, + "num_tokens": 305563465.0, + "step": 12254 + }, + { + "epoch": 1.3458159455304195, + "grad_norm": 2.4893651008605957, + "learning_rate": 1e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7415851950645447, + "num_tokens": 305584407.0, + "step": 12255 + }, + { + "epoch": 1.345925763233033, + "grad_norm": 2.366389036178589, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.731103777885437, + "num_tokens": 305607656.0, + "step": 12256 + }, + { + "epoch": 1.3460355809356468, + "grad_norm": 2.3382201194763184, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7175241708755493, + "num_tokens": 305631169.0, + "step": 12257 + }, + { + "epoch": 1.3461453986382605, + "grad_norm": 2.2464325428009033, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7152544260025024, + "num_tokens": 305659821.0, + "step": 12258 + }, + { + "epoch": 1.346255216340874, + "grad_norm": 2.3890492916107178, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7504495978355408, + "num_tokens": 305681671.0, + "step": 12259 + }, + { + "epoch": 1.3463650340434878, + "grad_norm": 2.3410604000091553, + "learning_rate": 1e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7377146482467651, + "num_tokens": 305705479.0, + "step": 12260 + }, + { + "epoch": 1.3464748517461014, + "grad_norm": 2.6496262550354004, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7317575216293335, + "num_tokens": 305725842.0, + "step": 12261 + }, + { + "epoch": 1.3465846694487151, + "grad_norm": 2.318803071975708, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7204000353813171, + "num_tokens": 305750222.0, + "step": 12262 + }, + { + "epoch": 1.346694487151329, + "grad_norm": 2.0357749462127686, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7073231935501099, + "num_tokens": 305780262.0, + "step": 12263 + }, + { + "epoch": 1.3468043048539424, + "grad_norm": 2.2998294830322266, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7087267637252808, + "num_tokens": 305806897.0, + "step": 12264 + }, + { + "epoch": 1.3469141225565562, + "grad_norm": 2.1408653259277344, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7267124652862549, + "num_tokens": 305835293.0, + "step": 12265 + }, + { + "epoch": 1.3470239402591697, + "grad_norm": 2.323244571685791, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7233827114105225, + "num_tokens": 305859030.0, + "step": 12266 + }, + { + "epoch": 1.3471337579617835, + "grad_norm": 2.1373353004455566, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6939787864685059, + "num_tokens": 305888644.0, + "step": 12267 + }, + { + "epoch": 1.3472435756643972, + "grad_norm": 2.250811815261841, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6941940784454346, + "num_tokens": 305915171.0, + "step": 12268 + }, + { + "epoch": 1.3473533933670108, + "grad_norm": 2.615872621536255, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.6950318813323975, + "num_tokens": 305934673.0, + "step": 12269 + }, + { + "epoch": 1.3474632110696243, + "grad_norm": 2.211984395980835, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7021082639694214, + "num_tokens": 305960891.0, + "step": 12270 + }, + { + "epoch": 1.347573028772238, + "grad_norm": 1.9183610677719116, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.738571047782898, + "num_tokens": 305992466.0, + "step": 12271 + }, + { + "epoch": 1.3476828464748518, + "grad_norm": 2.3203518390655518, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7201027870178223, + "num_tokens": 306018079.0, + "step": 12272 + }, + { + "epoch": 1.3477926641774653, + "grad_norm": 2.2258968353271484, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.6989683508872986, + "num_tokens": 306043851.0, + "step": 12273 + }, + { + "epoch": 1.347902481880079, + "grad_norm": 2.348719358444214, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.702639102935791, + "num_tokens": 306067894.0, + "step": 12274 + }, + { + "epoch": 1.3480122995826926, + "grad_norm": 2.150057315826416, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7148714065551758, + "num_tokens": 306095825.0, + "step": 12275 + }, + { + "epoch": 1.3481221172853064, + "grad_norm": 2.103969097137451, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7440043687820435, + "num_tokens": 306122464.0, + "step": 12276 + }, + { + "epoch": 1.3482319349879202, + "grad_norm": 2.3056395053863525, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7193540334701538, + "num_tokens": 306147007.0, + "step": 12277 + }, + { + "epoch": 1.3483417526905337, + "grad_norm": 2.171804666519165, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7027011513710022, + "num_tokens": 306176920.0, + "step": 12278 + }, + { + "epoch": 1.3484515703931474, + "grad_norm": 2.0920443534851074, + "learning_rate": 1e-06, + "loss": 1.0507, + "mean_token_accuracy": 0.6950401067733765, + "num_tokens": 306208840.0, + "step": 12279 + }, + { + "epoch": 1.348561388095761, + "grad_norm": 2.0831961631774902, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6982432007789612, + "num_tokens": 306239678.0, + "step": 12280 + }, + { + "epoch": 1.3486712057983747, + "grad_norm": 2.4650113582611084, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.727843701839447, + "num_tokens": 306260341.0, + "step": 12281 + }, + { + "epoch": 1.3487810235009885, + "grad_norm": 2.1877009868621826, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7198886871337891, + "num_tokens": 306286335.0, + "step": 12282 + }, + { + "epoch": 1.348890841203602, + "grad_norm": 2.0406205654144287, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7371193170547485, + "num_tokens": 306315273.0, + "step": 12283 + }, + { + "epoch": 1.3490006589062156, + "grad_norm": 2.3067710399627686, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7212803959846497, + "num_tokens": 306338486.0, + "step": 12284 + }, + { + "epoch": 1.3491104766088293, + "grad_norm": 2.0911872386932373, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7051544189453125, + "num_tokens": 306367083.0, + "step": 12285 + }, + { + "epoch": 1.349220294311443, + "grad_norm": 2.33076810836792, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7059431076049805, + "num_tokens": 306392067.0, + "step": 12286 + }, + { + "epoch": 1.3493301120140566, + "grad_norm": 2.4244918823242188, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7352586984634399, + "num_tokens": 306414204.0, + "step": 12287 + }, + { + "epoch": 1.3494399297166704, + "grad_norm": 2.427766799926758, + "learning_rate": 1e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.743553638458252, + "num_tokens": 306434935.0, + "step": 12288 + }, + { + "epoch": 1.349549747419284, + "grad_norm": 2.306641101837158, + "learning_rate": 1e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7375155687332153, + "num_tokens": 306457095.0, + "step": 12289 + }, + { + "epoch": 1.3496595651218977, + "grad_norm": 2.1737518310546875, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7090847492218018, + "num_tokens": 306482959.0, + "step": 12290 + }, + { + "epoch": 1.3497693828245114, + "grad_norm": 2.3743896484375, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7225481271743774, + "num_tokens": 306506729.0, + "step": 12291 + }, + { + "epoch": 1.349879200527125, + "grad_norm": 2.088585138320923, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7199669480323792, + "num_tokens": 306534378.0, + "step": 12292 + }, + { + "epoch": 1.3499890182297387, + "grad_norm": 2.3823554515838623, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7117947340011597, + "num_tokens": 306558071.0, + "step": 12293 + }, + { + "epoch": 1.3500988359323522, + "grad_norm": 2.565934181213379, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7209593057632446, + "num_tokens": 306578702.0, + "step": 12294 + }, + { + "epoch": 1.350208653634966, + "grad_norm": 2.5364363193511963, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7308598756790161, + "num_tokens": 306598453.0, + "step": 12295 + }, + { + "epoch": 1.3503184713375795, + "grad_norm": 2.258235454559326, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7388288974761963, + "num_tokens": 306625127.0, + "step": 12296 + }, + { + "epoch": 1.3504282890401933, + "grad_norm": 2.3200948238372803, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7232224345207214, + "num_tokens": 306651114.0, + "step": 12297 + }, + { + "epoch": 1.3505381067428068, + "grad_norm": 1.9421292543411255, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7235814332962036, + "num_tokens": 306683700.0, + "step": 12298 + }, + { + "epoch": 1.3506479244454206, + "grad_norm": 2.1341848373413086, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7048660516738892, + "num_tokens": 306710090.0, + "step": 12299 + }, + { + "epoch": 1.3507577421480343, + "grad_norm": 2.311811685562134, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7376061677932739, + "num_tokens": 306733073.0, + "step": 12300 + }, + { + "epoch": 1.3508675598506479, + "grad_norm": 2.127781867980957, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6902725696563721, + "num_tokens": 306761663.0, + "step": 12301 + }, + { + "epoch": 1.3509773775532616, + "grad_norm": 2.6658971309661865, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7269750833511353, + "num_tokens": 306781453.0, + "step": 12302 + }, + { + "epoch": 1.3510871952558752, + "grad_norm": 2.2646358013153076, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7165775299072266, + "num_tokens": 306807800.0, + "step": 12303 + }, + { + "epoch": 1.351197012958489, + "grad_norm": 2.4371280670166016, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7246428728103638, + "num_tokens": 306830164.0, + "step": 12304 + }, + { + "epoch": 1.3513068306611027, + "grad_norm": 2.3814306259155273, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7146695256233215, + "num_tokens": 306853111.0, + "step": 12305 + }, + { + "epoch": 1.3514166483637162, + "grad_norm": 2.8102502822875977, + "learning_rate": 1e-06, + "loss": 0.7404, + "mean_token_accuracy": 0.7605761289596558, + "num_tokens": 306869793.0, + "step": 12306 + }, + { + "epoch": 1.3515264660663298, + "grad_norm": 2.2471683025360107, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6883963346481323, + "num_tokens": 306896172.0, + "step": 12307 + }, + { + "epoch": 1.3516362837689435, + "grad_norm": 2.5219740867614746, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.732788622379303, + "num_tokens": 306916345.0, + "step": 12308 + }, + { + "epoch": 1.3517461014715573, + "grad_norm": 2.3376123905181885, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7113289833068848, + "num_tokens": 306940961.0, + "step": 12309 + }, + { + "epoch": 1.3518559191741708, + "grad_norm": 2.483407497406006, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7131090760231018, + "num_tokens": 306963168.0, + "step": 12310 + }, + { + "epoch": 1.3519657368767846, + "grad_norm": 2.3339502811431885, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7257471680641174, + "num_tokens": 306988267.0, + "step": 12311 + }, + { + "epoch": 1.352075554579398, + "grad_norm": 2.3229713439941406, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7226086258888245, + "num_tokens": 307010947.0, + "step": 12312 + }, + { + "epoch": 1.3521853722820119, + "grad_norm": 2.225546360015869, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7177503108978271, + "num_tokens": 307036248.0, + "step": 12313 + }, + { + "epoch": 1.3522951899846256, + "grad_norm": 2.094578266143799, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.730094313621521, + "num_tokens": 307064645.0, + "step": 12314 + }, + { + "epoch": 1.3524050076872391, + "grad_norm": 2.5407674312591553, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.725350022315979, + "num_tokens": 307087370.0, + "step": 12315 + }, + { + "epoch": 1.352514825389853, + "grad_norm": 2.2719151973724365, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7369409203529358, + "num_tokens": 307114976.0, + "step": 12316 + }, + { + "epoch": 1.3526246430924664, + "grad_norm": 2.2432920932769775, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7223251461982727, + "num_tokens": 307141018.0, + "step": 12317 + }, + { + "epoch": 1.3527344607950802, + "grad_norm": 2.4080469608306885, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7273104786872864, + "num_tokens": 307163788.0, + "step": 12318 + }, + { + "epoch": 1.352844278497694, + "grad_norm": 2.277214765548706, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7128139138221741, + "num_tokens": 307188925.0, + "step": 12319 + }, + { + "epoch": 1.3529540962003075, + "grad_norm": 2.272852659225464, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7144824266433716, + "num_tokens": 307213898.0, + "step": 12320 + }, + { + "epoch": 1.353063913902921, + "grad_norm": 2.2842068672180176, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.725559651851654, + "num_tokens": 307237823.0, + "step": 12321 + }, + { + "epoch": 1.3531737316055348, + "grad_norm": 2.0606093406677246, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7217311859130859, + "num_tokens": 307267932.0, + "step": 12322 + }, + { + "epoch": 1.3532835493081485, + "grad_norm": 2.3422110080718994, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7395315170288086, + "num_tokens": 307290717.0, + "step": 12323 + }, + { + "epoch": 1.353393367010762, + "grad_norm": 2.316895008087158, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7311040163040161, + "num_tokens": 307313671.0, + "step": 12324 + }, + { + "epoch": 1.3535031847133758, + "grad_norm": 2.1791093349456787, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7114325165748596, + "num_tokens": 307338778.0, + "step": 12325 + }, + { + "epoch": 1.3536130024159894, + "grad_norm": 2.5487987995147705, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7196000814437866, + "num_tokens": 307360935.0, + "step": 12326 + }, + { + "epoch": 1.3537228201186031, + "grad_norm": 2.2855851650238037, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7273042798042297, + "num_tokens": 307384765.0, + "step": 12327 + }, + { + "epoch": 1.3538326378212169, + "grad_norm": 2.298910140991211, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7142113447189331, + "num_tokens": 307408990.0, + "step": 12328 + }, + { + "epoch": 1.3539424555238304, + "grad_norm": 2.2340610027313232, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7210924625396729, + "num_tokens": 307435129.0, + "step": 12329 + }, + { + "epoch": 1.3540522732264442, + "grad_norm": 2.220311164855957, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7288179397583008, + "num_tokens": 307460654.0, + "step": 12330 + }, + { + "epoch": 1.3541620909290577, + "grad_norm": 2.161506175994873, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7143768668174744, + "num_tokens": 307488759.0, + "step": 12331 + }, + { + "epoch": 1.3542719086316715, + "grad_norm": 2.3751468658447266, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7187303304672241, + "num_tokens": 307512776.0, + "step": 12332 + }, + { + "epoch": 1.3543817263342852, + "grad_norm": 2.341038465499878, + "learning_rate": 1e-06, + "loss": 0.8121, + "mean_token_accuracy": 0.7430205345153809, + "num_tokens": 307535454.0, + "step": 12333 + }, + { + "epoch": 1.3544915440368988, + "grad_norm": 2.3435702323913574, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7183665037155151, + "num_tokens": 307564092.0, + "step": 12334 + }, + { + "epoch": 1.3546013617395123, + "grad_norm": 2.1437060832977295, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7022128105163574, + "num_tokens": 307592184.0, + "step": 12335 + }, + { + "epoch": 1.354711179442126, + "grad_norm": 2.816176414489746, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7386085987091064, + "num_tokens": 307609870.0, + "step": 12336 + }, + { + "epoch": 1.3548209971447398, + "grad_norm": 2.2793407440185547, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7096250057220459, + "num_tokens": 307636299.0, + "step": 12337 + }, + { + "epoch": 1.3549308148473533, + "grad_norm": 2.544762372970581, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7191821336746216, + "num_tokens": 307656740.0, + "step": 12338 + }, + { + "epoch": 1.355040632549967, + "grad_norm": 2.0648727416992188, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.73136305809021, + "num_tokens": 307686151.0, + "step": 12339 + }, + { + "epoch": 1.3551504502525806, + "grad_norm": 2.424682855606079, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7169821262359619, + "num_tokens": 307711124.0, + "step": 12340 + }, + { + "epoch": 1.3552602679551944, + "grad_norm": 2.1761999130249023, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7042657136917114, + "num_tokens": 307737182.0, + "step": 12341 + }, + { + "epoch": 1.3553700856578081, + "grad_norm": 2.5656278133392334, + "learning_rate": 1e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7325602769851685, + "num_tokens": 307755934.0, + "step": 12342 + }, + { + "epoch": 1.3554799033604217, + "grad_norm": 2.201906442642212, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7316504120826721, + "num_tokens": 307782022.0, + "step": 12343 + }, + { + "epoch": 1.3555897210630354, + "grad_norm": 2.055351734161377, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7156695127487183, + "num_tokens": 307812086.0, + "step": 12344 + }, + { + "epoch": 1.355699538765649, + "grad_norm": 2.3980205059051514, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7326159477233887, + "num_tokens": 307834164.0, + "step": 12345 + }, + { + "epoch": 1.3558093564682627, + "grad_norm": 2.395777463912964, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7242882251739502, + "num_tokens": 307856592.0, + "step": 12346 + }, + { + "epoch": 1.3559191741708763, + "grad_norm": 2.6386327743530273, + "learning_rate": 1e-06, + "loss": 0.8027, + "mean_token_accuracy": 0.7429941892623901, + "num_tokens": 307874857.0, + "step": 12347 + }, + { + "epoch": 1.35602899187349, + "grad_norm": 2.314337968826294, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7264969348907471, + "num_tokens": 307898682.0, + "step": 12348 + }, + { + "epoch": 1.3561388095761036, + "grad_norm": 2.344282388687134, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7201570272445679, + "num_tokens": 307922765.0, + "step": 12349 + }, + { + "epoch": 1.3562486272787173, + "grad_norm": 2.257781982421875, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7328858375549316, + "num_tokens": 307949299.0, + "step": 12350 + }, + { + "epoch": 1.356358444981331, + "grad_norm": 2.1445188522338867, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7120460867881775, + "num_tokens": 307976626.0, + "step": 12351 + }, + { + "epoch": 1.3564682626839446, + "grad_norm": 2.3995959758758545, + "learning_rate": 1e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7666729092597961, + "num_tokens": 307997132.0, + "step": 12352 + }, + { + "epoch": 1.3565780803865584, + "grad_norm": 2.11690092086792, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.6974102854728699, + "num_tokens": 308026537.0, + "step": 12353 + }, + { + "epoch": 1.356687898089172, + "grad_norm": 2.0821480751037598, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7171902656555176, + "num_tokens": 308055276.0, + "step": 12354 + }, + { + "epoch": 1.3567977157917857, + "grad_norm": 2.2051479816436768, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7357972264289856, + "num_tokens": 308081138.0, + "step": 12355 + }, + { + "epoch": 1.3569075334943994, + "grad_norm": 2.5660576820373535, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7207372784614563, + "num_tokens": 308101457.0, + "step": 12356 + }, + { + "epoch": 1.357017351197013, + "grad_norm": 2.434723377227783, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7230564951896667, + "num_tokens": 308125300.0, + "step": 12357 + }, + { + "epoch": 1.3571271688996267, + "grad_norm": 2.216696262359619, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7301031351089478, + "num_tokens": 308151928.0, + "step": 12358 + }, + { + "epoch": 1.3572369866022402, + "grad_norm": 2.5639498233795166, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.744043231010437, + "num_tokens": 308173235.0, + "step": 12359 + }, + { + "epoch": 1.357346804304854, + "grad_norm": 2.0698580741882324, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.694586992263794, + "num_tokens": 308201435.0, + "step": 12360 + }, + { + "epoch": 1.3574566220074675, + "grad_norm": 2.333608627319336, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.716080904006958, + "num_tokens": 308225130.0, + "step": 12361 + }, + { + "epoch": 1.3575664397100813, + "grad_norm": 2.3209238052368164, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7151390910148621, + "num_tokens": 308251932.0, + "step": 12362 + }, + { + "epoch": 1.3576762574126948, + "grad_norm": 2.450434684753418, + "learning_rate": 1e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7504634857177734, + "num_tokens": 308273571.0, + "step": 12363 + }, + { + "epoch": 1.3577860751153086, + "grad_norm": 2.410705327987671, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7276555299758911, + "num_tokens": 308294779.0, + "step": 12364 + }, + { + "epoch": 1.3578958928179223, + "grad_norm": 2.194516181945801, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7525961399078369, + "num_tokens": 308319895.0, + "step": 12365 + }, + { + "epoch": 1.3580057105205359, + "grad_norm": 2.6171298027038574, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7206730842590332, + "num_tokens": 308340607.0, + "step": 12366 + }, + { + "epoch": 1.3581155282231496, + "grad_norm": 2.2764222621917725, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.720234751701355, + "num_tokens": 308367565.0, + "step": 12367 + }, + { + "epoch": 1.3582253459257632, + "grad_norm": 2.458498954772949, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7436189651489258, + "num_tokens": 308387979.0, + "step": 12368 + }, + { + "epoch": 1.358335163628377, + "grad_norm": 2.5799434185028076, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7064669132232666, + "num_tokens": 308409834.0, + "step": 12369 + }, + { + "epoch": 1.3584449813309907, + "grad_norm": 2.1047160625457764, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6989427804946899, + "num_tokens": 308439819.0, + "step": 12370 + }, + { + "epoch": 1.3585547990336042, + "grad_norm": 2.1175358295440674, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7140761613845825, + "num_tokens": 308468655.0, + "step": 12371 + }, + { + "epoch": 1.3586646167362177, + "grad_norm": 2.845402479171753, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7153503894805908, + "num_tokens": 308488793.0, + "step": 12372 + }, + { + "epoch": 1.3587744344388315, + "grad_norm": 2.2657313346862793, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.6970025300979614, + "num_tokens": 308515036.0, + "step": 12373 + }, + { + "epoch": 1.3588842521414453, + "grad_norm": 2.17777156829834, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7044649720191956, + "num_tokens": 308540102.0, + "step": 12374 + }, + { + "epoch": 1.3589940698440588, + "grad_norm": 2.3486366271972656, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7358514070510864, + "num_tokens": 308563302.0, + "step": 12375 + }, + { + "epoch": 1.3591038875466726, + "grad_norm": 2.280937671661377, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7291508913040161, + "num_tokens": 308588842.0, + "step": 12376 + }, + { + "epoch": 1.359213705249286, + "grad_norm": 2.234257221221924, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7272233366966248, + "num_tokens": 308612320.0, + "step": 12377 + }, + { + "epoch": 1.3593235229518998, + "grad_norm": 2.4027938842773438, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7292284965515137, + "num_tokens": 308635835.0, + "step": 12378 + }, + { + "epoch": 1.3594333406545136, + "grad_norm": 2.2951035499572754, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7442008256912231, + "num_tokens": 308658967.0, + "step": 12379 + }, + { + "epoch": 1.3595431583571271, + "grad_norm": 2.541889190673828, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7155715823173523, + "num_tokens": 308680322.0, + "step": 12380 + }, + { + "epoch": 1.359652976059741, + "grad_norm": 2.180938243865967, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7235857844352722, + "num_tokens": 308708066.0, + "step": 12381 + }, + { + "epoch": 1.3597627937623544, + "grad_norm": 2.6504099369049072, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7220073342323303, + "num_tokens": 308727312.0, + "step": 12382 + }, + { + "epoch": 1.3598726114649682, + "grad_norm": 2.5460617542266846, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.699449360370636, + "num_tokens": 308751044.0, + "step": 12383 + }, + { + "epoch": 1.359982429167582, + "grad_norm": 2.4608912467956543, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7088175415992737, + "num_tokens": 308774549.0, + "step": 12384 + }, + { + "epoch": 1.3600922468701955, + "grad_norm": 2.365774393081665, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7061666250228882, + "num_tokens": 308799522.0, + "step": 12385 + }, + { + "epoch": 1.360202064572809, + "grad_norm": 2.2242445945739746, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.695858895778656, + "num_tokens": 308828131.0, + "step": 12386 + }, + { + "epoch": 1.3603118822754228, + "grad_norm": 2.388267993927002, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7394275069236755, + "num_tokens": 308851422.0, + "step": 12387 + }, + { + "epoch": 1.3604216999780365, + "grad_norm": 2.0946767330169678, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7108132839202881, + "num_tokens": 308881029.0, + "step": 12388 + }, + { + "epoch": 1.36053151768065, + "grad_norm": 2.420945167541504, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.7532688975334167, + "num_tokens": 308902533.0, + "step": 12389 + }, + { + "epoch": 1.3606413353832638, + "grad_norm": 1.9835644960403442, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7451814413070679, + "num_tokens": 308933757.0, + "step": 12390 + }, + { + "epoch": 1.3607511530858774, + "grad_norm": 2.374581813812256, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7159205675125122, + "num_tokens": 308956165.0, + "step": 12391 + }, + { + "epoch": 1.360860970788491, + "grad_norm": 2.1966140270233154, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6956733465194702, + "num_tokens": 308982559.0, + "step": 12392 + }, + { + "epoch": 1.3609707884911049, + "grad_norm": 2.216508626937866, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.708065927028656, + "num_tokens": 309009185.0, + "step": 12393 + }, + { + "epoch": 1.3610806061937184, + "grad_norm": 2.483839273452759, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7398195266723633, + "num_tokens": 309030249.0, + "step": 12394 + }, + { + "epoch": 1.3611904238963322, + "grad_norm": 2.2217752933502197, + "learning_rate": 1e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7399744987487793, + "num_tokens": 309055315.0, + "step": 12395 + }, + { + "epoch": 1.3613002415989457, + "grad_norm": 2.4173505306243896, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7292510867118835, + "num_tokens": 309078542.0, + "step": 12396 + }, + { + "epoch": 1.3614100593015594, + "grad_norm": 2.231968402862549, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7052788138389587, + "num_tokens": 309104789.0, + "step": 12397 + }, + { + "epoch": 1.3615198770041732, + "grad_norm": 2.074087142944336, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7075110673904419, + "num_tokens": 309135411.0, + "step": 12398 + }, + { + "epoch": 1.3616296947067867, + "grad_norm": 2.1649105548858643, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6938678026199341, + "num_tokens": 309164974.0, + "step": 12399 + }, + { + "epoch": 1.3617395124094003, + "grad_norm": 2.213376045227051, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.713095486164093, + "num_tokens": 309192543.0, + "step": 12400 + }, + { + "epoch": 1.361849330112014, + "grad_norm": 2.5904407501220703, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7150653600692749, + "num_tokens": 309212799.0, + "step": 12401 + }, + { + "epoch": 1.3619591478146278, + "grad_norm": 2.2832581996917725, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7261078953742981, + "num_tokens": 309236108.0, + "step": 12402 + }, + { + "epoch": 1.3620689655172413, + "grad_norm": 2.4510762691497803, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7049422860145569, + "num_tokens": 309258775.0, + "step": 12403 + }, + { + "epoch": 1.362178783219855, + "grad_norm": 2.2343852519989014, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7214308977127075, + "num_tokens": 309285793.0, + "step": 12404 + }, + { + "epoch": 1.3622886009224686, + "grad_norm": 2.2386679649353027, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6949217915534973, + "num_tokens": 309314441.0, + "step": 12405 + }, + { + "epoch": 1.3623984186250824, + "grad_norm": 2.35512638092041, + "learning_rate": 1e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.7480496168136597, + "num_tokens": 309337188.0, + "step": 12406 + }, + { + "epoch": 1.3625082363276961, + "grad_norm": 2.6756982803344727, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7281293869018555, + "num_tokens": 309356829.0, + "step": 12407 + }, + { + "epoch": 1.3626180540303097, + "grad_norm": 2.4933464527130127, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7184910774230957, + "num_tokens": 309379605.0, + "step": 12408 + }, + { + "epoch": 1.3627278717329234, + "grad_norm": 2.15523362159729, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7270444631576538, + "num_tokens": 309407422.0, + "step": 12409 + }, + { + "epoch": 1.362837689435537, + "grad_norm": 2.2920620441436768, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7227952480316162, + "num_tokens": 309435395.0, + "step": 12410 + }, + { + "epoch": 1.3629475071381507, + "grad_norm": 2.3852734565734863, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7346357703208923, + "num_tokens": 309459236.0, + "step": 12411 + }, + { + "epoch": 1.3630573248407643, + "grad_norm": 2.3201143741607666, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7106373906135559, + "num_tokens": 309483069.0, + "step": 12412 + }, + { + "epoch": 1.363167142543378, + "grad_norm": 2.3704774379730225, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7285549640655518, + "num_tokens": 309504621.0, + "step": 12413 + }, + { + "epoch": 1.3632769602459915, + "grad_norm": 2.3522861003875732, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7337327599525452, + "num_tokens": 309526754.0, + "step": 12414 + }, + { + "epoch": 1.3633867779486053, + "grad_norm": 2.269691228866577, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7103486061096191, + "num_tokens": 309551801.0, + "step": 12415 + }, + { + "epoch": 1.363496595651219, + "grad_norm": 2.2540149688720703, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.70066899061203, + "num_tokens": 309579586.0, + "step": 12416 + }, + { + "epoch": 1.3636064133538326, + "grad_norm": 2.260051727294922, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7223783731460571, + "num_tokens": 309603807.0, + "step": 12417 + }, + { + "epoch": 1.3637162310564463, + "grad_norm": 1.9596071243286133, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.681154727935791, + "num_tokens": 309637786.0, + "step": 12418 + }, + { + "epoch": 1.3638260487590599, + "grad_norm": 2.032716751098633, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7117595672607422, + "num_tokens": 309668620.0, + "step": 12419 + }, + { + "epoch": 1.3639358664616736, + "grad_norm": 2.24021315574646, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7285206317901611, + "num_tokens": 309694985.0, + "step": 12420 + }, + { + "epoch": 1.3640456841642874, + "grad_norm": 2.1754047870635986, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7230654954910278, + "num_tokens": 309722316.0, + "step": 12421 + }, + { + "epoch": 1.364155501866901, + "grad_norm": 2.0305051803588867, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7089849710464478, + "num_tokens": 309753845.0, + "step": 12422 + }, + { + "epoch": 1.3642653195695145, + "grad_norm": 2.4151737689971924, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7208558320999146, + "num_tokens": 309776500.0, + "step": 12423 + }, + { + "epoch": 1.3643751372721282, + "grad_norm": 2.1614935398101807, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7149100303649902, + "num_tokens": 309803563.0, + "step": 12424 + }, + { + "epoch": 1.364484954974742, + "grad_norm": 2.739813804626465, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7249354720115662, + "num_tokens": 309821874.0, + "step": 12425 + }, + { + "epoch": 1.3645947726773555, + "grad_norm": 2.33410382270813, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.6963266134262085, + "num_tokens": 309848182.0, + "step": 12426 + }, + { + "epoch": 1.3647045903799693, + "grad_norm": 2.5350074768066406, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7103531956672668, + "num_tokens": 309871365.0, + "step": 12427 + }, + { + "epoch": 1.3648144080825828, + "grad_norm": 2.194014310836792, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6952207088470459, + "num_tokens": 309901544.0, + "step": 12428 + }, + { + "epoch": 1.3649242257851966, + "grad_norm": 2.4944257736206055, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7269235253334045, + "num_tokens": 309923633.0, + "step": 12429 + }, + { + "epoch": 1.3650340434878103, + "grad_norm": 2.2428009510040283, + "learning_rate": 1e-06, + "loss": 0.75, + "mean_token_accuracy": 0.7588733434677124, + "num_tokens": 309946287.0, + "step": 12430 + }, + { + "epoch": 1.3651438611904239, + "grad_norm": 2.0055487155914307, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7079951167106628, + "num_tokens": 309976710.0, + "step": 12431 + }, + { + "epoch": 1.3652536788930376, + "grad_norm": 2.3001017570495605, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7138586640357971, + "num_tokens": 309999489.0, + "step": 12432 + }, + { + "epoch": 1.3653634965956511, + "grad_norm": 2.2629663944244385, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7299399375915527, + "num_tokens": 310022588.0, + "step": 12433 + }, + { + "epoch": 1.365473314298265, + "grad_norm": 2.347181558609009, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7235622406005859, + "num_tokens": 310045327.0, + "step": 12434 + }, + { + "epoch": 1.3655831320008787, + "grad_norm": 2.317348003387451, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7056607007980347, + "num_tokens": 310070806.0, + "step": 12435 + }, + { + "epoch": 1.3656929497034922, + "grad_norm": 2.2289867401123047, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7170808911323547, + "num_tokens": 310094968.0, + "step": 12436 + }, + { + "epoch": 1.3658027674061057, + "grad_norm": 2.337479591369629, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.6974896788597107, + "num_tokens": 310119791.0, + "step": 12437 + }, + { + "epoch": 1.3659125851087195, + "grad_norm": 2.3898093700408936, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7303045988082886, + "num_tokens": 310142743.0, + "step": 12438 + }, + { + "epoch": 1.3660224028113332, + "grad_norm": 2.5250236988067627, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.722516655921936, + "num_tokens": 310164338.0, + "step": 12439 + }, + { + "epoch": 1.3661322205139468, + "grad_norm": 2.349851369857788, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7095904350280762, + "num_tokens": 310189701.0, + "step": 12440 + }, + { + "epoch": 1.3662420382165605, + "grad_norm": 2.402184009552002, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.713970959186554, + "num_tokens": 310213968.0, + "step": 12441 + }, + { + "epoch": 1.366351855919174, + "grad_norm": 2.208160877227783, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6973164677619934, + "num_tokens": 310240300.0, + "step": 12442 + }, + { + "epoch": 1.3664616736217878, + "grad_norm": 2.429366111755371, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7444021105766296, + "num_tokens": 310261028.0, + "step": 12443 + }, + { + "epoch": 1.3665714913244016, + "grad_norm": 2.5895886421203613, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7230614423751831, + "num_tokens": 310280712.0, + "step": 12444 + }, + { + "epoch": 1.3666813090270151, + "grad_norm": 2.5460431575775146, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7379854917526245, + "num_tokens": 310300144.0, + "step": 12445 + }, + { + "epoch": 1.3667911267296289, + "grad_norm": 2.271254777908325, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7214335203170776, + "num_tokens": 310325217.0, + "step": 12446 + }, + { + "epoch": 1.3669009444322424, + "grad_norm": 2.7208316326141357, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7115381956100464, + "num_tokens": 310344602.0, + "step": 12447 + }, + { + "epoch": 1.3670107621348562, + "grad_norm": 2.659470558166504, + "learning_rate": 1e-06, + "loss": 0.7499, + "mean_token_accuracy": 0.756647527217865, + "num_tokens": 310363436.0, + "step": 12448 + }, + { + "epoch": 1.36712057983747, + "grad_norm": 2.3020761013031006, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7307696342468262, + "num_tokens": 310386559.0, + "step": 12449 + }, + { + "epoch": 1.3672303975400835, + "grad_norm": 2.334240436553955, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7329014539718628, + "num_tokens": 310407971.0, + "step": 12450 + }, + { + "epoch": 1.367340215242697, + "grad_norm": 2.3844809532165527, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7247283458709717, + "num_tokens": 310430597.0, + "step": 12451 + }, + { + "epoch": 1.3674500329453108, + "grad_norm": 2.4529294967651367, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7236582636833191, + "num_tokens": 310452681.0, + "step": 12452 + }, + { + "epoch": 1.3675598506479245, + "grad_norm": 2.1765024662017822, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7062674760818481, + "num_tokens": 310480291.0, + "step": 12453 + }, + { + "epoch": 1.367669668350538, + "grad_norm": 2.1875948905944824, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.731318473815918, + "num_tokens": 310507416.0, + "step": 12454 + }, + { + "epoch": 1.3677794860531518, + "grad_norm": 2.3934340476989746, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7140849828720093, + "num_tokens": 310530279.0, + "step": 12455 + }, + { + "epoch": 1.3678893037557653, + "grad_norm": 2.1706831455230713, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6933767795562744, + "num_tokens": 310560831.0, + "step": 12456 + }, + { + "epoch": 1.367999121458379, + "grad_norm": 2.0344696044921875, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7135530710220337, + "num_tokens": 310591061.0, + "step": 12457 + }, + { + "epoch": 1.3681089391609929, + "grad_norm": 2.001840829849243, + "learning_rate": 1e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7300295829772949, + "num_tokens": 310620609.0, + "step": 12458 + }, + { + "epoch": 1.3682187568636064, + "grad_norm": 2.294247627258301, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7090712785720825, + "num_tokens": 310644390.0, + "step": 12459 + }, + { + "epoch": 1.3683285745662201, + "grad_norm": 2.3519744873046875, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7349071502685547, + "num_tokens": 310668136.0, + "step": 12460 + }, + { + "epoch": 1.3684383922688337, + "grad_norm": 2.395723581314087, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6967027187347412, + "num_tokens": 310691103.0, + "step": 12461 + }, + { + "epoch": 1.3685482099714474, + "grad_norm": 2.4567112922668457, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7145765423774719, + "num_tokens": 310714424.0, + "step": 12462 + }, + { + "epoch": 1.3686580276740612, + "grad_norm": 2.0979702472686768, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7244449853897095, + "num_tokens": 310741084.0, + "step": 12463 + }, + { + "epoch": 1.3687678453766747, + "grad_norm": 2.1687889099121094, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7209131717681885, + "num_tokens": 310768036.0, + "step": 12464 + }, + { + "epoch": 1.3688776630792883, + "grad_norm": 2.334468126296997, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7056845426559448, + "num_tokens": 310792925.0, + "step": 12465 + }, + { + "epoch": 1.368987480781902, + "grad_norm": 2.7107858657836914, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.729231059551239, + "num_tokens": 310811642.0, + "step": 12466 + }, + { + "epoch": 1.3690972984845158, + "grad_norm": 2.2967567443847656, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7287705540657043, + "num_tokens": 310834033.0, + "step": 12467 + }, + { + "epoch": 1.3692071161871293, + "grad_norm": 2.3573222160339355, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.6988790035247803, + "num_tokens": 310859394.0, + "step": 12468 + }, + { + "epoch": 1.369316933889743, + "grad_norm": 2.3758840560913086, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7353768348693848, + "num_tokens": 310882866.0, + "step": 12469 + }, + { + "epoch": 1.3694267515923566, + "grad_norm": 2.021132469177246, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7141658067703247, + "num_tokens": 310916931.0, + "step": 12470 + }, + { + "epoch": 1.3695365692949704, + "grad_norm": 2.139497756958008, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7177650928497314, + "num_tokens": 310944872.0, + "step": 12471 + }, + { + "epoch": 1.3696463869975841, + "grad_norm": 2.1616241931915283, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7207728624343872, + "num_tokens": 310970156.0, + "step": 12472 + }, + { + "epoch": 1.3697562047001977, + "grad_norm": 2.1841773986816406, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7214546799659729, + "num_tokens": 310997133.0, + "step": 12473 + }, + { + "epoch": 1.3698660224028114, + "grad_norm": 2.555957555770874, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7185868620872498, + "num_tokens": 311019166.0, + "step": 12474 + }, + { + "epoch": 1.369975840105425, + "grad_norm": 2.0741918087005615, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.716499924659729, + "num_tokens": 311046001.0, + "step": 12475 + }, + { + "epoch": 1.3700856578080387, + "grad_norm": 2.387334108352661, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7312077283859253, + "num_tokens": 311069031.0, + "step": 12476 + }, + { + "epoch": 1.3701954755106522, + "grad_norm": 2.2885704040527344, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7171540856361389, + "num_tokens": 311093742.0, + "step": 12477 + }, + { + "epoch": 1.370305293213266, + "grad_norm": 1.9941787719726562, + "learning_rate": 1e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.7394306063652039, + "num_tokens": 311122481.0, + "step": 12478 + }, + { + "epoch": 1.3704151109158795, + "grad_norm": 2.2779500484466553, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7165142297744751, + "num_tokens": 311146738.0, + "step": 12479 + }, + { + "epoch": 1.3705249286184933, + "grad_norm": 2.3135342597961426, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.739859938621521, + "num_tokens": 311169983.0, + "step": 12480 + }, + { + "epoch": 1.370634746321107, + "grad_norm": 2.3174026012420654, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7226821184158325, + "num_tokens": 311192739.0, + "step": 12481 + }, + { + "epoch": 1.3707445640237206, + "grad_norm": 2.1865406036376953, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7126054763793945, + "num_tokens": 311220002.0, + "step": 12482 + }, + { + "epoch": 1.3708543817263343, + "grad_norm": 2.214930295944214, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7172759175300598, + "num_tokens": 311247256.0, + "step": 12483 + }, + { + "epoch": 1.3709641994289479, + "grad_norm": 2.098919153213501, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7316948175430298, + "num_tokens": 311276255.0, + "step": 12484 + }, + { + "epoch": 1.3710740171315616, + "grad_norm": 2.4436233043670654, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7254489064216614, + "num_tokens": 311298026.0, + "step": 12485 + }, + { + "epoch": 1.3711838348341754, + "grad_norm": 2.21655011177063, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.744321882724762, + "num_tokens": 311323321.0, + "step": 12486 + }, + { + "epoch": 1.371293652536789, + "grad_norm": 2.111576557159424, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7291684150695801, + "num_tokens": 311350178.0, + "step": 12487 + }, + { + "epoch": 1.3714034702394025, + "grad_norm": 2.3146917819976807, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7137033343315125, + "num_tokens": 311375156.0, + "step": 12488 + }, + { + "epoch": 1.3715132879420162, + "grad_norm": 2.291062355041504, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7334966063499451, + "num_tokens": 311399410.0, + "step": 12489 + }, + { + "epoch": 1.37162310564463, + "grad_norm": 2.471008539199829, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7237120866775513, + "num_tokens": 311421380.0, + "step": 12490 + }, + { + "epoch": 1.3717329233472435, + "grad_norm": 2.363679885864258, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7280646562576294, + "num_tokens": 311444083.0, + "step": 12491 + }, + { + "epoch": 1.3718427410498573, + "grad_norm": 2.207024335861206, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7400438785552979, + "num_tokens": 311469090.0, + "step": 12492 + }, + { + "epoch": 1.3719525587524708, + "grad_norm": 2.474442958831787, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7283863425254822, + "num_tokens": 311491913.0, + "step": 12493 + }, + { + "epoch": 1.3720623764550846, + "grad_norm": 2.426478862762451, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7059427499771118, + "num_tokens": 311515491.0, + "step": 12494 + }, + { + "epoch": 1.3721721941576983, + "grad_norm": 2.420360803604126, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7413046360015869, + "num_tokens": 311537225.0, + "step": 12495 + }, + { + "epoch": 1.3722820118603118, + "grad_norm": 2.3046302795410156, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7199375629425049, + "num_tokens": 311560961.0, + "step": 12496 + }, + { + "epoch": 1.3723918295629256, + "grad_norm": 2.1322240829467773, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7166168689727783, + "num_tokens": 311588762.0, + "step": 12497 + }, + { + "epoch": 1.3725016472655391, + "grad_norm": 2.1497440338134766, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7384903430938721, + "num_tokens": 311613660.0, + "step": 12498 + }, + { + "epoch": 1.372611464968153, + "grad_norm": 2.429887056350708, + "learning_rate": 1e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7440606355667114, + "num_tokens": 311635958.0, + "step": 12499 + }, + { + "epoch": 1.3727212826707667, + "grad_norm": 2.1759581565856934, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7197485566139221, + "num_tokens": 311665073.0, + "step": 12500 + }, + { + "epoch": 1.3728311003733802, + "grad_norm": 2.8259806632995605, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7297490835189819, + "num_tokens": 311682399.0, + "step": 12501 + }, + { + "epoch": 1.3729409180759937, + "grad_norm": 2.466677665710449, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.719835102558136, + "num_tokens": 311703222.0, + "step": 12502 + }, + { + "epoch": 1.3730507357786075, + "grad_norm": 2.402489423751831, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7252307534217834, + "num_tokens": 311726183.0, + "step": 12503 + }, + { + "epoch": 1.3731605534812212, + "grad_norm": 2.0922060012817383, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.712421178817749, + "num_tokens": 311756423.0, + "step": 12504 + }, + { + "epoch": 1.3732703711838348, + "grad_norm": 2.2697019577026367, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7083718776702881, + "num_tokens": 311783838.0, + "step": 12505 + }, + { + "epoch": 1.3733801888864485, + "grad_norm": 2.1045045852661133, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7217342853546143, + "num_tokens": 311815415.0, + "step": 12506 + }, + { + "epoch": 1.373490006589062, + "grad_norm": 2.382486581802368, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7196851968765259, + "num_tokens": 311838254.0, + "step": 12507 + }, + { + "epoch": 1.3735998242916758, + "grad_norm": 2.1648051738739014, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7169474363327026, + "num_tokens": 311864539.0, + "step": 12508 + }, + { + "epoch": 1.3737096419942896, + "grad_norm": 2.162696361541748, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7411171197891235, + "num_tokens": 311890719.0, + "step": 12509 + }, + { + "epoch": 1.3738194596969031, + "grad_norm": 2.4156298637390137, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7132341265678406, + "num_tokens": 311914206.0, + "step": 12510 + }, + { + "epoch": 1.3739292773995169, + "grad_norm": 2.453504800796509, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7189886569976807, + "num_tokens": 311936072.0, + "step": 12511 + }, + { + "epoch": 1.3740390951021304, + "grad_norm": 2.1698617935180664, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.708448052406311, + "num_tokens": 311963675.0, + "step": 12512 + }, + { + "epoch": 1.3741489128047442, + "grad_norm": 2.2344813346862793, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6875208020210266, + "num_tokens": 311990817.0, + "step": 12513 + }, + { + "epoch": 1.374258730507358, + "grad_norm": 2.1921844482421875, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6872657537460327, + "num_tokens": 312018573.0, + "step": 12514 + }, + { + "epoch": 1.3743685482099715, + "grad_norm": 2.326812982559204, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7199642658233643, + "num_tokens": 312041124.0, + "step": 12515 + }, + { + "epoch": 1.374478365912585, + "grad_norm": 2.370944023132324, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7245535850524902, + "num_tokens": 312065699.0, + "step": 12516 + }, + { + "epoch": 1.3745881836151987, + "grad_norm": 2.099304676055908, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7005182504653931, + "num_tokens": 312095253.0, + "step": 12517 + }, + { + "epoch": 1.3746980013178125, + "grad_norm": 2.4941344261169434, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7237353324890137, + "num_tokens": 312116238.0, + "step": 12518 + }, + { + "epoch": 1.374807819020426, + "grad_norm": 2.1618807315826416, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.724492609500885, + "num_tokens": 312145476.0, + "step": 12519 + }, + { + "epoch": 1.3749176367230398, + "grad_norm": 2.0184171199798584, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7197564840316772, + "num_tokens": 312175789.0, + "step": 12520 + }, + { + "epoch": 1.3750274544256533, + "grad_norm": 2.5209150314331055, + "learning_rate": 1e-06, + "loss": 0.7704, + "mean_token_accuracy": 0.7529996037483215, + "num_tokens": 312195168.0, + "step": 12521 + }, + { + "epoch": 1.375137272128267, + "grad_norm": 2.307170867919922, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.72686767578125, + "num_tokens": 312217745.0, + "step": 12522 + }, + { + "epoch": 1.3752470898308808, + "grad_norm": 2.1820521354675293, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7029462456703186, + "num_tokens": 312243496.0, + "step": 12523 + }, + { + "epoch": 1.3753569075334944, + "grad_norm": 2.5086629390716553, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7224286794662476, + "num_tokens": 312266466.0, + "step": 12524 + }, + { + "epoch": 1.3754667252361081, + "grad_norm": 2.2312283515930176, + "learning_rate": 1e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.752631664276123, + "num_tokens": 312291002.0, + "step": 12525 + }, + { + "epoch": 1.3755765429387217, + "grad_norm": 2.2108752727508545, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7042496204376221, + "num_tokens": 312316469.0, + "step": 12526 + }, + { + "epoch": 1.3756863606413354, + "grad_norm": 1.994461178779602, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7547892332077026, + "num_tokens": 312345411.0, + "step": 12527 + }, + { + "epoch": 1.3757961783439492, + "grad_norm": 2.1462416648864746, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7060797810554504, + "num_tokens": 312372658.0, + "step": 12528 + }, + { + "epoch": 1.3759059960465627, + "grad_norm": 2.4764111042022705, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.71720290184021, + "num_tokens": 312396097.0, + "step": 12529 + }, + { + "epoch": 1.3760158137491763, + "grad_norm": 2.343777894973755, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7370139360427856, + "num_tokens": 312418583.0, + "step": 12530 + }, + { + "epoch": 1.37612563145179, + "grad_norm": 2.305081844329834, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7317617535591125, + "num_tokens": 312443910.0, + "step": 12531 + }, + { + "epoch": 1.3762354491544038, + "grad_norm": 2.3469152450561523, + "learning_rate": 1e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7402746677398682, + "num_tokens": 312466933.0, + "step": 12532 + }, + { + "epoch": 1.3763452668570173, + "grad_norm": 2.072716236114502, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7011696696281433, + "num_tokens": 312498101.0, + "step": 12533 + }, + { + "epoch": 1.376455084559631, + "grad_norm": 2.031071186065674, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7279633283615112, + "num_tokens": 312526425.0, + "step": 12534 + }, + { + "epoch": 1.3765649022622446, + "grad_norm": 2.3091259002685547, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7344863414764404, + "num_tokens": 312549242.0, + "step": 12535 + }, + { + "epoch": 1.3766747199648584, + "grad_norm": 2.435948133468628, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7242290377616882, + "num_tokens": 312570957.0, + "step": 12536 + }, + { + "epoch": 1.376784537667472, + "grad_norm": 2.238603115081787, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7109956741333008, + "num_tokens": 312597155.0, + "step": 12537 + }, + { + "epoch": 1.3768943553700856, + "grad_norm": 2.0696003437042236, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7085927724838257, + "num_tokens": 312625436.0, + "step": 12538 + }, + { + "epoch": 1.3770041730726994, + "grad_norm": 2.228386640548706, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7048617005348206, + "num_tokens": 312652473.0, + "step": 12539 + }, + { + "epoch": 1.377113990775313, + "grad_norm": 2.4252147674560547, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7047650814056396, + "num_tokens": 312677200.0, + "step": 12540 + }, + { + "epoch": 1.3772238084779267, + "grad_norm": 2.3198723793029785, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7496225833892822, + "num_tokens": 312701514.0, + "step": 12541 + }, + { + "epoch": 1.3773336261805402, + "grad_norm": 2.208080768585205, + "learning_rate": 1e-06, + "loss": 0.7384, + "mean_token_accuracy": 0.7676289677619934, + "num_tokens": 312725463.0, + "step": 12542 + }, + { + "epoch": 1.377443443883154, + "grad_norm": 2.192479372024536, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7185268402099609, + "num_tokens": 312753499.0, + "step": 12543 + }, + { + "epoch": 1.3775532615857675, + "grad_norm": 2.3850319385528564, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7369204163551331, + "num_tokens": 312774932.0, + "step": 12544 + }, + { + "epoch": 1.3776630792883813, + "grad_norm": 2.462368965148926, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7063214778900146, + "num_tokens": 312797076.0, + "step": 12545 + }, + { + "epoch": 1.377772896990995, + "grad_norm": 2.1559057235717773, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.727110743522644, + "num_tokens": 312824080.0, + "step": 12546 + }, + { + "epoch": 1.3778827146936086, + "grad_norm": 2.042025566101074, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6978142261505127, + "num_tokens": 312855685.0, + "step": 12547 + }, + { + "epoch": 1.3779925323962223, + "grad_norm": 2.302358627319336, + "learning_rate": 1e-06, + "loss": 0.7944, + "mean_token_accuracy": 0.7466492056846619, + "num_tokens": 312879016.0, + "step": 12548 + }, + { + "epoch": 1.3781023500988359, + "grad_norm": 2.5050759315490723, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7250529527664185, + "num_tokens": 312898395.0, + "step": 12549 + }, + { + "epoch": 1.3782121678014496, + "grad_norm": 2.292158603668213, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7301708459854126, + "num_tokens": 312922096.0, + "step": 12550 + }, + { + "epoch": 1.3783219855040634, + "grad_norm": 2.406939744949341, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7080647945404053, + "num_tokens": 312947450.0, + "step": 12551 + }, + { + "epoch": 1.378431803206677, + "grad_norm": 2.5970895290374756, + "learning_rate": 1e-06, + "loss": 0.7188, + "mean_token_accuracy": 0.7783641815185547, + "num_tokens": 312967308.0, + "step": 12552 + }, + { + "epoch": 1.3785416209092904, + "grad_norm": 2.506204128265381, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7100281715393066, + "num_tokens": 312991506.0, + "step": 12553 + }, + { + "epoch": 1.3786514386119042, + "grad_norm": 2.2709908485412598, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7522246837615967, + "num_tokens": 313014579.0, + "step": 12554 + }, + { + "epoch": 1.378761256314518, + "grad_norm": 2.8714663982391357, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7370330691337585, + "num_tokens": 313032300.0, + "step": 12555 + }, + { + "epoch": 1.3788710740171315, + "grad_norm": 2.679168224334717, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7269002199172974, + "num_tokens": 313052446.0, + "step": 12556 + }, + { + "epoch": 1.3789808917197452, + "grad_norm": 2.3429956436157227, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7318636178970337, + "num_tokens": 313076046.0, + "step": 12557 + }, + { + "epoch": 1.3790907094223588, + "grad_norm": 2.3865678310394287, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7097679376602173, + "num_tokens": 313100295.0, + "step": 12558 + }, + { + "epoch": 1.3792005271249725, + "grad_norm": 2.327699899673462, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7485661506652832, + "num_tokens": 313124124.0, + "step": 12559 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 2.3611063957214355, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.724403440952301, + "num_tokens": 313148927.0, + "step": 12560 + }, + { + "epoch": 1.3794201625301998, + "grad_norm": 2.4787464141845703, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7322453260421753, + "num_tokens": 313170150.0, + "step": 12561 + }, + { + "epoch": 1.3795299802328136, + "grad_norm": 2.297649621963501, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7232351303100586, + "num_tokens": 313194584.0, + "step": 12562 + }, + { + "epoch": 1.3796397979354271, + "grad_norm": 2.2842373847961426, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7167583107948303, + "num_tokens": 313221595.0, + "step": 12563 + }, + { + "epoch": 1.3797496156380409, + "grad_norm": 2.214461088180542, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7042810916900635, + "num_tokens": 313248354.0, + "step": 12564 + }, + { + "epoch": 1.3798594333406546, + "grad_norm": 2.163519859313965, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.753720760345459, + "num_tokens": 313274240.0, + "step": 12565 + }, + { + "epoch": 1.3799692510432682, + "grad_norm": 2.1286754608154297, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7106131315231323, + "num_tokens": 313303127.0, + "step": 12566 + }, + { + "epoch": 1.3800790687458817, + "grad_norm": 2.497260093688965, + "learning_rate": 1e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7353032827377319, + "num_tokens": 313323734.0, + "step": 12567 + }, + { + "epoch": 1.3801888864484955, + "grad_norm": 2.223172187805176, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6996058821678162, + "num_tokens": 313351019.0, + "step": 12568 + }, + { + "epoch": 1.3802987041511092, + "grad_norm": 2.2471466064453125, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7063809633255005, + "num_tokens": 313376267.0, + "step": 12569 + }, + { + "epoch": 1.3804085218537228, + "grad_norm": 1.945264458656311, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.73957359790802, + "num_tokens": 313408971.0, + "step": 12570 + }, + { + "epoch": 1.3805183395563365, + "grad_norm": 2.3641908168792725, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7345066070556641, + "num_tokens": 313432080.0, + "step": 12571 + }, + { + "epoch": 1.38062815725895, + "grad_norm": 2.197052240371704, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7289133071899414, + "num_tokens": 313458718.0, + "step": 12572 + }, + { + "epoch": 1.3807379749615638, + "grad_norm": 2.1958608627319336, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7228926420211792, + "num_tokens": 313486406.0, + "step": 12573 + }, + { + "epoch": 1.3808477926641776, + "grad_norm": 2.2823333740234375, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7167870998382568, + "num_tokens": 313512407.0, + "step": 12574 + }, + { + "epoch": 1.380957610366791, + "grad_norm": 2.4219117164611816, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.725560188293457, + "num_tokens": 313534047.0, + "step": 12575 + }, + { + "epoch": 1.3810674280694049, + "grad_norm": 2.2469522953033447, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7224600315093994, + "num_tokens": 313560092.0, + "step": 12576 + }, + { + "epoch": 1.3811772457720184, + "grad_norm": 2.158907890319824, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7316625118255615, + "num_tokens": 313588089.0, + "step": 12577 + }, + { + "epoch": 1.3812870634746321, + "grad_norm": 2.294562339782715, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7176007628440857, + "num_tokens": 313613641.0, + "step": 12578 + }, + { + "epoch": 1.381396881177246, + "grad_norm": 2.311534881591797, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7073357105255127, + "num_tokens": 313638131.0, + "step": 12579 + }, + { + "epoch": 1.3815066988798594, + "grad_norm": 2.177320718765259, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.706023633480072, + "num_tokens": 313667442.0, + "step": 12580 + }, + { + "epoch": 1.381616516582473, + "grad_norm": 2.5519137382507324, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7373145818710327, + "num_tokens": 313688252.0, + "step": 12581 + }, + { + "epoch": 1.3817263342850867, + "grad_norm": 2.1371686458587646, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7003473043441772, + "num_tokens": 313719124.0, + "step": 12582 + }, + { + "epoch": 1.3818361519877005, + "grad_norm": 2.15559458732605, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7347553968429565, + "num_tokens": 313747969.0, + "step": 12583 + }, + { + "epoch": 1.381945969690314, + "grad_norm": 2.2416727542877197, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.734439492225647, + "num_tokens": 313773695.0, + "step": 12584 + }, + { + "epoch": 1.3820557873929278, + "grad_norm": 2.427373170852661, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7185543775558472, + "num_tokens": 313796233.0, + "step": 12585 + }, + { + "epoch": 1.3821656050955413, + "grad_norm": 2.633171558380127, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.731713056564331, + "num_tokens": 313817442.0, + "step": 12586 + }, + { + "epoch": 1.382275422798155, + "grad_norm": 2.1736552715301514, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.706558346748352, + "num_tokens": 313845724.0, + "step": 12587 + }, + { + "epoch": 1.3823852405007688, + "grad_norm": 2.482548952102661, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7142653465270996, + "num_tokens": 313869807.0, + "step": 12588 + }, + { + "epoch": 1.3824950582033824, + "grad_norm": 2.3901069164276123, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7359985113143921, + "num_tokens": 313891680.0, + "step": 12589 + }, + { + "epoch": 1.3826048759059961, + "grad_norm": 2.098816156387329, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7194976806640625, + "num_tokens": 313919117.0, + "step": 12590 + }, + { + "epoch": 1.3827146936086097, + "grad_norm": 2.3180863857269287, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7058241367340088, + "num_tokens": 313943725.0, + "step": 12591 + }, + { + "epoch": 1.3828245113112234, + "grad_norm": 1.8752061128616333, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7229450941085815, + "num_tokens": 313973417.0, + "step": 12592 + }, + { + "epoch": 1.382934329013837, + "grad_norm": 2.4682250022888184, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7420368194580078, + "num_tokens": 313993945.0, + "step": 12593 + }, + { + "epoch": 1.3830441467164507, + "grad_norm": 2.6717798709869385, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7156135439872742, + "num_tokens": 314013810.0, + "step": 12594 + }, + { + "epoch": 1.3831539644190642, + "grad_norm": 2.6009445190429688, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7241462469100952, + "num_tokens": 314033928.0, + "step": 12595 + }, + { + "epoch": 1.383263782121678, + "grad_norm": 2.612298011779785, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7354767322540283, + "num_tokens": 314052786.0, + "step": 12596 + }, + { + "epoch": 1.3833735998242918, + "grad_norm": 2.195674180984497, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7028219103813171, + "num_tokens": 314082246.0, + "step": 12597 + }, + { + "epoch": 1.3834834175269053, + "grad_norm": 2.210191249847412, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.725415825843811, + "num_tokens": 314107310.0, + "step": 12598 + }, + { + "epoch": 1.383593235229519, + "grad_norm": 2.2888681888580322, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7122175097465515, + "num_tokens": 314130962.0, + "step": 12599 + }, + { + "epoch": 1.3837030529321326, + "grad_norm": 2.3451039791107178, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7186899185180664, + "num_tokens": 314154482.0, + "step": 12600 + }, + { + "epoch": 1.3838128706347463, + "grad_norm": 2.0942342281341553, + "learning_rate": 1e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7370606660842896, + "num_tokens": 314182944.0, + "step": 12601 + }, + { + "epoch": 1.38392268833736, + "grad_norm": 2.558523178100586, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7185192704200745, + "num_tokens": 314203264.0, + "step": 12602 + }, + { + "epoch": 1.3840325060399736, + "grad_norm": 2.1682817935943604, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7066771984100342, + "num_tokens": 314230068.0, + "step": 12603 + }, + { + "epoch": 1.3841423237425874, + "grad_norm": 2.200071096420288, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7291163802146912, + "num_tokens": 314255923.0, + "step": 12604 + }, + { + "epoch": 1.384252141445201, + "grad_norm": 2.2339658737182617, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7596979737281799, + "num_tokens": 314280141.0, + "step": 12605 + }, + { + "epoch": 1.3843619591478147, + "grad_norm": 2.509188175201416, + "learning_rate": 1e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7534086108207703, + "num_tokens": 314299165.0, + "step": 12606 + }, + { + "epoch": 1.3844717768504282, + "grad_norm": 2.076022148132324, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7228038311004639, + "num_tokens": 314328846.0, + "step": 12607 + }, + { + "epoch": 1.384581594553042, + "grad_norm": 2.2891058921813965, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6933532357215881, + "num_tokens": 314355567.0, + "step": 12608 + }, + { + "epoch": 1.3846914122556555, + "grad_norm": 2.003589391708374, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7069886922836304, + "num_tokens": 314387873.0, + "step": 12609 + }, + { + "epoch": 1.3848012299582693, + "grad_norm": 2.1091086864471436, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6854652166366577, + "num_tokens": 314417544.0, + "step": 12610 + }, + { + "epoch": 1.384911047660883, + "grad_norm": 2.0424673557281494, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7187430262565613, + "num_tokens": 314446348.0, + "step": 12611 + }, + { + "epoch": 1.3850208653634966, + "grad_norm": 2.5253214836120605, + "learning_rate": 1e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7467136383056641, + "num_tokens": 314466041.0, + "step": 12612 + }, + { + "epoch": 1.3851306830661103, + "grad_norm": 2.1926732063293457, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7065779566764832, + "num_tokens": 314495185.0, + "step": 12613 + }, + { + "epoch": 1.3852405007687238, + "grad_norm": 2.384756565093994, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7185860872268677, + "num_tokens": 314518722.0, + "step": 12614 + }, + { + "epoch": 1.3853503184713376, + "grad_norm": 2.0835886001586914, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6984275579452515, + "num_tokens": 314550724.0, + "step": 12615 + }, + { + "epoch": 1.3854601361739514, + "grad_norm": 2.270275354385376, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7151292562484741, + "num_tokens": 314576332.0, + "step": 12616 + }, + { + "epoch": 1.385569953876565, + "grad_norm": 2.5362777709960938, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7027449607849121, + "num_tokens": 314598409.0, + "step": 12617 + }, + { + "epoch": 1.3856797715791784, + "grad_norm": 2.244131565093994, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7177819013595581, + "num_tokens": 314624882.0, + "step": 12618 + }, + { + "epoch": 1.3857895892817922, + "grad_norm": 2.371258497238159, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7244242429733276, + "num_tokens": 314647407.0, + "step": 12619 + }, + { + "epoch": 1.385899406984406, + "grad_norm": 2.2325079441070557, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7169910669326782, + "num_tokens": 314672594.0, + "step": 12620 + }, + { + "epoch": 1.3860092246870195, + "grad_norm": 1.9509910345077515, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7102312445640564, + "num_tokens": 314704367.0, + "step": 12621 + }, + { + "epoch": 1.3861190423896332, + "grad_norm": 2.4270637035369873, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7093803286552429, + "num_tokens": 314728138.0, + "step": 12622 + }, + { + "epoch": 1.3862288600922468, + "grad_norm": 2.592674493789673, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7328180074691772, + "num_tokens": 314754318.0, + "step": 12623 + }, + { + "epoch": 1.3863386777948605, + "grad_norm": 2.320908784866333, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7134337425231934, + "num_tokens": 314779804.0, + "step": 12624 + }, + { + "epoch": 1.3864484954974743, + "grad_norm": 2.1772091388702393, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7109957933425903, + "num_tokens": 314807391.0, + "step": 12625 + }, + { + "epoch": 1.3865583132000878, + "grad_norm": 2.4321353435516357, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.736750066280365, + "num_tokens": 314827720.0, + "step": 12626 + }, + { + "epoch": 1.3866681309027016, + "grad_norm": 2.173999071121216, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6968017816543579, + "num_tokens": 314857784.0, + "step": 12627 + }, + { + "epoch": 1.3867779486053151, + "grad_norm": 2.541264057159424, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7427022457122803, + "num_tokens": 314879087.0, + "step": 12628 + }, + { + "epoch": 1.3868877663079289, + "grad_norm": 2.363797664642334, + "learning_rate": 1e-06, + "loss": 0.7861, + "mean_token_accuracy": 0.7491836547851562, + "num_tokens": 314902004.0, + "step": 12629 + }, + { + "epoch": 1.3869975840105426, + "grad_norm": 2.2067341804504395, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7366093993186951, + "num_tokens": 314925982.0, + "step": 12630 + }, + { + "epoch": 1.3871074017131562, + "grad_norm": 2.2508087158203125, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7266943454742432, + "num_tokens": 314951070.0, + "step": 12631 + }, + { + "epoch": 1.3872172194157697, + "grad_norm": 2.0302555561065674, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7173094153404236, + "num_tokens": 314980298.0, + "step": 12632 + }, + { + "epoch": 1.3873270371183835, + "grad_norm": 2.0994949340820312, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7010969519615173, + "num_tokens": 315010391.0, + "step": 12633 + }, + { + "epoch": 1.3874368548209972, + "grad_norm": 2.260218381881714, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.723773717880249, + "num_tokens": 315034987.0, + "step": 12634 + }, + { + "epoch": 1.3875466725236107, + "grad_norm": 2.562229633331299, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7227945327758789, + "num_tokens": 315055159.0, + "step": 12635 + }, + { + "epoch": 1.3876564902262245, + "grad_norm": 2.2383105754852295, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6808145046234131, + "num_tokens": 315082613.0, + "step": 12636 + }, + { + "epoch": 1.387766307928838, + "grad_norm": 2.6568238735198975, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7220062017440796, + "num_tokens": 315102877.0, + "step": 12637 + }, + { + "epoch": 1.3878761256314518, + "grad_norm": 2.1941981315612793, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7115718722343445, + "num_tokens": 315132638.0, + "step": 12638 + }, + { + "epoch": 1.3879859433340656, + "grad_norm": 2.2336199283599854, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.728075385093689, + "num_tokens": 315156597.0, + "step": 12639 + }, + { + "epoch": 1.388095761036679, + "grad_norm": 2.1471657752990723, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7172381281852722, + "num_tokens": 315184870.0, + "step": 12640 + }, + { + "epoch": 1.3882055787392928, + "grad_norm": 2.6165764331817627, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7432920932769775, + "num_tokens": 315203704.0, + "step": 12641 + }, + { + "epoch": 1.3883153964419064, + "grad_norm": 2.3874526023864746, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7436002492904663, + "num_tokens": 315225660.0, + "step": 12642 + }, + { + "epoch": 1.3884252141445201, + "grad_norm": 2.414252996444702, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.722617506980896, + "num_tokens": 315248016.0, + "step": 12643 + }, + { + "epoch": 1.388535031847134, + "grad_norm": 2.726085901260376, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7284562587738037, + "num_tokens": 315267757.0, + "step": 12644 + }, + { + "epoch": 1.3886448495497474, + "grad_norm": 2.145707607269287, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7376520037651062, + "num_tokens": 315295809.0, + "step": 12645 + }, + { + "epoch": 1.388754667252361, + "grad_norm": 2.2795701026916504, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7124233245849609, + "num_tokens": 315322046.0, + "step": 12646 + }, + { + "epoch": 1.3888644849549747, + "grad_norm": 2.427877902984619, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7223286628723145, + "num_tokens": 315344071.0, + "step": 12647 + }, + { + "epoch": 1.3889743026575885, + "grad_norm": 2.3454854488372803, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7236186265945435, + "num_tokens": 315368074.0, + "step": 12648 + }, + { + "epoch": 1.389084120360202, + "grad_norm": 2.7265655994415283, + "learning_rate": 1e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7458250522613525, + "num_tokens": 315387714.0, + "step": 12649 + }, + { + "epoch": 1.3891939380628158, + "grad_norm": 2.3394525051116943, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.6954867839813232, + "num_tokens": 315413706.0, + "step": 12650 + }, + { + "epoch": 1.3893037557654293, + "grad_norm": 2.1079912185668945, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7224897146224976, + "num_tokens": 315442448.0, + "step": 12651 + }, + { + "epoch": 1.389413573468043, + "grad_norm": 2.231759548187256, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7170350551605225, + "num_tokens": 315470677.0, + "step": 12652 + }, + { + "epoch": 1.3895233911706568, + "grad_norm": 2.0451507568359375, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7054176926612854, + "num_tokens": 315499362.0, + "step": 12653 + }, + { + "epoch": 1.3896332088732704, + "grad_norm": 2.4064016342163086, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7212780117988586, + "num_tokens": 315524102.0, + "step": 12654 + }, + { + "epoch": 1.389743026575884, + "grad_norm": 1.9919434785842896, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7043554782867432, + "num_tokens": 315556283.0, + "step": 12655 + }, + { + "epoch": 1.3898528442784976, + "grad_norm": 2.6171369552612305, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7187099456787109, + "num_tokens": 315576042.0, + "step": 12656 + }, + { + "epoch": 1.3899626619811114, + "grad_norm": 2.0803167819976807, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7316948175430298, + "num_tokens": 315602535.0, + "step": 12657 + }, + { + "epoch": 1.390072479683725, + "grad_norm": 2.0826570987701416, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7156645059585571, + "num_tokens": 315633512.0, + "step": 12658 + }, + { + "epoch": 1.3901822973863387, + "grad_norm": 2.339942455291748, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7216575145721436, + "num_tokens": 315655964.0, + "step": 12659 + }, + { + "epoch": 1.3902921150889522, + "grad_norm": 2.338153839111328, + "learning_rate": 1e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7463215589523315, + "num_tokens": 315679062.0, + "step": 12660 + }, + { + "epoch": 1.390401932791566, + "grad_norm": 2.4608843326568604, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7322665452957153, + "num_tokens": 315702471.0, + "step": 12661 + }, + { + "epoch": 1.3905117504941797, + "grad_norm": 2.402186155319214, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7432503700256348, + "num_tokens": 315725488.0, + "step": 12662 + }, + { + "epoch": 1.3906215681967933, + "grad_norm": 2.305532455444336, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7334088683128357, + "num_tokens": 315749702.0, + "step": 12663 + }, + { + "epoch": 1.390731385899407, + "grad_norm": 2.0710365772247314, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7089439034461975, + "num_tokens": 315779356.0, + "step": 12664 + }, + { + "epoch": 1.3908412036020206, + "grad_norm": 2.6354146003723145, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7248809933662415, + "num_tokens": 315800453.0, + "step": 12665 + }, + { + "epoch": 1.3909510213046343, + "grad_norm": 2.3158209323883057, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.727896511554718, + "num_tokens": 315823176.0, + "step": 12666 + }, + { + "epoch": 1.391060839007248, + "grad_norm": 2.202169418334961, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7265247702598572, + "num_tokens": 315851145.0, + "step": 12667 + }, + { + "epoch": 1.3911706567098616, + "grad_norm": 2.2052319049835205, + "learning_rate": 1e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7409224510192871, + "num_tokens": 315876501.0, + "step": 12668 + }, + { + "epoch": 1.3912804744124752, + "grad_norm": 2.3320741653442383, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6971100568771362, + "num_tokens": 315900294.0, + "step": 12669 + }, + { + "epoch": 1.391390292115089, + "grad_norm": 1.845983624458313, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7016232013702393, + "num_tokens": 315933068.0, + "step": 12670 + }, + { + "epoch": 1.3915001098177027, + "grad_norm": 2.1726644039154053, + "learning_rate": 1e-06, + "loss": 0.7951, + "mean_token_accuracy": 0.7428928017616272, + "num_tokens": 315959570.0, + "step": 12671 + }, + { + "epoch": 1.3916099275203162, + "grad_norm": 2.433537244796753, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.721535861492157, + "num_tokens": 315984085.0, + "step": 12672 + }, + { + "epoch": 1.39171974522293, + "grad_norm": 2.1009130477905273, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7141954898834229, + "num_tokens": 316010678.0, + "step": 12673 + }, + { + "epoch": 1.3918295629255435, + "grad_norm": 2.3983724117279053, + "learning_rate": 1e-06, + "loss": 0.7839, + "mean_token_accuracy": 0.7430817484855652, + "num_tokens": 316030952.0, + "step": 12674 + }, + { + "epoch": 1.3919393806281573, + "grad_norm": 2.179168701171875, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7351608276367188, + "num_tokens": 316056002.0, + "step": 12675 + }, + { + "epoch": 1.392049198330771, + "grad_norm": 2.1929609775543213, + "learning_rate": 1e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7372572422027588, + "num_tokens": 316081972.0, + "step": 12676 + }, + { + "epoch": 1.3921590160333845, + "grad_norm": 2.357804298400879, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7207680940628052, + "num_tokens": 316106740.0, + "step": 12677 + }, + { + "epoch": 1.3922688337359983, + "grad_norm": 2.4747653007507324, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7355821132659912, + "num_tokens": 316127518.0, + "step": 12678 + }, + { + "epoch": 1.3923786514386118, + "grad_norm": 2.3745429515838623, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7138956785202026, + "num_tokens": 316150850.0, + "step": 12679 + }, + { + "epoch": 1.3924884691412256, + "grad_norm": 2.2449193000793457, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7405290603637695, + "num_tokens": 316173728.0, + "step": 12680 + }, + { + "epoch": 1.3925982868438394, + "grad_norm": 2.372992753982544, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7298255562782288, + "num_tokens": 316196492.0, + "step": 12681 + }, + { + "epoch": 1.3927081045464529, + "grad_norm": 2.32890248298645, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.739361047744751, + "num_tokens": 316220284.0, + "step": 12682 + }, + { + "epoch": 1.3928179222490664, + "grad_norm": 2.5519628524780273, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7375457286834717, + "num_tokens": 316240149.0, + "step": 12683 + }, + { + "epoch": 1.3929277399516802, + "grad_norm": 2.3111815452575684, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7246290445327759, + "num_tokens": 316263646.0, + "step": 12684 + }, + { + "epoch": 1.393037557654294, + "grad_norm": 2.306910991668701, + "learning_rate": 1e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7517761588096619, + "num_tokens": 316285631.0, + "step": 12685 + }, + { + "epoch": 1.3931473753569075, + "grad_norm": 2.077817916870117, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7071707248687744, + "num_tokens": 316316430.0, + "step": 12686 + }, + { + "epoch": 1.3932571930595212, + "grad_norm": 2.446061849594116, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7208542823791504, + "num_tokens": 316340456.0, + "step": 12687 + }, + { + "epoch": 1.3933670107621348, + "grad_norm": 2.821021556854248, + "learning_rate": 1e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7368662357330322, + "num_tokens": 316358611.0, + "step": 12688 + }, + { + "epoch": 1.3934768284647485, + "grad_norm": 2.151759147644043, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7122846841812134, + "num_tokens": 316387409.0, + "step": 12689 + }, + { + "epoch": 1.3935866461673623, + "grad_norm": 2.518932580947876, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7333541512489319, + "num_tokens": 316408734.0, + "step": 12690 + }, + { + "epoch": 1.3936964638699758, + "grad_norm": 2.2550063133239746, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7142109274864197, + "num_tokens": 316435296.0, + "step": 12691 + }, + { + "epoch": 1.3938062815725896, + "grad_norm": 2.35703182220459, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7283318042755127, + "num_tokens": 316457515.0, + "step": 12692 + }, + { + "epoch": 1.393916099275203, + "grad_norm": 2.2545900344848633, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7453473806381226, + "num_tokens": 316482650.0, + "step": 12693 + }, + { + "epoch": 1.3940259169778169, + "grad_norm": 2.1111977100372314, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7317689657211304, + "num_tokens": 316509707.0, + "step": 12694 + }, + { + "epoch": 1.3941357346804306, + "grad_norm": 2.2262837886810303, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7257656455039978, + "num_tokens": 316534177.0, + "step": 12695 + }, + { + "epoch": 1.3942455523830442, + "grad_norm": 2.261118173599243, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.715979278087616, + "num_tokens": 316559114.0, + "step": 12696 + }, + { + "epoch": 1.3943553700856577, + "grad_norm": 2.627516031265259, + "learning_rate": 1e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7451845407485962, + "num_tokens": 316579523.0, + "step": 12697 + }, + { + "epoch": 1.3944651877882714, + "grad_norm": 2.298311471939087, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7222217917442322, + "num_tokens": 316603859.0, + "step": 12698 + }, + { + "epoch": 1.3945750054908852, + "grad_norm": 2.7415919303894043, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7318552732467651, + "num_tokens": 316622403.0, + "step": 12699 + }, + { + "epoch": 1.3946848231934987, + "grad_norm": 2.3694143295288086, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7238761186599731, + "num_tokens": 316645142.0, + "step": 12700 + }, + { + "epoch": 1.3947946408961125, + "grad_norm": 2.734055280685425, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7342494130134583, + "num_tokens": 316664768.0, + "step": 12701 + }, + { + "epoch": 1.394904458598726, + "grad_norm": 2.7793517112731934, + "learning_rate": 1e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7342244386672974, + "num_tokens": 316683524.0, + "step": 12702 + }, + { + "epoch": 1.3950142763013398, + "grad_norm": 1.9614920616149902, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.6957314014434814, + "num_tokens": 316717217.0, + "step": 12703 + }, + { + "epoch": 1.3951240940039535, + "grad_norm": 2.12418532371521, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7098314762115479, + "num_tokens": 316747381.0, + "step": 12704 + }, + { + "epoch": 1.395233911706567, + "grad_norm": 2.2685797214508057, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7444994449615479, + "num_tokens": 316771545.0, + "step": 12705 + }, + { + "epoch": 1.3953437294091808, + "grad_norm": 2.078169584274292, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.729934573173523, + "num_tokens": 316797094.0, + "step": 12706 + }, + { + "epoch": 1.3954535471117944, + "grad_norm": 2.108398199081421, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7004534006118774, + "num_tokens": 316825434.0, + "step": 12707 + }, + { + "epoch": 1.3955633648144081, + "grad_norm": 2.402195930480957, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7278441190719604, + "num_tokens": 316851774.0, + "step": 12708 + }, + { + "epoch": 1.3956731825170219, + "grad_norm": 2.1559135913848877, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7027072906494141, + "num_tokens": 316880260.0, + "step": 12709 + }, + { + "epoch": 1.3957830002196354, + "grad_norm": 2.1593403816223145, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7322585582733154, + "num_tokens": 316907536.0, + "step": 12710 + }, + { + "epoch": 1.395892817922249, + "grad_norm": 2.321436882019043, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7179849147796631, + "num_tokens": 316932526.0, + "step": 12711 + }, + { + "epoch": 1.3960026356248627, + "grad_norm": 2.360409736633301, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7045419216156006, + "num_tokens": 316957891.0, + "step": 12712 + }, + { + "epoch": 1.3961124533274765, + "grad_norm": 2.0235066413879395, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7034576535224915, + "num_tokens": 316987484.0, + "step": 12713 + }, + { + "epoch": 1.39622227103009, + "grad_norm": 2.072263717651367, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7158401012420654, + "num_tokens": 317018085.0, + "step": 12714 + }, + { + "epoch": 1.3963320887327038, + "grad_norm": 2.094628095626831, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7036042213439941, + "num_tokens": 317048153.0, + "step": 12715 + }, + { + "epoch": 1.3964419064353173, + "grad_norm": 2.6697163581848145, + "learning_rate": 1e-06, + "loss": 0.8041, + "mean_token_accuracy": 0.7480252981185913, + "num_tokens": 317066135.0, + "step": 12716 + }, + { + "epoch": 1.396551724137931, + "grad_norm": 2.36395001411438, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.713536262512207, + "num_tokens": 317091044.0, + "step": 12717 + }, + { + "epoch": 1.3966615418405448, + "grad_norm": 2.1493310928344727, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.698214590549469, + "num_tokens": 317120011.0, + "step": 12718 + }, + { + "epoch": 1.3967713595431583, + "grad_norm": 2.138514518737793, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7038651704788208, + "num_tokens": 317148020.0, + "step": 12719 + }, + { + "epoch": 1.396881177245772, + "grad_norm": 2.1956369876861572, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7172611355781555, + "num_tokens": 317173975.0, + "step": 12720 + }, + { + "epoch": 1.3969909949483856, + "grad_norm": 1.9560086727142334, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7241860628128052, + "num_tokens": 317205315.0, + "step": 12721 + }, + { + "epoch": 1.3971008126509994, + "grad_norm": 2.2957065105438232, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7379032373428345, + "num_tokens": 317231088.0, + "step": 12722 + }, + { + "epoch": 1.397210630353613, + "grad_norm": 2.2669174671173096, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7220921516418457, + "num_tokens": 317256406.0, + "step": 12723 + }, + { + "epoch": 1.3973204480562267, + "grad_norm": 2.118811845779419, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7293533086776733, + "num_tokens": 317283534.0, + "step": 12724 + }, + { + "epoch": 1.3974302657588402, + "grad_norm": 2.1292061805725098, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7154587507247925, + "num_tokens": 317311640.0, + "step": 12725 + }, + { + "epoch": 1.397540083461454, + "grad_norm": 2.0998971462249756, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7118561267852783, + "num_tokens": 317342060.0, + "step": 12726 + }, + { + "epoch": 1.3976499011640677, + "grad_norm": 2.300415515899658, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7139275670051575, + "num_tokens": 317368681.0, + "step": 12727 + }, + { + "epoch": 1.3977597188666813, + "grad_norm": 2.2660439014434814, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6858115196228027, + "num_tokens": 317393586.0, + "step": 12728 + }, + { + "epoch": 1.397869536569295, + "grad_norm": 2.121486186981201, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.6952917575836182, + "num_tokens": 317422103.0, + "step": 12729 + }, + { + "epoch": 1.3979793542719086, + "grad_norm": 2.068655490875244, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7065930366516113, + "num_tokens": 317453044.0, + "step": 12730 + }, + { + "epoch": 1.3980891719745223, + "grad_norm": 2.6718292236328125, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7378858327865601, + "num_tokens": 317472542.0, + "step": 12731 + }, + { + "epoch": 1.398198989677136, + "grad_norm": 2.45674729347229, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7533460855484009, + "num_tokens": 317492645.0, + "step": 12732 + }, + { + "epoch": 1.3983088073797496, + "grad_norm": 2.258298635482788, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.714680016040802, + "num_tokens": 317518695.0, + "step": 12733 + }, + { + "epoch": 1.3984186250823631, + "grad_norm": 2.177469253540039, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7391126155853271, + "num_tokens": 317544492.0, + "step": 12734 + }, + { + "epoch": 1.398528442784977, + "grad_norm": 2.5756778717041016, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7156935930252075, + "num_tokens": 317564762.0, + "step": 12735 + }, + { + "epoch": 1.3986382604875907, + "grad_norm": 2.1581645011901855, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6918044090270996, + "num_tokens": 317592590.0, + "step": 12736 + }, + { + "epoch": 1.3987480781902042, + "grad_norm": 2.380808115005493, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7191485166549683, + "num_tokens": 317615780.0, + "step": 12737 + }, + { + "epoch": 1.398857895892818, + "grad_norm": 2.3887939453125, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7166182994842529, + "num_tokens": 317637811.0, + "step": 12738 + }, + { + "epoch": 1.3989677135954315, + "grad_norm": 2.093876361846924, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7077270746231079, + "num_tokens": 317665485.0, + "step": 12739 + }, + { + "epoch": 1.3990775312980452, + "grad_norm": 2.625805139541626, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7182446718215942, + "num_tokens": 317686116.0, + "step": 12740 + }, + { + "epoch": 1.399187349000659, + "grad_norm": 2.3541033267974854, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7085303068161011, + "num_tokens": 317710149.0, + "step": 12741 + }, + { + "epoch": 1.3992971667032725, + "grad_norm": 2.228442907333374, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7050064206123352, + "num_tokens": 317735297.0, + "step": 12742 + }, + { + "epoch": 1.3994069844058863, + "grad_norm": 2.4065916538238525, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7200985550880432, + "num_tokens": 317759353.0, + "step": 12743 + }, + { + "epoch": 1.3995168021084998, + "grad_norm": 2.3382883071899414, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7202056050300598, + "num_tokens": 317783849.0, + "step": 12744 + }, + { + "epoch": 1.3996266198111136, + "grad_norm": 2.3349320888519287, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7212432622909546, + "num_tokens": 317809140.0, + "step": 12745 + }, + { + "epoch": 1.3997364375137273, + "grad_norm": 2.4254400730133057, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7073405385017395, + "num_tokens": 317831842.0, + "step": 12746 + }, + { + "epoch": 1.3998462552163409, + "grad_norm": 2.062910318374634, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7131980657577515, + "num_tokens": 317860676.0, + "step": 12747 + }, + { + "epoch": 1.3999560729189544, + "grad_norm": 2.121638298034668, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6845626831054688, + "num_tokens": 317888429.0, + "step": 12748 + }, + { + "epoch": 1.4000658906215682, + "grad_norm": 2.330960273742676, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7236793041229248, + "num_tokens": 317912777.0, + "step": 12749 + }, + { + "epoch": 1.400175708324182, + "grad_norm": 2.3403637409210205, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7224830389022827, + "num_tokens": 317935324.0, + "step": 12750 + }, + { + "epoch": 1.4002855260267955, + "grad_norm": 2.351942777633667, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7151491641998291, + "num_tokens": 317958745.0, + "step": 12751 + }, + { + "epoch": 1.4003953437294092, + "grad_norm": 2.2917063236236572, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7391796112060547, + "num_tokens": 317982245.0, + "step": 12752 + }, + { + "epoch": 1.4005051614320227, + "grad_norm": 2.1709632873535156, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7324246168136597, + "num_tokens": 318007852.0, + "step": 12753 + }, + { + "epoch": 1.4006149791346365, + "grad_norm": 2.1796658039093018, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7125774621963501, + "num_tokens": 318032417.0, + "step": 12754 + }, + { + "epoch": 1.4007247968372503, + "grad_norm": 2.302720546722412, + "learning_rate": 1e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7447904348373413, + "num_tokens": 318055386.0, + "step": 12755 + }, + { + "epoch": 1.4008346145398638, + "grad_norm": 2.196364164352417, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6973782181739807, + "num_tokens": 318083522.0, + "step": 12756 + }, + { + "epoch": 1.4009444322424776, + "grad_norm": 2.720411777496338, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7154666185379028, + "num_tokens": 318102899.0, + "step": 12757 + }, + { + "epoch": 1.401054249945091, + "grad_norm": 2.2694976329803467, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7009775638580322, + "num_tokens": 318129057.0, + "step": 12758 + }, + { + "epoch": 1.4011640676477048, + "grad_norm": 2.281209707260132, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7221262454986572, + "num_tokens": 318153792.0, + "step": 12759 + }, + { + "epoch": 1.4012738853503186, + "grad_norm": 2.195687770843506, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7473413944244385, + "num_tokens": 318178978.0, + "step": 12760 + }, + { + "epoch": 1.4013837030529321, + "grad_norm": 2.1812491416931152, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7056176066398621, + "num_tokens": 318207441.0, + "step": 12761 + }, + { + "epoch": 1.4014935207555457, + "grad_norm": 2.364041805267334, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7176015377044678, + "num_tokens": 318229219.0, + "step": 12762 + }, + { + "epoch": 1.4016033384581594, + "grad_norm": 2.2509825229644775, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7019689083099365, + "num_tokens": 318255383.0, + "step": 12763 + }, + { + "epoch": 1.4017131561607732, + "grad_norm": 2.484077215194702, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7166876792907715, + "num_tokens": 318277202.0, + "step": 12764 + }, + { + "epoch": 1.4018229738633867, + "grad_norm": 2.1731202602386475, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7218883633613586, + "num_tokens": 318305917.0, + "step": 12765 + }, + { + "epoch": 1.4019327915660005, + "grad_norm": 2.501910448074341, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7258466482162476, + "num_tokens": 318326421.0, + "step": 12766 + }, + { + "epoch": 1.402042609268614, + "grad_norm": 2.38248348236084, + "learning_rate": 1e-06, + "loss": 0.7599, + "mean_token_accuracy": 0.7558767795562744, + "num_tokens": 318347388.0, + "step": 12767 + }, + { + "epoch": 1.4021524269712278, + "grad_norm": 2.324942111968994, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7228612899780273, + "num_tokens": 318370709.0, + "step": 12768 + }, + { + "epoch": 1.4022622446738415, + "grad_norm": 2.322929620742798, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7347655296325684, + "num_tokens": 318392643.0, + "step": 12769 + }, + { + "epoch": 1.402372062376455, + "grad_norm": 2.2496838569641113, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7242648601531982, + "num_tokens": 318416891.0, + "step": 12770 + }, + { + "epoch": 1.4024818800790688, + "grad_norm": 2.229412794113159, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.710365355014801, + "num_tokens": 318442412.0, + "step": 12771 + }, + { + "epoch": 1.4025916977816824, + "grad_norm": 2.394864082336426, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7503137588500977, + "num_tokens": 318465818.0, + "step": 12772 + }, + { + "epoch": 1.4027015154842961, + "grad_norm": 2.4672200679779053, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.725275456905365, + "num_tokens": 318488594.0, + "step": 12773 + }, + { + "epoch": 1.4028113331869096, + "grad_norm": 2.0264031887054443, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7346463799476624, + "num_tokens": 318517634.0, + "step": 12774 + }, + { + "epoch": 1.4029211508895234, + "grad_norm": 2.2421669960021973, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7313302755355835, + "num_tokens": 318542342.0, + "step": 12775 + }, + { + "epoch": 1.403030968592137, + "grad_norm": 2.31217622756958, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7094416618347168, + "num_tokens": 318567360.0, + "step": 12776 + }, + { + "epoch": 1.4031407862947507, + "grad_norm": 2.448842763900757, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7580941915512085, + "num_tokens": 318588074.0, + "step": 12777 + }, + { + "epoch": 1.4032506039973645, + "grad_norm": 2.1757240295410156, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7167956829071045, + "num_tokens": 318614666.0, + "step": 12778 + }, + { + "epoch": 1.403360421699978, + "grad_norm": 2.0856287479400635, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7099319100379944, + "num_tokens": 318643655.0, + "step": 12779 + }, + { + "epoch": 1.4034702394025917, + "grad_norm": 1.9338055849075317, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6886016130447388, + "num_tokens": 318676988.0, + "step": 12780 + }, + { + "epoch": 1.4035800571052053, + "grad_norm": 1.9680730104446411, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7200655937194824, + "num_tokens": 318707575.0, + "step": 12781 + }, + { + "epoch": 1.403689874807819, + "grad_norm": 2.0033657550811768, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.6944934129714966, + "num_tokens": 318736865.0, + "step": 12782 + }, + { + "epoch": 1.4037996925104328, + "grad_norm": 2.438143491744995, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7198032140731812, + "num_tokens": 318759580.0, + "step": 12783 + }, + { + "epoch": 1.4039095102130463, + "grad_norm": 2.1651816368103027, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.6995801329612732, + "num_tokens": 318787866.0, + "step": 12784 + }, + { + "epoch": 1.40401932791566, + "grad_norm": 2.1547043323516846, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.709002673625946, + "num_tokens": 318814529.0, + "step": 12785 + }, + { + "epoch": 1.4041291456182736, + "grad_norm": 2.004718542098999, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.6982240676879883, + "num_tokens": 318845761.0, + "step": 12786 + }, + { + "epoch": 1.4042389633208874, + "grad_norm": 2.4787797927856445, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7407747507095337, + "num_tokens": 318866946.0, + "step": 12787 + }, + { + "epoch": 1.404348781023501, + "grad_norm": 2.432293176651001, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7390613555908203, + "num_tokens": 318888958.0, + "step": 12788 + }, + { + "epoch": 1.4044585987261147, + "grad_norm": 2.061018943786621, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.71832275390625, + "num_tokens": 318916253.0, + "step": 12789 + }, + { + "epoch": 1.4045684164287282, + "grad_norm": 2.2511613368988037, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7281656265258789, + "num_tokens": 318942615.0, + "step": 12790 + }, + { + "epoch": 1.404678234131342, + "grad_norm": 2.626035451889038, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7194862365722656, + "num_tokens": 318962254.0, + "step": 12791 + }, + { + "epoch": 1.4047880518339557, + "grad_norm": 2.1172354221343994, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7141000032424927, + "num_tokens": 318990659.0, + "step": 12792 + }, + { + "epoch": 1.4048978695365693, + "grad_norm": 2.472079038619995, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7315483689308167, + "num_tokens": 319012507.0, + "step": 12793 + }, + { + "epoch": 1.405007687239183, + "grad_norm": 2.668112277984619, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7245270013809204, + "num_tokens": 319032682.0, + "step": 12794 + }, + { + "epoch": 1.4051175049417965, + "grad_norm": 2.2748663425445557, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7123057842254639, + "num_tokens": 319058644.0, + "step": 12795 + }, + { + "epoch": 1.4052273226444103, + "grad_norm": 2.2656443119049072, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.721663773059845, + "num_tokens": 319084164.0, + "step": 12796 + }, + { + "epoch": 1.405337140347024, + "grad_norm": 2.532521963119507, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7282921671867371, + "num_tokens": 319104294.0, + "step": 12797 + }, + { + "epoch": 1.4054469580496376, + "grad_norm": 2.636204719543457, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7259257435798645, + "num_tokens": 319124093.0, + "step": 12798 + }, + { + "epoch": 1.4055567757522511, + "grad_norm": 2.2155210971832275, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6984838843345642, + "num_tokens": 319152744.0, + "step": 12799 + }, + { + "epoch": 1.4056665934548649, + "grad_norm": 2.545228958129883, + "learning_rate": 1e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7360857725143433, + "num_tokens": 319173355.0, + "step": 12800 + }, + { + "epoch": 1.4057764111574786, + "grad_norm": 2.3522565364837646, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7193589806556702, + "num_tokens": 319198708.0, + "step": 12801 + }, + { + "epoch": 1.4058862288600922, + "grad_norm": 2.1750786304473877, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7070976495742798, + "num_tokens": 319226552.0, + "step": 12802 + }, + { + "epoch": 1.405996046562706, + "grad_norm": 2.461954355239868, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7123421430587769, + "num_tokens": 319248795.0, + "step": 12803 + }, + { + "epoch": 1.4061058642653195, + "grad_norm": 2.1469178199768066, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.714927077293396, + "num_tokens": 319276721.0, + "step": 12804 + }, + { + "epoch": 1.4062156819679332, + "grad_norm": 1.9703480005264282, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7258539199829102, + "num_tokens": 319305170.0, + "step": 12805 + }, + { + "epoch": 1.406325499670547, + "grad_norm": 2.484560489654541, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7319682836532593, + "num_tokens": 319326389.0, + "step": 12806 + }, + { + "epoch": 1.4064353173731605, + "grad_norm": 2.6546871662139893, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7378596067428589, + "num_tokens": 319345645.0, + "step": 12807 + }, + { + "epoch": 1.4065451350757743, + "grad_norm": 2.172452449798584, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7351047992706299, + "num_tokens": 319370021.0, + "step": 12808 + }, + { + "epoch": 1.4066549527783878, + "grad_norm": 2.2482314109802246, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7211934924125671, + "num_tokens": 319394430.0, + "step": 12809 + }, + { + "epoch": 1.4067647704810016, + "grad_norm": 2.648056983947754, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7397118210792542, + "num_tokens": 319414546.0, + "step": 12810 + }, + { + "epoch": 1.4068745881836153, + "grad_norm": 2.2217912673950195, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.6943880319595337, + "num_tokens": 319441721.0, + "step": 12811 + }, + { + "epoch": 1.4069844058862289, + "grad_norm": 2.6772923469543457, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.729810893535614, + "num_tokens": 319462368.0, + "step": 12812 + }, + { + "epoch": 1.4070942235888424, + "grad_norm": 2.4316956996917725, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.727075457572937, + "num_tokens": 319483987.0, + "step": 12813 + }, + { + "epoch": 1.4072040412914562, + "grad_norm": 2.317918062210083, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7079102993011475, + "num_tokens": 319511090.0, + "step": 12814 + }, + { + "epoch": 1.40731385899407, + "grad_norm": 2.4581146240234375, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7251428961753845, + "num_tokens": 319534244.0, + "step": 12815 + }, + { + "epoch": 1.4074236766966834, + "grad_norm": 2.1799659729003906, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7164161801338196, + "num_tokens": 319562354.0, + "step": 12816 + }, + { + "epoch": 1.4075334943992972, + "grad_norm": 2.4345850944519043, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.718825101852417, + "num_tokens": 319585404.0, + "step": 12817 + }, + { + "epoch": 1.4076433121019107, + "grad_norm": 2.1191189289093018, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7212209701538086, + "num_tokens": 319613042.0, + "step": 12818 + }, + { + "epoch": 1.4077531298045245, + "grad_norm": 2.0696561336517334, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7032659649848938, + "num_tokens": 319644090.0, + "step": 12819 + }, + { + "epoch": 1.4078629475071383, + "grad_norm": 2.4997925758361816, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7297263145446777, + "num_tokens": 319663714.0, + "step": 12820 + }, + { + "epoch": 1.4079727652097518, + "grad_norm": 2.5087549686431885, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7516909837722778, + "num_tokens": 319682896.0, + "step": 12821 + }, + { + "epoch": 1.4080825829123655, + "grad_norm": 2.169508934020996, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7331153154373169, + "num_tokens": 319707774.0, + "step": 12822 + }, + { + "epoch": 1.408192400614979, + "grad_norm": 2.243511199951172, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.744057297706604, + "num_tokens": 319730838.0, + "step": 12823 + }, + { + "epoch": 1.4083022183175928, + "grad_norm": 2.5406086444854736, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7427400350570679, + "num_tokens": 319752302.0, + "step": 12824 + }, + { + "epoch": 1.4084120360202066, + "grad_norm": 2.7490460872650146, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7324378490447998, + "num_tokens": 319770224.0, + "step": 12825 + }, + { + "epoch": 1.4085218537228201, + "grad_norm": 2.196981906890869, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7187223434448242, + "num_tokens": 319795145.0, + "step": 12826 + }, + { + "epoch": 1.4086316714254337, + "grad_norm": 2.447071075439453, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.710045337677002, + "num_tokens": 319817795.0, + "step": 12827 + }, + { + "epoch": 1.4087414891280474, + "grad_norm": 2.2110435962677, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7204979658126831, + "num_tokens": 319844373.0, + "step": 12828 + }, + { + "epoch": 1.4088513068306612, + "grad_norm": 1.9413583278656006, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7090741395950317, + "num_tokens": 319878379.0, + "step": 12829 + }, + { + "epoch": 1.4089611245332747, + "grad_norm": 2.364175319671631, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7397978901863098, + "num_tokens": 319901684.0, + "step": 12830 + }, + { + "epoch": 1.4090709422358885, + "grad_norm": 2.23862624168396, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7063247561454773, + "num_tokens": 319927783.0, + "step": 12831 + }, + { + "epoch": 1.409180759938502, + "grad_norm": 2.3166894912719727, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7206803560256958, + "num_tokens": 319951278.0, + "step": 12832 + }, + { + "epoch": 1.4092905776411158, + "grad_norm": 2.1372992992401123, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.710953414440155, + "num_tokens": 319978659.0, + "step": 12833 + }, + { + "epoch": 1.4094003953437295, + "grad_norm": 2.5416159629821777, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7292649745941162, + "num_tokens": 319998747.0, + "step": 12834 + }, + { + "epoch": 1.409510213046343, + "grad_norm": 2.3560216426849365, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7366572618484497, + "num_tokens": 320021883.0, + "step": 12835 + }, + { + "epoch": 1.4096200307489568, + "grad_norm": 2.204674005508423, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7289870977401733, + "num_tokens": 320047326.0, + "step": 12836 + }, + { + "epoch": 1.4097298484515703, + "grad_norm": 2.4654176235198975, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7195439338684082, + "num_tokens": 320069234.0, + "step": 12837 + }, + { + "epoch": 1.409839666154184, + "grad_norm": 2.012192487716675, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7182562351226807, + "num_tokens": 320099236.0, + "step": 12838 + }, + { + "epoch": 1.4099494838567976, + "grad_norm": 2.2074570655822754, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7471765279769897, + "num_tokens": 320123822.0, + "step": 12839 + }, + { + "epoch": 1.4100593015594114, + "grad_norm": 2.062803030014038, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6994616985321045, + "num_tokens": 320154263.0, + "step": 12840 + }, + { + "epoch": 1.410169119262025, + "grad_norm": 2.3265273571014404, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7309338450431824, + "num_tokens": 320177860.0, + "step": 12841 + }, + { + "epoch": 1.4102789369646387, + "grad_norm": 2.04518723487854, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7031144499778748, + "num_tokens": 320209755.0, + "step": 12842 + }, + { + "epoch": 1.4103887546672524, + "grad_norm": 2.2429542541503906, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7033490538597107, + "num_tokens": 320234314.0, + "step": 12843 + }, + { + "epoch": 1.410498572369866, + "grad_norm": 2.241569757461548, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7044128775596619, + "num_tokens": 320260919.0, + "step": 12844 + }, + { + "epoch": 1.4106083900724797, + "grad_norm": 2.0381150245666504, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7027209997177124, + "num_tokens": 320290384.0, + "step": 12845 + }, + { + "epoch": 1.4107182077750933, + "grad_norm": 2.274075746536255, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7112674713134766, + "num_tokens": 320317297.0, + "step": 12846 + }, + { + "epoch": 1.410828025477707, + "grad_norm": 2.008241653442383, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7106122970581055, + "num_tokens": 320347283.0, + "step": 12847 + }, + { + "epoch": 1.4109378431803208, + "grad_norm": 2.2963058948516846, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.703387975692749, + "num_tokens": 320372942.0, + "step": 12848 + }, + { + "epoch": 1.4110476608829343, + "grad_norm": 2.1756691932678223, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7131116390228271, + "num_tokens": 320401696.0, + "step": 12849 + }, + { + "epoch": 1.4111574785855479, + "grad_norm": 2.1564648151397705, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6944217681884766, + "num_tokens": 320430589.0, + "step": 12850 + }, + { + "epoch": 1.4112672962881616, + "grad_norm": 2.3158180713653564, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7224283814430237, + "num_tokens": 320455049.0, + "step": 12851 + }, + { + "epoch": 1.4113771139907754, + "grad_norm": 2.630323886871338, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7350211143493652, + "num_tokens": 320476802.0, + "step": 12852 + }, + { + "epoch": 1.411486931693389, + "grad_norm": 2.220759868621826, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7142665386199951, + "num_tokens": 320504598.0, + "step": 12853 + }, + { + "epoch": 1.4115967493960027, + "grad_norm": 2.571413993835449, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7471253275871277, + "num_tokens": 320525192.0, + "step": 12854 + }, + { + "epoch": 1.4117065670986162, + "grad_norm": 2.1100871562957764, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7318719625473022, + "num_tokens": 320553311.0, + "step": 12855 + }, + { + "epoch": 1.41181638480123, + "grad_norm": 2.0441091060638428, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7152812480926514, + "num_tokens": 320586338.0, + "step": 12856 + }, + { + "epoch": 1.4119262025038437, + "grad_norm": 1.8975460529327393, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7308551073074341, + "num_tokens": 320619972.0, + "step": 12857 + }, + { + "epoch": 1.4120360202064572, + "grad_norm": 2.4534976482391357, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7355093955993652, + "num_tokens": 320641106.0, + "step": 12858 + }, + { + "epoch": 1.412145837909071, + "grad_norm": 2.437095880508423, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7398486137390137, + "num_tokens": 320663671.0, + "step": 12859 + }, + { + "epoch": 1.4122556556116845, + "grad_norm": 2.0758702754974365, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6889185905456543, + "num_tokens": 320695582.0, + "step": 12860 + }, + { + "epoch": 1.4123654733142983, + "grad_norm": 2.3004672527313232, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7149841785430908, + "num_tokens": 320721162.0, + "step": 12861 + }, + { + "epoch": 1.412475291016912, + "grad_norm": 2.2877116203308105, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7086164951324463, + "num_tokens": 320745971.0, + "step": 12862 + }, + { + "epoch": 1.4125851087195256, + "grad_norm": 2.337592601776123, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7166395783424377, + "num_tokens": 320769509.0, + "step": 12863 + }, + { + "epoch": 1.4126949264221391, + "grad_norm": 2.201446056365967, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7216565608978271, + "num_tokens": 320795886.0, + "step": 12864 + }, + { + "epoch": 1.4128047441247529, + "grad_norm": 2.522726535797119, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7233588695526123, + "num_tokens": 320816809.0, + "step": 12865 + }, + { + "epoch": 1.4129145618273666, + "grad_norm": 2.30737566947937, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7330915927886963, + "num_tokens": 320839660.0, + "step": 12866 + }, + { + "epoch": 1.4130243795299802, + "grad_norm": 2.38739013671875, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7321642637252808, + "num_tokens": 320864379.0, + "step": 12867 + }, + { + "epoch": 1.413134197232594, + "grad_norm": 2.2093677520751953, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7308975458145142, + "num_tokens": 320892260.0, + "step": 12868 + }, + { + "epoch": 1.4132440149352075, + "grad_norm": 1.869795799255371, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7151998281478882, + "num_tokens": 320929373.0, + "step": 12869 + }, + { + "epoch": 1.4133538326378212, + "grad_norm": 1.9708716869354248, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6878253221511841, + "num_tokens": 320963978.0, + "step": 12870 + }, + { + "epoch": 1.413463650340435, + "grad_norm": 2.2840561866760254, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7109569311141968, + "num_tokens": 320989537.0, + "step": 12871 + }, + { + "epoch": 1.4135734680430485, + "grad_norm": 2.557305335998535, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7166210412979126, + "num_tokens": 321009789.0, + "step": 12872 + }, + { + "epoch": 1.4136832857456623, + "grad_norm": 2.3066608905792236, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7249571084976196, + "num_tokens": 321033928.0, + "step": 12873 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 2.3656461238861084, + "learning_rate": 1e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7531003952026367, + "num_tokens": 321056684.0, + "step": 12874 + }, + { + "epoch": 1.4139029211508896, + "grad_norm": 2.4361164569854736, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7428973913192749, + "num_tokens": 321078294.0, + "step": 12875 + }, + { + "epoch": 1.4140127388535033, + "grad_norm": 2.3825955390930176, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.730430543422699, + "num_tokens": 321099856.0, + "step": 12876 + }, + { + "epoch": 1.4141225565561168, + "grad_norm": 1.980739951133728, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7294882535934448, + "num_tokens": 321129389.0, + "step": 12877 + }, + { + "epoch": 1.4142323742587304, + "grad_norm": 2.449402332305908, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7259452939033508, + "num_tokens": 321152755.0, + "step": 12878 + }, + { + "epoch": 1.4143421919613441, + "grad_norm": 2.3660333156585693, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7151079177856445, + "num_tokens": 321177856.0, + "step": 12879 + }, + { + "epoch": 1.414452009663958, + "grad_norm": 2.1278789043426514, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.739149272441864, + "num_tokens": 321203853.0, + "step": 12880 + }, + { + "epoch": 1.4145618273665714, + "grad_norm": 1.877528429031372, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7004590630531311, + "num_tokens": 321237650.0, + "step": 12881 + }, + { + "epoch": 1.4146716450691852, + "grad_norm": 2.1756625175476074, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7135047912597656, + "num_tokens": 321264392.0, + "step": 12882 + }, + { + "epoch": 1.4147814627717987, + "grad_norm": 2.5697689056396484, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7174195647239685, + "num_tokens": 321285263.0, + "step": 12883 + }, + { + "epoch": 1.4148912804744125, + "grad_norm": 1.970572829246521, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7040935754776001, + "num_tokens": 321317728.0, + "step": 12884 + }, + { + "epoch": 1.4150010981770262, + "grad_norm": 2.3872928619384766, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7356243133544922, + "num_tokens": 321341333.0, + "step": 12885 + }, + { + "epoch": 1.4151109158796398, + "grad_norm": 2.133831262588501, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7069824934005737, + "num_tokens": 321370914.0, + "step": 12886 + }, + { + "epoch": 1.4152207335822535, + "grad_norm": 2.3590476512908936, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7213759422302246, + "num_tokens": 321395091.0, + "step": 12887 + }, + { + "epoch": 1.415330551284867, + "grad_norm": 2.132120370864868, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7165228724479675, + "num_tokens": 321422350.0, + "step": 12888 + }, + { + "epoch": 1.4154403689874808, + "grad_norm": 2.071164846420288, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.75636225938797, + "num_tokens": 321450224.0, + "step": 12889 + }, + { + "epoch": 1.4155501866900946, + "grad_norm": 2.3058793544769287, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7315019369125366, + "num_tokens": 321474514.0, + "step": 12890 + }, + { + "epoch": 1.4156600043927081, + "grad_norm": 2.362283945083618, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7379380464553833, + "num_tokens": 321497623.0, + "step": 12891 + }, + { + "epoch": 1.4157698220953217, + "grad_norm": 1.8529078960418701, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7261189222335815, + "num_tokens": 321531809.0, + "step": 12892 + }, + { + "epoch": 1.4158796397979354, + "grad_norm": 2.321854829788208, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7152571678161621, + "num_tokens": 321555384.0, + "step": 12893 + }, + { + "epoch": 1.4159894575005492, + "grad_norm": 2.1338160037994385, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7188624739646912, + "num_tokens": 321583801.0, + "step": 12894 + }, + { + "epoch": 1.4160992752031627, + "grad_norm": 2.4317684173583984, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7091875672340393, + "num_tokens": 321607038.0, + "step": 12895 + }, + { + "epoch": 1.4162090929057765, + "grad_norm": 2.4427528381347656, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7145787477493286, + "num_tokens": 321628998.0, + "step": 12896 + }, + { + "epoch": 1.41631891060839, + "grad_norm": 2.101884603500366, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7087920904159546, + "num_tokens": 321657626.0, + "step": 12897 + }, + { + "epoch": 1.4164287283110037, + "grad_norm": 2.5319671630859375, + "learning_rate": 1e-06, + "loss": 0.8093, + "mean_token_accuracy": 0.7446736097335815, + "num_tokens": 321676816.0, + "step": 12898 + }, + { + "epoch": 1.4165385460136175, + "grad_norm": 2.01841139793396, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7072633504867554, + "num_tokens": 321710164.0, + "step": 12899 + }, + { + "epoch": 1.416648363716231, + "grad_norm": 2.276862859725952, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7020833492279053, + "num_tokens": 321735034.0, + "step": 12900 + }, + { + "epoch": 1.4167581814188448, + "grad_norm": 2.1691551208496094, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.720523476600647, + "num_tokens": 321761823.0, + "step": 12901 + }, + { + "epoch": 1.4168679991214583, + "grad_norm": 2.2299742698669434, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7088167071342468, + "num_tokens": 321789835.0, + "step": 12902 + }, + { + "epoch": 1.416977816824072, + "grad_norm": 2.1353976726531982, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7098861336708069, + "num_tokens": 321818230.0, + "step": 12903 + }, + { + "epoch": 1.4170876345266856, + "grad_norm": 2.426558017730713, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7333714962005615, + "num_tokens": 321839226.0, + "step": 12904 + }, + { + "epoch": 1.4171974522292994, + "grad_norm": 2.0924150943756104, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7208729982376099, + "num_tokens": 321866067.0, + "step": 12905 + }, + { + "epoch": 1.417307269931913, + "grad_norm": 2.222541332244873, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7442593574523926, + "num_tokens": 321889793.0, + "step": 12906 + }, + { + "epoch": 1.4174170876345267, + "grad_norm": 2.1857399940490723, + "learning_rate": 1e-06, + "loss": 0.7822, + "mean_token_accuracy": 0.7547136545181274, + "num_tokens": 321916119.0, + "step": 12907 + }, + { + "epoch": 1.4175269053371404, + "grad_norm": 2.364694833755493, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7412309646606445, + "num_tokens": 321936364.0, + "step": 12908 + }, + { + "epoch": 1.417636723039754, + "grad_norm": 2.3489551544189453, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7129780054092407, + "num_tokens": 321960237.0, + "step": 12909 + }, + { + "epoch": 1.4177465407423677, + "grad_norm": 1.9404152631759644, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7170475721359253, + "num_tokens": 321994424.0, + "step": 12910 + }, + { + "epoch": 1.4178563584449813, + "grad_norm": 2.1201226711273193, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7181164622306824, + "num_tokens": 322025389.0, + "step": 12911 + }, + { + "epoch": 1.417966176147595, + "grad_norm": 2.278219223022461, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7097650766372681, + "num_tokens": 322050029.0, + "step": 12912 + }, + { + "epoch": 1.4180759938502088, + "grad_norm": 2.62398362159729, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7103656530380249, + "num_tokens": 322069095.0, + "step": 12913 + }, + { + "epoch": 1.4181858115528223, + "grad_norm": 2.226426601409912, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.710370659828186, + "num_tokens": 322098511.0, + "step": 12914 + }, + { + "epoch": 1.4182956292554358, + "grad_norm": 2.12825345993042, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7142167091369629, + "num_tokens": 322126161.0, + "step": 12915 + }, + { + "epoch": 1.4184054469580496, + "grad_norm": 2.397887706756592, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7251191139221191, + "num_tokens": 322147997.0, + "step": 12916 + }, + { + "epoch": 1.4185152646606634, + "grad_norm": 2.290393114089966, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7326797246932983, + "num_tokens": 322172569.0, + "step": 12917 + }, + { + "epoch": 1.418625082363277, + "grad_norm": 2.281663179397583, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7159674167633057, + "num_tokens": 322198801.0, + "step": 12918 + }, + { + "epoch": 1.4187349000658906, + "grad_norm": 2.3386898040771484, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7244171500205994, + "num_tokens": 322222191.0, + "step": 12919 + }, + { + "epoch": 1.4188447177685042, + "grad_norm": 2.4109315872192383, + "learning_rate": 1e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7429802417755127, + "num_tokens": 322245585.0, + "step": 12920 + }, + { + "epoch": 1.418954535471118, + "grad_norm": 2.084678888320923, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7200421094894409, + "num_tokens": 322273437.0, + "step": 12921 + }, + { + "epoch": 1.4190643531737317, + "grad_norm": 2.064307689666748, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7346988916397095, + "num_tokens": 322300438.0, + "step": 12922 + }, + { + "epoch": 1.4191741708763452, + "grad_norm": 2.4476563930511475, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7257952690124512, + "num_tokens": 322323563.0, + "step": 12923 + }, + { + "epoch": 1.419283988578959, + "grad_norm": 2.262599468231201, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6917433738708496, + "num_tokens": 322350372.0, + "step": 12924 + }, + { + "epoch": 1.4193938062815725, + "grad_norm": 2.11230731010437, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.737958550453186, + "num_tokens": 322376460.0, + "step": 12925 + }, + { + "epoch": 1.4195036239841863, + "grad_norm": 2.3826096057891846, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.721267819404602, + "num_tokens": 322399397.0, + "step": 12926 + }, + { + "epoch": 1.4196134416868, + "grad_norm": 2.1270737648010254, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7186110615730286, + "num_tokens": 322428109.0, + "step": 12927 + }, + { + "epoch": 1.4197232593894136, + "grad_norm": 2.187012195587158, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7231355309486389, + "num_tokens": 322454523.0, + "step": 12928 + }, + { + "epoch": 1.419833077092027, + "grad_norm": 2.147937774658203, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7144967317581177, + "num_tokens": 322480042.0, + "step": 12929 + }, + { + "epoch": 1.4199428947946409, + "grad_norm": 2.0490217208862305, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7184854745864868, + "num_tokens": 322508958.0, + "step": 12930 + }, + { + "epoch": 1.4200527124972546, + "grad_norm": 2.1666433811187744, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7302238941192627, + "num_tokens": 322533873.0, + "step": 12931 + }, + { + "epoch": 1.4201625301998682, + "grad_norm": 2.5039618015289307, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.724005937576294, + "num_tokens": 322554223.0, + "step": 12932 + }, + { + "epoch": 1.420272347902482, + "grad_norm": 2.0733790397644043, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6843291521072388, + "num_tokens": 322585599.0, + "step": 12933 + }, + { + "epoch": 1.4203821656050954, + "grad_norm": 2.233588933944702, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7176980972290039, + "num_tokens": 322612618.0, + "step": 12934 + }, + { + "epoch": 1.4204919833077092, + "grad_norm": 2.458237409591675, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7209475040435791, + "num_tokens": 322634199.0, + "step": 12935 + }, + { + "epoch": 1.420601801010323, + "grad_norm": 2.656163454055786, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7366371154785156, + "num_tokens": 322652564.0, + "step": 12936 + }, + { + "epoch": 1.4207116187129365, + "grad_norm": 2.2092058658599854, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7141292691230774, + "num_tokens": 322682236.0, + "step": 12937 + }, + { + "epoch": 1.4208214364155503, + "grad_norm": 3.0857067108154297, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7127236723899841, + "num_tokens": 322698908.0, + "step": 12938 + }, + { + "epoch": 1.4209312541181638, + "grad_norm": 2.6180872917175293, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7142115831375122, + "num_tokens": 322720016.0, + "step": 12939 + }, + { + "epoch": 1.4210410718207775, + "grad_norm": 2.1522436141967773, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7047447562217712, + "num_tokens": 322746459.0, + "step": 12940 + }, + { + "epoch": 1.4211508895233913, + "grad_norm": 2.3356518745422363, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7089693546295166, + "num_tokens": 322772801.0, + "step": 12941 + }, + { + "epoch": 1.4212607072260048, + "grad_norm": 2.321972608566284, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7190441489219666, + "num_tokens": 322797061.0, + "step": 12942 + }, + { + "epoch": 1.4213705249286184, + "grad_norm": 2.218838691711426, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7274319529533386, + "num_tokens": 322824327.0, + "step": 12943 + }, + { + "epoch": 1.4214803426312321, + "grad_norm": 2.1278035640716553, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.739425778388977, + "num_tokens": 322851725.0, + "step": 12944 + }, + { + "epoch": 1.4215901603338459, + "grad_norm": 2.3131651878356934, + "learning_rate": 1e-06, + "loss": 0.7357, + "mean_token_accuracy": 0.7565563917160034, + "num_tokens": 322873835.0, + "step": 12945 + }, + { + "epoch": 1.4216999780364594, + "grad_norm": 2.454481601715088, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6947981119155884, + "num_tokens": 322897671.0, + "step": 12946 + }, + { + "epoch": 1.4218097957390732, + "grad_norm": 2.398369789123535, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7136383056640625, + "num_tokens": 322921746.0, + "step": 12947 + }, + { + "epoch": 1.4219196134416867, + "grad_norm": 2.4503555297851562, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7351770401000977, + "num_tokens": 322943880.0, + "step": 12948 + }, + { + "epoch": 1.4220294311443005, + "grad_norm": 2.341862440109253, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7061183452606201, + "num_tokens": 322971082.0, + "step": 12949 + }, + { + "epoch": 1.4221392488469142, + "grad_norm": 2.5077686309814453, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7030378580093384, + "num_tokens": 322993797.0, + "step": 12950 + }, + { + "epoch": 1.4222490665495278, + "grad_norm": 2.1823716163635254, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7234221696853638, + "num_tokens": 323018447.0, + "step": 12951 + }, + { + "epoch": 1.4223588842521415, + "grad_norm": 2.4558894634246826, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7282060384750366, + "num_tokens": 323039109.0, + "step": 12952 + }, + { + "epoch": 1.422468701954755, + "grad_norm": 2.4647815227508545, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7211731672286987, + "num_tokens": 323062793.0, + "step": 12953 + }, + { + "epoch": 1.4225785196573688, + "grad_norm": 2.696485757827759, + "learning_rate": 1e-06, + "loss": 0.7955, + "mean_token_accuracy": 0.7436560988426208, + "num_tokens": 323080623.0, + "step": 12954 + }, + { + "epoch": 1.4226883373599823, + "grad_norm": 2.496826648712158, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7067639827728271, + "num_tokens": 323103097.0, + "step": 12955 + }, + { + "epoch": 1.422798155062596, + "grad_norm": 2.295555353164673, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7126757502555847, + "num_tokens": 323130986.0, + "step": 12956 + }, + { + "epoch": 1.4229079727652096, + "grad_norm": 2.318715810775757, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7280694246292114, + "num_tokens": 323155462.0, + "step": 12957 + }, + { + "epoch": 1.4230177904678234, + "grad_norm": 2.290114641189575, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7058200240135193, + "num_tokens": 323180467.0, + "step": 12958 + }, + { + "epoch": 1.4231276081704372, + "grad_norm": 2.4194114208221436, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7227113842964172, + "num_tokens": 323203081.0, + "step": 12959 + }, + { + "epoch": 1.4232374258730507, + "grad_norm": 2.5008556842803955, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7148492932319641, + "num_tokens": 323224945.0, + "step": 12960 + }, + { + "epoch": 1.4233472435756644, + "grad_norm": 2.2005558013916016, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7161224484443665, + "num_tokens": 323251740.0, + "step": 12961 + }, + { + "epoch": 1.423457061278278, + "grad_norm": 2.3163294792175293, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7283177971839905, + "num_tokens": 323276303.0, + "step": 12962 + }, + { + "epoch": 1.4235668789808917, + "grad_norm": 2.3872969150543213, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7329190969467163, + "num_tokens": 323298675.0, + "step": 12963 + }, + { + "epoch": 1.4236766966835055, + "grad_norm": 2.3391811847686768, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7387984395027161, + "num_tokens": 323321138.0, + "step": 12964 + }, + { + "epoch": 1.423786514386119, + "grad_norm": 2.427962303161621, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7110877633094788, + "num_tokens": 323345141.0, + "step": 12965 + }, + { + "epoch": 1.4238963320887328, + "grad_norm": 2.5141220092773438, + "learning_rate": 1e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.743901789188385, + "num_tokens": 323366429.0, + "step": 12966 + }, + { + "epoch": 1.4240061497913463, + "grad_norm": 2.411508798599243, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7260996699333191, + "num_tokens": 323390332.0, + "step": 12967 + }, + { + "epoch": 1.42411596749396, + "grad_norm": 2.1901323795318604, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7121483683586121, + "num_tokens": 323417934.0, + "step": 12968 + }, + { + "epoch": 1.4242257851965736, + "grad_norm": 2.2759244441986084, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7100921273231506, + "num_tokens": 323442453.0, + "step": 12969 + }, + { + "epoch": 1.4243356028991874, + "grad_norm": 2.158097267150879, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7008504867553711, + "num_tokens": 323468742.0, + "step": 12970 + }, + { + "epoch": 1.424445420601801, + "grad_norm": 2.4877915382385254, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.702734112739563, + "num_tokens": 323491355.0, + "step": 12971 + }, + { + "epoch": 1.4245552383044147, + "grad_norm": 2.3192827701568604, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7223654389381409, + "num_tokens": 323513528.0, + "step": 12972 + }, + { + "epoch": 1.4246650560070284, + "grad_norm": 2.5925192832946777, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7274534106254578, + "num_tokens": 323533601.0, + "step": 12973 + }, + { + "epoch": 1.424774873709642, + "grad_norm": 1.9649286270141602, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7180901765823364, + "num_tokens": 323563919.0, + "step": 12974 + }, + { + "epoch": 1.4248846914122557, + "grad_norm": 2.455784797668457, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7235332131385803, + "num_tokens": 323588620.0, + "step": 12975 + }, + { + "epoch": 1.4249945091148692, + "grad_norm": 2.5720624923706055, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7047104835510254, + "num_tokens": 323612269.0, + "step": 12976 + }, + { + "epoch": 1.425104326817483, + "grad_norm": 2.5013020038604736, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.715451717376709, + "num_tokens": 323633989.0, + "step": 12977 + }, + { + "epoch": 1.4252141445200968, + "grad_norm": 2.3402583599090576, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7116668224334717, + "num_tokens": 323658023.0, + "step": 12978 + }, + { + "epoch": 1.4253239622227103, + "grad_norm": 2.5609195232391357, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7201710939407349, + "num_tokens": 323678468.0, + "step": 12979 + }, + { + "epoch": 1.4254337799253238, + "grad_norm": 2.5255320072174072, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7341616153717041, + "num_tokens": 323700477.0, + "step": 12980 + }, + { + "epoch": 1.4255435976279376, + "grad_norm": 2.504702091217041, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7132712602615356, + "num_tokens": 323721842.0, + "step": 12981 + }, + { + "epoch": 1.4256534153305513, + "grad_norm": 2.5766830444335938, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7330138683319092, + "num_tokens": 323742511.0, + "step": 12982 + }, + { + "epoch": 1.4257632330331649, + "grad_norm": 2.1960132122039795, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7248406410217285, + "num_tokens": 323769199.0, + "step": 12983 + }, + { + "epoch": 1.4258730507357786, + "grad_norm": 2.255589246749878, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7243099212646484, + "num_tokens": 323796202.0, + "step": 12984 + }, + { + "epoch": 1.4259828684383922, + "grad_norm": 2.224156141281128, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.6964879035949707, + "num_tokens": 323823527.0, + "step": 12985 + }, + { + "epoch": 1.426092686141006, + "grad_norm": 2.0860023498535156, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7092286348342896, + "num_tokens": 323854545.0, + "step": 12986 + }, + { + "epoch": 1.4262025038436197, + "grad_norm": 2.3071255683898926, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7366059422492981, + "num_tokens": 323879179.0, + "step": 12987 + }, + { + "epoch": 1.4263123215462332, + "grad_norm": 2.282033920288086, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7317146062850952, + "num_tokens": 323904039.0, + "step": 12988 + }, + { + "epoch": 1.426422139248847, + "grad_norm": 2.605433702468872, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7288732528686523, + "num_tokens": 323924662.0, + "step": 12989 + }, + { + "epoch": 1.4265319569514605, + "grad_norm": 2.099837064743042, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7024693489074707, + "num_tokens": 323954122.0, + "step": 12990 + }, + { + "epoch": 1.4266417746540743, + "grad_norm": 2.4376003742218018, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.6958922147750854, + "num_tokens": 323978881.0, + "step": 12991 + }, + { + "epoch": 1.426751592356688, + "grad_norm": 2.353259325027466, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.729540228843689, + "num_tokens": 324001841.0, + "step": 12992 + }, + { + "epoch": 1.4268614100593016, + "grad_norm": 2.23374342918396, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7327661514282227, + "num_tokens": 324026586.0, + "step": 12993 + }, + { + "epoch": 1.426971227761915, + "grad_norm": 2.2141594886779785, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7124438881874084, + "num_tokens": 324052002.0, + "step": 12994 + }, + { + "epoch": 1.4270810454645289, + "grad_norm": 2.721994638442993, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7446228265762329, + "num_tokens": 324068565.0, + "step": 12995 + }, + { + "epoch": 1.4271908631671426, + "grad_norm": 2.4095842838287354, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7302051782608032, + "num_tokens": 324091781.0, + "step": 12996 + }, + { + "epoch": 1.4273006808697561, + "grad_norm": 2.296741247177124, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6768820881843567, + "num_tokens": 324119364.0, + "step": 12997 + }, + { + "epoch": 1.42741049857237, + "grad_norm": 2.1794309616088867, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7193459272384644, + "num_tokens": 324149006.0, + "step": 12998 + }, + { + "epoch": 1.4275203162749834, + "grad_norm": 2.423670530319214, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7064957022666931, + "num_tokens": 324172595.0, + "step": 12999 + }, + { + "epoch": 1.4276301339775972, + "grad_norm": 2.2764153480529785, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7293863296508789, + "num_tokens": 324196592.0, + "step": 13000 + }, + { + "epoch": 1.427739951680211, + "grad_norm": 2.1074275970458984, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7161529064178467, + "num_tokens": 324224115.0, + "step": 13001 + }, + { + "epoch": 1.4278497693828245, + "grad_norm": 2.47461199760437, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7386935353279114, + "num_tokens": 324244185.0, + "step": 13002 + }, + { + "epoch": 1.4279595870854382, + "grad_norm": 2.2575883865356445, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7289642095565796, + "num_tokens": 324267626.0, + "step": 13003 + }, + { + "epoch": 1.4280694047880518, + "grad_norm": 2.5203728675842285, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7009677886962891, + "num_tokens": 324290282.0, + "step": 13004 + }, + { + "epoch": 1.4281792224906655, + "grad_norm": 2.109266996383667, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7104690074920654, + "num_tokens": 324317338.0, + "step": 13005 + }, + { + "epoch": 1.4282890401932793, + "grad_norm": 2.3532509803771973, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7052333950996399, + "num_tokens": 324342461.0, + "step": 13006 + }, + { + "epoch": 1.4283988578958928, + "grad_norm": 2.014049530029297, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7124826908111572, + "num_tokens": 324372764.0, + "step": 13007 + }, + { + "epoch": 1.4285086755985064, + "grad_norm": 2.3505094051361084, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7235180735588074, + "num_tokens": 324395653.0, + "step": 13008 + }, + { + "epoch": 1.4286184933011201, + "grad_norm": 2.1524150371551514, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7136865258216858, + "num_tokens": 324423686.0, + "step": 13009 + }, + { + "epoch": 1.4287283110037339, + "grad_norm": 2.2826907634735107, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.6992823481559753, + "num_tokens": 324449804.0, + "step": 13010 + }, + { + "epoch": 1.4288381287063474, + "grad_norm": 2.195990562438965, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7298752069473267, + "num_tokens": 324476432.0, + "step": 13011 + }, + { + "epoch": 1.4289479464089612, + "grad_norm": 2.205540657043457, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7124090790748596, + "num_tokens": 324501781.0, + "step": 13012 + }, + { + "epoch": 1.4290577641115747, + "grad_norm": 2.1109488010406494, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6972206234931946, + "num_tokens": 324531742.0, + "step": 13013 + }, + { + "epoch": 1.4291675818141885, + "grad_norm": 2.3629090785980225, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7079563140869141, + "num_tokens": 324556476.0, + "step": 13014 + }, + { + "epoch": 1.4292773995168022, + "grad_norm": 2.379772186279297, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7150753140449524, + "num_tokens": 324578842.0, + "step": 13015 + }, + { + "epoch": 1.4293872172194158, + "grad_norm": 2.359920024871826, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7168314456939697, + "num_tokens": 324604149.0, + "step": 13016 + }, + { + "epoch": 1.4294970349220295, + "grad_norm": 2.1861865520477295, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7095855474472046, + "num_tokens": 324632866.0, + "step": 13017 + }, + { + "epoch": 1.429606852624643, + "grad_norm": 2.529937267303467, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7342379093170166, + "num_tokens": 324653459.0, + "step": 13018 + }, + { + "epoch": 1.4297166703272568, + "grad_norm": 2.468496561050415, + "learning_rate": 1e-06, + "loss": 0.7611, + "mean_token_accuracy": 0.7554863095283508, + "num_tokens": 324673893.0, + "step": 13019 + }, + { + "epoch": 1.4298264880298703, + "grad_norm": 2.0574285984039307, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7178329229354858, + "num_tokens": 324702884.0, + "step": 13020 + }, + { + "epoch": 1.429936305732484, + "grad_norm": 2.480468511581421, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.722221851348877, + "num_tokens": 324725659.0, + "step": 13021 + }, + { + "epoch": 1.4300461234350976, + "grad_norm": 2.1096222400665283, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7203729152679443, + "num_tokens": 324754856.0, + "step": 13022 + }, + { + "epoch": 1.4301559411377114, + "grad_norm": 2.333118200302124, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.702549159526825, + "num_tokens": 324780767.0, + "step": 13023 + }, + { + "epoch": 1.4302657588403251, + "grad_norm": 2.4380695819854736, + "learning_rate": 1e-06, + "loss": 0.7541, + "mean_token_accuracy": 0.7589259147644043, + "num_tokens": 324800759.0, + "step": 13024 + }, + { + "epoch": 1.4303755765429387, + "grad_norm": 2.411292314529419, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7212182879447937, + "num_tokens": 324824252.0, + "step": 13025 + }, + { + "epoch": 1.4304853942455524, + "grad_norm": 2.136054277420044, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7130626440048218, + "num_tokens": 324852146.0, + "step": 13026 + }, + { + "epoch": 1.430595211948166, + "grad_norm": 2.2819600105285645, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7360079288482666, + "num_tokens": 324876067.0, + "step": 13027 + }, + { + "epoch": 1.4307050296507797, + "grad_norm": 2.20251727104187, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7263404130935669, + "num_tokens": 324903308.0, + "step": 13028 + }, + { + "epoch": 1.4308148473533935, + "grad_norm": 2.0268328189849854, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6963058114051819, + "num_tokens": 324935906.0, + "step": 13029 + }, + { + "epoch": 1.430924665056007, + "grad_norm": 2.1562955379486084, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7155442237854004, + "num_tokens": 324963301.0, + "step": 13030 + }, + { + "epoch": 1.4310344827586206, + "grad_norm": 2.3895792961120605, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7239488959312439, + "num_tokens": 324986417.0, + "step": 13031 + }, + { + "epoch": 1.4311443004612343, + "grad_norm": 2.110675096511841, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7337254285812378, + "num_tokens": 325013054.0, + "step": 13032 + }, + { + "epoch": 1.431254118163848, + "grad_norm": 2.163364887237549, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7060420513153076, + "num_tokens": 325040141.0, + "step": 13033 + }, + { + "epoch": 1.4313639358664616, + "grad_norm": 2.2483367919921875, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7083084583282471, + "num_tokens": 325064005.0, + "step": 13034 + }, + { + "epoch": 1.4314737535690754, + "grad_norm": 2.4708027839660645, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7227916717529297, + "num_tokens": 325087165.0, + "step": 13035 + }, + { + "epoch": 1.431583571271689, + "grad_norm": 2.4788577556610107, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7297636270523071, + "num_tokens": 325109723.0, + "step": 13036 + }, + { + "epoch": 1.4316933889743026, + "grad_norm": 2.0937252044677734, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7175351977348328, + "num_tokens": 325138480.0, + "step": 13037 + }, + { + "epoch": 1.4318032066769164, + "grad_norm": 2.1219561100006104, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7379486560821533, + "num_tokens": 325166826.0, + "step": 13038 + }, + { + "epoch": 1.43191302437953, + "grad_norm": 2.228041410446167, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6974509358406067, + "num_tokens": 325193123.0, + "step": 13039 + }, + { + "epoch": 1.4320228420821437, + "grad_norm": 2.1109297275543213, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7306517362594604, + "num_tokens": 325220700.0, + "step": 13040 + }, + { + "epoch": 1.4321326597847572, + "grad_norm": 2.2156965732574463, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6950247287750244, + "num_tokens": 325249370.0, + "step": 13041 + }, + { + "epoch": 1.432242477487371, + "grad_norm": 2.1970431804656982, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7201824188232422, + "num_tokens": 325276993.0, + "step": 13042 + }, + { + "epoch": 1.4323522951899847, + "grad_norm": 2.1241488456726074, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7361430525779724, + "num_tokens": 325302757.0, + "step": 13043 + }, + { + "epoch": 1.4324621128925983, + "grad_norm": 2.361414670944214, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.713942289352417, + "num_tokens": 325326499.0, + "step": 13044 + }, + { + "epoch": 1.4325719305952118, + "grad_norm": 2.3622748851776123, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7229140996932983, + "num_tokens": 325348435.0, + "step": 13045 + }, + { + "epoch": 1.4326817482978256, + "grad_norm": 1.7831523418426514, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7194987535476685, + "num_tokens": 325383464.0, + "step": 13046 + }, + { + "epoch": 1.4327915660004393, + "grad_norm": 2.1867871284484863, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.70550137758255, + "num_tokens": 325408625.0, + "step": 13047 + }, + { + "epoch": 1.4329013837030529, + "grad_norm": 2.3579230308532715, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7362200021743774, + "num_tokens": 325432490.0, + "step": 13048 + }, + { + "epoch": 1.4330112014056666, + "grad_norm": 2.6004891395568848, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7167043685913086, + "num_tokens": 325453702.0, + "step": 13049 + }, + { + "epoch": 1.4331210191082802, + "grad_norm": 1.891776442527771, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7231351733207703, + "num_tokens": 325488168.0, + "step": 13050 + }, + { + "epoch": 1.433230836810894, + "grad_norm": 2.411717176437378, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7272463440895081, + "num_tokens": 325511035.0, + "step": 13051 + }, + { + "epoch": 1.4333406545135077, + "grad_norm": 2.2849738597869873, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7114243507385254, + "num_tokens": 325535433.0, + "step": 13052 + }, + { + "epoch": 1.4334504722161212, + "grad_norm": 2.3923287391662598, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7255908846855164, + "num_tokens": 325559183.0, + "step": 13053 + }, + { + "epoch": 1.433560289918735, + "grad_norm": 2.6529228687286377, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7382696270942688, + "num_tokens": 325577315.0, + "step": 13054 + }, + { + "epoch": 1.4336701076213485, + "grad_norm": 2.419956684112549, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7314155101776123, + "num_tokens": 325599318.0, + "step": 13055 + }, + { + "epoch": 1.4337799253239623, + "grad_norm": 2.3510019779205322, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.710071325302124, + "num_tokens": 325623526.0, + "step": 13056 + }, + { + "epoch": 1.433889743026576, + "grad_norm": 2.321091413497925, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7238607406616211, + "num_tokens": 325646672.0, + "step": 13057 + }, + { + "epoch": 1.4339995607291895, + "grad_norm": 2.3204407691955566, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7178738117218018, + "num_tokens": 325670503.0, + "step": 13058 + }, + { + "epoch": 1.434109378431803, + "grad_norm": 2.2967746257781982, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7159677743911743, + "num_tokens": 325695374.0, + "step": 13059 + }, + { + "epoch": 1.4342191961344168, + "grad_norm": 2.412609815597534, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.706718385219574, + "num_tokens": 325721169.0, + "step": 13060 + }, + { + "epoch": 1.4343290138370306, + "grad_norm": 2.4450700283050537, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7179844379425049, + "num_tokens": 325745470.0, + "step": 13061 + }, + { + "epoch": 1.4344388315396441, + "grad_norm": 2.500845193862915, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7255350351333618, + "num_tokens": 325769594.0, + "step": 13062 + }, + { + "epoch": 1.434548649242258, + "grad_norm": 2.572269916534424, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7074438333511353, + "num_tokens": 325791815.0, + "step": 13063 + }, + { + "epoch": 1.4346584669448714, + "grad_norm": 2.5605576038360596, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7380260229110718, + "num_tokens": 325809848.0, + "step": 13064 + }, + { + "epoch": 1.4347682846474852, + "grad_norm": 2.5269858837127686, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.721548855304718, + "num_tokens": 325831265.0, + "step": 13065 + }, + { + "epoch": 1.434878102350099, + "grad_norm": 2.3187544345855713, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7219987511634827, + "num_tokens": 325856115.0, + "step": 13066 + }, + { + "epoch": 1.4349879200527125, + "grad_norm": 2.3505759239196777, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7093927264213562, + "num_tokens": 325878789.0, + "step": 13067 + }, + { + "epoch": 1.4350977377553262, + "grad_norm": 2.288752317428589, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7215591073036194, + "num_tokens": 325902826.0, + "step": 13068 + }, + { + "epoch": 1.4352075554579398, + "grad_norm": 2.8616604804992676, + "learning_rate": 1e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7483059167861938, + "num_tokens": 325921404.0, + "step": 13069 + }, + { + "epoch": 1.4353173731605535, + "grad_norm": 2.0439186096191406, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7304419279098511, + "num_tokens": 325950643.0, + "step": 13070 + }, + { + "epoch": 1.4354271908631673, + "grad_norm": 2.1358089447021484, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7127853631973267, + "num_tokens": 325980970.0, + "step": 13071 + }, + { + "epoch": 1.4355370085657808, + "grad_norm": 2.4267241954803467, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7197032570838928, + "num_tokens": 326004262.0, + "step": 13072 + }, + { + "epoch": 1.4356468262683943, + "grad_norm": 2.713552713394165, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7295302152633667, + "num_tokens": 326024033.0, + "step": 13073 + }, + { + "epoch": 1.435756643971008, + "grad_norm": 2.3778114318847656, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7365349531173706, + "num_tokens": 326044616.0, + "step": 13074 + }, + { + "epoch": 1.4358664616736219, + "grad_norm": 2.2594969272613525, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7121864557266235, + "num_tokens": 326070323.0, + "step": 13075 + }, + { + "epoch": 1.4359762793762354, + "grad_norm": 2.0294759273529053, + "learning_rate": 1e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7530932426452637, + "num_tokens": 326100628.0, + "step": 13076 + }, + { + "epoch": 1.4360860970788492, + "grad_norm": 2.0486466884613037, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6897463798522949, + "num_tokens": 326131228.0, + "step": 13077 + }, + { + "epoch": 1.4361959147814627, + "grad_norm": 2.596902370452881, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7302406430244446, + "num_tokens": 326152358.0, + "step": 13078 + }, + { + "epoch": 1.4363057324840764, + "grad_norm": 2.189788341522217, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7079018354415894, + "num_tokens": 326178536.0, + "step": 13079 + }, + { + "epoch": 1.4364155501866902, + "grad_norm": 2.649786949157715, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7321941256523132, + "num_tokens": 326197984.0, + "step": 13080 + }, + { + "epoch": 1.4365253678893037, + "grad_norm": 2.7359955310821533, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7328908443450928, + "num_tokens": 326217051.0, + "step": 13081 + }, + { + "epoch": 1.4366351855919175, + "grad_norm": 2.627964735031128, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7238528728485107, + "num_tokens": 326235505.0, + "step": 13082 + }, + { + "epoch": 1.436745003294531, + "grad_norm": 2.3710122108459473, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7143478393554688, + "num_tokens": 326260647.0, + "step": 13083 + }, + { + "epoch": 1.4368548209971448, + "grad_norm": 2.1753926277160645, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7080515623092651, + "num_tokens": 326288420.0, + "step": 13084 + }, + { + "epoch": 1.4369646386997583, + "grad_norm": 2.2264585494995117, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7259777784347534, + "num_tokens": 326313992.0, + "step": 13085 + }, + { + "epoch": 1.437074456402372, + "grad_norm": 2.379854917526245, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7214435935020447, + "num_tokens": 326337833.0, + "step": 13086 + }, + { + "epoch": 1.4371842741049856, + "grad_norm": 2.5214483737945557, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7194121479988098, + "num_tokens": 326358639.0, + "step": 13087 + }, + { + "epoch": 1.4372940918075994, + "grad_norm": 2.578136444091797, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7066439390182495, + "num_tokens": 326378704.0, + "step": 13088 + }, + { + "epoch": 1.4374039095102131, + "grad_norm": 2.227250337600708, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7310727834701538, + "num_tokens": 326404235.0, + "step": 13089 + }, + { + "epoch": 1.4375137272128267, + "grad_norm": 2.2646970748901367, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7315623760223389, + "num_tokens": 326430785.0, + "step": 13090 + }, + { + "epoch": 1.4376235449154404, + "grad_norm": 2.421529531478882, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7377890348434448, + "num_tokens": 326453333.0, + "step": 13091 + }, + { + "epoch": 1.437733362618054, + "grad_norm": 2.47902512550354, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7197258472442627, + "num_tokens": 326474383.0, + "step": 13092 + }, + { + "epoch": 1.4378431803206677, + "grad_norm": 2.4813120365142822, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7043360471725464, + "num_tokens": 326495366.0, + "step": 13093 + }, + { + "epoch": 1.4379529980232815, + "grad_norm": 2.232849359512329, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7213740348815918, + "num_tokens": 326520126.0, + "step": 13094 + }, + { + "epoch": 1.438062815725895, + "grad_norm": 2.2000508308410645, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7327481508255005, + "num_tokens": 326547267.0, + "step": 13095 + }, + { + "epoch": 1.4381726334285085, + "grad_norm": 2.339418888092041, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7245160341262817, + "num_tokens": 326570949.0, + "step": 13096 + }, + { + "epoch": 1.4382824511311223, + "grad_norm": 2.162688732147217, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7043423056602478, + "num_tokens": 326600202.0, + "step": 13097 + }, + { + "epoch": 1.438392268833736, + "grad_norm": 2.4619789123535156, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.725492000579834, + "num_tokens": 326622492.0, + "step": 13098 + }, + { + "epoch": 1.4385020865363496, + "grad_norm": 2.830167770385742, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7281577587127686, + "num_tokens": 326641626.0, + "step": 13099 + }, + { + "epoch": 1.4386119042389633, + "grad_norm": 2.3659555912017822, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7282301187515259, + "num_tokens": 326664513.0, + "step": 13100 + }, + { + "epoch": 1.4387217219415769, + "grad_norm": 2.3666234016418457, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7259764671325684, + "num_tokens": 326687328.0, + "step": 13101 + }, + { + "epoch": 1.4388315396441906, + "grad_norm": 2.1846799850463867, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7187254428863525, + "num_tokens": 326715995.0, + "step": 13102 + }, + { + "epoch": 1.4389413573468044, + "grad_norm": 2.283261299133301, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7159703373908997, + "num_tokens": 326740411.0, + "step": 13103 + }, + { + "epoch": 1.439051175049418, + "grad_norm": 1.9400382041931152, + "learning_rate": 1e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7399533987045288, + "num_tokens": 326771311.0, + "step": 13104 + }, + { + "epoch": 1.4391609927520317, + "grad_norm": 1.9087908267974854, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7396999001502991, + "num_tokens": 326803047.0, + "step": 13105 + }, + { + "epoch": 1.4392708104546452, + "grad_norm": 1.9475641250610352, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7122679948806763, + "num_tokens": 326836137.0, + "step": 13106 + }, + { + "epoch": 1.439380628157259, + "grad_norm": 2.1936495304107666, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.722017228603363, + "num_tokens": 326863573.0, + "step": 13107 + }, + { + "epoch": 1.4394904458598727, + "grad_norm": 2.0387046337127686, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.7013736367225647, + "num_tokens": 326892826.0, + "step": 13108 + }, + { + "epoch": 1.4396002635624863, + "grad_norm": 2.1168367862701416, + "learning_rate": 1e-06, + "loss": 1.0806, + "mean_token_accuracy": 0.6795974969863892, + "num_tokens": 326921017.0, + "step": 13109 + }, + { + "epoch": 1.4397100812650998, + "grad_norm": 2.2358481884002686, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7082980871200562, + "num_tokens": 326946915.0, + "step": 13110 + }, + { + "epoch": 1.4398198989677136, + "grad_norm": 2.1933021545410156, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7377785444259644, + "num_tokens": 326970687.0, + "step": 13111 + }, + { + "epoch": 1.4399297166703273, + "grad_norm": 2.181034564971924, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7169657945632935, + "num_tokens": 326998755.0, + "step": 13112 + }, + { + "epoch": 1.4400395343729409, + "grad_norm": 2.273974895477295, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7150475382804871, + "num_tokens": 327023520.0, + "step": 13113 + }, + { + "epoch": 1.4401493520755546, + "grad_norm": 2.389211416244507, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7138707637786865, + "num_tokens": 327048112.0, + "step": 13114 + }, + { + "epoch": 1.4402591697781681, + "grad_norm": 2.1182665824890137, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7103135585784912, + "num_tokens": 327075482.0, + "step": 13115 + }, + { + "epoch": 1.440368987480782, + "grad_norm": 2.609187602996826, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.732446551322937, + "num_tokens": 327095782.0, + "step": 13116 + }, + { + "epoch": 1.4404788051833957, + "grad_norm": 2.4078996181488037, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7373510599136353, + "num_tokens": 327117591.0, + "step": 13117 + }, + { + "epoch": 1.4405886228860092, + "grad_norm": 2.20465087890625, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7196018695831299, + "num_tokens": 327142387.0, + "step": 13118 + }, + { + "epoch": 1.440698440588623, + "grad_norm": 2.2508575916290283, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6967819929122925, + "num_tokens": 327171163.0, + "step": 13119 + }, + { + "epoch": 1.4408082582912365, + "grad_norm": 2.584847927093506, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7219062447547913, + "num_tokens": 327192240.0, + "step": 13120 + }, + { + "epoch": 1.4409180759938502, + "grad_norm": 2.318570137023926, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7178231477737427, + "num_tokens": 327216474.0, + "step": 13121 + }, + { + "epoch": 1.441027893696464, + "grad_norm": 2.271207332611084, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7137703895568848, + "num_tokens": 327241311.0, + "step": 13122 + }, + { + "epoch": 1.4411377113990775, + "grad_norm": 2.503610610961914, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7306891083717346, + "num_tokens": 327261879.0, + "step": 13123 + }, + { + "epoch": 1.441247529101691, + "grad_norm": 2.152703285217285, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7123793363571167, + "num_tokens": 327288701.0, + "step": 13124 + }, + { + "epoch": 1.4413573468043048, + "grad_norm": 2.3297224044799805, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7015225887298584, + "num_tokens": 327313088.0, + "step": 13125 + }, + { + "epoch": 1.4414671645069186, + "grad_norm": 2.0564136505126953, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.721493661403656, + "num_tokens": 327342209.0, + "step": 13126 + }, + { + "epoch": 1.4415769822095321, + "grad_norm": 2.6270031929016113, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7239542007446289, + "num_tokens": 327364178.0, + "step": 13127 + }, + { + "epoch": 1.4416867999121459, + "grad_norm": 2.1671688556671143, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7062427997589111, + "num_tokens": 327390088.0, + "step": 13128 + }, + { + "epoch": 1.4417966176147594, + "grad_norm": 2.21411395072937, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7023122906684875, + "num_tokens": 327420642.0, + "step": 13129 + }, + { + "epoch": 1.4419064353173732, + "grad_norm": 2.4416861534118652, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7420281767845154, + "num_tokens": 327442604.0, + "step": 13130 + }, + { + "epoch": 1.442016253019987, + "grad_norm": 2.2247121334075928, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7260714769363403, + "num_tokens": 327467645.0, + "step": 13131 + }, + { + "epoch": 1.4421260707226005, + "grad_norm": 2.274160385131836, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7233280539512634, + "num_tokens": 327490580.0, + "step": 13132 + }, + { + "epoch": 1.4422358884252142, + "grad_norm": 2.3583405017852783, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7309370040893555, + "num_tokens": 327513418.0, + "step": 13133 + }, + { + "epoch": 1.4423457061278278, + "grad_norm": 2.2117297649383545, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.707125186920166, + "num_tokens": 327540416.0, + "step": 13134 + }, + { + "epoch": 1.4424555238304415, + "grad_norm": 2.2698073387145996, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7288525104522705, + "num_tokens": 327564595.0, + "step": 13135 + }, + { + "epoch": 1.442565341533055, + "grad_norm": 2.134716510772705, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7194684743881226, + "num_tokens": 327591643.0, + "step": 13136 + }, + { + "epoch": 1.4426751592356688, + "grad_norm": 2.2637789249420166, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7014328241348267, + "num_tokens": 327617036.0, + "step": 13137 + }, + { + "epoch": 1.4427849769382823, + "grad_norm": 2.4900524616241455, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.72529137134552, + "num_tokens": 327638451.0, + "step": 13138 + }, + { + "epoch": 1.442894794640896, + "grad_norm": 2.209575653076172, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7314509153366089, + "num_tokens": 327664328.0, + "step": 13139 + }, + { + "epoch": 1.4430046123435099, + "grad_norm": 2.4205949306488037, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7195080518722534, + "num_tokens": 327687772.0, + "step": 13140 + }, + { + "epoch": 1.4431144300461234, + "grad_norm": 2.69844388961792, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7346764802932739, + "num_tokens": 327706488.0, + "step": 13141 + }, + { + "epoch": 1.4432242477487371, + "grad_norm": 2.006098508834839, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6989784240722656, + "num_tokens": 327740317.0, + "step": 13142 + }, + { + "epoch": 1.4433340654513507, + "grad_norm": 2.5530028343200684, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7234604954719543, + "num_tokens": 327761931.0, + "step": 13143 + }, + { + "epoch": 1.4434438831539644, + "grad_norm": 2.396667242050171, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7190465927124023, + "num_tokens": 327786733.0, + "step": 13144 + }, + { + "epoch": 1.4435537008565782, + "grad_norm": 1.9290192127227783, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7063442468643188, + "num_tokens": 327823224.0, + "step": 13145 + }, + { + "epoch": 1.4436635185591917, + "grad_norm": 2.3169496059417725, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.708679735660553, + "num_tokens": 327848790.0, + "step": 13146 + }, + { + "epoch": 1.4437733362618055, + "grad_norm": 2.6498782634735107, + "learning_rate": 1e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7517152428627014, + "num_tokens": 327866554.0, + "step": 13147 + }, + { + "epoch": 1.443883153964419, + "grad_norm": 2.1613147258758545, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.6973979473114014, + "num_tokens": 327891988.0, + "step": 13148 + }, + { + "epoch": 1.4439929716670328, + "grad_norm": 2.235121726989746, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7261686325073242, + "num_tokens": 327917484.0, + "step": 13149 + }, + { + "epoch": 1.4441027893696463, + "grad_norm": 2.371967077255249, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7056660056114197, + "num_tokens": 327941028.0, + "step": 13150 + }, + { + "epoch": 1.44421260707226, + "grad_norm": 2.0773441791534424, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7257779836654663, + "num_tokens": 327968751.0, + "step": 13151 + }, + { + "epoch": 1.4443224247748736, + "grad_norm": 2.0847856998443604, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7226834297180176, + "num_tokens": 327997605.0, + "step": 13152 + }, + { + "epoch": 1.4444322424774874, + "grad_norm": 2.335136651992798, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7307347059249878, + "num_tokens": 328021266.0, + "step": 13153 + }, + { + "epoch": 1.4445420601801011, + "grad_norm": 2.7706470489501953, + "learning_rate": 1e-06, + "loss": 0.7674, + "mean_token_accuracy": 0.7480421662330627, + "num_tokens": 328039399.0, + "step": 13154 + }, + { + "epoch": 1.4446518778827147, + "grad_norm": 2.4952757358551025, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7228381633758545, + "num_tokens": 328062045.0, + "step": 13155 + }, + { + "epoch": 1.4447616955853284, + "grad_norm": 2.1205267906188965, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7053354978561401, + "num_tokens": 328089837.0, + "step": 13156 + }, + { + "epoch": 1.444871513287942, + "grad_norm": 2.4322688579559326, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7363554239273071, + "num_tokens": 328111955.0, + "step": 13157 + }, + { + "epoch": 1.4449813309905557, + "grad_norm": 2.31139874458313, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7474526166915894, + "num_tokens": 328137384.0, + "step": 13158 + }, + { + "epoch": 1.4450911486931695, + "grad_norm": 2.623987913131714, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7380023002624512, + "num_tokens": 328158522.0, + "step": 13159 + }, + { + "epoch": 1.445200966395783, + "grad_norm": 2.0901083946228027, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7328715324401855, + "num_tokens": 328184539.0, + "step": 13160 + }, + { + "epoch": 1.4453107840983965, + "grad_norm": 2.2562499046325684, + "learning_rate": 1e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7436084151268005, + "num_tokens": 328208356.0, + "step": 13161 + }, + { + "epoch": 1.4454206018010103, + "grad_norm": 2.4830143451690674, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7307554483413696, + "num_tokens": 328230636.0, + "step": 13162 + }, + { + "epoch": 1.445530419503624, + "grad_norm": 2.4266016483306885, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.742853045463562, + "num_tokens": 328254885.0, + "step": 13163 + }, + { + "epoch": 1.4456402372062376, + "grad_norm": 2.1204674243927, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7311645746231079, + "num_tokens": 328284543.0, + "step": 13164 + }, + { + "epoch": 1.4457500549088513, + "grad_norm": 2.334801435470581, + "learning_rate": 1e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7365431785583496, + "num_tokens": 328308961.0, + "step": 13165 + }, + { + "epoch": 1.4458598726114649, + "grad_norm": 2.5998120307922363, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7449711561203003, + "num_tokens": 328328219.0, + "step": 13166 + }, + { + "epoch": 1.4459696903140786, + "grad_norm": 2.3398654460906982, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7256501317024231, + "num_tokens": 328351707.0, + "step": 13167 + }, + { + "epoch": 1.4460795080166924, + "grad_norm": 2.3139023780822754, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7147659063339233, + "num_tokens": 328375855.0, + "step": 13168 + }, + { + "epoch": 1.446189325719306, + "grad_norm": 2.2039623260498047, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.725884199142456, + "num_tokens": 328401048.0, + "step": 13169 + }, + { + "epoch": 1.4462991434219197, + "grad_norm": 2.2807037830352783, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7084755897521973, + "num_tokens": 328427490.0, + "step": 13170 + }, + { + "epoch": 1.4464089611245332, + "grad_norm": 2.603839159011841, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7181407809257507, + "num_tokens": 328448152.0, + "step": 13171 + }, + { + "epoch": 1.446518778827147, + "grad_norm": 2.486403703689575, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.727756679058075, + "num_tokens": 328470203.0, + "step": 13172 + }, + { + "epoch": 1.4466285965297607, + "grad_norm": 2.173555850982666, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7316845655441284, + "num_tokens": 328496316.0, + "step": 13173 + }, + { + "epoch": 1.4467384142323743, + "grad_norm": 2.6706697940826416, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7435543537139893, + "num_tokens": 328514235.0, + "step": 13174 + }, + { + "epoch": 1.4468482319349878, + "grad_norm": 2.190584421157837, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7114059925079346, + "num_tokens": 328544276.0, + "step": 13175 + }, + { + "epoch": 1.4469580496376016, + "grad_norm": 2.140800714492798, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7338727712631226, + "num_tokens": 328574086.0, + "step": 13176 + }, + { + "epoch": 1.4470678673402153, + "grad_norm": 2.582167387008667, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7433814406394958, + "num_tokens": 328593410.0, + "step": 13177 + }, + { + "epoch": 1.4471776850428288, + "grad_norm": 2.2616496086120605, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7290666699409485, + "num_tokens": 328617961.0, + "step": 13178 + }, + { + "epoch": 1.4472875027454426, + "grad_norm": 2.6613168716430664, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7252802848815918, + "num_tokens": 328638488.0, + "step": 13179 + }, + { + "epoch": 1.4473973204480561, + "grad_norm": 2.1887571811676025, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7151591777801514, + "num_tokens": 328666969.0, + "step": 13180 + }, + { + "epoch": 1.44750713815067, + "grad_norm": 2.824563980102539, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7474332451820374, + "num_tokens": 328685484.0, + "step": 13181 + }, + { + "epoch": 1.4476169558532836, + "grad_norm": 2.517226219177246, + "learning_rate": 1e-06, + "loss": 0.7837, + "mean_token_accuracy": 0.749571681022644, + "num_tokens": 328705122.0, + "step": 13182 + }, + { + "epoch": 1.4477267735558972, + "grad_norm": 2.2923946380615234, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7109238505363464, + "num_tokens": 328730744.0, + "step": 13183 + }, + { + "epoch": 1.447836591258511, + "grad_norm": 2.2437796592712402, + "learning_rate": 1e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7458899617195129, + "num_tokens": 328755129.0, + "step": 13184 + }, + { + "epoch": 1.4479464089611245, + "grad_norm": 2.0486090183258057, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7397081851959229, + "num_tokens": 328782567.0, + "step": 13185 + }, + { + "epoch": 1.4480562266637382, + "grad_norm": 2.279470682144165, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7243324518203735, + "num_tokens": 328808761.0, + "step": 13186 + }, + { + "epoch": 1.448166044366352, + "grad_norm": 2.3122291564941406, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7311307787895203, + "num_tokens": 328832571.0, + "step": 13187 + }, + { + "epoch": 1.4482758620689655, + "grad_norm": 2.2541966438293457, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7122704386711121, + "num_tokens": 328859658.0, + "step": 13188 + }, + { + "epoch": 1.448385679771579, + "grad_norm": 2.1551194190979004, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7084949612617493, + "num_tokens": 328888172.0, + "step": 13189 + }, + { + "epoch": 1.4484954974741928, + "grad_norm": 2.174405813217163, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7254418730735779, + "num_tokens": 328915053.0, + "step": 13190 + }, + { + "epoch": 1.4486053151768066, + "grad_norm": 2.1782867908477783, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7020715475082397, + "num_tokens": 328940265.0, + "step": 13191 + }, + { + "epoch": 1.44871513287942, + "grad_norm": 2.255080223083496, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7299656867980957, + "num_tokens": 328965077.0, + "step": 13192 + }, + { + "epoch": 1.4488249505820339, + "grad_norm": 2.344301223754883, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7257211804389954, + "num_tokens": 328988323.0, + "step": 13193 + }, + { + "epoch": 1.4489347682846474, + "grad_norm": 2.158102035522461, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6897757649421692, + "num_tokens": 329017316.0, + "step": 13194 + }, + { + "epoch": 1.4490445859872612, + "grad_norm": 2.053994655609131, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7034152746200562, + "num_tokens": 329048497.0, + "step": 13195 + }, + { + "epoch": 1.449154403689875, + "grad_norm": 2.7096400260925293, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7336187362670898, + "num_tokens": 329067754.0, + "step": 13196 + }, + { + "epoch": 1.4492642213924884, + "grad_norm": 2.3546884059906006, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7234309315681458, + "num_tokens": 329091514.0, + "step": 13197 + }, + { + "epoch": 1.4493740390951022, + "grad_norm": 2.1744492053985596, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7422311902046204, + "num_tokens": 329119283.0, + "step": 13198 + }, + { + "epoch": 1.4494838567977157, + "grad_norm": 2.212118625640869, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7128058671951294, + "num_tokens": 329145421.0, + "step": 13199 + }, + { + "epoch": 1.4495936745003295, + "grad_norm": 1.9519809484481812, + "learning_rate": 1e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7595359086990356, + "num_tokens": 329173359.0, + "step": 13200 + }, + { + "epoch": 1.449703492202943, + "grad_norm": 2.398287773132324, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7318423390388489, + "num_tokens": 329197101.0, + "step": 13201 + }, + { + "epoch": 1.4498133099055568, + "grad_norm": 2.3022918701171875, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.6979058384895325, + "num_tokens": 329221889.0, + "step": 13202 + }, + { + "epoch": 1.4499231276081703, + "grad_norm": 2.167895555496216, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.699398934841156, + "num_tokens": 329249599.0, + "step": 13203 + }, + { + "epoch": 1.450032945310784, + "grad_norm": 2.2422285079956055, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7127257585525513, + "num_tokens": 329275602.0, + "step": 13204 + }, + { + "epoch": 1.4501427630133978, + "grad_norm": 2.32902455329895, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7042476534843445, + "num_tokens": 329299914.0, + "step": 13205 + }, + { + "epoch": 1.4502525807160114, + "grad_norm": 2.687859296798706, + "learning_rate": 1e-06, + "loss": 0.7809, + "mean_token_accuracy": 0.7615805864334106, + "num_tokens": 329321391.0, + "step": 13206 + }, + { + "epoch": 1.4503623984186251, + "grad_norm": 2.432215929031372, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7211625576019287, + "num_tokens": 329344600.0, + "step": 13207 + }, + { + "epoch": 1.4504722161212387, + "grad_norm": 2.0854086875915527, + "learning_rate": 1e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7324634194374084, + "num_tokens": 329373189.0, + "step": 13208 + }, + { + "epoch": 1.4505820338238524, + "grad_norm": 2.0505905151367188, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7292934656143188, + "num_tokens": 329401620.0, + "step": 13209 + }, + { + "epoch": 1.4506918515264662, + "grad_norm": 2.3417654037475586, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7298229932785034, + "num_tokens": 329424389.0, + "step": 13210 + }, + { + "epoch": 1.4508016692290797, + "grad_norm": 2.2566874027252197, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7156323194503784, + "num_tokens": 329449084.0, + "step": 13211 + }, + { + "epoch": 1.4509114869316933, + "grad_norm": 2.2971861362457275, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7100987434387207, + "num_tokens": 329475555.0, + "step": 13212 + }, + { + "epoch": 1.451021304634307, + "grad_norm": 2.460855484008789, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7272704839706421, + "num_tokens": 329497157.0, + "step": 13213 + }, + { + "epoch": 1.4511311223369208, + "grad_norm": 2.191272258758545, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7212398052215576, + "num_tokens": 329524516.0, + "step": 13214 + }, + { + "epoch": 1.4512409400395343, + "grad_norm": 2.1790659427642822, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6850186586380005, + "num_tokens": 329552373.0, + "step": 13215 + }, + { + "epoch": 1.451350757742148, + "grad_norm": 2.4697442054748535, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7247333526611328, + "num_tokens": 329573933.0, + "step": 13216 + }, + { + "epoch": 1.4514605754447616, + "grad_norm": 2.0579636096954346, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7194355726242065, + "num_tokens": 329602753.0, + "step": 13217 + }, + { + "epoch": 1.4515703931473753, + "grad_norm": 2.3626396656036377, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7228643298149109, + "num_tokens": 329626223.0, + "step": 13218 + }, + { + "epoch": 1.451680210849989, + "grad_norm": 2.388633966445923, + "learning_rate": 1e-06, + "loss": 0.7369, + "mean_token_accuracy": 0.7599488496780396, + "num_tokens": 329648282.0, + "step": 13219 + }, + { + "epoch": 1.4517900285526026, + "grad_norm": 2.6529390811920166, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7213176488876343, + "num_tokens": 329668972.0, + "step": 13220 + }, + { + "epoch": 1.4518998462552164, + "grad_norm": 2.160064458847046, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7172585725784302, + "num_tokens": 329695840.0, + "step": 13221 + }, + { + "epoch": 1.45200966395783, + "grad_norm": 2.537970781326294, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6878818273544312, + "num_tokens": 329718448.0, + "step": 13222 + }, + { + "epoch": 1.4521194816604437, + "grad_norm": 2.32639217376709, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7221367955207825, + "num_tokens": 329741707.0, + "step": 13223 + }, + { + "epoch": 1.4522292993630574, + "grad_norm": 2.1140360832214355, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6782962083816528, + "num_tokens": 329771042.0, + "step": 13224 + }, + { + "epoch": 1.452339117065671, + "grad_norm": 2.1525228023529053, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6972012519836426, + "num_tokens": 329799830.0, + "step": 13225 + }, + { + "epoch": 1.4524489347682845, + "grad_norm": 2.596712589263916, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7271918058395386, + "num_tokens": 329821065.0, + "step": 13226 + }, + { + "epoch": 1.4525587524708983, + "grad_norm": 2.303832530975342, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7319892644882202, + "num_tokens": 329842756.0, + "step": 13227 + }, + { + "epoch": 1.452668570173512, + "grad_norm": 2.015226364135742, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7244916558265686, + "num_tokens": 329872550.0, + "step": 13228 + }, + { + "epoch": 1.4527783878761256, + "grad_norm": 2.194028377532959, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.71924889087677, + "num_tokens": 329899402.0, + "step": 13229 + }, + { + "epoch": 1.4528882055787393, + "grad_norm": 2.3291492462158203, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7154783010482788, + "num_tokens": 329924490.0, + "step": 13230 + }, + { + "epoch": 1.4529980232813529, + "grad_norm": 2.4299798011779785, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7122728824615479, + "num_tokens": 329947027.0, + "step": 13231 + }, + { + "epoch": 1.4531078409839666, + "grad_norm": 2.1711630821228027, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7014758586883545, + "num_tokens": 329973725.0, + "step": 13232 + }, + { + "epoch": 1.4532176586865804, + "grad_norm": 2.323134422302246, + "learning_rate": 1e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7467025518417358, + "num_tokens": 329996220.0, + "step": 13233 + }, + { + "epoch": 1.453327476389194, + "grad_norm": 1.898917555809021, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7268053293228149, + "num_tokens": 330027862.0, + "step": 13234 + }, + { + "epoch": 1.4534372940918077, + "grad_norm": 2.2855026721954346, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7123658657073975, + "num_tokens": 330053450.0, + "step": 13235 + }, + { + "epoch": 1.4535471117944212, + "grad_norm": 2.0386135578155518, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.730898916721344, + "num_tokens": 330082150.0, + "step": 13236 + }, + { + "epoch": 1.453656929497035, + "grad_norm": 2.2085013389587402, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7030189037322998, + "num_tokens": 330110128.0, + "step": 13237 + }, + { + "epoch": 1.4537667471996487, + "grad_norm": 1.919677734375, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7294304966926575, + "num_tokens": 330143565.0, + "step": 13238 + }, + { + "epoch": 1.4538765649022622, + "grad_norm": 2.1669328212738037, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6954552531242371, + "num_tokens": 330173758.0, + "step": 13239 + }, + { + "epoch": 1.4539863826048758, + "grad_norm": 2.2284579277038574, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7344567179679871, + "num_tokens": 330198723.0, + "step": 13240 + }, + { + "epoch": 1.4540962003074895, + "grad_norm": 2.2239108085632324, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7228565216064453, + "num_tokens": 330225335.0, + "step": 13241 + }, + { + "epoch": 1.4542060180101033, + "grad_norm": 2.304992437362671, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6990048289299011, + "num_tokens": 330250797.0, + "step": 13242 + }, + { + "epoch": 1.4543158357127168, + "grad_norm": 2.534027576446533, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7389069199562073, + "num_tokens": 330272037.0, + "step": 13243 + }, + { + "epoch": 1.4544256534153306, + "grad_norm": 2.277369499206543, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7132545113563538, + "num_tokens": 330297631.0, + "step": 13244 + }, + { + "epoch": 1.4545354711179441, + "grad_norm": 2.343317985534668, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7346516847610474, + "num_tokens": 330321626.0, + "step": 13245 + }, + { + "epoch": 1.4546452888205579, + "grad_norm": 2.124690532684326, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7189186811447144, + "num_tokens": 330350767.0, + "step": 13246 + }, + { + "epoch": 1.4547551065231716, + "grad_norm": 2.29586124420166, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7300466299057007, + "num_tokens": 330375525.0, + "step": 13247 + }, + { + "epoch": 1.4548649242257852, + "grad_norm": 2.314284563064575, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7106373310089111, + "num_tokens": 330399798.0, + "step": 13248 + }, + { + "epoch": 1.454974741928399, + "grad_norm": 2.0323827266693115, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7095613479614258, + "num_tokens": 330431155.0, + "step": 13249 + }, + { + "epoch": 1.4550845596310125, + "grad_norm": 2.3035120964050293, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7150478363037109, + "num_tokens": 330456217.0, + "step": 13250 + }, + { + "epoch": 1.4551943773336262, + "grad_norm": 2.0455496311187744, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.722027063369751, + "num_tokens": 330485323.0, + "step": 13251 + }, + { + "epoch": 1.45530419503624, + "grad_norm": 2.3155100345611572, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7097277641296387, + "num_tokens": 330509174.0, + "step": 13252 + }, + { + "epoch": 1.4554140127388535, + "grad_norm": 2.346820116043091, + "learning_rate": 1e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7519162893295288, + "num_tokens": 330530893.0, + "step": 13253 + }, + { + "epoch": 1.455523830441467, + "grad_norm": 1.9912219047546387, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6980574727058411, + "num_tokens": 330562922.0, + "step": 13254 + }, + { + "epoch": 1.4556336481440808, + "grad_norm": 2.0532727241516113, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7042096257209778, + "num_tokens": 330593703.0, + "step": 13255 + }, + { + "epoch": 1.4557434658466946, + "grad_norm": 2.3837709426879883, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.724425196647644, + "num_tokens": 330615770.0, + "step": 13256 + }, + { + "epoch": 1.455853283549308, + "grad_norm": 2.0919430255889893, + "learning_rate": 1e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.7395403385162354, + "num_tokens": 330642007.0, + "step": 13257 + }, + { + "epoch": 1.4559631012519219, + "grad_norm": 2.22583270072937, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7131212949752808, + "num_tokens": 330668330.0, + "step": 13258 + }, + { + "epoch": 1.4560729189545354, + "grad_norm": 2.362309217453003, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7321010231971741, + "num_tokens": 330691822.0, + "step": 13259 + }, + { + "epoch": 1.4561827366571491, + "grad_norm": 2.477287530899048, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7559110522270203, + "num_tokens": 330713045.0, + "step": 13260 + }, + { + "epoch": 1.456292554359763, + "grad_norm": 2.4327569007873535, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.723289966583252, + "num_tokens": 330737582.0, + "step": 13261 + }, + { + "epoch": 1.4564023720623764, + "grad_norm": 2.2559680938720703, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7217239141464233, + "num_tokens": 330764270.0, + "step": 13262 + }, + { + "epoch": 1.4565121897649902, + "grad_norm": 2.7178406715393066, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7302713394165039, + "num_tokens": 330783698.0, + "step": 13263 + }, + { + "epoch": 1.4566220074676037, + "grad_norm": 2.234421968460083, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7369722127914429, + "num_tokens": 330807798.0, + "step": 13264 + }, + { + "epoch": 1.4567318251702175, + "grad_norm": 2.272596836090088, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.6951947212219238, + "num_tokens": 330833128.0, + "step": 13265 + }, + { + "epoch": 1.456841642872831, + "grad_norm": 2.4769933223724365, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7209097146987915, + "num_tokens": 330855606.0, + "step": 13266 + }, + { + "epoch": 1.4569514605754448, + "grad_norm": 2.2169058322906494, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7346509695053101, + "num_tokens": 330879895.0, + "step": 13267 + }, + { + "epoch": 1.4570612782780583, + "grad_norm": 1.834791660308838, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6940843462944031, + "num_tokens": 330918343.0, + "step": 13268 + }, + { + "epoch": 1.457171095980672, + "grad_norm": 2.2321391105651855, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.725109875202179, + "num_tokens": 330943494.0, + "step": 13269 + }, + { + "epoch": 1.4572809136832858, + "grad_norm": 2.142584800720215, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7190435528755188, + "num_tokens": 330971678.0, + "step": 13270 + }, + { + "epoch": 1.4573907313858994, + "grad_norm": 2.2367026805877686, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7234046459197998, + "num_tokens": 330996358.0, + "step": 13271 + }, + { + "epoch": 1.4575005490885131, + "grad_norm": 2.171271800994873, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7151501774787903, + "num_tokens": 331023260.0, + "step": 13272 + }, + { + "epoch": 1.4576103667911267, + "grad_norm": 2.4447391033172607, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7033227682113647, + "num_tokens": 331048705.0, + "step": 13273 + }, + { + "epoch": 1.4577201844937404, + "grad_norm": 2.408385992050171, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7015997171401978, + "num_tokens": 331073587.0, + "step": 13274 + }, + { + "epoch": 1.4578300021963542, + "grad_norm": 2.1790640354156494, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7294403314590454, + "num_tokens": 331101236.0, + "step": 13275 + }, + { + "epoch": 1.4579398198989677, + "grad_norm": 2.2623708248138428, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7411655187606812, + "num_tokens": 331124579.0, + "step": 13276 + }, + { + "epoch": 1.4580496376015812, + "grad_norm": 1.9566816091537476, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7153615951538086, + "num_tokens": 331157654.0, + "step": 13277 + }, + { + "epoch": 1.458159455304195, + "grad_norm": 2.0898213386535645, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7403444051742554, + "num_tokens": 331185993.0, + "step": 13278 + }, + { + "epoch": 1.4582692730068088, + "grad_norm": 2.217622995376587, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6944040060043335, + "num_tokens": 331213356.0, + "step": 13279 + }, + { + "epoch": 1.4583790907094223, + "grad_norm": 2.474351644515991, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7262542247772217, + "num_tokens": 331235288.0, + "step": 13280 + }, + { + "epoch": 1.458488908412036, + "grad_norm": 2.1593897342681885, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7025235891342163, + "num_tokens": 331263631.0, + "step": 13281 + }, + { + "epoch": 1.4585987261146496, + "grad_norm": 2.3334271907806396, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7107280492782593, + "num_tokens": 331288267.0, + "step": 13282 + }, + { + "epoch": 1.4587085438172633, + "grad_norm": 2.374295473098755, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7177374362945557, + "num_tokens": 331312008.0, + "step": 13283 + }, + { + "epoch": 1.458818361519877, + "grad_norm": 2.3810718059539795, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7294962406158447, + "num_tokens": 331335984.0, + "step": 13284 + }, + { + "epoch": 1.4589281792224906, + "grad_norm": 2.688838481903076, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7291325926780701, + "num_tokens": 331356387.0, + "step": 13285 + }, + { + "epoch": 1.4590379969251044, + "grad_norm": 2.5395805835723877, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7204355001449585, + "num_tokens": 331377919.0, + "step": 13286 + }, + { + "epoch": 1.459147814627718, + "grad_norm": 2.363563060760498, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7017078399658203, + "num_tokens": 331403041.0, + "step": 13287 + }, + { + "epoch": 1.4592576323303317, + "grad_norm": 2.2525250911712646, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7171382308006287, + "num_tokens": 331427443.0, + "step": 13288 + }, + { + "epoch": 1.4593674500329454, + "grad_norm": 2.222463369369507, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7244556546211243, + "num_tokens": 331454556.0, + "step": 13289 + }, + { + "epoch": 1.459477267735559, + "grad_norm": 2.4490954875946045, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7140941619873047, + "num_tokens": 331478136.0, + "step": 13290 + }, + { + "epoch": 1.4595870854381725, + "grad_norm": 2.010425090789795, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6951735019683838, + "num_tokens": 331507984.0, + "step": 13291 + }, + { + "epoch": 1.4596969031407863, + "grad_norm": 2.234248638153076, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7309455871582031, + "num_tokens": 331534447.0, + "step": 13292 + }, + { + "epoch": 1.4598067208434, + "grad_norm": 2.347956418991089, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7218449115753174, + "num_tokens": 331556649.0, + "step": 13293 + }, + { + "epoch": 1.4599165385460136, + "grad_norm": 1.9966862201690674, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7366647124290466, + "num_tokens": 331584067.0, + "step": 13294 + }, + { + "epoch": 1.4600263562486273, + "grad_norm": 2.504357099533081, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7497085332870483, + "num_tokens": 331602841.0, + "step": 13295 + }, + { + "epoch": 1.4601361739512408, + "grad_norm": 2.1555685997009277, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6922620534896851, + "num_tokens": 331632120.0, + "step": 13296 + }, + { + "epoch": 1.4602459916538546, + "grad_norm": 2.1351940631866455, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7116084694862366, + "num_tokens": 331659850.0, + "step": 13297 + }, + { + "epoch": 1.4603558093564684, + "grad_norm": 2.176605224609375, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.730116605758667, + "num_tokens": 331689269.0, + "step": 13298 + }, + { + "epoch": 1.460465627059082, + "grad_norm": 1.8470872640609741, + "learning_rate": 1e-06, + "loss": 0.863, + "mean_token_accuracy": 0.7288129329681396, + "num_tokens": 331725482.0, + "step": 13299 + }, + { + "epoch": 1.4605754447616957, + "grad_norm": 2.4823312759399414, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7282088994979858, + "num_tokens": 331746359.0, + "step": 13300 + }, + { + "epoch": 1.4606852624643092, + "grad_norm": 1.9880837202072144, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7168566584587097, + "num_tokens": 331775377.0, + "step": 13301 + }, + { + "epoch": 1.460795080166923, + "grad_norm": 2.414669990539551, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7236096858978271, + "num_tokens": 331799739.0, + "step": 13302 + }, + { + "epoch": 1.4609048978695367, + "grad_norm": 2.165253162384033, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7252326011657715, + "num_tokens": 331827441.0, + "step": 13303 + }, + { + "epoch": 1.4610147155721502, + "grad_norm": 2.4406309127807617, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7277743816375732, + "num_tokens": 331848637.0, + "step": 13304 + }, + { + "epoch": 1.4611245332747638, + "grad_norm": 2.311147689819336, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7218502759933472, + "num_tokens": 331874087.0, + "step": 13305 + }, + { + "epoch": 1.4612343509773775, + "grad_norm": 2.246004104614258, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6929460763931274, + "num_tokens": 331904110.0, + "step": 13306 + }, + { + "epoch": 1.4613441686799913, + "grad_norm": 2.4084019660949707, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7348009347915649, + "num_tokens": 331925980.0, + "step": 13307 + }, + { + "epoch": 1.4614539863826048, + "grad_norm": 2.0805935859680176, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.7395740151405334, + "num_tokens": 331953622.0, + "step": 13308 + }, + { + "epoch": 1.4615638040852186, + "grad_norm": 2.5509982109069824, + "learning_rate": 1e-06, + "loss": 0.7805, + "mean_token_accuracy": 0.7465218305587769, + "num_tokens": 331972657.0, + "step": 13309 + }, + { + "epoch": 1.4616736217878321, + "grad_norm": 2.2000656127929688, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7135053873062134, + "num_tokens": 331997112.0, + "step": 13310 + }, + { + "epoch": 1.4617834394904459, + "grad_norm": 2.127253293991089, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7050259113311768, + "num_tokens": 332026329.0, + "step": 13311 + }, + { + "epoch": 1.4618932571930596, + "grad_norm": 2.2025158405303955, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7157570719718933, + "num_tokens": 332053690.0, + "step": 13312 + }, + { + "epoch": 1.4620030748956732, + "grad_norm": 2.317121982574463, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.725986123085022, + "num_tokens": 332078543.0, + "step": 13313 + }, + { + "epoch": 1.462112892598287, + "grad_norm": 2.409083843231201, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6935397386550903, + "num_tokens": 332102243.0, + "step": 13314 + }, + { + "epoch": 1.4622227103009005, + "grad_norm": 2.314755916595459, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7183914184570312, + "num_tokens": 332126204.0, + "step": 13315 + }, + { + "epoch": 1.4623325280035142, + "grad_norm": 2.3606574535369873, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7176756858825684, + "num_tokens": 332149826.0, + "step": 13316 + }, + { + "epoch": 1.462442345706128, + "grad_norm": 2.65124773979187, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7207421064376831, + "num_tokens": 332171099.0, + "step": 13317 + }, + { + "epoch": 1.4625521634087415, + "grad_norm": 2.3682477474212646, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7113432884216309, + "num_tokens": 332195528.0, + "step": 13318 + }, + { + "epoch": 1.462661981111355, + "grad_norm": 2.4043216705322266, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7144007682800293, + "num_tokens": 332220623.0, + "step": 13319 + }, + { + "epoch": 1.4627717988139688, + "grad_norm": 2.3064699172973633, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7290358543395996, + "num_tokens": 332247111.0, + "step": 13320 + }, + { + "epoch": 1.4628816165165826, + "grad_norm": 2.4482946395874023, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7407323718070984, + "num_tokens": 332268165.0, + "step": 13321 + }, + { + "epoch": 1.462991434219196, + "grad_norm": 2.1858768463134766, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7127429246902466, + "num_tokens": 332297707.0, + "step": 13322 + }, + { + "epoch": 1.4631012519218098, + "grad_norm": 2.297757625579834, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.719771146774292, + "num_tokens": 332322328.0, + "step": 13323 + }, + { + "epoch": 1.4632110696244234, + "grad_norm": 2.436643123626709, + "learning_rate": 1e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7447590231895447, + "num_tokens": 332345484.0, + "step": 13324 + }, + { + "epoch": 1.4633208873270371, + "grad_norm": 2.441141366958618, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7294514775276184, + "num_tokens": 332368972.0, + "step": 13325 + }, + { + "epoch": 1.463430705029651, + "grad_norm": 2.484173536300659, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7329567074775696, + "num_tokens": 332390049.0, + "step": 13326 + }, + { + "epoch": 1.4635405227322644, + "grad_norm": 2.2242228984832764, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7150872945785522, + "num_tokens": 332416841.0, + "step": 13327 + }, + { + "epoch": 1.4636503404348782, + "grad_norm": 2.145272731781006, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7089614272117615, + "num_tokens": 332445769.0, + "step": 13328 + }, + { + "epoch": 1.4637601581374917, + "grad_norm": 2.3298180103302, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7094125151634216, + "num_tokens": 332469143.0, + "step": 13329 + }, + { + "epoch": 1.4638699758401055, + "grad_norm": 2.494696617126465, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7460682988166809, + "num_tokens": 332489798.0, + "step": 13330 + }, + { + "epoch": 1.463979793542719, + "grad_norm": 2.30332612991333, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7162958383560181, + "num_tokens": 332516739.0, + "step": 13331 + }, + { + "epoch": 1.4640896112453328, + "grad_norm": 2.1908018589019775, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7138202786445618, + "num_tokens": 332543561.0, + "step": 13332 + }, + { + "epoch": 1.4641994289479463, + "grad_norm": 2.6007049083709717, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7188514471054077, + "num_tokens": 332564860.0, + "step": 13333 + }, + { + "epoch": 1.46430924665056, + "grad_norm": 2.4299910068511963, + "learning_rate": 1e-06, + "loss": 0.808, + "mean_token_accuracy": 0.7406433820724487, + "num_tokens": 332589147.0, + "step": 13334 + }, + { + "epoch": 1.4644190643531738, + "grad_norm": 2.383497476577759, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7015212178230286, + "num_tokens": 332613469.0, + "step": 13335 + }, + { + "epoch": 1.4645288820557874, + "grad_norm": 2.4959769248962402, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7361170053482056, + "num_tokens": 332634562.0, + "step": 13336 + }, + { + "epoch": 1.464638699758401, + "grad_norm": 2.2220730781555176, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.726822555065155, + "num_tokens": 332659706.0, + "step": 13337 + }, + { + "epoch": 1.4647485174610146, + "grad_norm": 2.3751049041748047, + "learning_rate": 1e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7470320463180542, + "num_tokens": 332683177.0, + "step": 13338 + }, + { + "epoch": 1.4648583351636284, + "grad_norm": 2.2578577995300293, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7262743711471558, + "num_tokens": 332709507.0, + "step": 13339 + }, + { + "epoch": 1.4649681528662422, + "grad_norm": 2.7285563945770264, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7240131497383118, + "num_tokens": 332728812.0, + "step": 13340 + }, + { + "epoch": 1.4650779705688557, + "grad_norm": 2.6713333129882812, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7273563742637634, + "num_tokens": 332749258.0, + "step": 13341 + }, + { + "epoch": 1.4651877882714692, + "grad_norm": 2.5273032188415527, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7026980519294739, + "num_tokens": 332771734.0, + "step": 13342 + }, + { + "epoch": 1.465297605974083, + "grad_norm": 2.2473669052124023, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7115349769592285, + "num_tokens": 332798406.0, + "step": 13343 + }, + { + "epoch": 1.4654074236766967, + "grad_norm": 2.1592557430267334, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7313170433044434, + "num_tokens": 332826589.0, + "step": 13344 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 2.0448219776153564, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6949498057365417, + "num_tokens": 332857385.0, + "step": 13345 + }, + { + "epoch": 1.465627059081924, + "grad_norm": 2.1283376216888428, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.722812831401825, + "num_tokens": 332885289.0, + "step": 13346 + }, + { + "epoch": 1.4657368767845376, + "grad_norm": 2.5645580291748047, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7449480295181274, + "num_tokens": 332904476.0, + "step": 13347 + }, + { + "epoch": 1.4658466944871513, + "grad_norm": 2.074948787689209, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.715186357498169, + "num_tokens": 332933066.0, + "step": 13348 + }, + { + "epoch": 1.465956512189765, + "grad_norm": 2.2909886837005615, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7378131747245789, + "num_tokens": 332957441.0, + "step": 13349 + }, + { + "epoch": 1.4660663298923786, + "grad_norm": 2.2122209072113037, + "learning_rate": 1e-06, + "loss": 0.7726, + "mean_token_accuracy": 0.7477395534515381, + "num_tokens": 332982026.0, + "step": 13350 + }, + { + "epoch": 1.4661761475949924, + "grad_norm": 1.9704231023788452, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7107431292533875, + "num_tokens": 333012580.0, + "step": 13351 + }, + { + "epoch": 1.466285965297606, + "grad_norm": 2.2931573390960693, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7141982316970825, + "num_tokens": 333037962.0, + "step": 13352 + }, + { + "epoch": 1.4663957830002197, + "grad_norm": 2.4856510162353516, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7260977625846863, + "num_tokens": 333059091.0, + "step": 13353 + }, + { + "epoch": 1.4665056007028334, + "grad_norm": 2.202393054962158, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7183349132537842, + "num_tokens": 333087203.0, + "step": 13354 + }, + { + "epoch": 1.466615418405447, + "grad_norm": 2.197678565979004, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7331627011299133, + "num_tokens": 333115966.0, + "step": 13355 + }, + { + "epoch": 1.4667252361080605, + "grad_norm": 2.2890899181365967, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7248363494873047, + "num_tokens": 333141233.0, + "step": 13356 + }, + { + "epoch": 1.4668350538106742, + "grad_norm": 2.175105333328247, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6972366571426392, + "num_tokens": 333171607.0, + "step": 13357 + }, + { + "epoch": 1.466944871513288, + "grad_norm": 2.187746524810791, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7111240029335022, + "num_tokens": 333198177.0, + "step": 13358 + }, + { + "epoch": 1.4670546892159015, + "grad_norm": 2.470442056655884, + "learning_rate": 1e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7383385896682739, + "num_tokens": 333219330.0, + "step": 13359 + }, + { + "epoch": 1.4671645069185153, + "grad_norm": 2.3224282264709473, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7072737216949463, + "num_tokens": 333243597.0, + "step": 13360 + }, + { + "epoch": 1.4672743246211288, + "grad_norm": 2.516692638397217, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.707530677318573, + "num_tokens": 333265025.0, + "step": 13361 + }, + { + "epoch": 1.4673841423237426, + "grad_norm": 2.373016595840454, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7387551665306091, + "num_tokens": 333286853.0, + "step": 13362 + }, + { + "epoch": 1.4674939600263563, + "grad_norm": 2.0564017295837402, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6927604675292969, + "num_tokens": 333314634.0, + "step": 13363 + }, + { + "epoch": 1.4676037777289699, + "grad_norm": 2.300415277481079, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7226164937019348, + "num_tokens": 333339328.0, + "step": 13364 + }, + { + "epoch": 1.4677135954315836, + "grad_norm": 1.9822598695755005, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7281829118728638, + "num_tokens": 333369111.0, + "step": 13365 + }, + { + "epoch": 1.4678234131341972, + "grad_norm": 2.3701887130737305, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7335382699966431, + "num_tokens": 333392355.0, + "step": 13366 + }, + { + "epoch": 1.467933230836811, + "grad_norm": 2.810737371444702, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7562815546989441, + "num_tokens": 333410501.0, + "step": 13367 + }, + { + "epoch": 1.4680430485394247, + "grad_norm": 2.2033743858337402, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7083771228790283, + "num_tokens": 333436400.0, + "step": 13368 + }, + { + "epoch": 1.4681528662420382, + "grad_norm": 2.610377073287964, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7029613256454468, + "num_tokens": 333457360.0, + "step": 13369 + }, + { + "epoch": 1.4682626839446518, + "grad_norm": 2.2912049293518066, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7238451242446899, + "num_tokens": 333481764.0, + "step": 13370 + }, + { + "epoch": 1.4683725016472655, + "grad_norm": 2.265951633453369, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7366136312484741, + "num_tokens": 333507859.0, + "step": 13371 + }, + { + "epoch": 1.4684823193498793, + "grad_norm": 2.3922817707061768, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7242379188537598, + "num_tokens": 333529918.0, + "step": 13372 + }, + { + "epoch": 1.4685921370524928, + "grad_norm": 2.1174588203430176, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7052600383758545, + "num_tokens": 333558745.0, + "step": 13373 + }, + { + "epoch": 1.4687019547551066, + "grad_norm": 2.498035430908203, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7154940366744995, + "num_tokens": 333580201.0, + "step": 13374 + }, + { + "epoch": 1.46881177245772, + "grad_norm": 2.604487657546997, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7276279330253601, + "num_tokens": 333601733.0, + "step": 13375 + }, + { + "epoch": 1.4689215901603339, + "grad_norm": 2.3272650241851807, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7061090469360352, + "num_tokens": 333628130.0, + "step": 13376 + }, + { + "epoch": 1.4690314078629476, + "grad_norm": 2.017306089401245, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7147026658058167, + "num_tokens": 333658997.0, + "step": 13377 + }, + { + "epoch": 1.4691412255655611, + "grad_norm": 2.303562641143799, + "learning_rate": 1e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7496194243431091, + "num_tokens": 333680984.0, + "step": 13378 + }, + { + "epoch": 1.469251043268175, + "grad_norm": 2.172023296356201, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7225169539451599, + "num_tokens": 333707094.0, + "step": 13379 + }, + { + "epoch": 1.4693608609707884, + "grad_norm": 2.210491180419922, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7374117374420166, + "num_tokens": 333732906.0, + "step": 13380 + }, + { + "epoch": 1.4694706786734022, + "grad_norm": 2.294635534286499, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7221540212631226, + "num_tokens": 333757773.0, + "step": 13381 + }, + { + "epoch": 1.4695804963760157, + "grad_norm": 2.3475284576416016, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7282307147979736, + "num_tokens": 333781804.0, + "step": 13382 + }, + { + "epoch": 1.4696903140786295, + "grad_norm": 2.053692579269409, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7168513536453247, + "num_tokens": 333813106.0, + "step": 13383 + }, + { + "epoch": 1.469800131781243, + "grad_norm": 1.9374173879623413, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7243768572807312, + "num_tokens": 333846152.0, + "step": 13384 + }, + { + "epoch": 1.4699099494838568, + "grad_norm": 2.1443610191345215, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7203332781791687, + "num_tokens": 333872085.0, + "step": 13385 + }, + { + "epoch": 1.4700197671864705, + "grad_norm": 2.031256675720215, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.6967986822128296, + "num_tokens": 333903831.0, + "step": 13386 + }, + { + "epoch": 1.470129584889084, + "grad_norm": 2.2969400882720947, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7172895669937134, + "num_tokens": 333930254.0, + "step": 13387 + }, + { + "epoch": 1.4702394025916978, + "grad_norm": 2.244565010070801, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7334458827972412, + "num_tokens": 333953536.0, + "step": 13388 + }, + { + "epoch": 1.4703492202943114, + "grad_norm": 2.0287740230560303, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7061666250228882, + "num_tokens": 333986597.0, + "step": 13389 + }, + { + "epoch": 1.4704590379969251, + "grad_norm": 2.3792529106140137, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7255189418792725, + "num_tokens": 334011386.0, + "step": 13390 + }, + { + "epoch": 1.4705688556995389, + "grad_norm": 2.50091290473938, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7150425314903259, + "num_tokens": 334035680.0, + "step": 13391 + }, + { + "epoch": 1.4706786734021524, + "grad_norm": 2.141847848892212, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7201725244522095, + "num_tokens": 334064128.0, + "step": 13392 + }, + { + "epoch": 1.4707884911047662, + "grad_norm": 2.3446218967437744, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7118387222290039, + "num_tokens": 334087831.0, + "step": 13393 + }, + { + "epoch": 1.4708983088073797, + "grad_norm": 2.0303452014923096, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6946735382080078, + "num_tokens": 334118868.0, + "step": 13394 + }, + { + "epoch": 1.4710081265099935, + "grad_norm": 2.7682254314422607, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7437158226966858, + "num_tokens": 334137451.0, + "step": 13395 + }, + { + "epoch": 1.471117944212607, + "grad_norm": 2.195370674133301, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7134963870048523, + "num_tokens": 334165877.0, + "step": 13396 + }, + { + "epoch": 1.4712277619152208, + "grad_norm": 2.5456862449645996, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7218954563140869, + "num_tokens": 334187112.0, + "step": 13397 + }, + { + "epoch": 1.4713375796178343, + "grad_norm": 2.1657683849334717, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7184890508651733, + "num_tokens": 334213878.0, + "step": 13398 + }, + { + "epoch": 1.471447397320448, + "grad_norm": 2.6488819122314453, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7225620746612549, + "num_tokens": 334232054.0, + "step": 13399 + }, + { + "epoch": 1.4715572150230618, + "grad_norm": 2.0009520053863525, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7130666375160217, + "num_tokens": 334263556.0, + "step": 13400 + }, + { + "epoch": 1.4716670327256753, + "grad_norm": 2.560880422592163, + "learning_rate": 1e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7366107702255249, + "num_tokens": 334284602.0, + "step": 13401 + }, + { + "epoch": 1.471776850428289, + "grad_norm": 2.160848379135132, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.724124550819397, + "num_tokens": 334311406.0, + "step": 13402 + }, + { + "epoch": 1.4718866681309026, + "grad_norm": 2.3907248973846436, + "learning_rate": 1e-06, + "loss": 0.7452, + "mean_token_accuracy": 0.7684775590896606, + "num_tokens": 334332559.0, + "step": 13403 + }, + { + "epoch": 1.4719964858335164, + "grad_norm": 2.431130886077881, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7080341577529907, + "num_tokens": 334356906.0, + "step": 13404 + }, + { + "epoch": 1.4721063035361301, + "grad_norm": 2.3374972343444824, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7086838483810425, + "num_tokens": 334380941.0, + "step": 13405 + }, + { + "epoch": 1.4722161212387437, + "grad_norm": 2.6468617916107178, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7152977585792542, + "num_tokens": 334401902.0, + "step": 13406 + }, + { + "epoch": 1.4723259389413572, + "grad_norm": 2.374350070953369, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.725584864616394, + "num_tokens": 334425650.0, + "step": 13407 + }, + { + "epoch": 1.472435756643971, + "grad_norm": 2.1880106925964355, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.6966010332107544, + "num_tokens": 334456470.0, + "step": 13408 + }, + { + "epoch": 1.4725455743465847, + "grad_norm": 2.09212327003479, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7334094047546387, + "num_tokens": 334483170.0, + "step": 13409 + }, + { + "epoch": 1.4726553920491983, + "grad_norm": 2.016167640686035, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7137471437454224, + "num_tokens": 334513820.0, + "step": 13410 + }, + { + "epoch": 1.472765209751812, + "grad_norm": 2.3180887699127197, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7218129634857178, + "num_tokens": 334536340.0, + "step": 13411 + }, + { + "epoch": 1.4728750274544256, + "grad_norm": 1.8392773866653442, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7111093997955322, + "num_tokens": 334569306.0, + "step": 13412 + }, + { + "epoch": 1.4729848451570393, + "grad_norm": 2.1359734535217285, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7269338965415955, + "num_tokens": 334597249.0, + "step": 13413 + }, + { + "epoch": 1.473094662859653, + "grad_norm": 2.0423147678375244, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7191232442855835, + "num_tokens": 334625067.0, + "step": 13414 + }, + { + "epoch": 1.4732044805622666, + "grad_norm": 2.080796957015991, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7126731276512146, + "num_tokens": 334655601.0, + "step": 13415 + }, + { + "epoch": 1.4733142982648804, + "grad_norm": 2.2948288917541504, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7447506785392761, + "num_tokens": 334680628.0, + "step": 13416 + }, + { + "epoch": 1.473424115967494, + "grad_norm": 2.483956813812256, + "learning_rate": 1e-06, + "loss": 0.8086, + "mean_token_accuracy": 0.7427983283996582, + "num_tokens": 334701419.0, + "step": 13417 + }, + { + "epoch": 1.4735339336701077, + "grad_norm": 2.1015446186065674, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7189024686813354, + "num_tokens": 334728304.0, + "step": 13418 + }, + { + "epoch": 1.4736437513727214, + "grad_norm": 1.9656211137771606, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7131426334381104, + "num_tokens": 334759896.0, + "step": 13419 + }, + { + "epoch": 1.473753569075335, + "grad_norm": 2.216351270675659, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7193307876586914, + "num_tokens": 334783786.0, + "step": 13420 + }, + { + "epoch": 1.4738633867779485, + "grad_norm": 2.3559465408325195, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7154663801193237, + "num_tokens": 334807654.0, + "step": 13421 + }, + { + "epoch": 1.4739732044805622, + "grad_norm": 2.077768087387085, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.707573413848877, + "num_tokens": 334837811.0, + "step": 13422 + }, + { + "epoch": 1.474083022183176, + "grad_norm": 2.7955799102783203, + "learning_rate": 1e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7396750450134277, + "num_tokens": 334853755.0, + "step": 13423 + }, + { + "epoch": 1.4741928398857895, + "grad_norm": 2.700822591781616, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7377623915672302, + "num_tokens": 334873775.0, + "step": 13424 + }, + { + "epoch": 1.4743026575884033, + "grad_norm": 1.991155743598938, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6827971339225769, + "num_tokens": 334906668.0, + "step": 13425 + }, + { + "epoch": 1.4744124752910168, + "grad_norm": 2.2799863815307617, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7017416954040527, + "num_tokens": 334933152.0, + "step": 13426 + }, + { + "epoch": 1.4745222929936306, + "grad_norm": 2.500593900680542, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7120108008384705, + "num_tokens": 334955659.0, + "step": 13427 + }, + { + "epoch": 1.4746321106962443, + "grad_norm": 2.1780874729156494, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7237203121185303, + "num_tokens": 334982773.0, + "step": 13428 + }, + { + "epoch": 1.4747419283988579, + "grad_norm": 2.062574863433838, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.722211480140686, + "num_tokens": 335010993.0, + "step": 13429 + }, + { + "epoch": 1.4748517461014716, + "grad_norm": 2.2744953632354736, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7047017812728882, + "num_tokens": 335037582.0, + "step": 13430 + }, + { + "epoch": 1.4749615638040852, + "grad_norm": 2.2736992835998535, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7213822603225708, + "num_tokens": 335062274.0, + "step": 13431 + }, + { + "epoch": 1.475071381506699, + "grad_norm": 2.681063652038574, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7323637008666992, + "num_tokens": 335080865.0, + "step": 13432 + }, + { + "epoch": 1.4751811992093127, + "grad_norm": 2.3028464317321777, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7138251662254333, + "num_tokens": 335105679.0, + "step": 13433 + }, + { + "epoch": 1.4752910169119262, + "grad_norm": 2.006560802459717, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.699647068977356, + "num_tokens": 335136108.0, + "step": 13434 + }, + { + "epoch": 1.4754008346145397, + "grad_norm": 2.51275372505188, + "learning_rate": 1e-06, + "loss": 0.804, + "mean_token_accuracy": 0.7437236309051514, + "num_tokens": 335156813.0, + "step": 13435 + }, + { + "epoch": 1.4755106523171535, + "grad_norm": 2.4389545917510986, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.733765184879303, + "num_tokens": 335178422.0, + "step": 13436 + }, + { + "epoch": 1.4756204700197673, + "grad_norm": 2.2145538330078125, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7156318426132202, + "num_tokens": 335204859.0, + "step": 13437 + }, + { + "epoch": 1.4757302877223808, + "grad_norm": 2.3726980686187744, + "learning_rate": 1e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7436714768409729, + "num_tokens": 335226223.0, + "step": 13438 + }, + { + "epoch": 1.4758401054249946, + "grad_norm": 2.1035866737365723, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7023347616195679, + "num_tokens": 335254687.0, + "step": 13439 + }, + { + "epoch": 1.475949923127608, + "grad_norm": 2.028104782104492, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7248468995094299, + "num_tokens": 335286321.0, + "step": 13440 + }, + { + "epoch": 1.4760597408302218, + "grad_norm": 2.500885248184204, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7237991094589233, + "num_tokens": 335308167.0, + "step": 13441 + }, + { + "epoch": 1.4761695585328356, + "grad_norm": 2.0933845043182373, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6928753852844238, + "num_tokens": 335339341.0, + "step": 13442 + }, + { + "epoch": 1.4762793762354491, + "grad_norm": 2.5069737434387207, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7267964482307434, + "num_tokens": 335361459.0, + "step": 13443 + }, + { + "epoch": 1.476389193938063, + "grad_norm": 2.060060501098633, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.729677677154541, + "num_tokens": 335390694.0, + "step": 13444 + }, + { + "epoch": 1.4764990116406764, + "grad_norm": 2.477090358734131, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7059985399246216, + "num_tokens": 335414947.0, + "step": 13445 + }, + { + "epoch": 1.4766088293432902, + "grad_norm": 2.3300247192382812, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7175804972648621, + "num_tokens": 335441492.0, + "step": 13446 + }, + { + "epoch": 1.4767186470459037, + "grad_norm": 2.539125442504883, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7311438918113708, + "num_tokens": 335461604.0, + "step": 13447 + }, + { + "epoch": 1.4768284647485175, + "grad_norm": 2.243177890777588, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7276936769485474, + "num_tokens": 335487586.0, + "step": 13448 + }, + { + "epoch": 1.476938282451131, + "grad_norm": 2.242272138595581, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7266456484794617, + "num_tokens": 335514206.0, + "step": 13449 + }, + { + "epoch": 1.4770481001537448, + "grad_norm": 2.7784695625305176, + "learning_rate": 1e-06, + "loss": 0.7705, + "mean_token_accuracy": 0.7449283599853516, + "num_tokens": 335531643.0, + "step": 13450 + }, + { + "epoch": 1.4771579178563585, + "grad_norm": 2.3756206035614014, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7178332805633545, + "num_tokens": 335555707.0, + "step": 13451 + }, + { + "epoch": 1.477267735558972, + "grad_norm": 2.475321054458618, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7206388711929321, + "num_tokens": 335577275.0, + "step": 13452 + }, + { + "epoch": 1.4773775532615858, + "grad_norm": 2.565567970275879, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7273274064064026, + "num_tokens": 335597279.0, + "step": 13453 + }, + { + "epoch": 1.4774873709641994, + "grad_norm": 2.221526861190796, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7104430198669434, + "num_tokens": 335623449.0, + "step": 13454 + }, + { + "epoch": 1.477597188666813, + "grad_norm": 2.640202760696411, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7257798910140991, + "num_tokens": 335642945.0, + "step": 13455 + }, + { + "epoch": 1.4777070063694269, + "grad_norm": 2.6745445728302, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7120510935783386, + "num_tokens": 335662560.0, + "step": 13456 + }, + { + "epoch": 1.4778168240720404, + "grad_norm": 1.961698293685913, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7105855941772461, + "num_tokens": 335693203.0, + "step": 13457 + }, + { + "epoch": 1.477926641774654, + "grad_norm": 2.1864230632781982, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7324507236480713, + "num_tokens": 335719280.0, + "step": 13458 + }, + { + "epoch": 1.4780364594772677, + "grad_norm": 2.2678964138031006, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7171049118041992, + "num_tokens": 335742648.0, + "step": 13459 + }, + { + "epoch": 1.4781462771798815, + "grad_norm": 1.8238030672073364, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7051087617874146, + "num_tokens": 335778757.0, + "step": 13460 + }, + { + "epoch": 1.478256094882495, + "grad_norm": 2.173053503036499, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7013864517211914, + "num_tokens": 335805523.0, + "step": 13461 + }, + { + "epoch": 1.4783659125851087, + "grad_norm": 2.223483085632324, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7035611867904663, + "num_tokens": 335831880.0, + "step": 13462 + }, + { + "epoch": 1.4784757302877223, + "grad_norm": 2.0966100692749023, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7191483378410339, + "num_tokens": 335859534.0, + "step": 13463 + }, + { + "epoch": 1.478585547990336, + "grad_norm": 2.32851243019104, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7074077129364014, + "num_tokens": 335884360.0, + "step": 13464 + }, + { + "epoch": 1.4786953656929498, + "grad_norm": 2.4523510932922363, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7381997108459473, + "num_tokens": 335905614.0, + "step": 13465 + }, + { + "epoch": 1.4788051833955633, + "grad_norm": 2.1515753269195557, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6923660039901733, + "num_tokens": 335933631.0, + "step": 13466 + }, + { + "epoch": 1.478915001098177, + "grad_norm": 2.258777141571045, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7408289313316345, + "num_tokens": 335958385.0, + "step": 13467 + }, + { + "epoch": 1.4790248188007906, + "grad_norm": 2.3267552852630615, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7216527462005615, + "num_tokens": 335983374.0, + "step": 13468 + }, + { + "epoch": 1.4791346365034044, + "grad_norm": 2.2332417964935303, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7036683559417725, + "num_tokens": 336010014.0, + "step": 13469 + }, + { + "epoch": 1.4792444542060181, + "grad_norm": 2.0826306343078613, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7066251039505005, + "num_tokens": 336042238.0, + "step": 13470 + }, + { + "epoch": 1.4793542719086317, + "grad_norm": 2.215902090072632, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7211723327636719, + "num_tokens": 336067732.0, + "step": 13471 + }, + { + "epoch": 1.4794640896112452, + "grad_norm": 2.4689555168151855, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7064430713653564, + "num_tokens": 336092386.0, + "step": 13472 + }, + { + "epoch": 1.479573907313859, + "grad_norm": 2.498154640197754, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7370270490646362, + "num_tokens": 336115198.0, + "step": 13473 + }, + { + "epoch": 1.4796837250164727, + "grad_norm": 2.4785077571868896, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7432793378829956, + "num_tokens": 336135909.0, + "step": 13474 + }, + { + "epoch": 1.4797935427190863, + "grad_norm": 2.091827630996704, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7215666770935059, + "num_tokens": 336167818.0, + "step": 13475 + }, + { + "epoch": 1.4799033604217, + "grad_norm": 2.1896328926086426, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.737701416015625, + "num_tokens": 336192658.0, + "step": 13476 + }, + { + "epoch": 1.4800131781243135, + "grad_norm": 2.816687822341919, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7064992189407349, + "num_tokens": 336212318.0, + "step": 13477 + }, + { + "epoch": 1.4801229958269273, + "grad_norm": 2.209855556488037, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7202847003936768, + "num_tokens": 336241168.0, + "step": 13478 + }, + { + "epoch": 1.480232813529541, + "grad_norm": 2.1444051265716553, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7240012884140015, + "num_tokens": 336266508.0, + "step": 13479 + }, + { + "epoch": 1.4803426312321546, + "grad_norm": 2.591265916824341, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7382619380950928, + "num_tokens": 336286332.0, + "step": 13480 + }, + { + "epoch": 1.4804524489347684, + "grad_norm": 2.6432149410247803, + "learning_rate": 1e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7315745949745178, + "num_tokens": 336306336.0, + "step": 13481 + }, + { + "epoch": 1.4805622666373819, + "grad_norm": 2.20316743850708, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6929007768630981, + "num_tokens": 336335302.0, + "step": 13482 + }, + { + "epoch": 1.4806720843399956, + "grad_norm": 2.594597101211548, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7218708992004395, + "num_tokens": 336355363.0, + "step": 13483 + }, + { + "epoch": 1.4807819020426094, + "grad_norm": 2.129446268081665, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7214616537094116, + "num_tokens": 336385214.0, + "step": 13484 + }, + { + "epoch": 1.480891719745223, + "grad_norm": 2.343627452850342, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7421448230743408, + "num_tokens": 336410449.0, + "step": 13485 + }, + { + "epoch": 1.4810015374478365, + "grad_norm": 2.394463062286377, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.731292724609375, + "num_tokens": 336432702.0, + "step": 13486 + }, + { + "epoch": 1.4811113551504502, + "grad_norm": 2.1920831203460693, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7202222943305969, + "num_tokens": 336458963.0, + "step": 13487 + }, + { + "epoch": 1.481221172853064, + "grad_norm": 2.3454484939575195, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7321597337722778, + "num_tokens": 336481904.0, + "step": 13488 + }, + { + "epoch": 1.4813309905556775, + "grad_norm": 2.204838275909424, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7300980091094971, + "num_tokens": 336505704.0, + "step": 13489 + }, + { + "epoch": 1.4814408082582913, + "grad_norm": 2.232248544692993, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.720561683177948, + "num_tokens": 336528514.0, + "step": 13490 + }, + { + "epoch": 1.4815506259609048, + "grad_norm": 2.3963754177093506, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7274144291877747, + "num_tokens": 336552202.0, + "step": 13491 + }, + { + "epoch": 1.4816604436635186, + "grad_norm": 2.4403505325317383, + "learning_rate": 1e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7290395498275757, + "num_tokens": 336573664.0, + "step": 13492 + }, + { + "epoch": 1.4817702613661323, + "grad_norm": 2.2274580001831055, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7183538675308228, + "num_tokens": 336599764.0, + "step": 13493 + }, + { + "epoch": 1.4818800790687459, + "grad_norm": 2.8146660327911377, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7348045110702515, + "num_tokens": 336618780.0, + "step": 13494 + }, + { + "epoch": 1.4819898967713596, + "grad_norm": 2.391808271408081, + "learning_rate": 1e-06, + "loss": 0.7389, + "mean_token_accuracy": 0.7590003609657288, + "num_tokens": 336638187.0, + "step": 13495 + }, + { + "epoch": 1.4820997144739732, + "grad_norm": 2.1439712047576904, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7164985537528992, + "num_tokens": 336665138.0, + "step": 13496 + }, + { + "epoch": 1.482209532176587, + "grad_norm": 2.1721720695495605, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7284256815910339, + "num_tokens": 336692058.0, + "step": 13497 + }, + { + "epoch": 1.4823193498792007, + "grad_norm": 2.3202064037323, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7392521500587463, + "num_tokens": 336714871.0, + "step": 13498 + }, + { + "epoch": 1.4824291675818142, + "grad_norm": 2.4570679664611816, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7305481433868408, + "num_tokens": 336737557.0, + "step": 13499 + }, + { + "epoch": 1.4825389852844277, + "grad_norm": 2.1791374683380127, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7081694006919861, + "num_tokens": 336765191.0, + "step": 13500 + }, + { + "epoch": 1.4826488029870415, + "grad_norm": 2.336899757385254, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7201098799705505, + "num_tokens": 336789162.0, + "step": 13501 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 2.1651928424835205, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7232505679130554, + "num_tokens": 336816887.0, + "step": 13502 + }, + { + "epoch": 1.4828684383922688, + "grad_norm": 2.3629183769226074, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7280659675598145, + "num_tokens": 336840541.0, + "step": 13503 + }, + { + "epoch": 1.4829782560948825, + "grad_norm": 2.1563897132873535, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7078269720077515, + "num_tokens": 336869688.0, + "step": 13504 + }, + { + "epoch": 1.483088073797496, + "grad_norm": 2.055185079574585, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7173762917518616, + "num_tokens": 336899324.0, + "step": 13505 + }, + { + "epoch": 1.4831978915001098, + "grad_norm": 2.0343024730682373, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7119348049163818, + "num_tokens": 336928182.0, + "step": 13506 + }, + { + "epoch": 1.4833077092027236, + "grad_norm": 2.1311466693878174, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6947742700576782, + "num_tokens": 336957476.0, + "step": 13507 + }, + { + "epoch": 1.4834175269053371, + "grad_norm": 2.3569133281707764, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7096503973007202, + "num_tokens": 336979948.0, + "step": 13508 + }, + { + "epoch": 1.4835273446079509, + "grad_norm": 2.043597459793091, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7112753391265869, + "num_tokens": 337010463.0, + "step": 13509 + }, + { + "epoch": 1.4836371623105644, + "grad_norm": 2.2354860305786133, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.734847366809845, + "num_tokens": 337036612.0, + "step": 13510 + }, + { + "epoch": 1.4837469800131782, + "grad_norm": 1.9939684867858887, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7195941805839539, + "num_tokens": 337066651.0, + "step": 13511 + }, + { + "epoch": 1.4838567977157917, + "grad_norm": 2.8753159046173096, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7306677103042603, + "num_tokens": 337085176.0, + "step": 13512 + }, + { + "epoch": 1.4839666154184055, + "grad_norm": 2.4435927867889404, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7432812452316284, + "num_tokens": 337107593.0, + "step": 13513 + }, + { + "epoch": 1.484076433121019, + "grad_norm": 2.237680196762085, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7334594130516052, + "num_tokens": 337134025.0, + "step": 13514 + }, + { + "epoch": 1.4841862508236328, + "grad_norm": 2.5703117847442627, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7313354015350342, + "num_tokens": 337155525.0, + "step": 13515 + }, + { + "epoch": 1.4842960685262465, + "grad_norm": 2.6455655097961426, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7276820540428162, + "num_tokens": 337176421.0, + "step": 13516 + }, + { + "epoch": 1.48440588622886, + "grad_norm": 2.597973346710205, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7384738922119141, + "num_tokens": 337197020.0, + "step": 13517 + }, + { + "epoch": 1.4845157039314738, + "grad_norm": 2.3987386226654053, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.73024982213974, + "num_tokens": 337219307.0, + "step": 13518 + }, + { + "epoch": 1.4846255216340873, + "grad_norm": 2.2950735092163086, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7235218286514282, + "num_tokens": 337243535.0, + "step": 13519 + }, + { + "epoch": 1.484735339336701, + "grad_norm": 2.4045424461364746, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7341886758804321, + "num_tokens": 337266282.0, + "step": 13520 + }, + { + "epoch": 1.4848451570393149, + "grad_norm": 2.461061477661133, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7299132347106934, + "num_tokens": 337288145.0, + "step": 13521 + }, + { + "epoch": 1.4849549747419284, + "grad_norm": 2.178278684616089, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7222753763198853, + "num_tokens": 337314300.0, + "step": 13522 + }, + { + "epoch": 1.485064792444542, + "grad_norm": 2.102875232696533, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7137892246246338, + "num_tokens": 337342211.0, + "step": 13523 + }, + { + "epoch": 1.4851746101471557, + "grad_norm": 2.150670289993286, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7189698815345764, + "num_tokens": 337367732.0, + "step": 13524 + }, + { + "epoch": 1.4852844278497694, + "grad_norm": 2.3422350883483887, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7054219245910645, + "num_tokens": 337394054.0, + "step": 13525 + }, + { + "epoch": 1.485394245552383, + "grad_norm": 2.3858470916748047, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7080932259559631, + "num_tokens": 337416712.0, + "step": 13526 + }, + { + "epoch": 1.4855040632549967, + "grad_norm": 2.323326826095581, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7284157276153564, + "num_tokens": 337439686.0, + "step": 13527 + }, + { + "epoch": 1.4856138809576103, + "grad_norm": 2.292518138885498, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7035986185073853, + "num_tokens": 337465624.0, + "step": 13528 + }, + { + "epoch": 1.485723698660224, + "grad_norm": 2.319023609161377, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7090710401535034, + "num_tokens": 337491070.0, + "step": 13529 + }, + { + "epoch": 1.4858335163628378, + "grad_norm": 2.2888803482055664, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7062998414039612, + "num_tokens": 337516099.0, + "step": 13530 + }, + { + "epoch": 1.4859433340654513, + "grad_norm": 2.456719160079956, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7182302474975586, + "num_tokens": 337538706.0, + "step": 13531 + }, + { + "epoch": 1.486053151768065, + "grad_norm": 2.605736494064331, + "learning_rate": 1e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7413250207901001, + "num_tokens": 337557139.0, + "step": 13532 + }, + { + "epoch": 1.4861629694706786, + "grad_norm": 2.443815231323242, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7099664211273193, + "num_tokens": 337579572.0, + "step": 13533 + }, + { + "epoch": 1.4862727871732924, + "grad_norm": 2.1584410667419434, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7202390432357788, + "num_tokens": 337609320.0, + "step": 13534 + }, + { + "epoch": 1.4863826048759061, + "grad_norm": 2.356429100036621, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7378449440002441, + "num_tokens": 337633799.0, + "step": 13535 + }, + { + "epoch": 1.4864924225785197, + "grad_norm": 2.2262394428253174, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7043106555938721, + "num_tokens": 337662562.0, + "step": 13536 + }, + { + "epoch": 1.4866022402811332, + "grad_norm": 2.277249813079834, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.732974112033844, + "num_tokens": 337684745.0, + "step": 13537 + }, + { + "epoch": 1.486712057983747, + "grad_norm": 2.182551622390747, + "learning_rate": 1e-06, + "loss": 0.7668, + "mean_token_accuracy": 0.7649657726287842, + "num_tokens": 337711053.0, + "step": 13538 + }, + { + "epoch": 1.4868218756863607, + "grad_norm": 2.141998529434204, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.72137451171875, + "num_tokens": 337740345.0, + "step": 13539 + }, + { + "epoch": 1.4869316933889742, + "grad_norm": 2.120803117752075, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7029659748077393, + "num_tokens": 337770641.0, + "step": 13540 + }, + { + "epoch": 1.487041511091588, + "grad_norm": 2.0691845417022705, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7015591263771057, + "num_tokens": 337799674.0, + "step": 13541 + }, + { + "epoch": 1.4871513287942015, + "grad_norm": 2.249539852142334, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7161133885383606, + "num_tokens": 337823670.0, + "step": 13542 + }, + { + "epoch": 1.4872611464968153, + "grad_norm": 1.995784044265747, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7177560329437256, + "num_tokens": 337856094.0, + "step": 13543 + }, + { + "epoch": 1.487370964199429, + "grad_norm": 2.5448124408721924, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7503669857978821, + "num_tokens": 337876576.0, + "step": 13544 + }, + { + "epoch": 1.4874807819020426, + "grad_norm": 2.451822280883789, + "learning_rate": 1e-06, + "loss": 0.7629, + "mean_token_accuracy": 0.7500358819961548, + "num_tokens": 337897868.0, + "step": 13545 + }, + { + "epoch": 1.4875905996046563, + "grad_norm": 2.06410813331604, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.710637629032135, + "num_tokens": 337926704.0, + "step": 13546 + }, + { + "epoch": 1.4877004173072699, + "grad_norm": 2.3058865070343018, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.715583324432373, + "num_tokens": 337952377.0, + "step": 13547 + }, + { + "epoch": 1.4878102350098836, + "grad_norm": 2.4943039417266846, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7178295850753784, + "num_tokens": 337973623.0, + "step": 13548 + }, + { + "epoch": 1.4879200527124974, + "grad_norm": 2.3280913829803467, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7113198637962341, + "num_tokens": 337998724.0, + "step": 13549 + }, + { + "epoch": 1.488029870415111, + "grad_norm": 2.520911693572998, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7209874987602234, + "num_tokens": 338020368.0, + "step": 13550 + }, + { + "epoch": 1.4881396881177245, + "grad_norm": 2.2856733798980713, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7257546186447144, + "num_tokens": 338043925.0, + "step": 13551 + }, + { + "epoch": 1.4882495058203382, + "grad_norm": 2.2311081886291504, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7179561257362366, + "num_tokens": 338068007.0, + "step": 13552 + }, + { + "epoch": 1.488359323522952, + "grad_norm": 2.232938289642334, + "learning_rate": 1e-06, + "loss": 0.828, + "mean_token_accuracy": 0.737040638923645, + "num_tokens": 338093850.0, + "step": 13553 + }, + { + "epoch": 1.4884691412255655, + "grad_norm": 2.4683361053466797, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7254787683486938, + "num_tokens": 338115447.0, + "step": 13554 + }, + { + "epoch": 1.4885789589281793, + "grad_norm": 2.456697940826416, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7207320928573608, + "num_tokens": 338139944.0, + "step": 13555 + }, + { + "epoch": 1.4886887766307928, + "grad_norm": 2.3853647708892822, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.717684268951416, + "num_tokens": 338164072.0, + "step": 13556 + }, + { + "epoch": 1.4887985943334066, + "grad_norm": 2.0159881114959717, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7092716097831726, + "num_tokens": 338195966.0, + "step": 13557 + }, + { + "epoch": 1.4889084120360203, + "grad_norm": 2.3830976486206055, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.731784462928772, + "num_tokens": 338218007.0, + "step": 13558 + }, + { + "epoch": 1.4890182297386338, + "grad_norm": 2.328397274017334, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7196286916732788, + "num_tokens": 338241349.0, + "step": 13559 + }, + { + "epoch": 1.4891280474412476, + "grad_norm": 2.3752331733703613, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7201242446899414, + "num_tokens": 338264725.0, + "step": 13560 + }, + { + "epoch": 1.4892378651438611, + "grad_norm": 2.2616260051727295, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6903709173202515, + "num_tokens": 338291666.0, + "step": 13561 + }, + { + "epoch": 1.489347682846475, + "grad_norm": 2.155081272125244, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7078476548194885, + "num_tokens": 338318327.0, + "step": 13562 + }, + { + "epoch": 1.4894575005490884, + "grad_norm": 2.44909405708313, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.705224871635437, + "num_tokens": 338342552.0, + "step": 13563 + }, + { + "epoch": 1.4895673182517022, + "grad_norm": 2.6024584770202637, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7190427780151367, + "num_tokens": 338364926.0, + "step": 13564 + }, + { + "epoch": 1.4896771359543157, + "grad_norm": 2.14269757270813, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.6947886943817139, + "num_tokens": 338393053.0, + "step": 13565 + }, + { + "epoch": 1.4897869536569295, + "grad_norm": 2.482475757598877, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7274680137634277, + "num_tokens": 338414634.0, + "step": 13566 + }, + { + "epoch": 1.4898967713595432, + "grad_norm": 2.2566401958465576, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7213677167892456, + "num_tokens": 338439425.0, + "step": 13567 + }, + { + "epoch": 1.4900065890621568, + "grad_norm": 2.016481399536133, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7310240268707275, + "num_tokens": 338470839.0, + "step": 13568 + }, + { + "epoch": 1.4901164067647705, + "grad_norm": 2.174044609069824, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7312055826187134, + "num_tokens": 338499152.0, + "step": 13569 + }, + { + "epoch": 1.490226224467384, + "grad_norm": 2.318279266357422, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7258008718490601, + "num_tokens": 338523557.0, + "step": 13570 + }, + { + "epoch": 1.4903360421699978, + "grad_norm": 2.2759666442871094, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7175226211547852, + "num_tokens": 338549084.0, + "step": 13571 + }, + { + "epoch": 1.4904458598726116, + "grad_norm": 2.640187978744507, + "learning_rate": 1e-06, + "loss": 0.7556, + "mean_token_accuracy": 0.754287600517273, + "num_tokens": 338567976.0, + "step": 13572 + }, + { + "epoch": 1.4905556775752251, + "grad_norm": 1.9418933391571045, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7222778797149658, + "num_tokens": 338599288.0, + "step": 13573 + }, + { + "epoch": 1.4906654952778389, + "grad_norm": 2.3904881477355957, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7155246138572693, + "num_tokens": 338623310.0, + "step": 13574 + }, + { + "epoch": 1.4907753129804524, + "grad_norm": 2.646275281906128, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7096444964408875, + "num_tokens": 338642769.0, + "step": 13575 + }, + { + "epoch": 1.4908851306830662, + "grad_norm": 2.22809100151062, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7150592803955078, + "num_tokens": 338667469.0, + "step": 13576 + }, + { + "epoch": 1.4909949483856797, + "grad_norm": 2.3472132682800293, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.7066997289657593, + "num_tokens": 338692423.0, + "step": 13577 + }, + { + "epoch": 1.4911047660882935, + "grad_norm": 2.0838637351989746, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.708742618560791, + "num_tokens": 338724928.0, + "step": 13578 + }, + { + "epoch": 1.491214583790907, + "grad_norm": 2.084268569946289, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7198987007141113, + "num_tokens": 338753777.0, + "step": 13579 + }, + { + "epoch": 1.4913244014935207, + "grad_norm": 2.309582471847534, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7012407183647156, + "num_tokens": 338777583.0, + "step": 13580 + }, + { + "epoch": 1.4914342191961345, + "grad_norm": 2.7584755420684814, + "learning_rate": 1e-06, + "loss": 0.7433, + "mean_token_accuracy": 0.7576992511749268, + "num_tokens": 338794538.0, + "step": 13581 + }, + { + "epoch": 1.491544036898748, + "grad_norm": 2.446047306060791, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7394915819168091, + "num_tokens": 338816752.0, + "step": 13582 + }, + { + "epoch": 1.4916538546013618, + "grad_norm": 2.298511266708374, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7205939292907715, + "num_tokens": 338841107.0, + "step": 13583 + }, + { + "epoch": 1.4917636723039753, + "grad_norm": 2.4631710052490234, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7381219267845154, + "num_tokens": 338863208.0, + "step": 13584 + }, + { + "epoch": 1.491873490006589, + "grad_norm": 2.4328300952911377, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6947008371353149, + "num_tokens": 338886347.0, + "step": 13585 + }, + { + "epoch": 1.4919833077092028, + "grad_norm": 2.393893003463745, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7430824637413025, + "num_tokens": 338909164.0, + "step": 13586 + }, + { + "epoch": 1.4920931254118164, + "grad_norm": 2.4286422729492188, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7216833829879761, + "num_tokens": 338932778.0, + "step": 13587 + }, + { + "epoch": 1.49220294311443, + "grad_norm": 2.3619866371154785, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7233194708824158, + "num_tokens": 338956858.0, + "step": 13588 + }, + { + "epoch": 1.4923127608170437, + "grad_norm": 2.239907741546631, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7098661065101624, + "num_tokens": 338983645.0, + "step": 13589 + }, + { + "epoch": 1.4924225785196574, + "grad_norm": 2.3105123043060303, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7284556031227112, + "num_tokens": 339008761.0, + "step": 13590 + }, + { + "epoch": 1.492532396222271, + "grad_norm": 2.064486026763916, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7227528095245361, + "num_tokens": 339037398.0, + "step": 13591 + }, + { + "epoch": 1.4926422139248847, + "grad_norm": 2.6010890007019043, + "learning_rate": 1e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7412335872650146, + "num_tokens": 339057375.0, + "step": 13592 + }, + { + "epoch": 1.4927520316274983, + "grad_norm": 2.4708058834075928, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7365725040435791, + "num_tokens": 339078859.0, + "step": 13593 + }, + { + "epoch": 1.492861849330112, + "grad_norm": 2.125981569290161, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7548509836196899, + "num_tokens": 339104466.0, + "step": 13594 + }, + { + "epoch": 1.4929716670327258, + "grad_norm": 2.541877508163452, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7340061068534851, + "num_tokens": 339128219.0, + "step": 13595 + }, + { + "epoch": 1.4930814847353393, + "grad_norm": 2.275590658187866, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7159155607223511, + "num_tokens": 339153437.0, + "step": 13596 + }, + { + "epoch": 1.493191302437953, + "grad_norm": 2.3489699363708496, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7462900876998901, + "num_tokens": 339175872.0, + "step": 13597 + }, + { + "epoch": 1.4933011201405666, + "grad_norm": 2.2808167934417725, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7393108606338501, + "num_tokens": 339199381.0, + "step": 13598 + }, + { + "epoch": 1.4934109378431804, + "grad_norm": 2.261817693710327, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7050477862358093, + "num_tokens": 339227189.0, + "step": 13599 + }, + { + "epoch": 1.493520755545794, + "grad_norm": 2.161320686340332, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6940368413925171, + "num_tokens": 339255313.0, + "step": 13600 + }, + { + "epoch": 1.4936305732484076, + "grad_norm": 2.5716607570648193, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7350142002105713, + "num_tokens": 339275009.0, + "step": 13601 + }, + { + "epoch": 1.4937403909510212, + "grad_norm": 2.060920238494873, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7425038814544678, + "num_tokens": 339302968.0, + "step": 13602 + }, + { + "epoch": 1.493850208653635, + "grad_norm": 2.0685489177703857, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7091646194458008, + "num_tokens": 339333725.0, + "step": 13603 + }, + { + "epoch": 1.4939600263562487, + "grad_norm": 2.4600930213928223, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7316916584968567, + "num_tokens": 339353681.0, + "step": 13604 + }, + { + "epoch": 1.4940698440588622, + "grad_norm": 2.186131000518799, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7210222482681274, + "num_tokens": 339379529.0, + "step": 13605 + }, + { + "epoch": 1.494179661761476, + "grad_norm": 2.2055554389953613, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7562602758407593, + "num_tokens": 339404050.0, + "step": 13606 + }, + { + "epoch": 1.4942894794640895, + "grad_norm": 2.3060431480407715, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7308846712112427, + "num_tokens": 339428324.0, + "step": 13607 + }, + { + "epoch": 1.4943992971667033, + "grad_norm": 2.308368444442749, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7194184064865112, + "num_tokens": 339455498.0, + "step": 13608 + }, + { + "epoch": 1.494509114869317, + "grad_norm": 2.2656409740448, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7253522276878357, + "num_tokens": 339482971.0, + "step": 13609 + }, + { + "epoch": 1.4946189325719306, + "grad_norm": 2.3188328742980957, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7113743424415588, + "num_tokens": 339507627.0, + "step": 13610 + }, + { + "epoch": 1.4947287502745443, + "grad_norm": 2.4916861057281494, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7129253149032593, + "num_tokens": 339528123.0, + "step": 13611 + }, + { + "epoch": 1.4948385679771579, + "grad_norm": 2.082458257675171, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6913890838623047, + "num_tokens": 339557649.0, + "step": 13612 + }, + { + "epoch": 1.4949483856797716, + "grad_norm": 2.10579514503479, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7034252285957336, + "num_tokens": 339587524.0, + "step": 13613 + }, + { + "epoch": 1.4950582033823854, + "grad_norm": 2.2650763988494873, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7183042168617249, + "num_tokens": 339612414.0, + "step": 13614 + }, + { + "epoch": 1.495168021084999, + "grad_norm": 2.1925318241119385, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6977968811988831, + "num_tokens": 339640533.0, + "step": 13615 + }, + { + "epoch": 1.4952778387876124, + "grad_norm": 2.4508345127105713, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7340595722198486, + "num_tokens": 339663143.0, + "step": 13616 + }, + { + "epoch": 1.4953876564902262, + "grad_norm": 2.2840514183044434, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.700320303440094, + "num_tokens": 339688303.0, + "step": 13617 + }, + { + "epoch": 1.49549747419284, + "grad_norm": 2.346116542816162, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7404847741127014, + "num_tokens": 339712459.0, + "step": 13618 + }, + { + "epoch": 1.4956072918954535, + "grad_norm": 2.0822184085845947, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7027020454406738, + "num_tokens": 339741000.0, + "step": 13619 + }, + { + "epoch": 1.4957171095980673, + "grad_norm": 2.2747552394866943, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7012130618095398, + "num_tokens": 339767939.0, + "step": 13620 + }, + { + "epoch": 1.4958269273006808, + "grad_norm": 2.4393227100372314, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7189753651618958, + "num_tokens": 339792348.0, + "step": 13621 + }, + { + "epoch": 1.4959367450032945, + "grad_norm": 2.302245616912842, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7406795620918274, + "num_tokens": 339816733.0, + "step": 13622 + }, + { + "epoch": 1.4960465627059083, + "grad_norm": 2.1516692638397217, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7295236587524414, + "num_tokens": 339843517.0, + "step": 13623 + }, + { + "epoch": 1.4961563804085218, + "grad_norm": 2.3141586780548096, + "learning_rate": 1e-06, + "loss": 0.8103, + "mean_token_accuracy": 0.7440947890281677, + "num_tokens": 339867486.0, + "step": 13624 + }, + { + "epoch": 1.4962661981111356, + "grad_norm": 2.0251858234405518, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6862820386886597, + "num_tokens": 339899989.0, + "step": 13625 + }, + { + "epoch": 1.4963760158137491, + "grad_norm": 2.050152063369751, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7130715847015381, + "num_tokens": 339929623.0, + "step": 13626 + }, + { + "epoch": 1.4964858335163629, + "grad_norm": 2.1374635696411133, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7093342542648315, + "num_tokens": 339959629.0, + "step": 13627 + }, + { + "epoch": 1.4965956512189764, + "grad_norm": 2.461888074874878, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7285994291305542, + "num_tokens": 339981868.0, + "step": 13628 + }, + { + "epoch": 1.4967054689215902, + "grad_norm": 2.5885732173919678, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7219586968421936, + "num_tokens": 340003240.0, + "step": 13629 + }, + { + "epoch": 1.4968152866242037, + "grad_norm": 2.838770627975464, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7265158295631409, + "num_tokens": 340020886.0, + "step": 13630 + }, + { + "epoch": 1.4969251043268175, + "grad_norm": 2.282109022140503, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6903601884841919, + "num_tokens": 340044648.0, + "step": 13631 + }, + { + "epoch": 1.4970349220294312, + "grad_norm": 2.3555586338043213, + "learning_rate": 1e-06, + "loss": 0.753, + "mean_token_accuracy": 0.7560296058654785, + "num_tokens": 340065974.0, + "step": 13632 + }, + { + "epoch": 1.4971447397320448, + "grad_norm": 2.5821781158447266, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.720931887626648, + "num_tokens": 340086836.0, + "step": 13633 + }, + { + "epoch": 1.4972545574346585, + "grad_norm": 2.8513355255126953, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7462440133094788, + "num_tokens": 340104708.0, + "step": 13634 + }, + { + "epoch": 1.497364375137272, + "grad_norm": 2.3068625926971436, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7330073118209839, + "num_tokens": 340129742.0, + "step": 13635 + }, + { + "epoch": 1.4974741928398858, + "grad_norm": 2.3909242153167725, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7356135845184326, + "num_tokens": 340152553.0, + "step": 13636 + }, + { + "epoch": 1.4975840105424996, + "grad_norm": 2.4725801944732666, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7261564135551453, + "num_tokens": 340174006.0, + "step": 13637 + }, + { + "epoch": 1.497693828245113, + "grad_norm": 2.177685260772705, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7170765995979309, + "num_tokens": 340202049.0, + "step": 13638 + }, + { + "epoch": 1.4978036459477266, + "grad_norm": 2.101762294769287, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7017263770103455, + "num_tokens": 340230549.0, + "step": 13639 + }, + { + "epoch": 1.4979134636503404, + "grad_norm": 2.2514379024505615, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.737617552280426, + "num_tokens": 340257643.0, + "step": 13640 + }, + { + "epoch": 1.4980232813529542, + "grad_norm": 2.0304617881774902, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7130386233329773, + "num_tokens": 340289271.0, + "step": 13641 + }, + { + "epoch": 1.4981330990555677, + "grad_norm": 2.7702364921569824, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7090952396392822, + "num_tokens": 340308075.0, + "step": 13642 + }, + { + "epoch": 1.4982429167581814, + "grad_norm": 2.6935715675354004, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7384682297706604, + "num_tokens": 340327112.0, + "step": 13643 + }, + { + "epoch": 1.498352734460795, + "grad_norm": 2.3560678958892822, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.718531608581543, + "num_tokens": 340348816.0, + "step": 13644 + }, + { + "epoch": 1.4984625521634087, + "grad_norm": 2.1498565673828125, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.734715461730957, + "num_tokens": 340375169.0, + "step": 13645 + }, + { + "epoch": 1.4985723698660225, + "grad_norm": 2.4937968254089355, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.737114667892456, + "num_tokens": 340394510.0, + "step": 13646 + }, + { + "epoch": 1.498682187568636, + "grad_norm": 2.2132411003112793, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.734870970249176, + "num_tokens": 340419199.0, + "step": 13647 + }, + { + "epoch": 1.4987920052712498, + "grad_norm": 2.235914468765259, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7139543294906616, + "num_tokens": 340446449.0, + "step": 13648 + }, + { + "epoch": 1.4989018229738633, + "grad_norm": 2.791354179382324, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7173852920532227, + "num_tokens": 340464272.0, + "step": 13649 + }, + { + "epoch": 1.499011640676477, + "grad_norm": 2.520247220993042, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7260071039199829, + "num_tokens": 340484878.0, + "step": 13650 + }, + { + "epoch": 1.4991214583790908, + "grad_norm": 2.1416759490966797, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7247483134269714, + "num_tokens": 340511897.0, + "step": 13651 + }, + { + "epoch": 1.4992312760817044, + "grad_norm": 2.3234758377075195, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7208242416381836, + "num_tokens": 340538261.0, + "step": 13652 + }, + { + "epoch": 1.499341093784318, + "grad_norm": 2.6151576042175293, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7173722982406616, + "num_tokens": 340560667.0, + "step": 13653 + }, + { + "epoch": 1.4994509114869317, + "grad_norm": 2.03141450881958, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7121305465698242, + "num_tokens": 340588465.0, + "step": 13654 + }, + { + "epoch": 1.4995607291895454, + "grad_norm": 2.442152261734009, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7297234535217285, + "num_tokens": 340610187.0, + "step": 13655 + }, + { + "epoch": 1.499670546892159, + "grad_norm": 2.307955503463745, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7304928302764893, + "num_tokens": 340634264.0, + "step": 13656 + }, + { + "epoch": 1.4997803645947727, + "grad_norm": 2.2287189960479736, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7182590365409851, + "num_tokens": 340659630.0, + "step": 13657 + }, + { + "epoch": 1.4998901822973862, + "grad_norm": 2.281099319458008, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7209036350250244, + "num_tokens": 340684265.0, + "step": 13658 + }, + { + "epoch": 1.5, + "grad_norm": 2.3114423751831055, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7175517678260803, + "num_tokens": 340709328.0, + "step": 13659 + }, + { + "epoch": 1.5001098177026138, + "grad_norm": 2.4143075942993164, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7067242860794067, + "num_tokens": 340732903.0, + "step": 13660 + }, + { + "epoch": 1.5002196354052273, + "grad_norm": 2.6864230632781982, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7306645512580872, + "num_tokens": 340752091.0, + "step": 13661 + }, + { + "epoch": 1.5003294531078408, + "grad_norm": 1.8768548965454102, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.730411171913147, + "num_tokens": 340785002.0, + "step": 13662 + }, + { + "epoch": 1.5004392708104546, + "grad_norm": 2.2589550018310547, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7253626585006714, + "num_tokens": 340809501.0, + "step": 13663 + }, + { + "epoch": 1.5005490885130683, + "grad_norm": 2.1008498668670654, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7098897695541382, + "num_tokens": 340837565.0, + "step": 13664 + }, + { + "epoch": 1.500658906215682, + "grad_norm": 2.283189535140991, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.711299479007721, + "num_tokens": 340863549.0, + "step": 13665 + }, + { + "epoch": 1.5007687239182956, + "grad_norm": 2.0921072959899902, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7126363515853882, + "num_tokens": 340893219.0, + "step": 13666 + }, + { + "epoch": 1.5008785416209092, + "grad_norm": 2.2771809101104736, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6966568231582642, + "num_tokens": 340920604.0, + "step": 13667 + }, + { + "epoch": 1.500988359323523, + "grad_norm": 1.9715856313705444, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.711701512336731, + "num_tokens": 340953358.0, + "step": 13668 + }, + { + "epoch": 1.5010981770261367, + "grad_norm": 2.518751621246338, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7264367341995239, + "num_tokens": 340973690.0, + "step": 13669 + }, + { + "epoch": 1.5012079947287504, + "grad_norm": 2.30938720703125, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7535626888275146, + "num_tokens": 340997921.0, + "step": 13670 + }, + { + "epoch": 1.501317812431364, + "grad_norm": 2.445067882537842, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7341066598892212, + "num_tokens": 341019113.0, + "step": 13671 + }, + { + "epoch": 1.5014276301339775, + "grad_norm": 2.305485963821411, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7278759479522705, + "num_tokens": 341045428.0, + "step": 13672 + }, + { + "epoch": 1.5015374478365913, + "grad_norm": 2.4785640239715576, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.706285834312439, + "num_tokens": 341069118.0, + "step": 13673 + }, + { + "epoch": 1.501647265539205, + "grad_norm": 2.10880184173584, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7212104201316833, + "num_tokens": 341097011.0, + "step": 13674 + }, + { + "epoch": 1.5017570832418186, + "grad_norm": 2.8250386714935303, + "learning_rate": 1e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7399928569793701, + "num_tokens": 341114213.0, + "step": 13675 + }, + { + "epoch": 1.501866900944432, + "grad_norm": 2.2178428173065186, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7248976230621338, + "num_tokens": 341139107.0, + "step": 13676 + }, + { + "epoch": 1.5019767186470458, + "grad_norm": 2.429701089859009, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.720453143119812, + "num_tokens": 341161097.0, + "step": 13677 + }, + { + "epoch": 1.5020865363496596, + "grad_norm": 2.754631280899048, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.745101809501648, + "num_tokens": 341178072.0, + "step": 13678 + }, + { + "epoch": 1.5021963540522734, + "grad_norm": 2.4051358699798584, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7190120220184326, + "num_tokens": 341202588.0, + "step": 13679 + }, + { + "epoch": 1.502306171754887, + "grad_norm": 2.1741445064544678, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6931015849113464, + "num_tokens": 341232049.0, + "step": 13680 + }, + { + "epoch": 1.5024159894575004, + "grad_norm": 2.7819907665252686, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7237369418144226, + "num_tokens": 341249590.0, + "step": 13681 + }, + { + "epoch": 1.5025258071601142, + "grad_norm": 2.008751153945923, + "learning_rate": 1e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7359938025474548, + "num_tokens": 341280029.0, + "step": 13682 + }, + { + "epoch": 1.502635624862728, + "grad_norm": 2.8241612911224365, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7623721361160278, + "num_tokens": 341296793.0, + "step": 13683 + }, + { + "epoch": 1.5027454425653415, + "grad_norm": 2.307904005050659, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7216060757637024, + "num_tokens": 341320455.0, + "step": 13684 + }, + { + "epoch": 1.5028552602679552, + "grad_norm": 2.658024787902832, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7141809463500977, + "num_tokens": 341341062.0, + "step": 13685 + }, + { + "epoch": 1.5029650779705688, + "grad_norm": 2.1365902423858643, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7433955669403076, + "num_tokens": 341368430.0, + "step": 13686 + }, + { + "epoch": 1.5030748956731825, + "grad_norm": 2.3913700580596924, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7186084389686584, + "num_tokens": 341392207.0, + "step": 13687 + }, + { + "epoch": 1.5031847133757963, + "grad_norm": 2.711545944213867, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.718429446220398, + "num_tokens": 341412196.0, + "step": 13688 + }, + { + "epoch": 1.5032945310784098, + "grad_norm": 2.6229007244110107, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7362556457519531, + "num_tokens": 341432838.0, + "step": 13689 + }, + { + "epoch": 1.5034043487810234, + "grad_norm": 2.502718687057495, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7287352085113525, + "num_tokens": 341453298.0, + "step": 13690 + }, + { + "epoch": 1.5035141664836371, + "grad_norm": 2.3823812007904053, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7432599067687988, + "num_tokens": 341475227.0, + "step": 13691 + }, + { + "epoch": 1.5036239841862509, + "grad_norm": 1.998650312423706, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7205884456634521, + "num_tokens": 341506114.0, + "step": 13692 + }, + { + "epoch": 1.5037338018888646, + "grad_norm": 2.0785484313964844, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7072230577468872, + "num_tokens": 341537107.0, + "step": 13693 + }, + { + "epoch": 1.5038436195914782, + "grad_norm": 2.36118745803833, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7127199172973633, + "num_tokens": 341560696.0, + "step": 13694 + }, + { + "epoch": 1.5039534372940917, + "grad_norm": 2.3095197677612305, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.730507493019104, + "num_tokens": 341585114.0, + "step": 13695 + }, + { + "epoch": 1.5040632549967055, + "grad_norm": 2.094740152359009, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7098820209503174, + "num_tokens": 341613919.0, + "step": 13696 + }, + { + "epoch": 1.5041730726993192, + "grad_norm": 2.4002764225006104, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7309496402740479, + "num_tokens": 341638429.0, + "step": 13697 + }, + { + "epoch": 1.5042828904019327, + "grad_norm": 2.1008048057556152, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7086619734764099, + "num_tokens": 341668516.0, + "step": 13698 + }, + { + "epoch": 1.5043927081045465, + "grad_norm": 2.235682725906372, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7417765259742737, + "num_tokens": 341694328.0, + "step": 13699 + }, + { + "epoch": 1.50450252580716, + "grad_norm": 2.184058427810669, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7125501036643982, + "num_tokens": 341720941.0, + "step": 13700 + }, + { + "epoch": 1.5046123435097738, + "grad_norm": 1.9886471033096313, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7056445479393005, + "num_tokens": 341751813.0, + "step": 13701 + }, + { + "epoch": 1.5047221612123876, + "grad_norm": 2.3777904510498047, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7209080457687378, + "num_tokens": 341776253.0, + "step": 13702 + }, + { + "epoch": 1.504831978915001, + "grad_norm": 2.120053291320801, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.725296139717102, + "num_tokens": 341803413.0, + "step": 13703 + }, + { + "epoch": 1.5049417966176146, + "grad_norm": 2.4360644817352295, + "learning_rate": 1e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7414121627807617, + "num_tokens": 341823537.0, + "step": 13704 + }, + { + "epoch": 1.5050516143202284, + "grad_norm": 2.581223726272583, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7282407879829407, + "num_tokens": 341843053.0, + "step": 13705 + }, + { + "epoch": 1.5051614320228421, + "grad_norm": 2.5379207134246826, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7445593476295471, + "num_tokens": 341864272.0, + "step": 13706 + }, + { + "epoch": 1.505271249725456, + "grad_norm": 2.4733002185821533, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7228407859802246, + "num_tokens": 341886635.0, + "step": 13707 + }, + { + "epoch": 1.5053810674280694, + "grad_norm": 2.426687240600586, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7435429692268372, + "num_tokens": 341908232.0, + "step": 13708 + }, + { + "epoch": 1.505490885130683, + "grad_norm": 2.466057062149048, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7216427326202393, + "num_tokens": 341931098.0, + "step": 13709 + }, + { + "epoch": 1.5056007028332967, + "grad_norm": 2.5854880809783936, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7238633036613464, + "num_tokens": 341951702.0, + "step": 13710 + }, + { + "epoch": 1.5057105205359105, + "grad_norm": 2.449096918106079, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7321875095367432, + "num_tokens": 341972332.0, + "step": 13711 + }, + { + "epoch": 1.505820338238524, + "grad_norm": 2.065371036529541, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7355290651321411, + "num_tokens": 341999872.0, + "step": 13712 + }, + { + "epoch": 1.5059301559411375, + "grad_norm": 2.3556792736053467, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7313510775566101, + "num_tokens": 342023834.0, + "step": 13713 + }, + { + "epoch": 1.5060399736437513, + "grad_norm": 2.2326111793518066, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7072210311889648, + "num_tokens": 342050231.0, + "step": 13714 + }, + { + "epoch": 1.506149791346365, + "grad_norm": 2.1285040378570557, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7316997647285461, + "num_tokens": 342075911.0, + "step": 13715 + }, + { + "epoch": 1.5062596090489788, + "grad_norm": 2.432116985321045, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7393374443054199, + "num_tokens": 342098780.0, + "step": 13716 + }, + { + "epoch": 1.5063694267515924, + "grad_norm": 2.553891181945801, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7208282351493835, + "num_tokens": 342120202.0, + "step": 13717 + }, + { + "epoch": 1.506479244454206, + "grad_norm": 1.9801446199417114, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7377607226371765, + "num_tokens": 342149115.0, + "step": 13718 + }, + { + "epoch": 1.5065890621568196, + "grad_norm": 2.1314785480499268, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7146666646003723, + "num_tokens": 342175317.0, + "step": 13719 + }, + { + "epoch": 1.5066988798594334, + "grad_norm": 2.2380242347717285, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.71458500623703, + "num_tokens": 342200462.0, + "step": 13720 + }, + { + "epoch": 1.5068086975620472, + "grad_norm": 2.313577890396118, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7245626449584961, + "num_tokens": 342224141.0, + "step": 13721 + }, + { + "epoch": 1.5069185152646607, + "grad_norm": 1.9234076738357544, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7066905498504639, + "num_tokens": 342256109.0, + "step": 13722 + }, + { + "epoch": 1.5070283329672742, + "grad_norm": 2.7874903678894043, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.724891185760498, + "num_tokens": 342274927.0, + "step": 13723 + }, + { + "epoch": 1.507138150669888, + "grad_norm": 2.3089287281036377, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7129712104797363, + "num_tokens": 342301289.0, + "step": 13724 + }, + { + "epoch": 1.5072479683725017, + "grad_norm": 2.087794065475464, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7150075435638428, + "num_tokens": 342328645.0, + "step": 13725 + }, + { + "epoch": 1.5073577860751153, + "grad_norm": 2.2030882835388184, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7245005965232849, + "num_tokens": 342356130.0, + "step": 13726 + }, + { + "epoch": 1.5074676037777288, + "grad_norm": 2.339160203933716, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7116947770118713, + "num_tokens": 342381860.0, + "step": 13727 + }, + { + "epoch": 1.5075774214803426, + "grad_norm": 2.1301255226135254, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7197630405426025, + "num_tokens": 342410133.0, + "step": 13728 + }, + { + "epoch": 1.5076872391829563, + "grad_norm": 2.2227628231048584, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7252038717269897, + "num_tokens": 342434125.0, + "step": 13729 + }, + { + "epoch": 1.50779705688557, + "grad_norm": 2.141608715057373, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7272095680236816, + "num_tokens": 342462611.0, + "step": 13730 + }, + { + "epoch": 1.5079068745881836, + "grad_norm": 2.0417418479919434, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7472267150878906, + "num_tokens": 342492850.0, + "step": 13731 + }, + { + "epoch": 1.5080166922907972, + "grad_norm": 2.0565545558929443, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7124651670455933, + "num_tokens": 342522601.0, + "step": 13732 + }, + { + "epoch": 1.508126509993411, + "grad_norm": 2.566537618637085, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7283664345741272, + "num_tokens": 342543293.0, + "step": 13733 + }, + { + "epoch": 1.5082363276960247, + "grad_norm": 2.121257781982422, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7094775438308716, + "num_tokens": 342570945.0, + "step": 13734 + }, + { + "epoch": 1.5083461453986382, + "grad_norm": 2.3465819358825684, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7197988629341125, + "num_tokens": 342593667.0, + "step": 13735 + }, + { + "epoch": 1.508455963101252, + "grad_norm": 2.2634336948394775, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7095558047294617, + "num_tokens": 342620810.0, + "step": 13736 + }, + { + "epoch": 1.5085657808038655, + "grad_norm": 2.425765037536621, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7071249485015869, + "num_tokens": 342645144.0, + "step": 13737 + }, + { + "epoch": 1.5086755985064793, + "grad_norm": 2.207782506942749, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7299529314041138, + "num_tokens": 342670489.0, + "step": 13738 + }, + { + "epoch": 1.508785416209093, + "grad_norm": 2.1371095180511475, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.716456413269043, + "num_tokens": 342700409.0, + "step": 13739 + }, + { + "epoch": 1.5088952339117065, + "grad_norm": 1.9701766967773438, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.6965720653533936, + "num_tokens": 342737897.0, + "step": 13740 + }, + { + "epoch": 1.50900505161432, + "grad_norm": 2.522149085998535, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7283746004104614, + "num_tokens": 342759806.0, + "step": 13741 + }, + { + "epoch": 1.5091148693169338, + "grad_norm": 2.3699326515197754, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7091323733329773, + "num_tokens": 342784422.0, + "step": 13742 + }, + { + "epoch": 1.5092246870195476, + "grad_norm": 2.1944420337677, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7141008377075195, + "num_tokens": 342811403.0, + "step": 13743 + }, + { + "epoch": 1.5093345047221614, + "grad_norm": 2.321632146835327, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7013415098190308, + "num_tokens": 342836775.0, + "step": 13744 + }, + { + "epoch": 1.5094443224247749, + "grad_norm": 2.318955659866333, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7209274768829346, + "num_tokens": 342861400.0, + "step": 13745 + }, + { + "epoch": 1.5095541401273884, + "grad_norm": 2.259080648422241, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7236751317977905, + "num_tokens": 342886592.0, + "step": 13746 + }, + { + "epoch": 1.5096639578300022, + "grad_norm": 2.110255718231201, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7293614745140076, + "num_tokens": 342913139.0, + "step": 13747 + }, + { + "epoch": 1.509773775532616, + "grad_norm": 2.103924512863159, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7198472023010254, + "num_tokens": 342943045.0, + "step": 13748 + }, + { + "epoch": 1.5098835932352295, + "grad_norm": 2.2594664096832275, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7231001853942871, + "num_tokens": 342968164.0, + "step": 13749 + }, + { + "epoch": 1.5099934109378432, + "grad_norm": 2.3225347995758057, + "learning_rate": 1e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7416001558303833, + "num_tokens": 342991804.0, + "step": 13750 + }, + { + "epoch": 1.5101032286404568, + "grad_norm": 2.2156167030334473, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7173420190811157, + "num_tokens": 343019603.0, + "step": 13751 + }, + { + "epoch": 1.5102130463430705, + "grad_norm": 2.1031336784362793, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7273918390274048, + "num_tokens": 343048990.0, + "step": 13752 + }, + { + "epoch": 1.5103228640456843, + "grad_norm": 2.300489902496338, + "learning_rate": 1e-06, + "loss": 0.8067, + "mean_token_accuracy": 0.7447043657302856, + "num_tokens": 343072319.0, + "step": 13753 + }, + { + "epoch": 1.5104326817482978, + "grad_norm": 2.0851824283599854, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7043857574462891, + "num_tokens": 343102991.0, + "step": 13754 + }, + { + "epoch": 1.5105424994509113, + "grad_norm": 2.135122060775757, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7241619229316711, + "num_tokens": 343127994.0, + "step": 13755 + }, + { + "epoch": 1.510652317153525, + "grad_norm": 2.5529537200927734, + "learning_rate": 1e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7408748865127563, + "num_tokens": 343148742.0, + "step": 13756 + }, + { + "epoch": 1.5107621348561389, + "grad_norm": 2.4453516006469727, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7120945453643799, + "num_tokens": 343169532.0, + "step": 13757 + }, + { + "epoch": 1.5108719525587526, + "grad_norm": 2.1103827953338623, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7035346031188965, + "num_tokens": 343196849.0, + "step": 13758 + }, + { + "epoch": 1.5109817702613662, + "grad_norm": 2.5377357006073, + "learning_rate": 1e-06, + "loss": 0.7354, + "mean_token_accuracy": 0.7622137069702148, + "num_tokens": 343217187.0, + "step": 13759 + }, + { + "epoch": 1.5110915879639797, + "grad_norm": 2.2966527938842773, + "learning_rate": 1e-06, + "loss": 1.0404, + "mean_token_accuracy": 0.6951928734779358, + "num_tokens": 343243650.0, + "step": 13760 + }, + { + "epoch": 1.5112014056665934, + "grad_norm": 2.426544189453125, + "learning_rate": 1e-06, + "loss": 0.7731, + "mean_token_accuracy": 0.7480807900428772, + "num_tokens": 343264793.0, + "step": 13761 + }, + { + "epoch": 1.5113112233692072, + "grad_norm": 2.4314539432525635, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.735002338886261, + "num_tokens": 343288275.0, + "step": 13762 + }, + { + "epoch": 1.5114210410718207, + "grad_norm": 2.1568989753723145, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.692049503326416, + "num_tokens": 343317531.0, + "step": 13763 + }, + { + "epoch": 1.5115308587744345, + "grad_norm": 2.263258457183838, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7204668521881104, + "num_tokens": 343343388.0, + "step": 13764 + }, + { + "epoch": 1.511640676477048, + "grad_norm": 2.1828114986419678, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7250529527664185, + "num_tokens": 343370169.0, + "step": 13765 + }, + { + "epoch": 1.5117504941796618, + "grad_norm": 2.2007949352264404, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7390697598457336, + "num_tokens": 343395088.0, + "step": 13766 + }, + { + "epoch": 1.5118603118822755, + "grad_norm": 2.351588249206543, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7240536212921143, + "num_tokens": 343418988.0, + "step": 13767 + }, + { + "epoch": 1.511970129584889, + "grad_norm": 1.9965665340423584, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7257208824157715, + "num_tokens": 343449225.0, + "step": 13768 + }, + { + "epoch": 1.5120799472875026, + "grad_norm": 2.598932981491089, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7152918577194214, + "num_tokens": 343470021.0, + "step": 13769 + }, + { + "epoch": 1.5121897649901164, + "grad_norm": 2.4313831329345703, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7272989749908447, + "num_tokens": 343492796.0, + "step": 13770 + }, + { + "epoch": 1.5122995826927301, + "grad_norm": 1.977754831314087, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7158514261245728, + "num_tokens": 343525636.0, + "step": 13771 + }, + { + "epoch": 1.5124094003953439, + "grad_norm": 2.5092999935150146, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7219583988189697, + "num_tokens": 343547356.0, + "step": 13772 + }, + { + "epoch": 1.5125192180979574, + "grad_norm": 2.1264867782592773, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7032904028892517, + "num_tokens": 343575107.0, + "step": 13773 + }, + { + "epoch": 1.512629035800571, + "grad_norm": 2.193319320678711, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7124260663986206, + "num_tokens": 343603676.0, + "step": 13774 + }, + { + "epoch": 1.5127388535031847, + "grad_norm": 2.6039061546325684, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7309209704399109, + "num_tokens": 343623746.0, + "step": 13775 + }, + { + "epoch": 1.5128486712057985, + "grad_norm": 2.4180033206939697, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7210961580276489, + "num_tokens": 343654603.0, + "step": 13776 + }, + { + "epoch": 1.512958488908412, + "grad_norm": 2.124680280685425, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7238979339599609, + "num_tokens": 343681929.0, + "step": 13777 + }, + { + "epoch": 1.5130683066110255, + "grad_norm": 2.4869279861450195, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7143039107322693, + "num_tokens": 343704717.0, + "step": 13778 + }, + { + "epoch": 1.5131781243136393, + "grad_norm": 2.5016517639160156, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7100652456283569, + "num_tokens": 343726716.0, + "step": 13779 + }, + { + "epoch": 1.513287942016253, + "grad_norm": 2.4619193077087402, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7329906225204468, + "num_tokens": 343749257.0, + "step": 13780 + }, + { + "epoch": 1.5133977597188668, + "grad_norm": 2.505971670150757, + "learning_rate": 1e-06, + "loss": 0.7852, + "mean_token_accuracy": 0.7489873170852661, + "num_tokens": 343770987.0, + "step": 13781 + }, + { + "epoch": 1.5135075774214803, + "grad_norm": 2.3567698001861572, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7229998111724854, + "num_tokens": 343794807.0, + "step": 13782 + }, + { + "epoch": 1.5136173951240939, + "grad_norm": 2.0891873836517334, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7318850755691528, + "num_tokens": 343824064.0, + "step": 13783 + }, + { + "epoch": 1.5137272128267076, + "grad_norm": 2.550692081451416, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7281396389007568, + "num_tokens": 343844669.0, + "step": 13784 + }, + { + "epoch": 1.5138370305293214, + "grad_norm": 1.9617451429367065, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.6968238949775696, + "num_tokens": 343875404.0, + "step": 13785 + }, + { + "epoch": 1.5139468482319351, + "grad_norm": 2.3290228843688965, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.740516185760498, + "num_tokens": 343900541.0, + "step": 13786 + }, + { + "epoch": 1.5140566659345487, + "grad_norm": 2.535080909729004, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7273507118225098, + "num_tokens": 343921107.0, + "step": 13787 + }, + { + "epoch": 1.5141664836371622, + "grad_norm": 2.4270236492156982, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7042645812034607, + "num_tokens": 343945135.0, + "step": 13788 + }, + { + "epoch": 1.514276301339776, + "grad_norm": 2.2437007427215576, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.724092960357666, + "num_tokens": 343971723.0, + "step": 13789 + }, + { + "epoch": 1.5143861190423897, + "grad_norm": 1.9703726768493652, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7275696992874146, + "num_tokens": 344001533.0, + "step": 13790 + }, + { + "epoch": 1.5144959367450033, + "grad_norm": 2.296210289001465, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7072005271911621, + "num_tokens": 344025937.0, + "step": 13791 + }, + { + "epoch": 1.5146057544476168, + "grad_norm": 2.317793130874634, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.712102472782135, + "num_tokens": 344049514.0, + "step": 13792 + }, + { + "epoch": 1.5147155721502306, + "grad_norm": 2.27889347076416, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7117474675178528, + "num_tokens": 344075057.0, + "step": 13793 + }, + { + "epoch": 1.5148253898528443, + "grad_norm": 1.9808632135391235, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7050072550773621, + "num_tokens": 344109178.0, + "step": 13794 + }, + { + "epoch": 1.514935207555458, + "grad_norm": 2.335475206375122, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7260544300079346, + "num_tokens": 344132550.0, + "step": 13795 + }, + { + "epoch": 1.5150450252580716, + "grad_norm": 2.3631057739257812, + "learning_rate": 1e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7422026991844177, + "num_tokens": 344156339.0, + "step": 13796 + }, + { + "epoch": 1.5151548429606851, + "grad_norm": 2.240421772003174, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7050693035125732, + "num_tokens": 344185406.0, + "step": 13797 + }, + { + "epoch": 1.515264660663299, + "grad_norm": 2.2759506702423096, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7226763963699341, + "num_tokens": 344208831.0, + "step": 13798 + }, + { + "epoch": 1.5153744783659127, + "grad_norm": 2.5966596603393555, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7453058362007141, + "num_tokens": 344227677.0, + "step": 13799 + }, + { + "epoch": 1.5154842960685262, + "grad_norm": 2.225835084915161, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7215614914894104, + "num_tokens": 344252739.0, + "step": 13800 + }, + { + "epoch": 1.51559411377114, + "grad_norm": 2.3709051609039307, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.730226457118988, + "num_tokens": 344274625.0, + "step": 13801 + }, + { + "epoch": 1.5157039314737535, + "grad_norm": 2.2554807662963867, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7131965756416321, + "num_tokens": 344301760.0, + "step": 13802 + }, + { + "epoch": 1.5158137491763672, + "grad_norm": 2.216953754425049, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7201508283615112, + "num_tokens": 344327674.0, + "step": 13803 + }, + { + "epoch": 1.515923566878981, + "grad_norm": 2.100311517715454, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7101095914840698, + "num_tokens": 344356482.0, + "step": 13804 + }, + { + "epoch": 1.5160333845815945, + "grad_norm": 2.0546562671661377, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7147393226623535, + "num_tokens": 344386112.0, + "step": 13805 + }, + { + "epoch": 1.516143202284208, + "grad_norm": 2.585451126098633, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7341441512107849, + "num_tokens": 344407219.0, + "step": 13806 + }, + { + "epoch": 1.5162530199868218, + "grad_norm": 2.2417430877685547, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7182881832122803, + "num_tokens": 344432998.0, + "step": 13807 + }, + { + "epoch": 1.5163628376894356, + "grad_norm": 2.3365273475646973, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.725075364112854, + "num_tokens": 344455943.0, + "step": 13808 + }, + { + "epoch": 1.5164726553920493, + "grad_norm": 2.5083789825439453, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7180300354957581, + "num_tokens": 344478030.0, + "step": 13809 + }, + { + "epoch": 1.5165824730946629, + "grad_norm": 3.076493263244629, + "learning_rate": 1e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.742973268032074, + "num_tokens": 344495495.0, + "step": 13810 + }, + { + "epoch": 1.5166922907972764, + "grad_norm": 2.417229413986206, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7205524444580078, + "num_tokens": 344517559.0, + "step": 13811 + }, + { + "epoch": 1.5168021084998902, + "grad_norm": 2.3204450607299805, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7369260787963867, + "num_tokens": 344541128.0, + "step": 13812 + }, + { + "epoch": 1.516911926202504, + "grad_norm": 2.134002685546875, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7045851349830627, + "num_tokens": 344568818.0, + "step": 13813 + }, + { + "epoch": 1.5170217439051175, + "grad_norm": 2.194288730621338, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7318979501724243, + "num_tokens": 344596138.0, + "step": 13814 + }, + { + "epoch": 1.5171315616077312, + "grad_norm": 2.2498340606689453, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7079781293869019, + "num_tokens": 344621317.0, + "step": 13815 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 2.3191874027252197, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7283333539962769, + "num_tokens": 344645624.0, + "step": 13816 + }, + { + "epoch": 1.5173511970129585, + "grad_norm": 2.226317882537842, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7143863439559937, + "num_tokens": 344674196.0, + "step": 13817 + }, + { + "epoch": 1.5174610147155723, + "grad_norm": 2.21821665763855, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7140581607818604, + "num_tokens": 344701185.0, + "step": 13818 + }, + { + "epoch": 1.5175708324181858, + "grad_norm": 2.3092665672302246, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7204749584197998, + "num_tokens": 344725390.0, + "step": 13819 + }, + { + "epoch": 1.5176806501207993, + "grad_norm": 2.465787887573242, + "learning_rate": 1e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.744963526725769, + "num_tokens": 344746108.0, + "step": 13820 + }, + { + "epoch": 1.517790467823413, + "grad_norm": 2.203179121017456, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7367687821388245, + "num_tokens": 344772046.0, + "step": 13821 + }, + { + "epoch": 1.5179002855260268, + "grad_norm": 2.181725025177002, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7270835638046265, + "num_tokens": 344799094.0, + "step": 13822 + }, + { + "epoch": 1.5180101032286406, + "grad_norm": 2.1313765048980713, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7222045660018921, + "num_tokens": 344827289.0, + "step": 13823 + }, + { + "epoch": 1.5181199209312541, + "grad_norm": 2.772859811782837, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7482873201370239, + "num_tokens": 344845712.0, + "step": 13824 + }, + { + "epoch": 1.5182297386338677, + "grad_norm": 2.8262569904327393, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7229431867599487, + "num_tokens": 344864253.0, + "step": 13825 + }, + { + "epoch": 1.5183395563364814, + "grad_norm": 2.2203071117401123, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7224931716918945, + "num_tokens": 344890811.0, + "step": 13826 + }, + { + "epoch": 1.5184493740390952, + "grad_norm": 2.378469705581665, + "learning_rate": 1e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.748382568359375, + "num_tokens": 344913126.0, + "step": 13827 + }, + { + "epoch": 1.5185591917417087, + "grad_norm": 2.1946215629577637, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7220876216888428, + "num_tokens": 344938707.0, + "step": 13828 + }, + { + "epoch": 1.5186690094443223, + "grad_norm": 2.4852967262268066, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7017215490341187, + "num_tokens": 344960854.0, + "step": 13829 + }, + { + "epoch": 1.518778827146936, + "grad_norm": 2.5469629764556885, + "learning_rate": 1e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7463337779045105, + "num_tokens": 344980238.0, + "step": 13830 + }, + { + "epoch": 1.5188886448495498, + "grad_norm": 2.3072762489318848, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7406951189041138, + "num_tokens": 345002744.0, + "step": 13831 + }, + { + "epoch": 1.5189984625521635, + "grad_norm": 2.165217876434326, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7411423921585083, + "num_tokens": 345028600.0, + "step": 13832 + }, + { + "epoch": 1.519108280254777, + "grad_norm": 2.6569089889526367, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7210196256637573, + "num_tokens": 345049941.0, + "step": 13833 + }, + { + "epoch": 1.5192180979573906, + "grad_norm": 2.3643486499786377, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7130844593048096, + "num_tokens": 345075173.0, + "step": 13834 + }, + { + "epoch": 1.5193279156600044, + "grad_norm": 2.4379665851593018, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.719404399394989, + "num_tokens": 345097391.0, + "step": 13835 + }, + { + "epoch": 1.5194377333626181, + "grad_norm": 2.259112596511841, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7349316477775574, + "num_tokens": 345120345.0, + "step": 13836 + }, + { + "epoch": 1.5195475510652319, + "grad_norm": 2.368450880050659, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7187502384185791, + "num_tokens": 345142813.0, + "step": 13837 + }, + { + "epoch": 1.5196573687678454, + "grad_norm": 2.3613388538360596, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7474132776260376, + "num_tokens": 345164285.0, + "step": 13838 + }, + { + "epoch": 1.519767186470459, + "grad_norm": 2.3018076419830322, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7157185077667236, + "num_tokens": 345190629.0, + "step": 13839 + }, + { + "epoch": 1.5198770041730727, + "grad_norm": 2.381434202194214, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7206369638442993, + "num_tokens": 345212110.0, + "step": 13840 + }, + { + "epoch": 1.5199868218756865, + "grad_norm": 2.0434789657592773, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7228751182556152, + "num_tokens": 345241871.0, + "step": 13841 + }, + { + "epoch": 1.5200966395783, + "grad_norm": 2.21702241897583, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6903154850006104, + "num_tokens": 345268886.0, + "step": 13842 + }, + { + "epoch": 1.5202064572809135, + "grad_norm": 2.2242064476013184, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.700916588306427, + "num_tokens": 345295182.0, + "step": 13843 + }, + { + "epoch": 1.5203162749835273, + "grad_norm": 2.1921093463897705, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7110016345977783, + "num_tokens": 345323228.0, + "step": 13844 + }, + { + "epoch": 1.520426092686141, + "grad_norm": 2.373685359954834, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7447330355644226, + "num_tokens": 345345392.0, + "step": 13845 + }, + { + "epoch": 1.5205359103887548, + "grad_norm": 2.330982208251953, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7093489170074463, + "num_tokens": 345371917.0, + "step": 13846 + }, + { + "epoch": 1.5206457280913683, + "grad_norm": 2.221669912338257, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7177403569221497, + "num_tokens": 345397429.0, + "step": 13847 + }, + { + "epoch": 1.5207555457939819, + "grad_norm": 2.0536770820617676, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7144046425819397, + "num_tokens": 345427048.0, + "step": 13848 + }, + { + "epoch": 1.5208653634965956, + "grad_norm": 2.460188627243042, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.716760516166687, + "num_tokens": 345450459.0, + "step": 13849 + }, + { + "epoch": 1.5209751811992094, + "grad_norm": 2.4853501319885254, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.727597713470459, + "num_tokens": 345471412.0, + "step": 13850 + }, + { + "epoch": 1.5210849989018231, + "grad_norm": 2.067243814468384, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6948132514953613, + "num_tokens": 345503579.0, + "step": 13851 + }, + { + "epoch": 1.5211948166044367, + "grad_norm": 2.5545291900634766, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.724366307258606, + "num_tokens": 345525384.0, + "step": 13852 + }, + { + "epoch": 1.5213046343070502, + "grad_norm": 2.4805753231048584, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7261710166931152, + "num_tokens": 345546418.0, + "step": 13853 + }, + { + "epoch": 1.521414452009664, + "grad_norm": 2.494371175765991, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6968188285827637, + "num_tokens": 345569051.0, + "step": 13854 + }, + { + "epoch": 1.5215242697122777, + "grad_norm": 2.036882162094116, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7314519286155701, + "num_tokens": 345598708.0, + "step": 13855 + }, + { + "epoch": 1.5216340874148913, + "grad_norm": 2.877725839614868, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7382644414901733, + "num_tokens": 345614439.0, + "step": 13856 + }, + { + "epoch": 1.5217439051175048, + "grad_norm": 2.8188014030456543, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7273162603378296, + "num_tokens": 345632584.0, + "step": 13857 + }, + { + "epoch": 1.5218537228201185, + "grad_norm": 2.4577741622924805, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7231777906417847, + "num_tokens": 345655583.0, + "step": 13858 + }, + { + "epoch": 1.5219635405227323, + "grad_norm": 2.245368003845215, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7153675556182861, + "num_tokens": 345681095.0, + "step": 13859 + }, + { + "epoch": 1.522073358225346, + "grad_norm": 2.2019219398498535, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7059338092803955, + "num_tokens": 345707541.0, + "step": 13860 + }, + { + "epoch": 1.5221831759279596, + "grad_norm": 2.3607144355773926, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7056528329849243, + "num_tokens": 345732510.0, + "step": 13861 + }, + { + "epoch": 1.5222929936305731, + "grad_norm": 2.2829127311706543, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7500987648963928, + "num_tokens": 345754111.0, + "step": 13862 + }, + { + "epoch": 1.522402811333187, + "grad_norm": 2.4577796459198, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7119117379188538, + "num_tokens": 345775716.0, + "step": 13863 + }, + { + "epoch": 1.5225126290358006, + "grad_norm": 1.973191738128662, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7136044502258301, + "num_tokens": 345806562.0, + "step": 13864 + }, + { + "epoch": 1.5226224467384142, + "grad_norm": 2.0329084396362305, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.708510160446167, + "num_tokens": 345835855.0, + "step": 13865 + }, + { + "epoch": 1.522732264441028, + "grad_norm": 2.244213819503784, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7105541229248047, + "num_tokens": 345860183.0, + "step": 13866 + }, + { + "epoch": 1.5228420821436415, + "grad_norm": 2.280766010284424, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7068941593170166, + "num_tokens": 345884808.0, + "step": 13867 + }, + { + "epoch": 1.5229518998462552, + "grad_norm": 2.1810808181762695, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7114287614822388, + "num_tokens": 345910607.0, + "step": 13868 + }, + { + "epoch": 1.523061717548869, + "grad_norm": 2.3649632930755615, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7378474473953247, + "num_tokens": 345932454.0, + "step": 13869 + }, + { + "epoch": 1.5231715352514825, + "grad_norm": 2.5328307151794434, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7352315187454224, + "num_tokens": 345953560.0, + "step": 13870 + }, + { + "epoch": 1.523281352954096, + "grad_norm": 2.541024684906006, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7305102944374084, + "num_tokens": 345974854.0, + "step": 13871 + }, + { + "epoch": 1.5233911706567098, + "grad_norm": 2.387216329574585, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7307657599449158, + "num_tokens": 345996136.0, + "step": 13872 + }, + { + "epoch": 1.5235009883593236, + "grad_norm": 2.2748265266418457, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7115670442581177, + "num_tokens": 346022747.0, + "step": 13873 + }, + { + "epoch": 1.5236108060619373, + "grad_norm": 2.3396971225738525, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7355311512947083, + "num_tokens": 346044936.0, + "step": 13874 + }, + { + "epoch": 1.5237206237645509, + "grad_norm": 2.460484743118286, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7273485064506531, + "num_tokens": 346066771.0, + "step": 13875 + }, + { + "epoch": 1.5238304414671644, + "grad_norm": 2.344910144805908, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7071393132209778, + "num_tokens": 346095509.0, + "step": 13876 + }, + { + "epoch": 1.5239402591697782, + "grad_norm": 2.586162805557251, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7260050773620605, + "num_tokens": 346115180.0, + "step": 13877 + }, + { + "epoch": 1.524050076872392, + "grad_norm": 2.433178663253784, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7055596113204956, + "num_tokens": 346138014.0, + "step": 13878 + }, + { + "epoch": 1.5241598945750054, + "grad_norm": 2.0879311561584473, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.698165774345398, + "num_tokens": 346168865.0, + "step": 13879 + }, + { + "epoch": 1.5242697122776192, + "grad_norm": 2.292646646499634, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7012748718261719, + "num_tokens": 346191942.0, + "step": 13880 + }, + { + "epoch": 1.5243795299802327, + "grad_norm": 2.297532558441162, + "learning_rate": 1e-06, + "loss": 0.7235, + "mean_token_accuracy": 0.7601388096809387, + "num_tokens": 346213017.0, + "step": 13881 + }, + { + "epoch": 1.5244893476828465, + "grad_norm": 2.3198208808898926, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7158524990081787, + "num_tokens": 346236912.0, + "step": 13882 + }, + { + "epoch": 1.5245991653854603, + "grad_norm": 2.4446940422058105, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7018013000488281, + "num_tokens": 346259156.0, + "step": 13883 + }, + { + "epoch": 1.5247089830880738, + "grad_norm": 2.4525997638702393, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7343726754188538, + "num_tokens": 346279087.0, + "step": 13884 + }, + { + "epoch": 1.5248188007906873, + "grad_norm": 2.3106138706207275, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.727634072303772, + "num_tokens": 346303740.0, + "step": 13885 + }, + { + "epoch": 1.524928618493301, + "grad_norm": 2.4254257678985596, + "learning_rate": 1e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7510016560554504, + "num_tokens": 346324653.0, + "step": 13886 + }, + { + "epoch": 1.5250384361959148, + "grad_norm": 2.4354214668273926, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7240353226661682, + "num_tokens": 346345370.0, + "step": 13887 + }, + { + "epoch": 1.5251482538985286, + "grad_norm": 2.304609775543213, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7159184217453003, + "num_tokens": 346369778.0, + "step": 13888 + }, + { + "epoch": 1.5252580716011421, + "grad_norm": 2.298184871673584, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7288656234741211, + "num_tokens": 346394214.0, + "step": 13889 + }, + { + "epoch": 1.5253678893037557, + "grad_norm": 2.2216198444366455, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7161351442337036, + "num_tokens": 346420576.0, + "step": 13890 + }, + { + "epoch": 1.5254777070063694, + "grad_norm": 2.33119797706604, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6996753215789795, + "num_tokens": 346444889.0, + "step": 13891 + }, + { + "epoch": 1.5255875247089832, + "grad_norm": 2.3810105323791504, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7206175327301025, + "num_tokens": 346468356.0, + "step": 13892 + }, + { + "epoch": 1.5256973424115967, + "grad_norm": 2.492727756500244, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7259690761566162, + "num_tokens": 346490301.0, + "step": 13893 + }, + { + "epoch": 1.5258071601142102, + "grad_norm": 2.2351551055908203, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7028571367263794, + "num_tokens": 346516324.0, + "step": 13894 + }, + { + "epoch": 1.525916977816824, + "grad_norm": 2.1619255542755127, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7137187719345093, + "num_tokens": 346543810.0, + "step": 13895 + }, + { + "epoch": 1.5260267955194378, + "grad_norm": 2.0702037811279297, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7076470851898193, + "num_tokens": 346572194.0, + "step": 13896 + }, + { + "epoch": 1.5261366132220515, + "grad_norm": 2.4465277194976807, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7310194969177246, + "num_tokens": 346593541.0, + "step": 13897 + }, + { + "epoch": 1.526246430924665, + "grad_norm": 2.5472400188446045, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7264888882637024, + "num_tokens": 346614053.0, + "step": 13898 + }, + { + "epoch": 1.5263562486272786, + "grad_norm": 2.338127851486206, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7254047393798828, + "num_tokens": 346636931.0, + "step": 13899 + }, + { + "epoch": 1.5264660663298923, + "grad_norm": 2.5278453826904297, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.731742799282074, + "num_tokens": 346657533.0, + "step": 13900 + }, + { + "epoch": 1.526575884032506, + "grad_norm": 2.3066251277923584, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7133594155311584, + "num_tokens": 346681924.0, + "step": 13901 + }, + { + "epoch": 1.5266857017351199, + "grad_norm": 2.550854206085205, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7452096939086914, + "num_tokens": 346701719.0, + "step": 13902 + }, + { + "epoch": 1.5267955194377334, + "grad_norm": 2.3182497024536133, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7146271467208862, + "num_tokens": 346726392.0, + "step": 13903 + }, + { + "epoch": 1.526905337140347, + "grad_norm": 2.215064287185669, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7132527828216553, + "num_tokens": 346751790.0, + "step": 13904 + }, + { + "epoch": 1.5270151548429607, + "grad_norm": 2.568314790725708, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7182193994522095, + "num_tokens": 346773079.0, + "step": 13905 + }, + { + "epoch": 1.5271249725455744, + "grad_norm": 2.4076712131500244, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7189270853996277, + "num_tokens": 346795686.0, + "step": 13906 + }, + { + "epoch": 1.527234790248188, + "grad_norm": 2.0652811527252197, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7299442291259766, + "num_tokens": 346825065.0, + "step": 13907 + }, + { + "epoch": 1.5273446079508015, + "grad_norm": 2.506326198577881, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7275998592376709, + "num_tokens": 346847372.0, + "step": 13908 + }, + { + "epoch": 1.5274544256534153, + "grad_norm": 2.3081214427948, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7435760498046875, + "num_tokens": 346871030.0, + "step": 13909 + }, + { + "epoch": 1.527564243356029, + "grad_norm": 2.8080625534057617, + "learning_rate": 1e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.738970160484314, + "num_tokens": 346888157.0, + "step": 13910 + }, + { + "epoch": 1.5276740610586428, + "grad_norm": 2.4213757514953613, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7357213497161865, + "num_tokens": 346910335.0, + "step": 13911 + }, + { + "epoch": 1.5277838787612563, + "grad_norm": 2.363086223602295, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7179402112960815, + "num_tokens": 346933492.0, + "step": 13912 + }, + { + "epoch": 1.5278936964638699, + "grad_norm": 2.2184362411499023, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7138643264770508, + "num_tokens": 346960046.0, + "step": 13913 + }, + { + "epoch": 1.5280035141664836, + "grad_norm": 2.5635781288146973, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7315605878829956, + "num_tokens": 346980560.0, + "step": 13914 + }, + { + "epoch": 1.5281133318690974, + "grad_norm": 2.3104846477508545, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7273753881454468, + "num_tokens": 347005028.0, + "step": 13915 + }, + { + "epoch": 1.5282231495717111, + "grad_norm": 2.334505081176758, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7093566060066223, + "num_tokens": 347030902.0, + "step": 13916 + }, + { + "epoch": 1.5283329672743247, + "grad_norm": 2.131002187728882, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.710481584072113, + "num_tokens": 347059922.0, + "step": 13917 + }, + { + "epoch": 1.5284427849769382, + "grad_norm": 2.3625705242156982, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7303522825241089, + "num_tokens": 347083490.0, + "step": 13918 + }, + { + "epoch": 1.528552602679552, + "grad_norm": 2.5136585235595703, + "learning_rate": 1e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7502002716064453, + "num_tokens": 347105228.0, + "step": 13919 + }, + { + "epoch": 1.5286624203821657, + "grad_norm": 2.4536795616149902, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7364435195922852, + "num_tokens": 347126792.0, + "step": 13920 + }, + { + "epoch": 1.5287722380847792, + "grad_norm": 1.818463683128357, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7042297124862671, + "num_tokens": 347164590.0, + "step": 13921 + }, + { + "epoch": 1.5288820557873928, + "grad_norm": 2.451859474182129, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.730114758014679, + "num_tokens": 347185945.0, + "step": 13922 + }, + { + "epoch": 1.5289918734900065, + "grad_norm": 2.35282826423645, + "learning_rate": 1e-06, + "loss": 0.7663, + "mean_token_accuracy": 0.7530291080474854, + "num_tokens": 347209316.0, + "step": 13923 + }, + { + "epoch": 1.5291016911926203, + "grad_norm": 2.188344717025757, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7003153562545776, + "num_tokens": 347237611.0, + "step": 13924 + }, + { + "epoch": 1.529211508895234, + "grad_norm": 2.8109729290008545, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.754164457321167, + "num_tokens": 347254605.0, + "step": 13925 + }, + { + "epoch": 1.5293213265978476, + "grad_norm": 2.1630589962005615, + "learning_rate": 1e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7495118975639343, + "num_tokens": 347278941.0, + "step": 13926 + }, + { + "epoch": 1.5294311443004611, + "grad_norm": 2.285733699798584, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7243578433990479, + "num_tokens": 347303536.0, + "step": 13927 + }, + { + "epoch": 1.5295409620030749, + "grad_norm": 2.092564821243286, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7146490216255188, + "num_tokens": 347332334.0, + "step": 13928 + }, + { + "epoch": 1.5296507797056886, + "grad_norm": 2.4135215282440186, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.733035683631897, + "num_tokens": 347355935.0, + "step": 13929 + }, + { + "epoch": 1.5297605974083022, + "grad_norm": 2.1260781288146973, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7103228569030762, + "num_tokens": 347384403.0, + "step": 13930 + }, + { + "epoch": 1.529870415110916, + "grad_norm": 2.1469547748565674, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7345150709152222, + "num_tokens": 347411372.0, + "step": 13931 + }, + { + "epoch": 1.5299802328135295, + "grad_norm": 2.369487762451172, + "learning_rate": 1e-06, + "loss": 0.8097, + "mean_token_accuracy": 0.7406346201896667, + "num_tokens": 347433005.0, + "step": 13932 + }, + { + "epoch": 1.5300900505161432, + "grad_norm": 2.4571871757507324, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7343632578849792, + "num_tokens": 347453377.0, + "step": 13933 + }, + { + "epoch": 1.530199868218757, + "grad_norm": 2.497921943664551, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7367134094238281, + "num_tokens": 347475924.0, + "step": 13934 + }, + { + "epoch": 1.5303096859213705, + "grad_norm": 2.6459827423095703, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7296762466430664, + "num_tokens": 347497648.0, + "step": 13935 + }, + { + "epoch": 1.530419503623984, + "grad_norm": 2.2811081409454346, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7078763842582703, + "num_tokens": 347525022.0, + "step": 13936 + }, + { + "epoch": 1.5305293213265978, + "grad_norm": 2.2660510540008545, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7095892429351807, + "num_tokens": 347549660.0, + "step": 13937 + }, + { + "epoch": 1.5306391390292116, + "grad_norm": 2.189561367034912, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7388103604316711, + "num_tokens": 347573505.0, + "step": 13938 + }, + { + "epoch": 1.5307489567318253, + "grad_norm": 2.251220941543579, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7240941524505615, + "num_tokens": 347598127.0, + "step": 13939 + }, + { + "epoch": 1.5308587744344389, + "grad_norm": 1.9744926691055298, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7155860662460327, + "num_tokens": 347629208.0, + "step": 13940 + }, + { + "epoch": 1.5309685921370524, + "grad_norm": 2.1760153770446777, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6805442571640015, + "num_tokens": 347660241.0, + "step": 13941 + }, + { + "epoch": 1.5310784098396661, + "grad_norm": 2.493107557296753, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7154357433319092, + "num_tokens": 347681792.0, + "step": 13942 + }, + { + "epoch": 1.53118822754228, + "grad_norm": 2.186709403991699, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.729709804058075, + "num_tokens": 347708114.0, + "step": 13943 + }, + { + "epoch": 1.5312980452448934, + "grad_norm": 2.0939760208129883, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7054258584976196, + "num_tokens": 347737618.0, + "step": 13944 + }, + { + "epoch": 1.5314078629475072, + "grad_norm": 2.1479122638702393, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.715093731880188, + "num_tokens": 347766320.0, + "step": 13945 + }, + { + "epoch": 1.5315176806501207, + "grad_norm": 2.417255163192749, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7154104709625244, + "num_tokens": 347788140.0, + "step": 13946 + }, + { + "epoch": 1.5316274983527345, + "grad_norm": 2.0435447692871094, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7310189008712769, + "num_tokens": 347818333.0, + "step": 13947 + }, + { + "epoch": 1.5317373160553482, + "grad_norm": 2.0744388103485107, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.727154016494751, + "num_tokens": 347850260.0, + "step": 13948 + }, + { + "epoch": 1.5318471337579618, + "grad_norm": 2.250865936279297, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7088953852653503, + "num_tokens": 347875708.0, + "step": 13949 + }, + { + "epoch": 1.5319569514605753, + "grad_norm": 2.3467836380004883, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7152556777000427, + "num_tokens": 347900201.0, + "step": 13950 + }, + { + "epoch": 1.532066769163189, + "grad_norm": 2.342071056365967, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7185883522033691, + "num_tokens": 347923780.0, + "step": 13951 + }, + { + "epoch": 1.5321765868658028, + "grad_norm": 2.066598415374756, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6903097629547119, + "num_tokens": 347956772.0, + "step": 13952 + }, + { + "epoch": 1.5322864045684166, + "grad_norm": 2.319354772567749, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7102781534194946, + "num_tokens": 347983789.0, + "step": 13953 + }, + { + "epoch": 1.5323962222710301, + "grad_norm": 2.313246726989746, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7453913688659668, + "num_tokens": 348008018.0, + "step": 13954 + }, + { + "epoch": 1.5325060399736437, + "grad_norm": 2.513166666030884, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.711594820022583, + "num_tokens": 348031934.0, + "step": 13955 + }, + { + "epoch": 1.5326158576762574, + "grad_norm": 2.295938491821289, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7196377515792847, + "num_tokens": 348056580.0, + "step": 13956 + }, + { + "epoch": 1.5327256753788712, + "grad_norm": 2.160524845123291, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7122558355331421, + "num_tokens": 348084485.0, + "step": 13957 + }, + { + "epoch": 1.5328354930814847, + "grad_norm": 2.0552284717559814, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7189551591873169, + "num_tokens": 348113070.0, + "step": 13958 + }, + { + "epoch": 1.5329453107840982, + "grad_norm": 2.337195634841919, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7246823310852051, + "num_tokens": 348136616.0, + "step": 13959 + }, + { + "epoch": 1.533055128486712, + "grad_norm": 2.131218910217285, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7264277935028076, + "num_tokens": 348164485.0, + "step": 13960 + }, + { + "epoch": 1.5331649461893258, + "grad_norm": 2.2016830444335938, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7191077470779419, + "num_tokens": 348191270.0, + "step": 13961 + }, + { + "epoch": 1.5332747638919395, + "grad_norm": 2.1176130771636963, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6973696947097778, + "num_tokens": 348219079.0, + "step": 13962 + }, + { + "epoch": 1.533384581594553, + "grad_norm": 2.5427892208099365, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7359046936035156, + "num_tokens": 348240594.0, + "step": 13963 + }, + { + "epoch": 1.5334943992971666, + "grad_norm": 2.1273458003997803, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6972337961196899, + "num_tokens": 348272159.0, + "step": 13964 + }, + { + "epoch": 1.5336042169997803, + "grad_norm": 2.571073055267334, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7112678289413452, + "num_tokens": 348291922.0, + "step": 13965 + }, + { + "epoch": 1.533714034702394, + "grad_norm": 2.442434549331665, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7156510353088379, + "num_tokens": 348315618.0, + "step": 13966 + }, + { + "epoch": 1.5338238524050078, + "grad_norm": 2.409684181213379, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.6941431164741516, + "num_tokens": 348340942.0, + "step": 13967 + }, + { + "epoch": 1.5339336701076214, + "grad_norm": 2.656459331512451, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7458715438842773, + "num_tokens": 348359329.0, + "step": 13968 + }, + { + "epoch": 1.534043487810235, + "grad_norm": 2.447458028793335, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7286766171455383, + "num_tokens": 348379670.0, + "step": 13969 + }, + { + "epoch": 1.5341533055128487, + "grad_norm": 2.062285900115967, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7016125917434692, + "num_tokens": 348409560.0, + "step": 13970 + }, + { + "epoch": 1.5342631232154624, + "grad_norm": 2.7804133892059326, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7267634868621826, + "num_tokens": 348428450.0, + "step": 13971 + }, + { + "epoch": 1.534372940918076, + "grad_norm": 2.0453147888183594, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.69643235206604, + "num_tokens": 348460547.0, + "step": 13972 + }, + { + "epoch": 1.5344827586206895, + "grad_norm": 2.8081297874450684, + "learning_rate": 1e-06, + "loss": 0.7421, + "mean_token_accuracy": 0.7588571906089783, + "num_tokens": 348478211.0, + "step": 13973 + }, + { + "epoch": 1.5345925763233033, + "grad_norm": 2.004624605178833, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7092947959899902, + "num_tokens": 348508758.0, + "step": 13974 + }, + { + "epoch": 1.534702394025917, + "grad_norm": 2.731693983078003, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7354686260223389, + "num_tokens": 348526639.0, + "step": 13975 + }, + { + "epoch": 1.5348122117285308, + "grad_norm": 2.414149522781372, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7200394868850708, + "num_tokens": 348548316.0, + "step": 13976 + }, + { + "epoch": 1.5349220294311443, + "grad_norm": 2.217618465423584, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6865631341934204, + "num_tokens": 348575581.0, + "step": 13977 + }, + { + "epoch": 1.5350318471337578, + "grad_norm": 2.319197416305542, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7278621196746826, + "num_tokens": 348599210.0, + "step": 13978 + }, + { + "epoch": 1.5351416648363716, + "grad_norm": 2.334542751312256, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7145456075668335, + "num_tokens": 348624981.0, + "step": 13979 + }, + { + "epoch": 1.5352514825389854, + "grad_norm": 2.475497245788574, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7204326391220093, + "num_tokens": 348647696.0, + "step": 13980 + }, + { + "epoch": 1.535361300241599, + "grad_norm": 2.7195615768432617, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7434278130531311, + "num_tokens": 348667932.0, + "step": 13981 + }, + { + "epoch": 1.5354711179442126, + "grad_norm": 2.249856948852539, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7203772664070129, + "num_tokens": 348693169.0, + "step": 13982 + }, + { + "epoch": 1.5355809356468262, + "grad_norm": 2.390467643737793, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7098389863967896, + "num_tokens": 348718420.0, + "step": 13983 + }, + { + "epoch": 1.53569075334944, + "grad_norm": 2.199784278869629, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7081995010375977, + "num_tokens": 348745893.0, + "step": 13984 + }, + { + "epoch": 1.5358005710520537, + "grad_norm": 2.469912052154541, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.724844753742218, + "num_tokens": 348767362.0, + "step": 13985 + }, + { + "epoch": 1.5359103887546672, + "grad_norm": 2.332029104232788, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7009235620498657, + "num_tokens": 348792178.0, + "step": 13986 + }, + { + "epoch": 1.5360202064572808, + "grad_norm": 2.211681365966797, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.709871768951416, + "num_tokens": 348817130.0, + "step": 13987 + }, + { + "epoch": 1.5361300241598945, + "grad_norm": 2.461369037628174, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7354493141174316, + "num_tokens": 348838992.0, + "step": 13988 + }, + { + "epoch": 1.5362398418625083, + "grad_norm": 2.0336380004882812, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7303107976913452, + "num_tokens": 348867667.0, + "step": 13989 + }, + { + "epoch": 1.536349659565122, + "grad_norm": 2.2978310585021973, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7111008763313293, + "num_tokens": 348893620.0, + "step": 13990 + }, + { + "epoch": 1.5364594772677356, + "grad_norm": 2.20509934425354, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7144773602485657, + "num_tokens": 348920531.0, + "step": 13991 + }, + { + "epoch": 1.536569294970349, + "grad_norm": 2.225066900253296, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6972235441207886, + "num_tokens": 348946190.0, + "step": 13992 + }, + { + "epoch": 1.5366791126729629, + "grad_norm": 2.4367048740386963, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7114933729171753, + "num_tokens": 348969390.0, + "step": 13993 + }, + { + "epoch": 1.5367889303755766, + "grad_norm": 2.4725232124328613, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7294259071350098, + "num_tokens": 348991265.0, + "step": 13994 + }, + { + "epoch": 1.5368987480781902, + "grad_norm": 2.403594493865967, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7143518924713135, + "num_tokens": 349013909.0, + "step": 13995 + }, + { + "epoch": 1.537008565780804, + "grad_norm": 2.4816787242889404, + "learning_rate": 1e-06, + "loss": 0.7848, + "mean_token_accuracy": 0.7486060857772827, + "num_tokens": 349033031.0, + "step": 13996 + }, + { + "epoch": 1.5371183834834174, + "grad_norm": 2.386683225631714, + "learning_rate": 1e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.754393458366394, + "num_tokens": 349054449.0, + "step": 13997 + }, + { + "epoch": 1.5372282011860312, + "grad_norm": 1.9686399698257446, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7452190518379211, + "num_tokens": 349086533.0, + "step": 13998 + }, + { + "epoch": 1.537338018888645, + "grad_norm": 2.280575752258301, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7255435585975647, + "num_tokens": 349111664.0, + "step": 13999 + }, + { + "epoch": 1.5374478365912585, + "grad_norm": 2.2861952781677246, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7046763896942139, + "num_tokens": 349135336.0, + "step": 14000 + }, + { + "epoch": 1.537557654293872, + "grad_norm": 2.6379168033599854, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7196696996688843, + "num_tokens": 349155006.0, + "step": 14001 + }, + { + "epoch": 1.5376674719964858, + "grad_norm": 1.9578769207000732, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7157224416732788, + "num_tokens": 349185512.0, + "step": 14002 + }, + { + "epoch": 1.5377772896990995, + "grad_norm": 2.245401620864868, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7199809551239014, + "num_tokens": 349211445.0, + "step": 14003 + }, + { + "epoch": 1.5378871074017133, + "grad_norm": 2.555516481399536, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7098024487495422, + "num_tokens": 349232813.0, + "step": 14004 + }, + { + "epoch": 1.5379969251043268, + "grad_norm": 2.316467046737671, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7306344509124756, + "num_tokens": 349257334.0, + "step": 14005 + }, + { + "epoch": 1.5381067428069404, + "grad_norm": 2.1954498291015625, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7020224332809448, + "num_tokens": 349285319.0, + "step": 14006 + }, + { + "epoch": 1.5382165605095541, + "grad_norm": 2.094938278198242, + "learning_rate": 1e-06, + "loss": 0.7974, + "mean_token_accuracy": 0.7478368878364563, + "num_tokens": 349310833.0, + "step": 14007 + }, + { + "epoch": 1.5383263782121679, + "grad_norm": 2.420145034790039, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7267241477966309, + "num_tokens": 349333119.0, + "step": 14008 + }, + { + "epoch": 1.5384361959147814, + "grad_norm": 2.1281657218933105, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7385697364807129, + "num_tokens": 349361377.0, + "step": 14009 + }, + { + "epoch": 1.538546013617395, + "grad_norm": 1.9598253965377808, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6883152723312378, + "num_tokens": 349394329.0, + "step": 14010 + }, + { + "epoch": 1.5386558313200087, + "grad_norm": 2.2175114154815674, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7295776009559631, + "num_tokens": 349418916.0, + "step": 14011 + }, + { + "epoch": 1.5387656490226225, + "grad_norm": 2.4720818996429443, + "learning_rate": 1e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7464070916175842, + "num_tokens": 349439363.0, + "step": 14012 + }, + { + "epoch": 1.5388754667252362, + "grad_norm": 2.328336000442505, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7056571245193481, + "num_tokens": 349463533.0, + "step": 14013 + }, + { + "epoch": 1.5389852844278498, + "grad_norm": 2.059511661529541, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7131877541542053, + "num_tokens": 349493212.0, + "step": 14014 + }, + { + "epoch": 1.5390951021304633, + "grad_norm": 2.519987106323242, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7520878911018372, + "num_tokens": 349515076.0, + "step": 14015 + }, + { + "epoch": 1.539204919833077, + "grad_norm": 2.5258448123931885, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7466726899147034, + "num_tokens": 349536950.0, + "step": 14016 + }, + { + "epoch": 1.5393147375356908, + "grad_norm": 2.3119454383850098, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7252815961837769, + "num_tokens": 349560248.0, + "step": 14017 + }, + { + "epoch": 1.5394245552383046, + "grad_norm": 2.1966140270233154, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7281696200370789, + "num_tokens": 349586161.0, + "step": 14018 + }, + { + "epoch": 1.539534372940918, + "grad_norm": 2.601181983947754, + "learning_rate": 1e-06, + "loss": 0.7967, + "mean_token_accuracy": 0.7418571710586548, + "num_tokens": 349607123.0, + "step": 14019 + }, + { + "epoch": 1.5396441906435316, + "grad_norm": 2.2291057109832764, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7192325592041016, + "num_tokens": 349634178.0, + "step": 14020 + }, + { + "epoch": 1.5397540083461454, + "grad_norm": 2.4354987144470215, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.6980193257331848, + "num_tokens": 349657557.0, + "step": 14021 + }, + { + "epoch": 1.5398638260487592, + "grad_norm": 2.1353766918182373, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7133175134658813, + "num_tokens": 349683780.0, + "step": 14022 + }, + { + "epoch": 1.5399736437513727, + "grad_norm": 2.1574835777282715, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7226458191871643, + "num_tokens": 349711687.0, + "step": 14023 + }, + { + "epoch": 1.5400834614539862, + "grad_norm": 2.6997663974761963, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7428046464920044, + "num_tokens": 349730214.0, + "step": 14024 + }, + { + "epoch": 1.5401932791566, + "grad_norm": 2.3676140308380127, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7298851609230042, + "num_tokens": 349752283.0, + "step": 14025 + }, + { + "epoch": 1.5403030968592137, + "grad_norm": 1.811564326286316, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6845957040786743, + "num_tokens": 349787824.0, + "step": 14026 + }, + { + "epoch": 1.5404129145618275, + "grad_norm": 2.0751888751983643, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7137113809585571, + "num_tokens": 349818337.0, + "step": 14027 + }, + { + "epoch": 1.540522732264441, + "grad_norm": 2.438912868499756, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7224715948104858, + "num_tokens": 349839739.0, + "step": 14028 + }, + { + "epoch": 1.5406325499670546, + "grad_norm": 1.8991172313690186, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.715394914150238, + "num_tokens": 349873725.0, + "step": 14029 + }, + { + "epoch": 1.5407423676696683, + "grad_norm": 2.38461971282959, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.6990995407104492, + "num_tokens": 349895783.0, + "step": 14030 + }, + { + "epoch": 1.540852185372282, + "grad_norm": 2.029332399368286, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7080841064453125, + "num_tokens": 349925843.0, + "step": 14031 + }, + { + "epoch": 1.5409620030748958, + "grad_norm": 2.3513195514678955, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7307794690132141, + "num_tokens": 349949514.0, + "step": 14032 + }, + { + "epoch": 1.5410718207775094, + "grad_norm": 2.3931546211242676, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7295958995819092, + "num_tokens": 349972063.0, + "step": 14033 + }, + { + "epoch": 1.541181638480123, + "grad_norm": 2.4413464069366455, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7198513746261597, + "num_tokens": 349994896.0, + "step": 14034 + }, + { + "epoch": 1.5412914561827367, + "grad_norm": 2.3314101696014404, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7060784101486206, + "num_tokens": 350020150.0, + "step": 14035 + }, + { + "epoch": 1.5414012738853504, + "grad_norm": 2.3702809810638428, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.725368082523346, + "num_tokens": 350043553.0, + "step": 14036 + }, + { + "epoch": 1.541511091587964, + "grad_norm": 1.975479006767273, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7044973969459534, + "num_tokens": 350076229.0, + "step": 14037 + }, + { + "epoch": 1.5416209092905775, + "grad_norm": 2.63081431388855, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7520906925201416, + "num_tokens": 350094612.0, + "step": 14038 + }, + { + "epoch": 1.5417307269931912, + "grad_norm": 2.5548508167266846, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.714288055896759, + "num_tokens": 350114573.0, + "step": 14039 + }, + { + "epoch": 1.541840544695805, + "grad_norm": 2.781897783279419, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7205300331115723, + "num_tokens": 350133155.0, + "step": 14040 + }, + { + "epoch": 1.5419503623984188, + "grad_norm": 2.0737831592559814, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.729423999786377, + "num_tokens": 350162246.0, + "step": 14041 + }, + { + "epoch": 1.5420601801010323, + "grad_norm": 2.7225027084350586, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.721674382686615, + "num_tokens": 350183159.0, + "step": 14042 + }, + { + "epoch": 1.5421699978036458, + "grad_norm": 2.345041036605835, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.731823742389679, + "num_tokens": 350206164.0, + "step": 14043 + }, + { + "epoch": 1.5422798155062596, + "grad_norm": 2.069347620010376, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7056713104248047, + "num_tokens": 350235838.0, + "step": 14044 + }, + { + "epoch": 1.5423896332088733, + "grad_norm": 2.527377128601074, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7079442143440247, + "num_tokens": 350257453.0, + "step": 14045 + }, + { + "epoch": 1.5424994509114869, + "grad_norm": 2.3709304332733154, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6997300386428833, + "num_tokens": 350282456.0, + "step": 14046 + }, + { + "epoch": 1.5426092686141006, + "grad_norm": 2.119568347930908, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7147024869918823, + "num_tokens": 350310163.0, + "step": 14047 + }, + { + "epoch": 1.5427190863167142, + "grad_norm": 2.4769952297210693, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7132627964019775, + "num_tokens": 350332847.0, + "step": 14048 + }, + { + "epoch": 1.542828904019328, + "grad_norm": 2.2446541786193848, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.706203818321228, + "num_tokens": 350359431.0, + "step": 14049 + }, + { + "epoch": 1.5429387217219417, + "grad_norm": 2.425898790359497, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7357640266418457, + "num_tokens": 350383342.0, + "step": 14050 + }, + { + "epoch": 1.5430485394245552, + "grad_norm": 2.0260987281799316, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6833368539810181, + "num_tokens": 350416563.0, + "step": 14051 + }, + { + "epoch": 1.5431583571271688, + "grad_norm": 2.28550124168396, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7027590274810791, + "num_tokens": 350442761.0, + "step": 14052 + }, + { + "epoch": 1.5432681748297825, + "grad_norm": 2.2158403396606445, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.694147527217865, + "num_tokens": 350470626.0, + "step": 14053 + }, + { + "epoch": 1.5433779925323963, + "grad_norm": 2.227329969406128, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.705601692199707, + "num_tokens": 350496230.0, + "step": 14054 + }, + { + "epoch": 1.54348781023501, + "grad_norm": 2.453118324279785, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7427220344543457, + "num_tokens": 350516892.0, + "step": 14055 + }, + { + "epoch": 1.5435976279376236, + "grad_norm": 2.2093777656555176, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7062949538230896, + "num_tokens": 350542966.0, + "step": 14056 + }, + { + "epoch": 1.543707445640237, + "grad_norm": 2.238988161087036, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7142024040222168, + "num_tokens": 350571105.0, + "step": 14057 + }, + { + "epoch": 1.5438172633428509, + "grad_norm": 1.8921358585357666, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7016434669494629, + "num_tokens": 350605019.0, + "step": 14058 + }, + { + "epoch": 1.5439270810454646, + "grad_norm": 2.8242719173431396, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7235145568847656, + "num_tokens": 350623897.0, + "step": 14059 + }, + { + "epoch": 1.5440368987480781, + "grad_norm": 2.467024326324463, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7199281454086304, + "num_tokens": 350645338.0, + "step": 14060 + }, + { + "epoch": 1.544146716450692, + "grad_norm": 2.160202980041504, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7088428735733032, + "num_tokens": 350671782.0, + "step": 14061 + }, + { + "epoch": 1.5442565341533054, + "grad_norm": 2.435537576675415, + "learning_rate": 1e-06, + "loss": 0.7242, + "mean_token_accuracy": 0.7659301161766052, + "num_tokens": 350692142.0, + "step": 14062 + }, + { + "epoch": 1.5443663518559192, + "grad_norm": 2.3179972171783447, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7457308769226074, + "num_tokens": 350715636.0, + "step": 14063 + }, + { + "epoch": 1.544476169558533, + "grad_norm": 2.21591854095459, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7184819579124451, + "num_tokens": 350742528.0, + "step": 14064 + }, + { + "epoch": 1.5445859872611465, + "grad_norm": 2.315349817276001, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7419108152389526, + "num_tokens": 350765593.0, + "step": 14065 + }, + { + "epoch": 1.54469580496376, + "grad_norm": 2.5466091632843018, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7223727107048035, + "num_tokens": 350785534.0, + "step": 14066 + }, + { + "epoch": 1.5448056226663738, + "grad_norm": 2.2073731422424316, + "learning_rate": 1e-06, + "loss": 0.754, + "mean_token_accuracy": 0.7564221620559692, + "num_tokens": 350808704.0, + "step": 14067 + }, + { + "epoch": 1.5449154403689875, + "grad_norm": 2.32588529586792, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.727232813835144, + "num_tokens": 350833093.0, + "step": 14068 + }, + { + "epoch": 1.5450252580716013, + "grad_norm": 2.3873627185821533, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7037222385406494, + "num_tokens": 350856766.0, + "step": 14069 + }, + { + "epoch": 1.5451350757742148, + "grad_norm": 2.053741216659546, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.692827582359314, + "num_tokens": 350887503.0, + "step": 14070 + }, + { + "epoch": 1.5452448934768284, + "grad_norm": 2.4355087280273438, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7316597700119019, + "num_tokens": 350909136.0, + "step": 14071 + }, + { + "epoch": 1.5453547111794421, + "grad_norm": 1.9726375341415405, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7116219997406006, + "num_tokens": 350942329.0, + "step": 14072 + }, + { + "epoch": 1.5454645288820559, + "grad_norm": 2.231985569000244, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.6994023323059082, + "num_tokens": 350970914.0, + "step": 14073 + }, + { + "epoch": 1.5455743465846694, + "grad_norm": 2.3872196674346924, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7303992509841919, + "num_tokens": 350991890.0, + "step": 14074 + }, + { + "epoch": 1.545684164287283, + "grad_norm": 2.4940314292907715, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7365857362747192, + "num_tokens": 351012282.0, + "step": 14075 + }, + { + "epoch": 1.5457939819898967, + "grad_norm": 2.1242830753326416, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7231625318527222, + "num_tokens": 351039930.0, + "step": 14076 + }, + { + "epoch": 1.5459037996925105, + "grad_norm": 2.401134490966797, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.7063845992088318, + "num_tokens": 351064142.0, + "step": 14077 + }, + { + "epoch": 1.5460136173951242, + "grad_norm": 2.518681287765503, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7209727168083191, + "num_tokens": 351085623.0, + "step": 14078 + }, + { + "epoch": 1.5461234350977378, + "grad_norm": 2.163775682449341, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.6955939531326294, + "num_tokens": 351116575.0, + "step": 14079 + }, + { + "epoch": 1.5462332528003513, + "grad_norm": 2.4599995613098145, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7324557900428772, + "num_tokens": 351139129.0, + "step": 14080 + }, + { + "epoch": 1.546343070502965, + "grad_norm": 2.263965606689453, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7278279066085815, + "num_tokens": 351165719.0, + "step": 14081 + }, + { + "epoch": 1.5464528882055788, + "grad_norm": 2.4886696338653564, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7159802317619324, + "num_tokens": 351186989.0, + "step": 14082 + }, + { + "epoch": 1.5465627059081926, + "grad_norm": 2.412876605987549, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7019041776657104, + "num_tokens": 351211882.0, + "step": 14083 + }, + { + "epoch": 1.546672523610806, + "grad_norm": 2.5294642448425293, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7215603590011597, + "num_tokens": 351234513.0, + "step": 14084 + }, + { + "epoch": 1.5467823413134196, + "grad_norm": 2.0848281383514404, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7202078104019165, + "num_tokens": 351261807.0, + "step": 14085 + }, + { + "epoch": 1.5468921590160334, + "grad_norm": 2.228746175765991, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7141098976135254, + "num_tokens": 351286390.0, + "step": 14086 + }, + { + "epoch": 1.5470019767186471, + "grad_norm": 2.2186150550842285, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7255423665046692, + "num_tokens": 351312644.0, + "step": 14087 + }, + { + "epoch": 1.5471117944212607, + "grad_norm": 2.4975781440734863, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7228046655654907, + "num_tokens": 351335460.0, + "step": 14088 + }, + { + "epoch": 1.5472216121238742, + "grad_norm": 2.9314396381378174, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7301097512245178, + "num_tokens": 351354757.0, + "step": 14089 + }, + { + "epoch": 1.547331429826488, + "grad_norm": 2.4831063747406006, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7347451448440552, + "num_tokens": 351375800.0, + "step": 14090 + }, + { + "epoch": 1.5474412475291017, + "grad_norm": 2.230274200439453, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.715518593788147, + "num_tokens": 351402220.0, + "step": 14091 + }, + { + "epoch": 1.5475510652317155, + "grad_norm": 2.7803046703338623, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7458629012107849, + "num_tokens": 351420693.0, + "step": 14092 + }, + { + "epoch": 1.547660882934329, + "grad_norm": 2.4336602687835693, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7336181998252869, + "num_tokens": 351443009.0, + "step": 14093 + }, + { + "epoch": 1.5477707006369426, + "grad_norm": 2.4000329971313477, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.726081371307373, + "num_tokens": 351465285.0, + "step": 14094 + }, + { + "epoch": 1.5478805183395563, + "grad_norm": 2.327876329421997, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7248175740242004, + "num_tokens": 351487864.0, + "step": 14095 + }, + { + "epoch": 1.54799033604217, + "grad_norm": 2.315577745437622, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7247571349143982, + "num_tokens": 351513658.0, + "step": 14096 + }, + { + "epoch": 1.5481001537447838, + "grad_norm": 2.06679630279541, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7318962216377258, + "num_tokens": 351541395.0, + "step": 14097 + }, + { + "epoch": 1.5482099714473974, + "grad_norm": 2.4740400314331055, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7152188420295715, + "num_tokens": 351563855.0, + "step": 14098 + }, + { + "epoch": 1.548319789150011, + "grad_norm": 2.117135524749756, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7244006991386414, + "num_tokens": 351592289.0, + "step": 14099 + }, + { + "epoch": 1.5484296068526247, + "grad_norm": 1.9757604598999023, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7049305438995361, + "num_tokens": 351624561.0, + "step": 14100 + }, + { + "epoch": 1.5485394245552384, + "grad_norm": 2.372068166732788, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7273790836334229, + "num_tokens": 351647352.0, + "step": 14101 + }, + { + "epoch": 1.548649242257852, + "grad_norm": 2.152064561843872, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6968549489974976, + "num_tokens": 351676041.0, + "step": 14102 + }, + { + "epoch": 1.5487590599604655, + "grad_norm": 2.2136728763580322, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7341939806938171, + "num_tokens": 351700787.0, + "step": 14103 + }, + { + "epoch": 1.5488688776630792, + "grad_norm": 2.166753053665161, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7187252044677734, + "num_tokens": 351727971.0, + "step": 14104 + }, + { + "epoch": 1.548978695365693, + "grad_norm": 2.0458648204803467, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7001177668571472, + "num_tokens": 351758687.0, + "step": 14105 + }, + { + "epoch": 1.5490885130683067, + "grad_norm": 2.178429126739502, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.6982595324516296, + "num_tokens": 351786456.0, + "step": 14106 + }, + { + "epoch": 1.5491983307709203, + "grad_norm": 2.3002288341522217, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7227221727371216, + "num_tokens": 351809384.0, + "step": 14107 + }, + { + "epoch": 1.5493081484735338, + "grad_norm": 2.491276741027832, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7239981889724731, + "num_tokens": 351831805.0, + "step": 14108 + }, + { + "epoch": 1.5494179661761476, + "grad_norm": 2.390484094619751, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7183050513267517, + "num_tokens": 351852200.0, + "step": 14109 + }, + { + "epoch": 1.5495277838787613, + "grad_norm": 2.268702983856201, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7238086462020874, + "num_tokens": 351879468.0, + "step": 14110 + }, + { + "epoch": 1.5496376015813749, + "grad_norm": 2.666166305541992, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7272005081176758, + "num_tokens": 351899896.0, + "step": 14111 + }, + { + "epoch": 1.5497474192839886, + "grad_norm": 2.2167298793792725, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.693801999092102, + "num_tokens": 351928173.0, + "step": 14112 + }, + { + "epoch": 1.5498572369866022, + "grad_norm": 1.9689399003982544, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7242966890335083, + "num_tokens": 351960913.0, + "step": 14113 + }, + { + "epoch": 1.549967054689216, + "grad_norm": 2.5921883583068848, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.70225590467453, + "num_tokens": 351981667.0, + "step": 14114 + }, + { + "epoch": 1.5500768723918297, + "grad_norm": 2.423187732696533, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7297959327697754, + "num_tokens": 352003392.0, + "step": 14115 + }, + { + "epoch": 1.5501866900944432, + "grad_norm": 2.2396469116210938, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7255571484565735, + "num_tokens": 352029350.0, + "step": 14116 + }, + { + "epoch": 1.5502965077970567, + "grad_norm": 2.6927008628845215, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7273374795913696, + "num_tokens": 352049019.0, + "step": 14117 + }, + { + "epoch": 1.5504063254996705, + "grad_norm": 2.2856953144073486, + "learning_rate": 1e-06, + "loss": 0.7615, + "mean_token_accuracy": 0.7602147459983826, + "num_tokens": 352071634.0, + "step": 14118 + }, + { + "epoch": 1.5505161432022843, + "grad_norm": 2.5585806369781494, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7359501123428345, + "num_tokens": 352090691.0, + "step": 14119 + }, + { + "epoch": 1.550625960904898, + "grad_norm": 2.2732062339782715, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7360315918922424, + "num_tokens": 352115097.0, + "step": 14120 + }, + { + "epoch": 1.5507357786075116, + "grad_norm": 2.490086555480957, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7237789630889893, + "num_tokens": 352137325.0, + "step": 14121 + }, + { + "epoch": 1.550845596310125, + "grad_norm": 2.014789342880249, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7235254645347595, + "num_tokens": 352168596.0, + "step": 14122 + }, + { + "epoch": 1.5509554140127388, + "grad_norm": 2.1720447540283203, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7083688974380493, + "num_tokens": 352195993.0, + "step": 14123 + }, + { + "epoch": 1.5510652317153526, + "grad_norm": 2.321882724761963, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7210938930511475, + "num_tokens": 352220680.0, + "step": 14124 + }, + { + "epoch": 1.5511750494179661, + "grad_norm": 2.13179612159729, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.727855920791626, + "num_tokens": 352247976.0, + "step": 14125 + }, + { + "epoch": 1.55128486712058, + "grad_norm": 2.1930124759674072, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7060009241104126, + "num_tokens": 352275169.0, + "step": 14126 + }, + { + "epoch": 1.5513946848231934, + "grad_norm": 2.3337507247924805, + "learning_rate": 1e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7543485760688782, + "num_tokens": 352296958.0, + "step": 14127 + }, + { + "epoch": 1.5515045025258072, + "grad_norm": 2.2533867359161377, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7431192398071289, + "num_tokens": 352321356.0, + "step": 14128 + }, + { + "epoch": 1.551614320228421, + "grad_norm": 2.329462766647339, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7124179601669312, + "num_tokens": 352345817.0, + "step": 14129 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 2.3908345699310303, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7407429218292236, + "num_tokens": 352369107.0, + "step": 14130 + }, + { + "epoch": 1.551833955633648, + "grad_norm": 2.097231388092041, + "learning_rate": 1e-06, + "loss": 0.8717, + "mean_token_accuracy": 0.7252856492996216, + "num_tokens": 352395821.0, + "step": 14131 + }, + { + "epoch": 1.5519437733362618, + "grad_norm": 2.326343297958374, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7216849327087402, + "num_tokens": 352423027.0, + "step": 14132 + }, + { + "epoch": 1.5520535910388755, + "grad_norm": 2.29184889793396, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7287279367446899, + "num_tokens": 352447629.0, + "step": 14133 + }, + { + "epoch": 1.5521634087414893, + "grad_norm": 2.7431211471557617, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7436163425445557, + "num_tokens": 352466971.0, + "step": 14134 + }, + { + "epoch": 1.5522732264441028, + "grad_norm": 2.261922597885132, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7383393049240112, + "num_tokens": 352493277.0, + "step": 14135 + }, + { + "epoch": 1.5523830441467164, + "grad_norm": 2.196593761444092, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7207561731338501, + "num_tokens": 352519517.0, + "step": 14136 + }, + { + "epoch": 1.55249286184933, + "grad_norm": 2.0029945373535156, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7393920421600342, + "num_tokens": 352548571.0, + "step": 14137 + }, + { + "epoch": 1.5526026795519439, + "grad_norm": 2.4024910926818848, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.697201669216156, + "num_tokens": 352572722.0, + "step": 14138 + }, + { + "epoch": 1.5527124972545574, + "grad_norm": 2.311168909072876, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7104887962341309, + "num_tokens": 352597243.0, + "step": 14139 + }, + { + "epoch": 1.552822314957171, + "grad_norm": 2.4435925483703613, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7232507467269897, + "num_tokens": 352621529.0, + "step": 14140 + }, + { + "epoch": 1.5529321326597847, + "grad_norm": 2.310035467147827, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7160026431083679, + "num_tokens": 352644809.0, + "step": 14141 + }, + { + "epoch": 1.5530419503623984, + "grad_norm": 2.0450804233551025, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7055217623710632, + "num_tokens": 352677274.0, + "step": 14142 + }, + { + "epoch": 1.5531517680650122, + "grad_norm": 2.6388015747070312, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7290346026420593, + "num_tokens": 352698367.0, + "step": 14143 + }, + { + "epoch": 1.5532615857676257, + "grad_norm": 2.547558069229126, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.721753716468811, + "num_tokens": 352718942.0, + "step": 14144 + }, + { + "epoch": 1.5533714034702393, + "grad_norm": 2.143427848815918, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7130792737007141, + "num_tokens": 352744832.0, + "step": 14145 + }, + { + "epoch": 1.553481221172853, + "grad_norm": 2.1529698371887207, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7314548492431641, + "num_tokens": 352770903.0, + "step": 14146 + }, + { + "epoch": 1.5535910388754668, + "grad_norm": 2.336423635482788, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7101857662200928, + "num_tokens": 352795999.0, + "step": 14147 + }, + { + "epoch": 1.5537008565780805, + "grad_norm": 2.521177053451538, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7387584447860718, + "num_tokens": 352818018.0, + "step": 14148 + }, + { + "epoch": 1.553810674280694, + "grad_norm": 2.3044590950012207, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7257764339447021, + "num_tokens": 352840990.0, + "step": 14149 + }, + { + "epoch": 1.5539204919833076, + "grad_norm": 2.289278984069824, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7039070725440979, + "num_tokens": 352864113.0, + "step": 14150 + }, + { + "epoch": 1.5540303096859214, + "grad_norm": 2.413541078567505, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7479907274246216, + "num_tokens": 352885514.0, + "step": 14151 + }, + { + "epoch": 1.5541401273885351, + "grad_norm": 2.3486735820770264, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7231859564781189, + "num_tokens": 352909799.0, + "step": 14152 + }, + { + "epoch": 1.5542499450911487, + "grad_norm": 2.0692121982574463, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7212050557136536, + "num_tokens": 352938935.0, + "step": 14153 + }, + { + "epoch": 1.5543597627937622, + "grad_norm": 2.772430419921875, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7278118133544922, + "num_tokens": 352958770.0, + "step": 14154 + }, + { + "epoch": 1.554469580496376, + "grad_norm": 2.2441248893737793, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7325119972229004, + "num_tokens": 352986055.0, + "step": 14155 + }, + { + "epoch": 1.5545793981989897, + "grad_norm": 2.1082282066345215, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7034049034118652, + "num_tokens": 353017729.0, + "step": 14156 + }, + { + "epoch": 1.5546892159016035, + "grad_norm": 2.2579996585845947, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7290568351745605, + "num_tokens": 353046043.0, + "step": 14157 + }, + { + "epoch": 1.554799033604217, + "grad_norm": 2.0667526721954346, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7088610529899597, + "num_tokens": 353076471.0, + "step": 14158 + }, + { + "epoch": 1.5549088513068305, + "grad_norm": 2.012786865234375, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7066482305526733, + "num_tokens": 353106373.0, + "step": 14159 + }, + { + "epoch": 1.5550186690094443, + "grad_norm": 2.186392068862915, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7003619074821472, + "num_tokens": 353136405.0, + "step": 14160 + }, + { + "epoch": 1.555128486712058, + "grad_norm": 2.407515287399292, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7172712683677673, + "num_tokens": 353160555.0, + "step": 14161 + }, + { + "epoch": 1.5552383044146716, + "grad_norm": 2.50740385055542, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6964538097381592, + "num_tokens": 353183463.0, + "step": 14162 + }, + { + "epoch": 1.5553481221172853, + "grad_norm": 2.2915823459625244, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6999731063842773, + "num_tokens": 353206608.0, + "step": 14163 + }, + { + "epoch": 1.5554579398198989, + "grad_norm": 2.6626474857330322, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7330831289291382, + "num_tokens": 353226315.0, + "step": 14164 + }, + { + "epoch": 1.5555677575225126, + "grad_norm": 2.2984278202056885, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7224481105804443, + "num_tokens": 353250065.0, + "step": 14165 + }, + { + "epoch": 1.5556775752251264, + "grad_norm": 2.4602413177490234, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7076879739761353, + "num_tokens": 353272206.0, + "step": 14166 + }, + { + "epoch": 1.55578739292774, + "grad_norm": 2.251750946044922, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7199888229370117, + "num_tokens": 353298467.0, + "step": 14167 + }, + { + "epoch": 1.5558972106303535, + "grad_norm": 2.268983840942383, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.708301305770874, + "num_tokens": 353322966.0, + "step": 14168 + }, + { + "epoch": 1.5560070283329672, + "grad_norm": 2.257373094558716, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7220438718795776, + "num_tokens": 353347463.0, + "step": 14169 + }, + { + "epoch": 1.556116846035581, + "grad_norm": 2.1576743125915527, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7185487747192383, + "num_tokens": 353375490.0, + "step": 14170 + }, + { + "epoch": 1.5562266637381947, + "grad_norm": 2.245487928390503, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7162806987762451, + "num_tokens": 353400979.0, + "step": 14171 + }, + { + "epoch": 1.5563364814408083, + "grad_norm": 2.5770812034606934, + "learning_rate": 1e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7409806847572327, + "num_tokens": 353420424.0, + "step": 14172 + }, + { + "epoch": 1.5564462991434218, + "grad_norm": 2.3801167011260986, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7140467762947083, + "num_tokens": 353444823.0, + "step": 14173 + }, + { + "epoch": 1.5565561168460356, + "grad_norm": 2.4748828411102295, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7249323129653931, + "num_tokens": 353467087.0, + "step": 14174 + }, + { + "epoch": 1.5566659345486493, + "grad_norm": 2.280749797821045, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7027162313461304, + "num_tokens": 353493494.0, + "step": 14175 + }, + { + "epoch": 1.5567757522512629, + "grad_norm": 2.7690083980560303, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7072038054466248, + "num_tokens": 353512349.0, + "step": 14176 + }, + { + "epoch": 1.5568855699538766, + "grad_norm": 2.5384342670440674, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7345177531242371, + "num_tokens": 353533653.0, + "step": 14177 + }, + { + "epoch": 1.5569953876564901, + "grad_norm": 2.528012990951538, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7228567600250244, + "num_tokens": 353553560.0, + "step": 14178 + }, + { + "epoch": 1.557105205359104, + "grad_norm": 2.3016061782836914, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7211838364601135, + "num_tokens": 353578164.0, + "step": 14179 + }, + { + "epoch": 1.5572150230617177, + "grad_norm": 2.148704767227173, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7074247598648071, + "num_tokens": 353603313.0, + "step": 14180 + }, + { + "epoch": 1.5573248407643312, + "grad_norm": 2.2761082649230957, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7425723075866699, + "num_tokens": 353626150.0, + "step": 14181 + }, + { + "epoch": 1.5574346584669447, + "grad_norm": 2.291062355041504, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7262905240058899, + "num_tokens": 353650030.0, + "step": 14182 + }, + { + "epoch": 1.5575444761695585, + "grad_norm": 2.395339250564575, + "learning_rate": 1e-06, + "loss": 0.781, + "mean_token_accuracy": 0.7489508390426636, + "num_tokens": 353670239.0, + "step": 14183 + }, + { + "epoch": 1.5576542938721722, + "grad_norm": 2.487614870071411, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7212063074111938, + "num_tokens": 353692493.0, + "step": 14184 + }, + { + "epoch": 1.557764111574786, + "grad_norm": 2.3791699409484863, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.730920672416687, + "num_tokens": 353715751.0, + "step": 14185 + }, + { + "epoch": 1.5578739292773995, + "grad_norm": 2.2924768924713135, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7228514552116394, + "num_tokens": 353739588.0, + "step": 14186 + }, + { + "epoch": 1.557983746980013, + "grad_norm": 2.388239622116089, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7065250873565674, + "num_tokens": 353764205.0, + "step": 14187 + }, + { + "epoch": 1.5580935646826268, + "grad_norm": 2.182626724243164, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7287845611572266, + "num_tokens": 353793245.0, + "step": 14188 + }, + { + "epoch": 1.5582033823852406, + "grad_norm": 1.9254752397537231, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.70477294921875, + "num_tokens": 353825341.0, + "step": 14189 + }, + { + "epoch": 1.5583132000878541, + "grad_norm": 2.341792106628418, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6982380151748657, + "num_tokens": 353851732.0, + "step": 14190 + }, + { + "epoch": 1.5584230177904677, + "grad_norm": 2.2661399841308594, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7229210138320923, + "num_tokens": 353878761.0, + "step": 14191 + }, + { + "epoch": 1.5585328354930814, + "grad_norm": 2.1626949310302734, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7133476138114929, + "num_tokens": 353907876.0, + "step": 14192 + }, + { + "epoch": 1.5586426531956952, + "grad_norm": 2.36488676071167, + "learning_rate": 1e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7376132011413574, + "num_tokens": 353930224.0, + "step": 14193 + }, + { + "epoch": 1.558752470898309, + "grad_norm": 2.1344714164733887, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.733467698097229, + "num_tokens": 353955169.0, + "step": 14194 + }, + { + "epoch": 1.5588622886009225, + "grad_norm": 2.3857059478759766, + "learning_rate": 1e-06, + "loss": 0.7607, + "mean_token_accuracy": 0.752282440662384, + "num_tokens": 353978456.0, + "step": 14195 + }, + { + "epoch": 1.558972106303536, + "grad_norm": 2.8557310104370117, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7442026734352112, + "num_tokens": 353996885.0, + "step": 14196 + }, + { + "epoch": 1.5590819240061498, + "grad_norm": 2.4222962856292725, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7111507058143616, + "num_tokens": 354022418.0, + "step": 14197 + }, + { + "epoch": 1.5591917417087635, + "grad_norm": 2.1977171897888184, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7352461218833923, + "num_tokens": 354049185.0, + "step": 14198 + }, + { + "epoch": 1.5593015594113773, + "grad_norm": 2.183713436126709, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7323490381240845, + "num_tokens": 354074295.0, + "step": 14199 + }, + { + "epoch": 1.5594113771139908, + "grad_norm": 2.0538384914398193, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7185913324356079, + "num_tokens": 354103102.0, + "step": 14200 + }, + { + "epoch": 1.5595211948166043, + "grad_norm": 2.4037580490112305, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7084441184997559, + "num_tokens": 354128098.0, + "step": 14201 + }, + { + "epoch": 1.559631012519218, + "grad_norm": 2.1236965656280518, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7221184968948364, + "num_tokens": 354154983.0, + "step": 14202 + }, + { + "epoch": 1.5597408302218319, + "grad_norm": 2.5267350673675537, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7494348287582397, + "num_tokens": 354176489.0, + "step": 14203 + }, + { + "epoch": 1.5598506479244454, + "grad_norm": 2.354851484298706, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7123172879219055, + "num_tokens": 354201453.0, + "step": 14204 + }, + { + "epoch": 1.559960465627059, + "grad_norm": 2.153491258621216, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7239077091217041, + "num_tokens": 354228323.0, + "step": 14205 + }, + { + "epoch": 1.5600702833296727, + "grad_norm": 2.3628883361816406, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6960815191268921, + "num_tokens": 354252915.0, + "step": 14206 + }, + { + "epoch": 1.5601801010322864, + "grad_norm": 2.2964529991149902, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7104654908180237, + "num_tokens": 354276700.0, + "step": 14207 + }, + { + "epoch": 1.5602899187349002, + "grad_norm": 2.268275260925293, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7361427545547485, + "num_tokens": 354301142.0, + "step": 14208 + }, + { + "epoch": 1.5603997364375137, + "grad_norm": 2.01823353767395, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7050195336341858, + "num_tokens": 354332353.0, + "step": 14209 + }, + { + "epoch": 1.5605095541401273, + "grad_norm": 2.2175960540771484, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.720698356628418, + "num_tokens": 354356472.0, + "step": 14210 + }, + { + "epoch": 1.560619371842741, + "grad_norm": 2.554736375808716, + "learning_rate": 1e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7319904565811157, + "num_tokens": 354374829.0, + "step": 14211 + }, + { + "epoch": 1.5607291895453548, + "grad_norm": 2.1510508060455322, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7281378507614136, + "num_tokens": 354401571.0, + "step": 14212 + }, + { + "epoch": 1.5608390072479685, + "grad_norm": 2.2466890811920166, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7272618412971497, + "num_tokens": 354428024.0, + "step": 14213 + }, + { + "epoch": 1.560948824950582, + "grad_norm": 2.162214994430542, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.723042368888855, + "num_tokens": 354454182.0, + "step": 14214 + }, + { + "epoch": 1.5610586426531956, + "grad_norm": 2.2354183197021484, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7217103242874146, + "num_tokens": 354480933.0, + "step": 14215 + }, + { + "epoch": 1.5611684603558094, + "grad_norm": 2.1290969848632812, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7049899101257324, + "num_tokens": 354510901.0, + "step": 14216 + }, + { + "epoch": 1.5612782780584231, + "grad_norm": 2.3829307556152344, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7190810441970825, + "num_tokens": 354533430.0, + "step": 14217 + }, + { + "epoch": 1.5613880957610367, + "grad_norm": 1.9750096797943115, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7273930907249451, + "num_tokens": 354565396.0, + "step": 14218 + }, + { + "epoch": 1.5614979134636502, + "grad_norm": 2.502781391143799, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7362752556800842, + "num_tokens": 354587298.0, + "step": 14219 + }, + { + "epoch": 1.561607731166264, + "grad_norm": 2.6578943729400635, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.724535346031189, + "num_tokens": 354607065.0, + "step": 14220 + }, + { + "epoch": 1.5617175488688777, + "grad_norm": 2.806419849395752, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7306575775146484, + "num_tokens": 354628699.0, + "step": 14221 + }, + { + "epoch": 1.5618273665714915, + "grad_norm": 2.3912720680236816, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7228528261184692, + "num_tokens": 354653909.0, + "step": 14222 + }, + { + "epoch": 1.561937184274105, + "grad_norm": 2.398314952850342, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7106524705886841, + "num_tokens": 354678693.0, + "step": 14223 + }, + { + "epoch": 1.5620470019767185, + "grad_norm": 2.1845593452453613, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7335996627807617, + "num_tokens": 354705690.0, + "step": 14224 + }, + { + "epoch": 1.5621568196793323, + "grad_norm": 2.536684036254883, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7130711078643799, + "num_tokens": 354725531.0, + "step": 14225 + }, + { + "epoch": 1.562266637381946, + "grad_norm": 2.3348662853240967, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7047604322433472, + "num_tokens": 354749617.0, + "step": 14226 + }, + { + "epoch": 1.5623764550845596, + "grad_norm": 2.276186943054199, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7248235940933228, + "num_tokens": 354776502.0, + "step": 14227 + }, + { + "epoch": 1.5624862727871733, + "grad_norm": 2.550828218460083, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7244444489479065, + "num_tokens": 354796325.0, + "step": 14228 + }, + { + "epoch": 1.5625960904897869, + "grad_norm": 2.2829418182373047, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7198629379272461, + "num_tokens": 354822192.0, + "step": 14229 + }, + { + "epoch": 1.5627059081924006, + "grad_norm": 2.1918342113494873, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6959978342056274, + "num_tokens": 354849882.0, + "step": 14230 + }, + { + "epoch": 1.5628157258950144, + "grad_norm": 2.3601865768432617, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7222521305084229, + "num_tokens": 354875305.0, + "step": 14231 + }, + { + "epoch": 1.562925543597628, + "grad_norm": 2.324601650238037, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7297679781913757, + "num_tokens": 354898337.0, + "step": 14232 + }, + { + "epoch": 1.5630353613002415, + "grad_norm": 2.3185436725616455, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7213397026062012, + "num_tokens": 354921520.0, + "step": 14233 + }, + { + "epoch": 1.5631451790028552, + "grad_norm": 2.3028249740600586, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7065272331237793, + "num_tokens": 354945779.0, + "step": 14234 + }, + { + "epoch": 1.563254996705469, + "grad_norm": 2.6438562870025635, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7456756234169006, + "num_tokens": 354964432.0, + "step": 14235 + }, + { + "epoch": 1.5633648144080827, + "grad_norm": 2.044079542160034, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7160913944244385, + "num_tokens": 354994576.0, + "step": 14236 + }, + { + "epoch": 1.5634746321106963, + "grad_norm": 2.4514193534851074, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.739786684513092, + "num_tokens": 355014957.0, + "step": 14237 + }, + { + "epoch": 1.5635844498133098, + "grad_norm": 2.169797420501709, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7282370328903198, + "num_tokens": 355042510.0, + "step": 14238 + }, + { + "epoch": 1.5636942675159236, + "grad_norm": 2.3108410835266113, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7004023790359497, + "num_tokens": 355069600.0, + "step": 14239 + }, + { + "epoch": 1.5638040852185373, + "grad_norm": 2.4319510459899902, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7356382608413696, + "num_tokens": 355090330.0, + "step": 14240 + }, + { + "epoch": 1.5639139029211508, + "grad_norm": 2.1658222675323486, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7273526191711426, + "num_tokens": 355116798.0, + "step": 14241 + }, + { + "epoch": 1.5640237206237646, + "grad_norm": 2.0665647983551025, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.705547571182251, + "num_tokens": 355147485.0, + "step": 14242 + }, + { + "epoch": 1.5641335383263781, + "grad_norm": 2.5370209217071533, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7296491265296936, + "num_tokens": 355169817.0, + "step": 14243 + }, + { + "epoch": 1.564243356028992, + "grad_norm": 2.0382773876190186, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.700018048286438, + "num_tokens": 355200131.0, + "step": 14244 + }, + { + "epoch": 1.5643531737316057, + "grad_norm": 2.0195770263671875, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7109239101409912, + "num_tokens": 355230489.0, + "step": 14245 + }, + { + "epoch": 1.5644629914342192, + "grad_norm": 2.074140787124634, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7191943526268005, + "num_tokens": 355259235.0, + "step": 14246 + }, + { + "epoch": 1.5645728091368327, + "grad_norm": 2.1920511722564697, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7432057857513428, + "num_tokens": 355284402.0, + "step": 14247 + }, + { + "epoch": 1.5646826268394465, + "grad_norm": 2.379004955291748, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7127548456192017, + "num_tokens": 355309188.0, + "step": 14248 + }, + { + "epoch": 1.5647924445420602, + "grad_norm": 2.241359233856201, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7284096479415894, + "num_tokens": 355334691.0, + "step": 14249 + }, + { + "epoch": 1.564902262244674, + "grad_norm": 2.1283085346221924, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.716674268245697, + "num_tokens": 355360942.0, + "step": 14250 + }, + { + "epoch": 1.5650120799472875, + "grad_norm": 2.5590124130249023, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7258777618408203, + "num_tokens": 355380876.0, + "step": 14251 + }, + { + "epoch": 1.565121897649901, + "grad_norm": 2.380697250366211, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7077895402908325, + "num_tokens": 355404198.0, + "step": 14252 + }, + { + "epoch": 1.5652317153525148, + "grad_norm": 2.138049602508545, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7217339277267456, + "num_tokens": 355432013.0, + "step": 14253 + }, + { + "epoch": 1.5653415330551286, + "grad_norm": 2.136009693145752, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7103123068809509, + "num_tokens": 355459857.0, + "step": 14254 + }, + { + "epoch": 1.565451350757742, + "grad_norm": 2.7762982845306396, + "learning_rate": 1e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.745668888092041, + "num_tokens": 355477369.0, + "step": 14255 + }, + { + "epoch": 1.5655611684603556, + "grad_norm": 2.6033802032470703, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.6999701261520386, + "num_tokens": 355498958.0, + "step": 14256 + }, + { + "epoch": 1.5656709861629694, + "grad_norm": 2.039447784423828, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7240680456161499, + "num_tokens": 355532575.0, + "step": 14257 + }, + { + "epoch": 1.5657808038655832, + "grad_norm": 2.5076961517333984, + "learning_rate": 1e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7442591190338135, + "num_tokens": 355554820.0, + "step": 14258 + }, + { + "epoch": 1.565890621568197, + "grad_norm": 2.115097999572754, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7143694162368774, + "num_tokens": 355586377.0, + "step": 14259 + }, + { + "epoch": 1.5660004392708105, + "grad_norm": 2.2347209453582764, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7358391284942627, + "num_tokens": 355608694.0, + "step": 14260 + }, + { + "epoch": 1.566110256973424, + "grad_norm": 2.1347732543945312, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.711799144744873, + "num_tokens": 355638047.0, + "step": 14261 + }, + { + "epoch": 1.5662200746760377, + "grad_norm": 2.1095311641693115, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7034088373184204, + "num_tokens": 355667282.0, + "step": 14262 + }, + { + "epoch": 1.5663298923786515, + "grad_norm": 2.368244171142578, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7105401754379272, + "num_tokens": 355690458.0, + "step": 14263 + }, + { + "epoch": 1.5664397100812653, + "grad_norm": 2.3557987213134766, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7172192931175232, + "num_tokens": 355713430.0, + "step": 14264 + }, + { + "epoch": 1.5665495277838788, + "grad_norm": 2.2124884128570557, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7128032445907593, + "num_tokens": 355741611.0, + "step": 14265 + }, + { + "epoch": 1.5666593454864923, + "grad_norm": 2.1733896732330322, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.710186243057251, + "num_tokens": 355769814.0, + "step": 14266 + }, + { + "epoch": 1.566769163189106, + "grad_norm": 1.9327627420425415, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7004438638687134, + "num_tokens": 355804013.0, + "step": 14267 + }, + { + "epoch": 1.5668789808917198, + "grad_norm": 2.240833044052124, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7187854051589966, + "num_tokens": 355829591.0, + "step": 14268 + }, + { + "epoch": 1.5669887985943334, + "grad_norm": 2.2479658126831055, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7083827257156372, + "num_tokens": 355854325.0, + "step": 14269 + }, + { + "epoch": 1.567098616296947, + "grad_norm": 2.2611746788024902, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7287003993988037, + "num_tokens": 355879436.0, + "step": 14270 + }, + { + "epoch": 1.5672084339995607, + "grad_norm": 2.094242811203003, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7266074419021606, + "num_tokens": 355910072.0, + "step": 14271 + }, + { + "epoch": 1.5673182517021744, + "grad_norm": 2.256847858428955, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7222105264663696, + "num_tokens": 355933476.0, + "step": 14272 + }, + { + "epoch": 1.5674280694047882, + "grad_norm": 2.3333547115325928, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7272007465362549, + "num_tokens": 355958276.0, + "step": 14273 + }, + { + "epoch": 1.5675378871074017, + "grad_norm": 2.2874486446380615, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7163463830947876, + "num_tokens": 355983202.0, + "step": 14274 + }, + { + "epoch": 1.5676477048100153, + "grad_norm": 2.371910333633423, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.716643214225769, + "num_tokens": 356008633.0, + "step": 14275 + }, + { + "epoch": 1.567757522512629, + "grad_norm": 2.4466559886932373, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7370113134384155, + "num_tokens": 356030105.0, + "step": 14276 + }, + { + "epoch": 1.5678673402152428, + "grad_norm": 2.238424062728882, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7343604564666748, + "num_tokens": 356055378.0, + "step": 14277 + }, + { + "epoch": 1.5679771579178565, + "grad_norm": 2.020174264907837, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7133412957191467, + "num_tokens": 356085908.0, + "step": 14278 + }, + { + "epoch": 1.56808697562047, + "grad_norm": 2.414130687713623, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7258946895599365, + "num_tokens": 356109919.0, + "step": 14279 + }, + { + "epoch": 1.5681967933230836, + "grad_norm": 2.1496737003326416, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.686187744140625, + "num_tokens": 356140387.0, + "step": 14280 + }, + { + "epoch": 1.5683066110256974, + "grad_norm": 2.2561140060424805, + "learning_rate": 1e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7353217601776123, + "num_tokens": 356165548.0, + "step": 14281 + }, + { + "epoch": 1.568416428728311, + "grad_norm": 2.32360577583313, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7210425734519958, + "num_tokens": 356191899.0, + "step": 14282 + }, + { + "epoch": 1.5685262464309246, + "grad_norm": 1.9497157335281372, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7225455045700073, + "num_tokens": 356223194.0, + "step": 14283 + }, + { + "epoch": 1.5686360641335382, + "grad_norm": 2.1051621437072754, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7137187123298645, + "num_tokens": 356250328.0, + "step": 14284 + }, + { + "epoch": 1.568745881836152, + "grad_norm": 2.5247714519500732, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7407650351524353, + "num_tokens": 356270857.0, + "step": 14285 + }, + { + "epoch": 1.5688556995387657, + "grad_norm": 2.266812324523926, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7250866889953613, + "num_tokens": 356293748.0, + "step": 14286 + }, + { + "epoch": 1.5689655172413794, + "grad_norm": 2.365382671356201, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7529248595237732, + "num_tokens": 356315430.0, + "step": 14287 + }, + { + "epoch": 1.569075334943993, + "grad_norm": 2.331878900527954, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.731011152267456, + "num_tokens": 356338579.0, + "step": 14288 + }, + { + "epoch": 1.5691851526466065, + "grad_norm": 2.100454330444336, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7116910219192505, + "num_tokens": 356366393.0, + "step": 14289 + }, + { + "epoch": 1.5692949703492203, + "grad_norm": 2.1987555027008057, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7314302921295166, + "num_tokens": 356390944.0, + "step": 14290 + }, + { + "epoch": 1.569404788051834, + "grad_norm": 1.9970653057098389, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.718596339225769, + "num_tokens": 356422230.0, + "step": 14291 + }, + { + "epoch": 1.5695146057544476, + "grad_norm": 2.190821647644043, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7298015356063843, + "num_tokens": 356447958.0, + "step": 14292 + }, + { + "epoch": 1.5696244234570613, + "grad_norm": 2.4100801944732666, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7368420362472534, + "num_tokens": 356471214.0, + "step": 14293 + }, + { + "epoch": 1.5697342411596749, + "grad_norm": 2.0765271186828613, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7324774265289307, + "num_tokens": 356500155.0, + "step": 14294 + }, + { + "epoch": 1.5698440588622886, + "grad_norm": 2.4752495288848877, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7140806317329407, + "num_tokens": 356520891.0, + "step": 14295 + }, + { + "epoch": 1.5699538765649024, + "grad_norm": 2.380570411682129, + "learning_rate": 1e-06, + "loss": 0.8138, + "mean_token_accuracy": 0.7470154762268066, + "num_tokens": 356542783.0, + "step": 14296 + }, + { + "epoch": 1.570063694267516, + "grad_norm": 2.0628762245178223, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.6997923254966736, + "num_tokens": 356573963.0, + "step": 14297 + }, + { + "epoch": 1.5701735119701294, + "grad_norm": 2.3435447216033936, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7360799312591553, + "num_tokens": 356596420.0, + "step": 14298 + }, + { + "epoch": 1.5702833296727432, + "grad_norm": 2.1445798873901367, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7068472504615784, + "num_tokens": 356624764.0, + "step": 14299 + }, + { + "epoch": 1.570393147375357, + "grad_norm": 2.2249464988708496, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7071348428726196, + "num_tokens": 356653699.0, + "step": 14300 + }, + { + "epoch": 1.5705029650779707, + "grad_norm": 2.2011876106262207, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6954226493835449, + "num_tokens": 356681013.0, + "step": 14301 + }, + { + "epoch": 1.5706127827805842, + "grad_norm": 2.451369524002075, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7105072736740112, + "num_tokens": 356703733.0, + "step": 14302 + }, + { + "epoch": 1.5707226004831978, + "grad_norm": 2.37947940826416, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7417018413543701, + "num_tokens": 356725882.0, + "step": 14303 + }, + { + "epoch": 1.5708324181858115, + "grad_norm": 2.0329041481018066, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7093442678451538, + "num_tokens": 356755906.0, + "step": 14304 + }, + { + "epoch": 1.5709422358884253, + "grad_norm": 2.253844738006592, + "learning_rate": 1e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7414961457252502, + "num_tokens": 356778809.0, + "step": 14305 + }, + { + "epoch": 1.5710520535910388, + "grad_norm": 2.2213351726531982, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.703681230545044, + "num_tokens": 356805985.0, + "step": 14306 + }, + { + "epoch": 1.5711618712936526, + "grad_norm": 2.3108608722686768, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7022243142127991, + "num_tokens": 356831204.0, + "step": 14307 + }, + { + "epoch": 1.5712716889962661, + "grad_norm": 2.13260555267334, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7437617778778076, + "num_tokens": 356855646.0, + "step": 14308 + }, + { + "epoch": 1.5713815066988799, + "grad_norm": 2.301408290863037, + "learning_rate": 1e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7409665584564209, + "num_tokens": 356878455.0, + "step": 14309 + }, + { + "epoch": 1.5714913244014936, + "grad_norm": 2.231626272201538, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7335134744644165, + "num_tokens": 356905335.0, + "step": 14310 + }, + { + "epoch": 1.5716011421041072, + "grad_norm": 2.0112249851226807, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7161688804626465, + "num_tokens": 356935587.0, + "step": 14311 + }, + { + "epoch": 1.5717109598067207, + "grad_norm": 2.13957142829895, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7148773670196533, + "num_tokens": 356962502.0, + "step": 14312 + }, + { + "epoch": 1.5718207775093345, + "grad_norm": 2.3796732425689697, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.720630407333374, + "num_tokens": 356984916.0, + "step": 14313 + }, + { + "epoch": 1.5719305952119482, + "grad_norm": 2.675882577896118, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7476387023925781, + "num_tokens": 357003465.0, + "step": 14314 + }, + { + "epoch": 1.572040412914562, + "grad_norm": 2.2957212924957275, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7458033561706543, + "num_tokens": 357026804.0, + "step": 14315 + }, + { + "epoch": 1.5721502306171755, + "grad_norm": 2.092745065689087, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7160030603408813, + "num_tokens": 357057506.0, + "step": 14316 + }, + { + "epoch": 1.572260048319789, + "grad_norm": 2.282099723815918, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6946579813957214, + "num_tokens": 357083367.0, + "step": 14317 + }, + { + "epoch": 1.5723698660224028, + "grad_norm": 2.2147490978240967, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7151687145233154, + "num_tokens": 357109156.0, + "step": 14318 + }, + { + "epoch": 1.5724796837250166, + "grad_norm": 2.169417381286621, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7072964906692505, + "num_tokens": 357137137.0, + "step": 14319 + }, + { + "epoch": 1.57258950142763, + "grad_norm": 2.2265782356262207, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7215296030044556, + "num_tokens": 357163252.0, + "step": 14320 + }, + { + "epoch": 1.5726993191302436, + "grad_norm": 2.065239906311035, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.701137125492096, + "num_tokens": 357194256.0, + "step": 14321 + }, + { + "epoch": 1.5728091368328574, + "grad_norm": 2.496652126312256, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7212494015693665, + "num_tokens": 357215585.0, + "step": 14322 + }, + { + "epoch": 1.5729189545354711, + "grad_norm": 2.108663320541382, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7156739234924316, + "num_tokens": 357244327.0, + "step": 14323 + }, + { + "epoch": 1.573028772238085, + "grad_norm": 2.1888530254364014, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7153321504592896, + "num_tokens": 357271062.0, + "step": 14324 + }, + { + "epoch": 1.5731385899406984, + "grad_norm": 2.474604845046997, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.724269688129425, + "num_tokens": 357292985.0, + "step": 14325 + }, + { + "epoch": 1.573248407643312, + "grad_norm": 2.1761457920074463, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7223151922225952, + "num_tokens": 357318569.0, + "step": 14326 + }, + { + "epoch": 1.5733582253459257, + "grad_norm": 2.104194164276123, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7019209861755371, + "num_tokens": 357350916.0, + "step": 14327 + }, + { + "epoch": 1.5734680430485395, + "grad_norm": 2.508666515350342, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7409205436706543, + "num_tokens": 357371795.0, + "step": 14328 + }, + { + "epoch": 1.5735778607511532, + "grad_norm": 2.3156416416168213, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.6977477669715881, + "num_tokens": 357396203.0, + "step": 14329 + }, + { + "epoch": 1.5736876784537668, + "grad_norm": 2.172046661376953, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7055724859237671, + "num_tokens": 357422849.0, + "step": 14330 + }, + { + "epoch": 1.5737974961563803, + "grad_norm": 2.6082186698913574, + "learning_rate": 1e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.7418569326400757, + "num_tokens": 357441157.0, + "step": 14331 + }, + { + "epoch": 1.573907313858994, + "grad_norm": 2.1693592071533203, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7192637920379639, + "num_tokens": 357466185.0, + "step": 14332 + }, + { + "epoch": 1.5740171315616078, + "grad_norm": 2.3540055751800537, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7154219150543213, + "num_tokens": 357489949.0, + "step": 14333 + }, + { + "epoch": 1.5741269492642214, + "grad_norm": 2.2306745052337646, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6855362057685852, + "num_tokens": 357517381.0, + "step": 14334 + }, + { + "epoch": 1.574236766966835, + "grad_norm": 2.156219720840454, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7012972831726074, + "num_tokens": 357543606.0, + "step": 14335 + }, + { + "epoch": 1.5743465846694487, + "grad_norm": 2.1807303428649902, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7220118045806885, + "num_tokens": 357569544.0, + "step": 14336 + }, + { + "epoch": 1.5744564023720624, + "grad_norm": 2.3186423778533936, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7020564079284668, + "num_tokens": 357593826.0, + "step": 14337 + }, + { + "epoch": 1.5745662200746762, + "grad_norm": 2.6024699211120605, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7277524471282959, + "num_tokens": 357613525.0, + "step": 14338 + }, + { + "epoch": 1.5746760377772897, + "grad_norm": 2.2115864753723145, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7233747839927673, + "num_tokens": 357638109.0, + "step": 14339 + }, + { + "epoch": 1.5747858554799032, + "grad_norm": 2.1171200275421143, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7056868076324463, + "num_tokens": 357665402.0, + "step": 14340 + }, + { + "epoch": 1.574895673182517, + "grad_norm": 2.1260428428649902, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7312808632850647, + "num_tokens": 357692749.0, + "step": 14341 + }, + { + "epoch": 1.5750054908851308, + "grad_norm": 2.482473373413086, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7407082319259644, + "num_tokens": 357715104.0, + "step": 14342 + }, + { + "epoch": 1.5751153085877443, + "grad_norm": 2.5080361366271973, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.720150351524353, + "num_tokens": 357735962.0, + "step": 14343 + }, + { + "epoch": 1.575225126290358, + "grad_norm": 2.0665786266326904, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6923993825912476, + "num_tokens": 357767288.0, + "step": 14344 + }, + { + "epoch": 1.5753349439929716, + "grad_norm": 2.1377928256988525, + "learning_rate": 1e-06, + "loss": 0.8649, + "mean_token_accuracy": 0.7312807440757751, + "num_tokens": 357794331.0, + "step": 14345 + }, + { + "epoch": 1.5754447616955853, + "grad_norm": 2.5012145042419434, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7332198023796082, + "num_tokens": 357814194.0, + "step": 14346 + }, + { + "epoch": 1.575554579398199, + "grad_norm": 2.2088184356689453, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7009634971618652, + "num_tokens": 357841187.0, + "step": 14347 + }, + { + "epoch": 1.5756643971008126, + "grad_norm": 2.4336180686950684, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7302045822143555, + "num_tokens": 357863895.0, + "step": 14348 + }, + { + "epoch": 1.5757742148034262, + "grad_norm": 2.1055426597595215, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6999067068099976, + "num_tokens": 357898406.0, + "step": 14349 + }, + { + "epoch": 1.57588403250604, + "grad_norm": 2.421532154083252, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7258237600326538, + "num_tokens": 357920469.0, + "step": 14350 + }, + { + "epoch": 1.5759938502086537, + "grad_norm": 2.33837628364563, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7357772588729858, + "num_tokens": 357943103.0, + "step": 14351 + }, + { + "epoch": 1.5761036679112674, + "grad_norm": 2.236706495285034, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7032177448272705, + "num_tokens": 357970667.0, + "step": 14352 + }, + { + "epoch": 1.576213485613881, + "grad_norm": 2.6092193126678467, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.723141074180603, + "num_tokens": 357990637.0, + "step": 14353 + }, + { + "epoch": 1.5763233033164945, + "grad_norm": 2.27691388130188, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7212178111076355, + "num_tokens": 358014769.0, + "step": 14354 + }, + { + "epoch": 1.5764331210191083, + "grad_norm": 2.381539821624756, + "learning_rate": 1e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7418577671051025, + "num_tokens": 358036462.0, + "step": 14355 + }, + { + "epoch": 1.576542938721722, + "grad_norm": 2.3053295612335205, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6924158334732056, + "num_tokens": 358063272.0, + "step": 14356 + }, + { + "epoch": 1.5766527564243356, + "grad_norm": 2.305387020111084, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7433285713195801, + "num_tokens": 358086158.0, + "step": 14357 + }, + { + "epoch": 1.5767625741269493, + "grad_norm": 2.202352523803711, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.74195796251297, + "num_tokens": 358112818.0, + "step": 14358 + }, + { + "epoch": 1.5768723918295628, + "grad_norm": 2.063427686691284, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.729351818561554, + "num_tokens": 358141940.0, + "step": 14359 + }, + { + "epoch": 1.5769822095321766, + "grad_norm": 2.3094899654388428, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7032934427261353, + "num_tokens": 358166362.0, + "step": 14360 + }, + { + "epoch": 1.5770920272347904, + "grad_norm": 2.2215707302093506, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7107007503509521, + "num_tokens": 358193249.0, + "step": 14361 + }, + { + "epoch": 1.577201844937404, + "grad_norm": 2.3238561153411865, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7218422293663025, + "num_tokens": 358217922.0, + "step": 14362 + }, + { + "epoch": 1.5773116626400174, + "grad_norm": 2.0032341480255127, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7026106715202332, + "num_tokens": 358249824.0, + "step": 14363 + }, + { + "epoch": 1.5774214803426312, + "grad_norm": 2.3087408542633057, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7378329038619995, + "num_tokens": 358275371.0, + "step": 14364 + }, + { + "epoch": 1.577531298045245, + "grad_norm": 2.119297504425049, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.697802722454071, + "num_tokens": 358303632.0, + "step": 14365 + }, + { + "epoch": 1.5776411157478587, + "grad_norm": 1.9591540098190308, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7002086043357849, + "num_tokens": 358335778.0, + "step": 14366 + }, + { + "epoch": 1.5777509334504722, + "grad_norm": 2.24418044090271, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7126688957214355, + "num_tokens": 358361829.0, + "step": 14367 + }, + { + "epoch": 1.5778607511530858, + "grad_norm": 2.4858195781707764, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7315309643745422, + "num_tokens": 358382223.0, + "step": 14368 + }, + { + "epoch": 1.5779705688556995, + "grad_norm": 2.01926589012146, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7181868553161621, + "num_tokens": 358412162.0, + "step": 14369 + }, + { + "epoch": 1.5780803865583133, + "grad_norm": 2.317701816558838, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7086063027381897, + "num_tokens": 358436706.0, + "step": 14370 + }, + { + "epoch": 1.5781902042609268, + "grad_norm": 2.198725700378418, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7101576328277588, + "num_tokens": 358463915.0, + "step": 14371 + }, + { + "epoch": 1.5783000219635406, + "grad_norm": 2.301224708557129, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7360955476760864, + "num_tokens": 358488153.0, + "step": 14372 + }, + { + "epoch": 1.5784098396661541, + "grad_norm": 2.5321569442749023, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7297481298446655, + "num_tokens": 358507732.0, + "step": 14373 + }, + { + "epoch": 1.5785196573687679, + "grad_norm": 2.201246500015259, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7082146406173706, + "num_tokens": 358534000.0, + "step": 14374 + }, + { + "epoch": 1.5786294750713816, + "grad_norm": 2.563786268234253, + "learning_rate": 1e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7453635931015015, + "num_tokens": 358553537.0, + "step": 14375 + }, + { + "epoch": 1.5787392927739952, + "grad_norm": 2.1916093826293945, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7475501298904419, + "num_tokens": 358579502.0, + "step": 14376 + }, + { + "epoch": 1.5788491104766087, + "grad_norm": 2.2889931201934814, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7179642915725708, + "num_tokens": 358602993.0, + "step": 14377 + }, + { + "epoch": 1.5789589281792225, + "grad_norm": 2.171839714050293, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7028110027313232, + "num_tokens": 358629242.0, + "step": 14378 + }, + { + "epoch": 1.5790687458818362, + "grad_norm": 2.1411690711975098, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7121227383613586, + "num_tokens": 358657392.0, + "step": 14379 + }, + { + "epoch": 1.57917856358445, + "grad_norm": 2.2664003372192383, + "learning_rate": 1e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7490640878677368, + "num_tokens": 358679641.0, + "step": 14380 + }, + { + "epoch": 1.5792883812870635, + "grad_norm": 2.184011697769165, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7121129035949707, + "num_tokens": 358705102.0, + "step": 14381 + }, + { + "epoch": 1.579398198989677, + "grad_norm": 2.3819739818573, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7257872819900513, + "num_tokens": 358726770.0, + "step": 14382 + }, + { + "epoch": 1.5795080166922908, + "grad_norm": 2.262010097503662, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7123404145240784, + "num_tokens": 358751829.0, + "step": 14383 + }, + { + "epoch": 1.5796178343949046, + "grad_norm": 2.080012321472168, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7242722511291504, + "num_tokens": 358778952.0, + "step": 14384 + }, + { + "epoch": 1.579727652097518, + "grad_norm": 2.1983959674835205, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7191814184188843, + "num_tokens": 358804631.0, + "step": 14385 + }, + { + "epoch": 1.5798374698001316, + "grad_norm": 2.4172427654266357, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6799052953720093, + "num_tokens": 358830056.0, + "step": 14386 + }, + { + "epoch": 1.5799472875027454, + "grad_norm": 2.241624355316162, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7140700817108154, + "num_tokens": 358855156.0, + "step": 14387 + }, + { + "epoch": 1.5800571052053591, + "grad_norm": 2.9117980003356934, + "learning_rate": 1e-06, + "loss": 0.7709, + "mean_token_accuracy": 0.7573969960212708, + "num_tokens": 358871063.0, + "step": 14388 + }, + { + "epoch": 1.580166922907973, + "grad_norm": 2.117617607116699, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7318943738937378, + "num_tokens": 358895982.0, + "step": 14389 + }, + { + "epoch": 1.5802767406105864, + "grad_norm": 2.1452395915985107, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7018582820892334, + "num_tokens": 358923900.0, + "step": 14390 + }, + { + "epoch": 1.5803865583132, + "grad_norm": 2.1600401401519775, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7057510018348694, + "num_tokens": 358952258.0, + "step": 14391 + }, + { + "epoch": 1.5804963760158137, + "grad_norm": 2.142613410949707, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6906901597976685, + "num_tokens": 358982443.0, + "step": 14392 + }, + { + "epoch": 1.5806061937184275, + "grad_norm": 2.192319631576538, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7393295168876648, + "num_tokens": 359008219.0, + "step": 14393 + }, + { + "epoch": 1.5807160114210412, + "grad_norm": 2.7381558418273926, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7201457023620605, + "num_tokens": 359027993.0, + "step": 14394 + }, + { + "epoch": 1.5808258291236548, + "grad_norm": 2.8000195026397705, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.737930178642273, + "num_tokens": 359046650.0, + "step": 14395 + }, + { + "epoch": 1.5809356468262683, + "grad_norm": 2.3364720344543457, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7125338315963745, + "num_tokens": 359070668.0, + "step": 14396 + }, + { + "epoch": 1.581045464528882, + "grad_norm": 2.468392848968506, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7268494367599487, + "num_tokens": 359094584.0, + "step": 14397 + }, + { + "epoch": 1.5811552822314958, + "grad_norm": 2.3081750869750977, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7209184169769287, + "num_tokens": 359120065.0, + "step": 14398 + }, + { + "epoch": 1.5812650999341094, + "grad_norm": 2.193455457687378, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7106131315231323, + "num_tokens": 359144658.0, + "step": 14399 + }, + { + "epoch": 1.5813749176367229, + "grad_norm": 2.575922966003418, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7309406399726868, + "num_tokens": 359163652.0, + "step": 14400 + }, + { + "epoch": 1.5814847353393366, + "grad_norm": 2.204988479614258, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7318695783615112, + "num_tokens": 359188269.0, + "step": 14401 + }, + { + "epoch": 1.5815945530419504, + "grad_norm": 2.3275997638702393, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7258829474449158, + "num_tokens": 359213663.0, + "step": 14402 + }, + { + "epoch": 1.5817043707445642, + "grad_norm": 2.634970188140869, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7208138108253479, + "num_tokens": 359234365.0, + "step": 14403 + }, + { + "epoch": 1.5818141884471777, + "grad_norm": 2.0510315895080566, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7231060862541199, + "num_tokens": 359263287.0, + "step": 14404 + }, + { + "epoch": 1.5819240061497912, + "grad_norm": 2.1201956272125244, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6975246667861938, + "num_tokens": 359292199.0, + "step": 14405 + }, + { + "epoch": 1.582033823852405, + "grad_norm": 2.259087562561035, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7271466255187988, + "num_tokens": 359317185.0, + "step": 14406 + }, + { + "epoch": 1.5821436415550187, + "grad_norm": 2.0636847019195557, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7313604950904846, + "num_tokens": 359344443.0, + "step": 14407 + }, + { + "epoch": 1.5822534592576323, + "grad_norm": 2.060901641845703, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7477149367332458, + "num_tokens": 359372158.0, + "step": 14408 + }, + { + "epoch": 1.582363276960246, + "grad_norm": 2.0837953090667725, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7393616437911987, + "num_tokens": 359401290.0, + "step": 14409 + }, + { + "epoch": 1.5824730946628596, + "grad_norm": 2.17560076713562, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7124620676040649, + "num_tokens": 359429803.0, + "step": 14410 + }, + { + "epoch": 1.5825829123654733, + "grad_norm": 2.2758424282073975, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7192261219024658, + "num_tokens": 359454296.0, + "step": 14411 + }, + { + "epoch": 1.582692730068087, + "grad_norm": 2.181360960006714, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7542715668678284, + "num_tokens": 359480417.0, + "step": 14412 + }, + { + "epoch": 1.5828025477707006, + "grad_norm": 2.547532320022583, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7293103933334351, + "num_tokens": 359500208.0, + "step": 14413 + }, + { + "epoch": 1.5829123654733142, + "grad_norm": 2.1689388751983643, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7329961657524109, + "num_tokens": 359525964.0, + "step": 14414 + }, + { + "epoch": 1.583022183175928, + "grad_norm": 2.1659767627716064, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7125213742256165, + "num_tokens": 359553908.0, + "step": 14415 + }, + { + "epoch": 1.5831320008785417, + "grad_norm": 2.580850124359131, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7229959964752197, + "num_tokens": 359574735.0, + "step": 14416 + }, + { + "epoch": 1.5832418185811554, + "grad_norm": 2.3145081996917725, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7165704965591431, + "num_tokens": 359598820.0, + "step": 14417 + }, + { + "epoch": 1.583351636283769, + "grad_norm": 2.19576096534729, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7112581729888916, + "num_tokens": 359624896.0, + "step": 14418 + }, + { + "epoch": 1.5834614539863825, + "grad_norm": 2.6323843002319336, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7164326906204224, + "num_tokens": 359645550.0, + "step": 14419 + }, + { + "epoch": 1.5835712716889963, + "grad_norm": 2.3363921642303467, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7058109045028687, + "num_tokens": 359671587.0, + "step": 14420 + }, + { + "epoch": 1.58368108939161, + "grad_norm": 2.594531297683716, + "learning_rate": 1e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.7464720010757446, + "num_tokens": 359690419.0, + "step": 14421 + }, + { + "epoch": 1.5837909070942235, + "grad_norm": 2.450298547744751, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7357301712036133, + "num_tokens": 359711844.0, + "step": 14422 + }, + { + "epoch": 1.5839007247968373, + "grad_norm": 2.645606517791748, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7286936044692993, + "num_tokens": 359731291.0, + "step": 14423 + }, + { + "epoch": 1.5840105424994508, + "grad_norm": 2.4538917541503906, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.737795352935791, + "num_tokens": 359751532.0, + "step": 14424 + }, + { + "epoch": 1.5841203602020646, + "grad_norm": 2.3458549976348877, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.720695972442627, + "num_tokens": 359773692.0, + "step": 14425 + }, + { + "epoch": 1.5842301779046783, + "grad_norm": 2.2329976558685303, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7206451296806335, + "num_tokens": 359799330.0, + "step": 14426 + }, + { + "epoch": 1.5843399956072919, + "grad_norm": 2.0465190410614014, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.716227114200592, + "num_tokens": 359829378.0, + "step": 14427 + }, + { + "epoch": 1.5844498133099054, + "grad_norm": 2.8854331970214844, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7359998226165771, + "num_tokens": 359846651.0, + "step": 14428 + }, + { + "epoch": 1.5845596310125192, + "grad_norm": 1.9654194116592407, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7130073308944702, + "num_tokens": 359877216.0, + "step": 14429 + }, + { + "epoch": 1.584669448715133, + "grad_norm": 2.3235697746276855, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7259132266044617, + "num_tokens": 359900765.0, + "step": 14430 + }, + { + "epoch": 1.5847792664177467, + "grad_norm": 2.543748140335083, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6902973055839539, + "num_tokens": 359922534.0, + "step": 14431 + }, + { + "epoch": 1.5848890841203602, + "grad_norm": 2.2800936698913574, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.6972523927688599, + "num_tokens": 359952616.0, + "step": 14432 + }, + { + "epoch": 1.5849989018229738, + "grad_norm": 2.284590005874634, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7213131785392761, + "num_tokens": 359977548.0, + "step": 14433 + }, + { + "epoch": 1.5851087195255875, + "grad_norm": 2.7311699390411377, + "learning_rate": 1e-06, + "loss": 0.7454, + "mean_token_accuracy": 0.7576882243156433, + "num_tokens": 359995119.0, + "step": 14434 + }, + { + "epoch": 1.5852185372282013, + "grad_norm": 2.1307992935180664, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7352790236473083, + "num_tokens": 360020311.0, + "step": 14435 + }, + { + "epoch": 1.5853283549308148, + "grad_norm": 2.396164655685425, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7232873439788818, + "num_tokens": 360043027.0, + "step": 14436 + }, + { + "epoch": 1.5854381726334283, + "grad_norm": 2.740375518798828, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7382564544677734, + "num_tokens": 360062637.0, + "step": 14437 + }, + { + "epoch": 1.585547990336042, + "grad_norm": 2.227126121520996, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7255783081054688, + "num_tokens": 360090847.0, + "step": 14438 + }, + { + "epoch": 1.5856578080386559, + "grad_norm": 2.1330528259277344, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7378705739974976, + "num_tokens": 360117677.0, + "step": 14439 + }, + { + "epoch": 1.5857676257412696, + "grad_norm": 2.3590683937072754, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7120254039764404, + "num_tokens": 360143401.0, + "step": 14440 + }, + { + "epoch": 1.5858774434438832, + "grad_norm": 2.372840404510498, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7188484072685242, + "num_tokens": 360167558.0, + "step": 14441 + }, + { + "epoch": 1.5859872611464967, + "grad_norm": 2.2186291217803955, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7111714482307434, + "num_tokens": 360193784.0, + "step": 14442 + }, + { + "epoch": 1.5860970788491104, + "grad_norm": 2.2340214252471924, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7210471630096436, + "num_tokens": 360218660.0, + "step": 14443 + }, + { + "epoch": 1.5862068965517242, + "grad_norm": 2.3876709938049316, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7103183269500732, + "num_tokens": 360241283.0, + "step": 14444 + }, + { + "epoch": 1.586316714254338, + "grad_norm": 2.3333542346954346, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7228038311004639, + "num_tokens": 360264625.0, + "step": 14445 + }, + { + "epoch": 1.5864265319569515, + "grad_norm": 2.044740676879883, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7227952480316162, + "num_tokens": 360294538.0, + "step": 14446 + }, + { + "epoch": 1.586536349659565, + "grad_norm": 2.3391714096069336, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7186792492866516, + "num_tokens": 360319603.0, + "step": 14447 + }, + { + "epoch": 1.5866461673621788, + "grad_norm": 2.4952073097229004, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7252005338668823, + "num_tokens": 360341422.0, + "step": 14448 + }, + { + "epoch": 1.5867559850647925, + "grad_norm": 2.2745096683502197, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.723186194896698, + "num_tokens": 360365472.0, + "step": 14449 + }, + { + "epoch": 1.586865802767406, + "grad_norm": 2.326209783554077, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7505941390991211, + "num_tokens": 360387770.0, + "step": 14450 + }, + { + "epoch": 1.5869756204700196, + "grad_norm": 2.3596839904785156, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7411720752716064, + "num_tokens": 360410605.0, + "step": 14451 + }, + { + "epoch": 1.5870854381726334, + "grad_norm": 2.653576135635376, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7379845380783081, + "num_tokens": 360429528.0, + "step": 14452 + }, + { + "epoch": 1.5871952558752471, + "grad_norm": 2.4414658546447754, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7151978611946106, + "num_tokens": 360452266.0, + "step": 14453 + }, + { + "epoch": 1.5873050735778609, + "grad_norm": 2.3994407653808594, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7168600559234619, + "num_tokens": 360477331.0, + "step": 14454 + }, + { + "epoch": 1.5874148912804744, + "grad_norm": 2.28814435005188, + "learning_rate": 1e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7354608774185181, + "num_tokens": 360500698.0, + "step": 14455 + }, + { + "epoch": 1.587524708983088, + "grad_norm": 2.1831250190734863, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7072649002075195, + "num_tokens": 360527161.0, + "step": 14456 + }, + { + "epoch": 1.5876345266857017, + "grad_norm": 2.226304769515991, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7392629981040955, + "num_tokens": 360553422.0, + "step": 14457 + }, + { + "epoch": 1.5877443443883155, + "grad_norm": 2.240919828414917, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7029454708099365, + "num_tokens": 360579518.0, + "step": 14458 + }, + { + "epoch": 1.5878541620909292, + "grad_norm": 2.1560890674591064, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.7005406618118286, + "num_tokens": 360608418.0, + "step": 14459 + }, + { + "epoch": 1.5879639797935428, + "grad_norm": 2.3008787631988525, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7210269570350647, + "num_tokens": 360634146.0, + "step": 14460 + }, + { + "epoch": 1.5880737974961563, + "grad_norm": 2.1734366416931152, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7103395462036133, + "num_tokens": 360660152.0, + "step": 14461 + }, + { + "epoch": 1.58818361519877, + "grad_norm": 2.2402100563049316, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6935096979141235, + "num_tokens": 360686907.0, + "step": 14462 + }, + { + "epoch": 1.5882934329013838, + "grad_norm": 2.2016098499298096, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7200659513473511, + "num_tokens": 360713830.0, + "step": 14463 + }, + { + "epoch": 1.5884032506039973, + "grad_norm": 2.656161069869995, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7345649003982544, + "num_tokens": 360733699.0, + "step": 14464 + }, + { + "epoch": 1.5885130683066109, + "grad_norm": 2.255362033843994, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7502410411834717, + "num_tokens": 360758487.0, + "step": 14465 + }, + { + "epoch": 1.5886228860092246, + "grad_norm": 2.2520790100097656, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7103445529937744, + "num_tokens": 360784707.0, + "step": 14466 + }, + { + "epoch": 1.5887327037118384, + "grad_norm": 2.1158287525177, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7120270729064941, + "num_tokens": 360812466.0, + "step": 14467 + }, + { + "epoch": 1.5888425214144521, + "grad_norm": 2.2818212509155273, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7434945106506348, + "num_tokens": 360835759.0, + "step": 14468 + }, + { + "epoch": 1.5889523391170657, + "grad_norm": 2.362055540084839, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7187198400497437, + "num_tokens": 360857356.0, + "step": 14469 + }, + { + "epoch": 1.5890621568196792, + "grad_norm": 2.1639010906219482, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7238820791244507, + "num_tokens": 360884524.0, + "step": 14470 + }, + { + "epoch": 1.589171974522293, + "grad_norm": 1.9611588716506958, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7175204157829285, + "num_tokens": 360916836.0, + "step": 14471 + }, + { + "epoch": 1.5892817922249067, + "grad_norm": 2.31430983543396, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7362961173057556, + "num_tokens": 360939099.0, + "step": 14472 + }, + { + "epoch": 1.5893916099275203, + "grad_norm": 2.515739917755127, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7313054800033569, + "num_tokens": 360957933.0, + "step": 14473 + }, + { + "epoch": 1.589501427630134, + "grad_norm": 1.9953384399414062, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7170037031173706, + "num_tokens": 360987781.0, + "step": 14474 + }, + { + "epoch": 1.5896112453327476, + "grad_norm": 2.203022003173828, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7300280332565308, + "num_tokens": 361012971.0, + "step": 14475 + }, + { + "epoch": 1.5897210630353613, + "grad_norm": 2.2069802284240723, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.711514949798584, + "num_tokens": 361040812.0, + "step": 14476 + }, + { + "epoch": 1.589830880737975, + "grad_norm": 1.9315766096115112, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.72597336769104, + "num_tokens": 361074952.0, + "step": 14477 + }, + { + "epoch": 1.5899406984405886, + "grad_norm": 2.0731711387634277, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7205125689506531, + "num_tokens": 361103842.0, + "step": 14478 + }, + { + "epoch": 1.5900505161432021, + "grad_norm": 2.0853068828582764, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7242101430892944, + "num_tokens": 361131392.0, + "step": 14479 + }, + { + "epoch": 1.590160333845816, + "grad_norm": 2.3958418369293213, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7070934176445007, + "num_tokens": 361156674.0, + "step": 14480 + }, + { + "epoch": 1.5902701515484297, + "grad_norm": 2.3702898025512695, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7223175764083862, + "num_tokens": 361179739.0, + "step": 14481 + }, + { + "epoch": 1.5903799692510434, + "grad_norm": 2.3938190937042236, + "learning_rate": 1e-06, + "loss": 0.7868, + "mean_token_accuracy": 0.7541955709457397, + "num_tokens": 361201479.0, + "step": 14482 + }, + { + "epoch": 1.590489786953657, + "grad_norm": 2.353705644607544, + "learning_rate": 1e-06, + "loss": 0.802, + "mean_token_accuracy": 0.7444626092910767, + "num_tokens": 361222830.0, + "step": 14483 + }, + { + "epoch": 1.5905996046562705, + "grad_norm": 2.0567517280578613, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7379387021064758, + "num_tokens": 361250981.0, + "step": 14484 + }, + { + "epoch": 1.5907094223588842, + "grad_norm": 2.3540432453155518, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6989137530326843, + "num_tokens": 361275675.0, + "step": 14485 + }, + { + "epoch": 1.590819240061498, + "grad_norm": 2.101503849029541, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7203644514083862, + "num_tokens": 361303874.0, + "step": 14486 + }, + { + "epoch": 1.5909290577641115, + "grad_norm": 2.3469045162200928, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7323506474494934, + "num_tokens": 361327846.0, + "step": 14487 + }, + { + "epoch": 1.5910388754667253, + "grad_norm": 2.1966426372528076, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7207542061805725, + "num_tokens": 361353443.0, + "step": 14488 + }, + { + "epoch": 1.5911486931693388, + "grad_norm": 2.6068854331970215, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7128880023956299, + "num_tokens": 361374398.0, + "step": 14489 + }, + { + "epoch": 1.5912585108719526, + "grad_norm": 2.047555923461914, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.730231523513794, + "num_tokens": 361402820.0, + "step": 14490 + }, + { + "epoch": 1.5913683285745663, + "grad_norm": 1.9392751455307007, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.686204195022583, + "num_tokens": 361437118.0, + "step": 14491 + }, + { + "epoch": 1.5914781462771799, + "grad_norm": 1.9323443174362183, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7005183696746826, + "num_tokens": 361470337.0, + "step": 14492 + }, + { + "epoch": 1.5915879639797934, + "grad_norm": 2.0236055850982666, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7025237083435059, + "num_tokens": 361502149.0, + "step": 14493 + }, + { + "epoch": 1.5916977816824072, + "grad_norm": 2.5228664875030518, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7210125923156738, + "num_tokens": 361522649.0, + "step": 14494 + }, + { + "epoch": 1.591807599385021, + "grad_norm": 2.425049304962158, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7331891059875488, + "num_tokens": 361545428.0, + "step": 14495 + }, + { + "epoch": 1.5919174170876347, + "grad_norm": 2.2074639797210693, + "learning_rate": 1e-06, + "loss": 0.751, + "mean_token_accuracy": 0.7658998966217041, + "num_tokens": 361570734.0, + "step": 14496 + }, + { + "epoch": 1.5920272347902482, + "grad_norm": 1.8265471458435059, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7405673265457153, + "num_tokens": 361603219.0, + "step": 14497 + }, + { + "epoch": 1.5921370524928617, + "grad_norm": 1.9234787225723267, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7014015913009644, + "num_tokens": 361637555.0, + "step": 14498 + }, + { + "epoch": 1.5922468701954755, + "grad_norm": 2.0581884384155273, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7049272060394287, + "num_tokens": 361668719.0, + "step": 14499 + }, + { + "epoch": 1.5923566878980893, + "grad_norm": 2.4643990993499756, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7412098050117493, + "num_tokens": 361690609.0, + "step": 14500 + }, + { + "epoch": 1.5924665056007028, + "grad_norm": 2.702080011367798, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.7587171792984009, + "num_tokens": 361708611.0, + "step": 14501 + }, + { + "epoch": 1.5925763233033163, + "grad_norm": 2.0040533542633057, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7276623249053955, + "num_tokens": 361738949.0, + "step": 14502 + }, + { + "epoch": 1.59268614100593, + "grad_norm": 2.384725332260132, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7112983465194702, + "num_tokens": 361765659.0, + "step": 14503 + }, + { + "epoch": 1.5927959587085438, + "grad_norm": 2.407017946243286, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6937959790229797, + "num_tokens": 361789518.0, + "step": 14504 + }, + { + "epoch": 1.5929057764111576, + "grad_norm": 2.540088415145874, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7171557545661926, + "num_tokens": 361809399.0, + "step": 14505 + }, + { + "epoch": 1.5930155941137711, + "grad_norm": 2.5279366970062256, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7215291261672974, + "num_tokens": 361831475.0, + "step": 14506 + }, + { + "epoch": 1.5931254118163847, + "grad_norm": 2.127258777618408, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7304291725158691, + "num_tokens": 361859038.0, + "step": 14507 + }, + { + "epoch": 1.5932352295189984, + "grad_norm": 2.038435935974121, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7138490080833435, + "num_tokens": 361888365.0, + "step": 14508 + }, + { + "epoch": 1.5933450472216122, + "grad_norm": 2.4830169677734375, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7244940400123596, + "num_tokens": 361909010.0, + "step": 14509 + }, + { + "epoch": 1.593454864924226, + "grad_norm": 2.271345615386963, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7155736684799194, + "num_tokens": 361933648.0, + "step": 14510 + }, + { + "epoch": 1.5935646826268395, + "grad_norm": 2.234126567840576, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7144838571548462, + "num_tokens": 361957858.0, + "step": 14511 + }, + { + "epoch": 1.593674500329453, + "grad_norm": 2.1862316131591797, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7273702621459961, + "num_tokens": 361983902.0, + "step": 14512 + }, + { + "epoch": 1.5937843180320668, + "grad_norm": 2.4140987396240234, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7087934613227844, + "num_tokens": 362006316.0, + "step": 14513 + }, + { + "epoch": 1.5938941357346805, + "grad_norm": 2.3409364223480225, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7060267925262451, + "num_tokens": 362031502.0, + "step": 14514 + }, + { + "epoch": 1.594003953437294, + "grad_norm": 2.2433464527130127, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7159273624420166, + "num_tokens": 362056822.0, + "step": 14515 + }, + { + "epoch": 1.5941137711399076, + "grad_norm": 2.6062979698181152, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7355457544326782, + "num_tokens": 362076085.0, + "step": 14516 + }, + { + "epoch": 1.5942235888425214, + "grad_norm": 2.357491970062256, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7096571326255798, + "num_tokens": 362100255.0, + "step": 14517 + }, + { + "epoch": 1.5943334065451351, + "grad_norm": 2.072934627532959, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.6957293152809143, + "num_tokens": 362131483.0, + "step": 14518 + }, + { + "epoch": 1.5944432242477489, + "grad_norm": 2.24361252784729, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7382712364196777, + "num_tokens": 362155869.0, + "step": 14519 + }, + { + "epoch": 1.5945530419503624, + "grad_norm": 2.570016860961914, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7365843653678894, + "num_tokens": 362175525.0, + "step": 14520 + }, + { + "epoch": 1.594662859652976, + "grad_norm": 2.403731107711792, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7296500205993652, + "num_tokens": 362198657.0, + "step": 14521 + }, + { + "epoch": 1.5947726773555897, + "grad_norm": 2.3707683086395264, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7112571001052856, + "num_tokens": 362222316.0, + "step": 14522 + }, + { + "epoch": 1.5948824950582035, + "grad_norm": 2.170656442642212, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7003452777862549, + "num_tokens": 362250717.0, + "step": 14523 + }, + { + "epoch": 1.5949923127608172, + "grad_norm": 2.0971615314483643, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7415949106216431, + "num_tokens": 362279364.0, + "step": 14524 + }, + { + "epoch": 1.5951021304634307, + "grad_norm": 2.423961877822876, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7335957288742065, + "num_tokens": 362299946.0, + "step": 14525 + }, + { + "epoch": 1.5952119481660443, + "grad_norm": 2.2594308853149414, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7013497948646545, + "num_tokens": 362324967.0, + "step": 14526 + }, + { + "epoch": 1.595321765868658, + "grad_norm": 2.4505748748779297, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7235026359558105, + "num_tokens": 362347285.0, + "step": 14527 + }, + { + "epoch": 1.5954315835712718, + "grad_norm": 2.1689791679382324, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7267182469367981, + "num_tokens": 362373758.0, + "step": 14528 + }, + { + "epoch": 1.5955414012738853, + "grad_norm": 2.6637301445007324, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.72184157371521, + "num_tokens": 362395368.0, + "step": 14529 + }, + { + "epoch": 1.5956512189764989, + "grad_norm": 2.119757890701294, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7189810276031494, + "num_tokens": 362424044.0, + "step": 14530 + }, + { + "epoch": 1.5957610366791126, + "grad_norm": 2.44736909866333, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7279784083366394, + "num_tokens": 362446998.0, + "step": 14531 + }, + { + "epoch": 1.5958708543817264, + "grad_norm": 2.5871057510375977, + "learning_rate": 1e-06, + "loss": 0.7973, + "mean_token_accuracy": 0.7552258968353271, + "num_tokens": 362465664.0, + "step": 14532 + }, + { + "epoch": 1.5959806720843401, + "grad_norm": 2.280897855758667, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7457233667373657, + "num_tokens": 362489522.0, + "step": 14533 + }, + { + "epoch": 1.5960904897869537, + "grad_norm": 2.4930098056793213, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7377328872680664, + "num_tokens": 362510220.0, + "step": 14534 + }, + { + "epoch": 1.5962003074895672, + "grad_norm": 2.4983339309692383, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7068225145339966, + "num_tokens": 362532941.0, + "step": 14535 + }, + { + "epoch": 1.596310125192181, + "grad_norm": 2.2718353271484375, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7164047360420227, + "num_tokens": 362557650.0, + "step": 14536 + }, + { + "epoch": 1.5964199428947947, + "grad_norm": 2.3177459239959717, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7022131681442261, + "num_tokens": 362584260.0, + "step": 14537 + }, + { + "epoch": 1.5965297605974083, + "grad_norm": 2.212820053100586, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7383551597595215, + "num_tokens": 362609998.0, + "step": 14538 + }, + { + "epoch": 1.596639578300022, + "grad_norm": 2.1289455890655518, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7273424863815308, + "num_tokens": 362637142.0, + "step": 14539 + }, + { + "epoch": 1.5967493960026355, + "grad_norm": 2.1896042823791504, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7134535312652588, + "num_tokens": 362664613.0, + "step": 14540 + }, + { + "epoch": 1.5968592137052493, + "grad_norm": 2.3399386405944824, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7172880172729492, + "num_tokens": 362687766.0, + "step": 14541 + }, + { + "epoch": 1.596969031407863, + "grad_norm": 2.5298521518707275, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7228105068206787, + "num_tokens": 362708517.0, + "step": 14542 + }, + { + "epoch": 1.5970788491104766, + "grad_norm": 2.447500705718994, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7324950098991394, + "num_tokens": 362729500.0, + "step": 14543 + }, + { + "epoch": 1.5971886668130901, + "grad_norm": 2.4042367935180664, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.723931074142456, + "num_tokens": 362750926.0, + "step": 14544 + }, + { + "epoch": 1.5972984845157039, + "grad_norm": 2.5455243587493896, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7299256324768066, + "num_tokens": 362771652.0, + "step": 14545 + }, + { + "epoch": 1.5974083022183176, + "grad_norm": 2.1533751487731934, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7030746936798096, + "num_tokens": 362799528.0, + "step": 14546 + }, + { + "epoch": 1.5975181199209314, + "grad_norm": 2.5043845176696777, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7364693880081177, + "num_tokens": 362820720.0, + "step": 14547 + }, + { + "epoch": 1.597627937623545, + "grad_norm": 2.1297335624694824, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7392739057540894, + "num_tokens": 362849198.0, + "step": 14548 + }, + { + "epoch": 1.5977377553261585, + "grad_norm": 2.190537929534912, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7322384119033813, + "num_tokens": 362875090.0, + "step": 14549 + }, + { + "epoch": 1.5978475730287722, + "grad_norm": 2.258002281188965, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7163828015327454, + "num_tokens": 362901546.0, + "step": 14550 + }, + { + "epoch": 1.597957390731386, + "grad_norm": 2.3247504234313965, + "learning_rate": 1e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7403807640075684, + "num_tokens": 362924077.0, + "step": 14551 + }, + { + "epoch": 1.5980672084339995, + "grad_norm": 2.3092639446258545, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7200101017951965, + "num_tokens": 362949021.0, + "step": 14552 + }, + { + "epoch": 1.5981770261366133, + "grad_norm": 2.2576355934143066, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7187579870223999, + "num_tokens": 362973853.0, + "step": 14553 + }, + { + "epoch": 1.5982868438392268, + "grad_norm": 2.2640576362609863, + "learning_rate": 1e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7403887510299683, + "num_tokens": 362998404.0, + "step": 14554 + }, + { + "epoch": 1.5983966615418406, + "grad_norm": 2.780914306640625, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7466749548912048, + "num_tokens": 363016603.0, + "step": 14555 + }, + { + "epoch": 1.5985064792444543, + "grad_norm": 2.6946048736572266, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7197738289833069, + "num_tokens": 363036302.0, + "step": 14556 + }, + { + "epoch": 1.5986162969470679, + "grad_norm": 2.1922595500946045, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7090750336647034, + "num_tokens": 363065942.0, + "step": 14557 + }, + { + "epoch": 1.5987261146496814, + "grad_norm": 2.7353007793426514, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7198483347892761, + "num_tokens": 363083230.0, + "step": 14558 + }, + { + "epoch": 1.5988359323522952, + "grad_norm": 2.2004077434539795, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7261468768119812, + "num_tokens": 363109519.0, + "step": 14559 + }, + { + "epoch": 1.598945750054909, + "grad_norm": 2.52583384513855, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7204235792160034, + "num_tokens": 363131370.0, + "step": 14560 + }, + { + "epoch": 1.5990555677575227, + "grad_norm": 2.3000030517578125, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7185340523719788, + "num_tokens": 363157436.0, + "step": 14561 + }, + { + "epoch": 1.5991653854601362, + "grad_norm": 2.3588712215423584, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7177243232727051, + "num_tokens": 363183213.0, + "step": 14562 + }, + { + "epoch": 1.5992752031627497, + "grad_norm": 2.4702367782592773, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7243237495422363, + "num_tokens": 363204824.0, + "step": 14563 + }, + { + "epoch": 1.5993850208653635, + "grad_norm": 2.278085231781006, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7100803852081299, + "num_tokens": 363230725.0, + "step": 14564 + }, + { + "epoch": 1.5994948385679773, + "grad_norm": 2.3129007816314697, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7216999530792236, + "num_tokens": 363257527.0, + "step": 14565 + }, + { + "epoch": 1.5996046562705908, + "grad_norm": 2.1550962924957275, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7302403450012207, + "num_tokens": 363282669.0, + "step": 14566 + }, + { + "epoch": 1.5997144739732043, + "grad_norm": 2.227304220199585, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7113233208656311, + "num_tokens": 363309680.0, + "step": 14567 + }, + { + "epoch": 1.599824291675818, + "grad_norm": 2.268634557723999, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7101939916610718, + "num_tokens": 363335344.0, + "step": 14568 + }, + { + "epoch": 1.5999341093784318, + "grad_norm": 2.1581828594207764, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7337870597839355, + "num_tokens": 363360931.0, + "step": 14569 + }, + { + "epoch": 1.6000439270810456, + "grad_norm": 2.2344353199005127, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.71168053150177, + "num_tokens": 363387406.0, + "step": 14570 + }, + { + "epoch": 1.6001537447836591, + "grad_norm": 2.23093581199646, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7205857634544373, + "num_tokens": 363412335.0, + "step": 14571 + }, + { + "epoch": 1.6002635624862727, + "grad_norm": 2.085325002670288, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.729348361492157, + "num_tokens": 363440242.0, + "step": 14572 + }, + { + "epoch": 1.6003733801888864, + "grad_norm": 2.409271001815796, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.699862003326416, + "num_tokens": 363465579.0, + "step": 14573 + }, + { + "epoch": 1.6004831978915002, + "grad_norm": 2.184601306915283, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7394931316375732, + "num_tokens": 363492434.0, + "step": 14574 + }, + { + "epoch": 1.600593015594114, + "grad_norm": 2.6006860733032227, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7296335101127625, + "num_tokens": 363512781.0, + "step": 14575 + }, + { + "epoch": 1.6007028332967275, + "grad_norm": 2.151432991027832, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7085784673690796, + "num_tokens": 363539134.0, + "step": 14576 + }, + { + "epoch": 1.600812650999341, + "grad_norm": 2.351463556289673, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7034685611724854, + "num_tokens": 363562378.0, + "step": 14577 + }, + { + "epoch": 1.6009224687019548, + "grad_norm": 2.740823984146118, + "learning_rate": 1e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7508413791656494, + "num_tokens": 363579266.0, + "step": 14578 + }, + { + "epoch": 1.6010322864045685, + "grad_norm": 2.6941323280334473, + "learning_rate": 1e-06, + "loss": 0.8201, + "mean_token_accuracy": 0.7370047569274902, + "num_tokens": 363598574.0, + "step": 14579 + }, + { + "epoch": 1.601142104107182, + "grad_norm": 2.1174871921539307, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7356935739517212, + "num_tokens": 363627948.0, + "step": 14580 + }, + { + "epoch": 1.6012519218097956, + "grad_norm": 1.9789433479309082, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7274720668792725, + "num_tokens": 363659371.0, + "step": 14581 + }, + { + "epoch": 1.6013617395124093, + "grad_norm": 2.2465100288391113, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7087799906730652, + "num_tokens": 363688184.0, + "step": 14582 + }, + { + "epoch": 1.601471557215023, + "grad_norm": 2.593842029571533, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7218673229217529, + "num_tokens": 363709149.0, + "step": 14583 + }, + { + "epoch": 1.6015813749176369, + "grad_norm": 2.2328109741210938, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6924375295639038, + "num_tokens": 363738674.0, + "step": 14584 + }, + { + "epoch": 1.6016911926202504, + "grad_norm": 2.293611526489258, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.729902982711792, + "num_tokens": 363762450.0, + "step": 14585 + }, + { + "epoch": 1.601801010322864, + "grad_norm": 2.3034846782684326, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7160894870758057, + "num_tokens": 363788511.0, + "step": 14586 + }, + { + "epoch": 1.6019108280254777, + "grad_norm": 2.219053030014038, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.6953756809234619, + "num_tokens": 363814513.0, + "step": 14587 + }, + { + "epoch": 1.6020206457280914, + "grad_norm": 2.504269599914551, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7131602764129639, + "num_tokens": 363835724.0, + "step": 14588 + }, + { + "epoch": 1.602130463430705, + "grad_norm": 2.3419229984283447, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6931298971176147, + "num_tokens": 363860150.0, + "step": 14589 + }, + { + "epoch": 1.6022402811333187, + "grad_norm": 2.2116611003875732, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7249178886413574, + "num_tokens": 363884136.0, + "step": 14590 + }, + { + "epoch": 1.6023500988359323, + "grad_norm": 2.033173084259033, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6979293823242188, + "num_tokens": 363915220.0, + "step": 14591 + }, + { + "epoch": 1.602459916538546, + "grad_norm": 2.749816656112671, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7304405570030212, + "num_tokens": 363934232.0, + "step": 14592 + }, + { + "epoch": 1.6025697342411598, + "grad_norm": 2.1847267150878906, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7102065682411194, + "num_tokens": 363963023.0, + "step": 14593 + }, + { + "epoch": 1.6026795519437733, + "grad_norm": 2.100559949874878, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7215802669525146, + "num_tokens": 363991626.0, + "step": 14594 + }, + { + "epoch": 1.6027893696463869, + "grad_norm": 2.2681729793548584, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6992583870887756, + "num_tokens": 364019541.0, + "step": 14595 + }, + { + "epoch": 1.6028991873490006, + "grad_norm": 2.0875089168548584, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7301144599914551, + "num_tokens": 364046745.0, + "step": 14596 + }, + { + "epoch": 1.6030090050516144, + "grad_norm": 2.0606913566589355, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7263076901435852, + "num_tokens": 364075391.0, + "step": 14597 + }, + { + "epoch": 1.6031188227542281, + "grad_norm": 2.1485211849212646, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7132258415222168, + "num_tokens": 364100071.0, + "step": 14598 + }, + { + "epoch": 1.6032286404568417, + "grad_norm": 2.211851119995117, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7166315317153931, + "num_tokens": 364126688.0, + "step": 14599 + }, + { + "epoch": 1.6033384581594552, + "grad_norm": 2.1796956062316895, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.704891562461853, + "num_tokens": 364155607.0, + "step": 14600 + }, + { + "epoch": 1.603448275862069, + "grad_norm": 2.0715911388397217, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7238489389419556, + "num_tokens": 364182787.0, + "step": 14601 + }, + { + "epoch": 1.6035580935646827, + "grad_norm": 2.3629558086395264, + "learning_rate": 1e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7561761140823364, + "num_tokens": 364206320.0, + "step": 14602 + }, + { + "epoch": 1.6036679112672962, + "grad_norm": 2.1373226642608643, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7251753807067871, + "num_tokens": 364232871.0, + "step": 14603 + }, + { + "epoch": 1.60377772896991, + "grad_norm": 2.355714797973633, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7371716499328613, + "num_tokens": 364254910.0, + "step": 14604 + }, + { + "epoch": 1.6038875466725235, + "grad_norm": 2.4726250171661377, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7492589354515076, + "num_tokens": 364277456.0, + "step": 14605 + }, + { + "epoch": 1.6039973643751373, + "grad_norm": 2.235503911972046, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7300349473953247, + "num_tokens": 364301827.0, + "step": 14606 + }, + { + "epoch": 1.604107182077751, + "grad_norm": 2.2306766510009766, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7158163785934448, + "num_tokens": 364328691.0, + "step": 14607 + }, + { + "epoch": 1.6042169997803646, + "grad_norm": 2.4282889366149902, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7466396689414978, + "num_tokens": 364350931.0, + "step": 14608 + }, + { + "epoch": 1.6043268174829781, + "grad_norm": 2.775179386138916, + "learning_rate": 1e-06, + "loss": 0.7411, + "mean_token_accuracy": 0.764178991317749, + "num_tokens": 364367843.0, + "step": 14609 + }, + { + "epoch": 1.6044366351855919, + "grad_norm": 1.9955536127090454, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7281699180603027, + "num_tokens": 364400076.0, + "step": 14610 + }, + { + "epoch": 1.6045464528882056, + "grad_norm": 2.137422800064087, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7024202346801758, + "num_tokens": 364429633.0, + "step": 14611 + }, + { + "epoch": 1.6046562705908194, + "grad_norm": 2.10740065574646, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7039852142333984, + "num_tokens": 364460621.0, + "step": 14612 + }, + { + "epoch": 1.604766088293433, + "grad_norm": 2.259146213531494, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7334139347076416, + "num_tokens": 364485207.0, + "step": 14613 + }, + { + "epoch": 1.6048759059960465, + "grad_norm": 2.2546427249908447, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7288572192192078, + "num_tokens": 364509079.0, + "step": 14614 + }, + { + "epoch": 1.6049857236986602, + "grad_norm": 2.141969919204712, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7115134000778198, + "num_tokens": 364537030.0, + "step": 14615 + }, + { + "epoch": 1.605095541401274, + "grad_norm": 2.4653871059417725, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7316524386405945, + "num_tokens": 364558207.0, + "step": 14616 + }, + { + "epoch": 1.6052053591038875, + "grad_norm": 2.1902406215667725, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7098331451416016, + "num_tokens": 364584257.0, + "step": 14617 + }, + { + "epoch": 1.605315176806501, + "grad_norm": 1.9736090898513794, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7168478965759277, + "num_tokens": 364615019.0, + "step": 14618 + }, + { + "epoch": 1.6054249945091148, + "grad_norm": 2.6675214767456055, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7319324612617493, + "num_tokens": 364633559.0, + "step": 14619 + }, + { + "epoch": 1.6055348122117286, + "grad_norm": 2.154569149017334, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.705827534198761, + "num_tokens": 364660054.0, + "step": 14620 + }, + { + "epoch": 1.6056446299143423, + "grad_norm": 2.1466355323791504, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7154650688171387, + "num_tokens": 364687085.0, + "step": 14621 + }, + { + "epoch": 1.6057544476169558, + "grad_norm": 2.1425647735595703, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7203782200813293, + "num_tokens": 364715386.0, + "step": 14622 + }, + { + "epoch": 1.6058642653195694, + "grad_norm": 2.097576379776001, + "learning_rate": 1e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.751891016960144, + "num_tokens": 364742054.0, + "step": 14623 + }, + { + "epoch": 1.6059740830221831, + "grad_norm": 2.151594400405884, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7056373357772827, + "num_tokens": 364769124.0, + "step": 14624 + }, + { + "epoch": 1.606083900724797, + "grad_norm": 2.700068950653076, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7362554669380188, + "num_tokens": 364788197.0, + "step": 14625 + }, + { + "epoch": 1.6061937184274107, + "grad_norm": 2.5927977561950684, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7328951954841614, + "num_tokens": 364807583.0, + "step": 14626 + }, + { + "epoch": 1.6063035361300242, + "grad_norm": 2.54744815826416, + "learning_rate": 1e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7480001449584961, + "num_tokens": 364827393.0, + "step": 14627 + }, + { + "epoch": 1.6064133538326377, + "grad_norm": 2.1342265605926514, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7134822607040405, + "num_tokens": 364854983.0, + "step": 14628 + }, + { + "epoch": 1.6065231715352515, + "grad_norm": 2.3816447257995605, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7203594446182251, + "num_tokens": 364878711.0, + "step": 14629 + }, + { + "epoch": 1.6066329892378652, + "grad_norm": 2.302251100540161, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7103957533836365, + "num_tokens": 364905160.0, + "step": 14630 + }, + { + "epoch": 1.6067428069404788, + "grad_norm": 2.161358118057251, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6975446939468384, + "num_tokens": 364935409.0, + "step": 14631 + }, + { + "epoch": 1.6068526246430923, + "grad_norm": 2.2306735515594482, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7186146378517151, + "num_tokens": 364960403.0, + "step": 14632 + }, + { + "epoch": 1.606962442345706, + "grad_norm": 2.4141228199005127, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7115394473075867, + "num_tokens": 364983353.0, + "step": 14633 + }, + { + "epoch": 1.6070722600483198, + "grad_norm": 2.3296711444854736, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7139072418212891, + "num_tokens": 365007981.0, + "step": 14634 + }, + { + "epoch": 1.6071820777509336, + "grad_norm": 2.040614366531372, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7172414064407349, + "num_tokens": 365039547.0, + "step": 14635 + }, + { + "epoch": 1.6072918954535471, + "grad_norm": 2.2929184436798096, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7094370126724243, + "num_tokens": 365065695.0, + "step": 14636 + }, + { + "epoch": 1.6074017131561606, + "grad_norm": 2.2061235904693604, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7243683338165283, + "num_tokens": 365091441.0, + "step": 14637 + }, + { + "epoch": 1.6075115308587744, + "grad_norm": 2.1003308296203613, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7279126644134521, + "num_tokens": 365120763.0, + "step": 14638 + }, + { + "epoch": 1.6076213485613882, + "grad_norm": 2.0775201320648193, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7128863334655762, + "num_tokens": 365150241.0, + "step": 14639 + }, + { + "epoch": 1.607731166264002, + "grad_norm": 2.3834924697875977, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7121384143829346, + "num_tokens": 365174479.0, + "step": 14640 + }, + { + "epoch": 1.6078409839666155, + "grad_norm": 2.4917900562286377, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7358202934265137, + "num_tokens": 365195681.0, + "step": 14641 + }, + { + "epoch": 1.607950801669229, + "grad_norm": 1.9254919290542603, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7291635274887085, + "num_tokens": 365227457.0, + "step": 14642 + }, + { + "epoch": 1.6080606193718427, + "grad_norm": 1.9982202053070068, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7173160314559937, + "num_tokens": 365259346.0, + "step": 14643 + }, + { + "epoch": 1.6081704370744565, + "grad_norm": 2.2634925842285156, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7215174436569214, + "num_tokens": 365285624.0, + "step": 14644 + }, + { + "epoch": 1.60828025477707, + "grad_norm": 2.557565450668335, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7101428508758545, + "num_tokens": 365307628.0, + "step": 14645 + }, + { + "epoch": 1.6083900724796836, + "grad_norm": 2.347785472869873, + "learning_rate": 1e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7375097274780273, + "num_tokens": 365330556.0, + "step": 14646 + }, + { + "epoch": 1.6084998901822973, + "grad_norm": 2.372786283493042, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7279474139213562, + "num_tokens": 365354867.0, + "step": 14647 + }, + { + "epoch": 1.608609707884911, + "grad_norm": 2.3120760917663574, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7336610555648804, + "num_tokens": 365378760.0, + "step": 14648 + }, + { + "epoch": 1.6087195255875248, + "grad_norm": 2.1897828578948975, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7201606035232544, + "num_tokens": 365404825.0, + "step": 14649 + }, + { + "epoch": 1.6088293432901384, + "grad_norm": 2.1772797107696533, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7154353857040405, + "num_tokens": 365431138.0, + "step": 14650 + }, + { + "epoch": 1.608939160992752, + "grad_norm": 2.5280959606170654, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7283851504325867, + "num_tokens": 365450961.0, + "step": 14651 + }, + { + "epoch": 1.6090489786953657, + "grad_norm": 2.394468307495117, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7237443327903748, + "num_tokens": 365474851.0, + "step": 14652 + }, + { + "epoch": 1.6091587963979794, + "grad_norm": 2.718238115310669, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.716436505317688, + "num_tokens": 365494516.0, + "step": 14653 + }, + { + "epoch": 1.609268614100593, + "grad_norm": 2.356098175048828, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7136393189430237, + "num_tokens": 365517969.0, + "step": 14654 + }, + { + "epoch": 1.6093784318032067, + "grad_norm": 2.218226671218872, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6957156658172607, + "num_tokens": 365545877.0, + "step": 14655 + }, + { + "epoch": 1.6094882495058203, + "grad_norm": 2.240183115005493, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.713664174079895, + "num_tokens": 365571691.0, + "step": 14656 + }, + { + "epoch": 1.609598067208434, + "grad_norm": 2.207972526550293, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7075814008712769, + "num_tokens": 365599313.0, + "step": 14657 + }, + { + "epoch": 1.6097078849110478, + "grad_norm": 2.6461188793182373, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7403056621551514, + "num_tokens": 365618258.0, + "step": 14658 + }, + { + "epoch": 1.6098177026136613, + "grad_norm": 2.333872079849243, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7240456342697144, + "num_tokens": 365641324.0, + "step": 14659 + }, + { + "epoch": 1.6099275203162748, + "grad_norm": 2.274702310562134, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7175845503807068, + "num_tokens": 365665864.0, + "step": 14660 + }, + { + "epoch": 1.6100373380188886, + "grad_norm": 2.0219433307647705, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7308930158615112, + "num_tokens": 365696465.0, + "step": 14661 + }, + { + "epoch": 1.6101471557215024, + "grad_norm": 2.329456329345703, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7482708096504211, + "num_tokens": 365722567.0, + "step": 14662 + }, + { + "epoch": 1.6102569734241161, + "grad_norm": 2.1110923290252686, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7059247493743896, + "num_tokens": 365753399.0, + "step": 14663 + }, + { + "epoch": 1.6103667911267296, + "grad_norm": 2.148325204849243, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7039403915405273, + "num_tokens": 365782664.0, + "step": 14664 + }, + { + "epoch": 1.6104766088293432, + "grad_norm": 2.7000832557678223, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7255816459655762, + "num_tokens": 365802313.0, + "step": 14665 + }, + { + "epoch": 1.610586426531957, + "grad_norm": 2.761401414871216, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7166206240653992, + "num_tokens": 365820621.0, + "step": 14666 + }, + { + "epoch": 1.6106962442345707, + "grad_norm": 2.1360576152801514, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7426808476448059, + "num_tokens": 365848884.0, + "step": 14667 + }, + { + "epoch": 1.6108060619371842, + "grad_norm": 2.2828712463378906, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7215160131454468, + "num_tokens": 365875254.0, + "step": 14668 + }, + { + "epoch": 1.610915879639798, + "grad_norm": 2.1055257320404053, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7014849185943604, + "num_tokens": 365904441.0, + "step": 14669 + }, + { + "epoch": 1.6110256973424115, + "grad_norm": 2.2575109004974365, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7287915349006653, + "num_tokens": 365928835.0, + "step": 14670 + }, + { + "epoch": 1.6111355150450253, + "grad_norm": 2.281914472579956, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7093400359153748, + "num_tokens": 365955193.0, + "step": 14671 + }, + { + "epoch": 1.611245332747639, + "grad_norm": 2.0667333602905273, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7145673036575317, + "num_tokens": 365983647.0, + "step": 14672 + }, + { + "epoch": 1.6113551504502526, + "grad_norm": 2.328653573989868, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7080955505371094, + "num_tokens": 366009318.0, + "step": 14673 + }, + { + "epoch": 1.611464968152866, + "grad_norm": 2.2862629890441895, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7210090160369873, + "num_tokens": 366034593.0, + "step": 14674 + }, + { + "epoch": 1.6115747858554799, + "grad_norm": 2.4490966796875, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7253867387771606, + "num_tokens": 366056898.0, + "step": 14675 + }, + { + "epoch": 1.6116846035580936, + "grad_norm": 2.4172251224517822, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7258797883987427, + "num_tokens": 366078654.0, + "step": 14676 + }, + { + "epoch": 1.6117944212607074, + "grad_norm": 1.9827220439910889, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7136834263801575, + "num_tokens": 366109346.0, + "step": 14677 + }, + { + "epoch": 1.611904238963321, + "grad_norm": 2.558751106262207, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.712584376335144, + "num_tokens": 366130208.0, + "step": 14678 + }, + { + "epoch": 1.6120140566659344, + "grad_norm": 2.266883373260498, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.729798436164856, + "num_tokens": 366154557.0, + "step": 14679 + }, + { + "epoch": 1.6121238743685482, + "grad_norm": 2.6161999702453613, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7429327964782715, + "num_tokens": 366173126.0, + "step": 14680 + }, + { + "epoch": 1.612233692071162, + "grad_norm": 2.5134999752044678, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.703127384185791, + "num_tokens": 366193835.0, + "step": 14681 + }, + { + "epoch": 1.6123435097737755, + "grad_norm": 2.636219024658203, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7374638915061951, + "num_tokens": 366212321.0, + "step": 14682 + }, + { + "epoch": 1.612453327476389, + "grad_norm": 2.207463026046753, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.731640636920929, + "num_tokens": 366241987.0, + "step": 14683 + }, + { + "epoch": 1.6125631451790028, + "grad_norm": 2.282891035079956, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7155493497848511, + "num_tokens": 366268310.0, + "step": 14684 + }, + { + "epoch": 1.6126729628816165, + "grad_norm": 2.2636194229125977, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7300350069999695, + "num_tokens": 366294396.0, + "step": 14685 + }, + { + "epoch": 1.6127827805842303, + "grad_norm": 2.3226048946380615, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7298324108123779, + "num_tokens": 366318996.0, + "step": 14686 + }, + { + "epoch": 1.6128925982868438, + "grad_norm": 2.105821132659912, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7167943716049194, + "num_tokens": 366348196.0, + "step": 14687 + }, + { + "epoch": 1.6130024159894574, + "grad_norm": 2.3640987873077393, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7204363346099854, + "num_tokens": 366371623.0, + "step": 14688 + }, + { + "epoch": 1.6131122336920711, + "grad_norm": 2.461437702178955, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7239549160003662, + "num_tokens": 366393350.0, + "step": 14689 + }, + { + "epoch": 1.6132220513946849, + "grad_norm": 2.617434501647949, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7176613807678223, + "num_tokens": 366412691.0, + "step": 14690 + }, + { + "epoch": 1.6133318690972986, + "grad_norm": 2.521584987640381, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7276675701141357, + "num_tokens": 366433732.0, + "step": 14691 + }, + { + "epoch": 1.6134416867999122, + "grad_norm": 2.2918248176574707, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7152036428451538, + "num_tokens": 366459268.0, + "step": 14692 + }, + { + "epoch": 1.6135515045025257, + "grad_norm": 2.299804210662842, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7192753553390503, + "num_tokens": 366483423.0, + "step": 14693 + }, + { + "epoch": 1.6136613222051395, + "grad_norm": 2.3070719242095947, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7167388796806335, + "num_tokens": 366508785.0, + "step": 14694 + }, + { + "epoch": 1.6137711399077532, + "grad_norm": 2.936115026473999, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7332553863525391, + "num_tokens": 366525493.0, + "step": 14695 + }, + { + "epoch": 1.6138809576103668, + "grad_norm": 2.4051074981689453, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7227789163589478, + "num_tokens": 366547367.0, + "step": 14696 + }, + { + "epoch": 1.6139907753129803, + "grad_norm": 2.184213876724243, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7211601734161377, + "num_tokens": 366573713.0, + "step": 14697 + }, + { + "epoch": 1.614100593015594, + "grad_norm": 2.2442235946655273, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7084755897521973, + "num_tokens": 366598464.0, + "step": 14698 + }, + { + "epoch": 1.6142104107182078, + "grad_norm": 2.7100889682769775, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7330238223075867, + "num_tokens": 366616132.0, + "step": 14699 + }, + { + "epoch": 1.6143202284208216, + "grad_norm": 2.270125150680542, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7203226089477539, + "num_tokens": 366643257.0, + "step": 14700 + }, + { + "epoch": 1.614430046123435, + "grad_norm": 2.29532790184021, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7098498344421387, + "num_tokens": 366669298.0, + "step": 14701 + }, + { + "epoch": 1.6145398638260486, + "grad_norm": 2.305934429168701, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7189839482307434, + "num_tokens": 366695019.0, + "step": 14702 + }, + { + "epoch": 1.6146496815286624, + "grad_norm": 2.315974235534668, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7188923358917236, + "num_tokens": 366719333.0, + "step": 14703 + }, + { + "epoch": 1.6147594992312762, + "grad_norm": 2.4486374855041504, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7225536108016968, + "num_tokens": 366739934.0, + "step": 14704 + }, + { + "epoch": 1.61486931693389, + "grad_norm": 2.3194730281829834, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7101244926452637, + "num_tokens": 366766696.0, + "step": 14705 + }, + { + "epoch": 1.6149791346365034, + "grad_norm": 2.1717395782470703, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7239037752151489, + "num_tokens": 366793141.0, + "step": 14706 + }, + { + "epoch": 1.615088952339117, + "grad_norm": 2.3873817920684814, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7234262228012085, + "num_tokens": 366815282.0, + "step": 14707 + }, + { + "epoch": 1.6151987700417307, + "grad_norm": 2.3375139236450195, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7111833095550537, + "num_tokens": 366839753.0, + "step": 14708 + }, + { + "epoch": 1.6153085877443445, + "grad_norm": 2.0863282680511475, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7368438839912415, + "num_tokens": 366868608.0, + "step": 14709 + }, + { + "epoch": 1.615418405446958, + "grad_norm": 1.9140489101409912, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7004616260528564, + "num_tokens": 366902171.0, + "step": 14710 + }, + { + "epoch": 1.6155282231495716, + "grad_norm": 2.011157274246216, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6938853859901428, + "num_tokens": 366933875.0, + "step": 14711 + }, + { + "epoch": 1.6156380408521853, + "grad_norm": 2.2015528678894043, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7182753682136536, + "num_tokens": 366959214.0, + "step": 14712 + }, + { + "epoch": 1.615747858554799, + "grad_norm": 2.1822104454040527, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7095360159873962, + "num_tokens": 366984376.0, + "step": 14713 + }, + { + "epoch": 1.6158576762574128, + "grad_norm": 2.6624419689178467, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7250047922134399, + "num_tokens": 367004125.0, + "step": 14714 + }, + { + "epoch": 1.6159674939600264, + "grad_norm": 2.8618359565734863, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7436938285827637, + "num_tokens": 367021428.0, + "step": 14715 + }, + { + "epoch": 1.61607731166264, + "grad_norm": 2.4158127307891846, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7149368524551392, + "num_tokens": 367043457.0, + "step": 14716 + }, + { + "epoch": 1.6161871293652537, + "grad_norm": 2.6239566802978516, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7255362272262573, + "num_tokens": 367062219.0, + "step": 14717 + }, + { + "epoch": 1.6162969470678674, + "grad_norm": 2.2646965980529785, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7390828132629395, + "num_tokens": 367085299.0, + "step": 14718 + }, + { + "epoch": 1.616406764770481, + "grad_norm": 2.147874355316162, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7147512435913086, + "num_tokens": 367113268.0, + "step": 14719 + }, + { + "epoch": 1.6165165824730947, + "grad_norm": 2.484283447265625, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7279914617538452, + "num_tokens": 367136320.0, + "step": 14720 + }, + { + "epoch": 1.6166264001757082, + "grad_norm": 2.439131021499634, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7115412950515747, + "num_tokens": 367159537.0, + "step": 14721 + }, + { + "epoch": 1.616736217878322, + "grad_norm": 2.4021143913269043, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7317255139350891, + "num_tokens": 367181181.0, + "step": 14722 + }, + { + "epoch": 1.6168460355809358, + "grad_norm": 2.155517101287842, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.730452835559845, + "num_tokens": 367209642.0, + "step": 14723 + }, + { + "epoch": 1.6169558532835493, + "grad_norm": 2.115647077560425, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7080296277999878, + "num_tokens": 367239794.0, + "step": 14724 + }, + { + "epoch": 1.6170656709861628, + "grad_norm": 2.2293825149536133, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6890006065368652, + "num_tokens": 367265383.0, + "step": 14725 + }, + { + "epoch": 1.6171754886887766, + "grad_norm": 2.000486135482788, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7273107767105103, + "num_tokens": 367294615.0, + "step": 14726 + }, + { + "epoch": 1.6172853063913903, + "grad_norm": 2.2983572483062744, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7180423140525818, + "num_tokens": 367319456.0, + "step": 14727 + }, + { + "epoch": 1.617395124094004, + "grad_norm": 2.4409239292144775, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7057417035102844, + "num_tokens": 367342437.0, + "step": 14728 + }, + { + "epoch": 1.6175049417966176, + "grad_norm": 2.083217144012451, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7205613851547241, + "num_tokens": 367372023.0, + "step": 14729 + }, + { + "epoch": 1.6176147594992312, + "grad_norm": 2.303410291671753, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7091078758239746, + "num_tokens": 367398648.0, + "step": 14730 + }, + { + "epoch": 1.617724577201845, + "grad_norm": 2.1091320514678955, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.6987531185150146, + "num_tokens": 367427352.0, + "step": 14731 + }, + { + "epoch": 1.6178343949044587, + "grad_norm": 2.4335412979125977, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7349650263786316, + "num_tokens": 367449509.0, + "step": 14732 + }, + { + "epoch": 1.6179442126070722, + "grad_norm": 1.964250087738037, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.693012535572052, + "num_tokens": 367484868.0, + "step": 14733 + }, + { + "epoch": 1.618054030309686, + "grad_norm": 2.7251055240631104, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7192828059196472, + "num_tokens": 367505753.0, + "step": 14734 + }, + { + "epoch": 1.6181638480122995, + "grad_norm": 2.3029534816741943, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7246768474578857, + "num_tokens": 367529231.0, + "step": 14735 + }, + { + "epoch": 1.6182736657149133, + "grad_norm": 2.616873264312744, + "learning_rate": 1e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7453132271766663, + "num_tokens": 367549137.0, + "step": 14736 + }, + { + "epoch": 1.618383483417527, + "grad_norm": 2.573594331741333, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.7185729742050171, + "num_tokens": 367574397.0, + "step": 14737 + }, + { + "epoch": 1.6184933011201406, + "grad_norm": 2.303919792175293, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7190151214599609, + "num_tokens": 367601082.0, + "step": 14738 + }, + { + "epoch": 1.618603118822754, + "grad_norm": 2.2727153301239014, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.6999083757400513, + "num_tokens": 367626252.0, + "step": 14739 + }, + { + "epoch": 1.6187129365253679, + "grad_norm": 2.4798405170440674, + "learning_rate": 1e-06, + "loss": 0.7603, + "mean_token_accuracy": 0.7547892332077026, + "num_tokens": 367646996.0, + "step": 14740 + }, + { + "epoch": 1.6188227542279816, + "grad_norm": 2.4100842475891113, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7276151776313782, + "num_tokens": 367670063.0, + "step": 14741 + }, + { + "epoch": 1.6189325719305954, + "grad_norm": 2.1189022064208984, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7170722484588623, + "num_tokens": 367699403.0, + "step": 14742 + }, + { + "epoch": 1.619042389633209, + "grad_norm": 2.172475814819336, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7218284606933594, + "num_tokens": 367727098.0, + "step": 14743 + }, + { + "epoch": 1.6191522073358224, + "grad_norm": 2.569087505340576, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7437129020690918, + "num_tokens": 367747368.0, + "step": 14744 + }, + { + "epoch": 1.6192620250384362, + "grad_norm": 2.2496225833892822, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7291766405105591, + "num_tokens": 367772982.0, + "step": 14745 + }, + { + "epoch": 1.61937184274105, + "grad_norm": 2.21136212348938, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7160140872001648, + "num_tokens": 367798399.0, + "step": 14746 + }, + { + "epoch": 1.6194816604436635, + "grad_norm": 2.4533979892730713, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7230360507965088, + "num_tokens": 367821089.0, + "step": 14747 + }, + { + "epoch": 1.619591478146277, + "grad_norm": 2.1110029220581055, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.6965970396995544, + "num_tokens": 367849225.0, + "step": 14748 + }, + { + "epoch": 1.6197012958488908, + "grad_norm": 2.1634480953216553, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7338804006576538, + "num_tokens": 367876779.0, + "step": 14749 + }, + { + "epoch": 1.6198111135515045, + "grad_norm": 2.318845272064209, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7358782291412354, + "num_tokens": 367900690.0, + "step": 14750 + }, + { + "epoch": 1.6199209312541183, + "grad_norm": 2.2443861961364746, + "learning_rate": 1e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7521542310714722, + "num_tokens": 367922717.0, + "step": 14751 + }, + { + "epoch": 1.6200307489567318, + "grad_norm": 2.135483980178833, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7309820652008057, + "num_tokens": 367949708.0, + "step": 14752 + }, + { + "epoch": 1.6201405666593454, + "grad_norm": 2.2984495162963867, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7356894612312317, + "num_tokens": 367973181.0, + "step": 14753 + }, + { + "epoch": 1.6202503843619591, + "grad_norm": 2.3765408992767334, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7464355230331421, + "num_tokens": 367993513.0, + "step": 14754 + }, + { + "epoch": 1.6203602020645729, + "grad_norm": 2.1818363666534424, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7247616052627563, + "num_tokens": 368022107.0, + "step": 14755 + }, + { + "epoch": 1.6204700197671866, + "grad_norm": 2.4940345287323, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7114975452423096, + "num_tokens": 368043938.0, + "step": 14756 + }, + { + "epoch": 1.6205798374698002, + "grad_norm": 2.2288310527801514, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7201045751571655, + "num_tokens": 368069540.0, + "step": 14757 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 2.123138189315796, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7139912843704224, + "num_tokens": 368099408.0, + "step": 14758 + }, + { + "epoch": 1.6207994728750275, + "grad_norm": 2.671212673187256, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7169870138168335, + "num_tokens": 368118526.0, + "step": 14759 + }, + { + "epoch": 1.6209092905776412, + "grad_norm": 2.1112680435180664, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7352369427680969, + "num_tokens": 368147571.0, + "step": 14760 + }, + { + "epoch": 1.6210191082802548, + "grad_norm": 2.41615891456604, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7213128805160522, + "num_tokens": 368171213.0, + "step": 14761 + }, + { + "epoch": 1.6211289259828683, + "grad_norm": 2.184588670730591, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7213366031646729, + "num_tokens": 368198604.0, + "step": 14762 + }, + { + "epoch": 1.621238743685482, + "grad_norm": 2.10310435295105, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7067997455596924, + "num_tokens": 368230333.0, + "step": 14763 + }, + { + "epoch": 1.6213485613880958, + "grad_norm": 2.294607162475586, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7316718101501465, + "num_tokens": 368254315.0, + "step": 14764 + }, + { + "epoch": 1.6214583790907096, + "grad_norm": 2.1172564029693604, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7222262620925903, + "num_tokens": 368281611.0, + "step": 14765 + }, + { + "epoch": 1.621568196793323, + "grad_norm": 2.5607094764709473, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7154823541641235, + "num_tokens": 368303121.0, + "step": 14766 + }, + { + "epoch": 1.6216780144959366, + "grad_norm": 2.40596342086792, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.71478670835495, + "num_tokens": 368326427.0, + "step": 14767 + }, + { + "epoch": 1.6217878321985504, + "grad_norm": 2.457324266433716, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7381527423858643, + "num_tokens": 368349637.0, + "step": 14768 + }, + { + "epoch": 1.6218976499011641, + "grad_norm": 2.212815999984741, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7147600054740906, + "num_tokens": 368375899.0, + "step": 14769 + }, + { + "epoch": 1.6220074676037777, + "grad_norm": 2.4399476051330566, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.72993403673172, + "num_tokens": 368399095.0, + "step": 14770 + }, + { + "epoch": 1.6221172853063914, + "grad_norm": 2.048138380050659, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6981961727142334, + "num_tokens": 368430078.0, + "step": 14771 + }, + { + "epoch": 1.622227103009005, + "grad_norm": 2.2230520248413086, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6914880275726318, + "num_tokens": 368459061.0, + "step": 14772 + }, + { + "epoch": 1.6223369207116187, + "grad_norm": 2.640373945236206, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7117417454719543, + "num_tokens": 368479724.0, + "step": 14773 + }, + { + "epoch": 1.6224467384142325, + "grad_norm": 2.1870617866516113, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.708369255065918, + "num_tokens": 368506122.0, + "step": 14774 + }, + { + "epoch": 1.622556556116846, + "grad_norm": 2.1107263565063477, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7388540506362915, + "num_tokens": 368532794.0, + "step": 14775 + }, + { + "epoch": 1.6226663738194596, + "grad_norm": 2.5532751083374023, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7237836122512817, + "num_tokens": 368553051.0, + "step": 14776 + }, + { + "epoch": 1.6227761915220733, + "grad_norm": 2.2727560997009277, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7315157055854797, + "num_tokens": 368577801.0, + "step": 14777 + }, + { + "epoch": 1.622886009224687, + "grad_norm": 2.2788867950439453, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.6993082761764526, + "num_tokens": 368604388.0, + "step": 14778 + }, + { + "epoch": 1.6229958269273008, + "grad_norm": 2.0321145057678223, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.715243399143219, + "num_tokens": 368634842.0, + "step": 14779 + }, + { + "epoch": 1.6231056446299144, + "grad_norm": 2.2266674041748047, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7183743715286255, + "num_tokens": 368661706.0, + "step": 14780 + }, + { + "epoch": 1.623215462332528, + "grad_norm": 2.4032342433929443, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7198613286018372, + "num_tokens": 368685247.0, + "step": 14781 + }, + { + "epoch": 1.6233252800351416, + "grad_norm": 2.097835063934326, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7278023958206177, + "num_tokens": 368713705.0, + "step": 14782 + }, + { + "epoch": 1.6234350977377554, + "grad_norm": 2.322904586791992, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7172004580497742, + "num_tokens": 368736072.0, + "step": 14783 + }, + { + "epoch": 1.623544915440369, + "grad_norm": 2.1624844074249268, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7287390232086182, + "num_tokens": 368763672.0, + "step": 14784 + }, + { + "epoch": 1.6236547331429827, + "grad_norm": 2.4916460514068604, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7263093590736389, + "num_tokens": 368784682.0, + "step": 14785 + }, + { + "epoch": 1.6237645508455962, + "grad_norm": 1.9780272245407104, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7158614993095398, + "num_tokens": 368815494.0, + "step": 14786 + }, + { + "epoch": 1.62387436854821, + "grad_norm": 2.0726661682128906, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7264878749847412, + "num_tokens": 368843390.0, + "step": 14787 + }, + { + "epoch": 1.6239841862508237, + "grad_norm": 2.3214385509490967, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.733159065246582, + "num_tokens": 368866525.0, + "step": 14788 + }, + { + "epoch": 1.6240940039534373, + "grad_norm": 2.1909193992614746, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7103978395462036, + "num_tokens": 368894000.0, + "step": 14789 + }, + { + "epoch": 1.6242038216560508, + "grad_norm": 2.278519630432129, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7093382477760315, + "num_tokens": 368921435.0, + "step": 14790 + }, + { + "epoch": 1.6243136393586646, + "grad_norm": 1.9401146173477173, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7000705003738403, + "num_tokens": 368955920.0, + "step": 14791 + }, + { + "epoch": 1.6244234570612783, + "grad_norm": 2.077075719833374, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7084980010986328, + "num_tokens": 368986375.0, + "step": 14792 + }, + { + "epoch": 1.624533274763892, + "grad_norm": 2.0507819652557373, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7134268283843994, + "num_tokens": 369015263.0, + "step": 14793 + }, + { + "epoch": 1.6246430924665056, + "grad_norm": 2.311788558959961, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7376856803894043, + "num_tokens": 369038754.0, + "step": 14794 + }, + { + "epoch": 1.6247529101691192, + "grad_norm": 2.324148654937744, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7232348322868347, + "num_tokens": 369062803.0, + "step": 14795 + }, + { + "epoch": 1.624862727871733, + "grad_norm": 2.1345300674438477, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6968932747840881, + "num_tokens": 369092582.0, + "step": 14796 + }, + { + "epoch": 1.6249725455743467, + "grad_norm": 2.6011648178100586, + "learning_rate": 1e-06, + "loss": 0.7896, + "mean_token_accuracy": 0.7538940906524658, + "num_tokens": 369110824.0, + "step": 14797 + }, + { + "epoch": 1.6250823632769602, + "grad_norm": 2.251107692718506, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7056288719177246, + "num_tokens": 369136594.0, + "step": 14798 + }, + { + "epoch": 1.6251921809795737, + "grad_norm": 2.309177875518799, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7345985174179077, + "num_tokens": 369160253.0, + "step": 14799 + }, + { + "epoch": 1.6253019986821875, + "grad_norm": 2.4142534732818604, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7164633274078369, + "num_tokens": 369184217.0, + "step": 14800 + }, + { + "epoch": 1.6254118163848013, + "grad_norm": 2.0257182121276855, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7155946493148804, + "num_tokens": 369215734.0, + "step": 14801 + }, + { + "epoch": 1.625521634087415, + "grad_norm": 2.311894655227661, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7365455627441406, + "num_tokens": 369238655.0, + "step": 14802 + }, + { + "epoch": 1.6256314517900285, + "grad_norm": 2.3398995399475098, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7119529247283936, + "num_tokens": 369263189.0, + "step": 14803 + }, + { + "epoch": 1.625741269492642, + "grad_norm": 2.3497042655944824, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7308882474899292, + "num_tokens": 369285788.0, + "step": 14804 + }, + { + "epoch": 1.6258510871952558, + "grad_norm": 2.193080186843872, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7448489665985107, + "num_tokens": 369311029.0, + "step": 14805 + }, + { + "epoch": 1.6259609048978696, + "grad_norm": 1.9851102828979492, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7055904865264893, + "num_tokens": 369342438.0, + "step": 14806 + }, + { + "epoch": 1.6260707226004834, + "grad_norm": 2.1037678718566895, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7236875295639038, + "num_tokens": 369370355.0, + "step": 14807 + }, + { + "epoch": 1.6261805403030969, + "grad_norm": 2.3183352947235107, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7223873138427734, + "num_tokens": 369396132.0, + "step": 14808 + }, + { + "epoch": 1.6262903580057104, + "grad_norm": 2.1937382221221924, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7171647548675537, + "num_tokens": 369422416.0, + "step": 14809 + }, + { + "epoch": 1.6264001757083242, + "grad_norm": 2.264763832092285, + "learning_rate": 1e-06, + "loss": 0.7897, + "mean_token_accuracy": 0.7474083304405212, + "num_tokens": 369445133.0, + "step": 14810 + }, + { + "epoch": 1.626509993410938, + "grad_norm": 2.6610169410705566, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7228974103927612, + "num_tokens": 369464134.0, + "step": 14811 + }, + { + "epoch": 1.6266198111135515, + "grad_norm": 2.455425977706909, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7152279615402222, + "num_tokens": 369486029.0, + "step": 14812 + }, + { + "epoch": 1.626729628816165, + "grad_norm": 2.3681387901306152, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7388575077056885, + "num_tokens": 369510528.0, + "step": 14813 + }, + { + "epoch": 1.6268394465187788, + "grad_norm": 2.0932090282440186, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7216553688049316, + "num_tokens": 369541187.0, + "step": 14814 + }, + { + "epoch": 1.6269492642213925, + "grad_norm": 2.4854555130004883, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7107364535331726, + "num_tokens": 369563025.0, + "step": 14815 + }, + { + "epoch": 1.6270590819240063, + "grad_norm": 2.1462514400482178, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7247419357299805, + "num_tokens": 369589716.0, + "step": 14816 + }, + { + "epoch": 1.6271688996266198, + "grad_norm": 2.1430699825286865, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7192988395690918, + "num_tokens": 369619391.0, + "step": 14817 + }, + { + "epoch": 1.6272787173292333, + "grad_norm": 2.9860551357269287, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.747071385383606, + "num_tokens": 369634595.0, + "step": 14818 + }, + { + "epoch": 1.627388535031847, + "grad_norm": 2.296682119369507, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7161192297935486, + "num_tokens": 369659492.0, + "step": 14819 + }, + { + "epoch": 1.6274983527344609, + "grad_norm": 2.02577543258667, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7182772159576416, + "num_tokens": 369688348.0, + "step": 14820 + }, + { + "epoch": 1.6276081704370746, + "grad_norm": 2.0706350803375244, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7213415503501892, + "num_tokens": 369716606.0, + "step": 14821 + }, + { + "epoch": 1.6277179881396882, + "grad_norm": 2.1742584705352783, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7556610703468323, + "num_tokens": 369741301.0, + "step": 14822 + }, + { + "epoch": 1.6278278058423017, + "grad_norm": 2.3955750465393066, + "learning_rate": 1e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7385627031326294, + "num_tokens": 369764203.0, + "step": 14823 + }, + { + "epoch": 1.6279376235449154, + "grad_norm": 2.4182398319244385, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.732211709022522, + "num_tokens": 369788149.0, + "step": 14824 + }, + { + "epoch": 1.6280474412475292, + "grad_norm": 2.0661251544952393, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7214919328689575, + "num_tokens": 369817836.0, + "step": 14825 + }, + { + "epoch": 1.6281572589501427, + "grad_norm": 2.3320627212524414, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7032108902931213, + "num_tokens": 369843051.0, + "step": 14826 + }, + { + "epoch": 1.6282670766527563, + "grad_norm": 2.090501308441162, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7158572673797607, + "num_tokens": 369872789.0, + "step": 14827 + }, + { + "epoch": 1.62837689435537, + "grad_norm": 2.233461380004883, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7187209129333496, + "num_tokens": 369899592.0, + "step": 14828 + }, + { + "epoch": 1.6284867120579838, + "grad_norm": 2.2842094898223877, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7021752595901489, + "num_tokens": 369925886.0, + "step": 14829 + }, + { + "epoch": 1.6285965297605975, + "grad_norm": 2.093604326248169, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7030963897705078, + "num_tokens": 369954604.0, + "step": 14830 + }, + { + "epoch": 1.628706347463211, + "grad_norm": 2.153907537460327, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7212915420532227, + "num_tokens": 369982137.0, + "step": 14831 + }, + { + "epoch": 1.6288161651658246, + "grad_norm": 2.2089648246765137, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.709126353263855, + "num_tokens": 370007782.0, + "step": 14832 + }, + { + "epoch": 1.6289259828684384, + "grad_norm": 2.3771090507507324, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7267217636108398, + "num_tokens": 370031739.0, + "step": 14833 + }, + { + "epoch": 1.6290358005710521, + "grad_norm": 2.3536245822906494, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7149211764335632, + "num_tokens": 370054206.0, + "step": 14834 + }, + { + "epoch": 1.6291456182736657, + "grad_norm": 2.5719709396362305, + "learning_rate": 1e-06, + "loss": 0.7528, + "mean_token_accuracy": 0.7569687962532043, + "num_tokens": 370075506.0, + "step": 14835 + }, + { + "epoch": 1.6292554359762794, + "grad_norm": 2.286726713180542, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7107487320899963, + "num_tokens": 370100009.0, + "step": 14836 + }, + { + "epoch": 1.629365253678893, + "grad_norm": 1.9517171382904053, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.6945415735244751, + "num_tokens": 370134268.0, + "step": 14837 + }, + { + "epoch": 1.6294750713815067, + "grad_norm": 2.755096197128296, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.73460453748703, + "num_tokens": 370151611.0, + "step": 14838 + }, + { + "epoch": 1.6295848890841205, + "grad_norm": 2.437551259994507, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.6966350674629211, + "num_tokens": 370176259.0, + "step": 14839 + }, + { + "epoch": 1.629694706786734, + "grad_norm": 2.210425615310669, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7114696502685547, + "num_tokens": 370202674.0, + "step": 14840 + }, + { + "epoch": 1.6298045244893475, + "grad_norm": 2.4676053524017334, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7399974465370178, + "num_tokens": 370223795.0, + "step": 14841 + }, + { + "epoch": 1.6299143421919613, + "grad_norm": 2.351930618286133, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7239862680435181, + "num_tokens": 370247333.0, + "step": 14842 + }, + { + "epoch": 1.630024159894575, + "grad_norm": 2.2430202960968018, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.6995779275894165, + "num_tokens": 370273434.0, + "step": 14843 + }, + { + "epoch": 1.6301339775971888, + "grad_norm": 2.26668119430542, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7230381369590759, + "num_tokens": 370298485.0, + "step": 14844 + }, + { + "epoch": 1.6302437952998023, + "grad_norm": 2.273073196411133, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7260470390319824, + "num_tokens": 370324142.0, + "step": 14845 + }, + { + "epoch": 1.6303536130024159, + "grad_norm": 2.2574710845947266, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7179217338562012, + "num_tokens": 370352661.0, + "step": 14846 + }, + { + "epoch": 1.6304634307050296, + "grad_norm": 2.2025909423828125, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.734111487865448, + "num_tokens": 370380092.0, + "step": 14847 + }, + { + "epoch": 1.6305732484076434, + "grad_norm": 2.27105712890625, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7106203436851501, + "num_tokens": 370405419.0, + "step": 14848 + }, + { + "epoch": 1.630683066110257, + "grad_norm": 2.6350932121276855, + "learning_rate": 1e-06, + "loss": 0.7856, + "mean_token_accuracy": 0.7522060871124268, + "num_tokens": 370424228.0, + "step": 14849 + }, + { + "epoch": 1.6307928838128707, + "grad_norm": 2.4164812564849854, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7293145656585693, + "num_tokens": 370449361.0, + "step": 14850 + }, + { + "epoch": 1.6309027015154842, + "grad_norm": 2.383309841156006, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.735761821269989, + "num_tokens": 370472704.0, + "step": 14851 + }, + { + "epoch": 1.631012519218098, + "grad_norm": 2.222470760345459, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7177083492279053, + "num_tokens": 370498502.0, + "step": 14852 + }, + { + "epoch": 1.6311223369207117, + "grad_norm": 2.4106268882751465, + "learning_rate": 1e-06, + "loss": 0.7764, + "mean_token_accuracy": 0.7512487173080444, + "num_tokens": 370518922.0, + "step": 14853 + }, + { + "epoch": 1.6312321546233253, + "grad_norm": 2.6093831062316895, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7239139080047607, + "num_tokens": 370538453.0, + "step": 14854 + }, + { + "epoch": 1.6313419723259388, + "grad_norm": 2.009857177734375, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7311117649078369, + "num_tokens": 370568441.0, + "step": 14855 + }, + { + "epoch": 1.6314517900285526, + "grad_norm": 2.0382728576660156, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.720778226852417, + "num_tokens": 370599302.0, + "step": 14856 + }, + { + "epoch": 1.6315616077311663, + "grad_norm": 2.108314037322998, + "learning_rate": 1e-06, + "loss": 0.7249, + "mean_token_accuracy": 0.7651939392089844, + "num_tokens": 370625708.0, + "step": 14857 + }, + { + "epoch": 1.63167142543378, + "grad_norm": 2.5190629959106445, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7060842514038086, + "num_tokens": 370647199.0, + "step": 14858 + }, + { + "epoch": 1.6317812431363936, + "grad_norm": 2.306607723236084, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7274253368377686, + "num_tokens": 370671470.0, + "step": 14859 + }, + { + "epoch": 1.6318910608390071, + "grad_norm": 2.3082082271575928, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7117474675178528, + "num_tokens": 370697777.0, + "step": 14860 + }, + { + "epoch": 1.632000878541621, + "grad_norm": 2.4901368618011475, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7147037982940674, + "num_tokens": 370719104.0, + "step": 14861 + }, + { + "epoch": 1.6321106962442347, + "grad_norm": 2.382174253463745, + "learning_rate": 1e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7498723268508911, + "num_tokens": 370741319.0, + "step": 14862 + }, + { + "epoch": 1.6322205139468482, + "grad_norm": 2.0810601711273193, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7280325889587402, + "num_tokens": 370770662.0, + "step": 14863 + }, + { + "epoch": 1.6323303316494617, + "grad_norm": 2.145364999771118, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7132139801979065, + "num_tokens": 370797978.0, + "step": 14864 + }, + { + "epoch": 1.6324401493520755, + "grad_norm": 2.488802433013916, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.742310643196106, + "num_tokens": 370817776.0, + "step": 14865 + }, + { + "epoch": 1.6325499670546892, + "grad_norm": 2.3408079147338867, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6935484409332275, + "num_tokens": 370841161.0, + "step": 14866 + }, + { + "epoch": 1.632659784757303, + "grad_norm": 2.2761056423187256, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7259572744369507, + "num_tokens": 370866234.0, + "step": 14867 + }, + { + "epoch": 1.6327696024599165, + "grad_norm": 2.538193941116333, + "learning_rate": 1e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7472264766693115, + "num_tokens": 370887229.0, + "step": 14868 + }, + { + "epoch": 1.63287942016253, + "grad_norm": 2.0570321083068848, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.6988212466239929, + "num_tokens": 370917624.0, + "step": 14869 + }, + { + "epoch": 1.6329892378651438, + "grad_norm": 2.1448724269866943, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7276460528373718, + "num_tokens": 370943744.0, + "step": 14870 + }, + { + "epoch": 1.6330990555677576, + "grad_norm": 2.408355236053467, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.693088173866272, + "num_tokens": 370969463.0, + "step": 14871 + }, + { + "epoch": 1.6332088732703713, + "grad_norm": 2.52093505859375, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7415162324905396, + "num_tokens": 370990169.0, + "step": 14872 + }, + { + "epoch": 1.6333186909729849, + "grad_norm": 2.6038718223571777, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7120976448059082, + "num_tokens": 371012135.0, + "step": 14873 + }, + { + "epoch": 1.6334285086755984, + "grad_norm": 2.5110628604888916, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7355337738990784, + "num_tokens": 371031459.0, + "step": 14874 + }, + { + "epoch": 1.6335383263782122, + "grad_norm": 2.317343235015869, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7309499382972717, + "num_tokens": 371056466.0, + "step": 14875 + }, + { + "epoch": 1.633648144080826, + "grad_norm": 2.18917179107666, + "learning_rate": 1e-06, + "loss": 0.78, + "mean_token_accuracy": 0.7557976245880127, + "num_tokens": 371081246.0, + "step": 14876 + }, + { + "epoch": 1.6337579617834395, + "grad_norm": 2.026642322540283, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7081030607223511, + "num_tokens": 371113997.0, + "step": 14877 + }, + { + "epoch": 1.633867779486053, + "grad_norm": 2.6107585430145264, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7280833721160889, + "num_tokens": 371133915.0, + "step": 14878 + }, + { + "epoch": 1.6339775971886668, + "grad_norm": 2.442490816116333, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7239234447479248, + "num_tokens": 371157844.0, + "step": 14879 + }, + { + "epoch": 1.6340874148912805, + "grad_norm": 2.279409885406494, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7061785459518433, + "num_tokens": 371183692.0, + "step": 14880 + }, + { + "epoch": 1.6341972325938943, + "grad_norm": 2.638612985610962, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7074853777885437, + "num_tokens": 371205054.0, + "step": 14881 + }, + { + "epoch": 1.6343070502965078, + "grad_norm": 2.6037943363189697, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7311652898788452, + "num_tokens": 371226055.0, + "step": 14882 + }, + { + "epoch": 1.6344168679991213, + "grad_norm": 2.281721353530884, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7223221063613892, + "num_tokens": 371251936.0, + "step": 14883 + }, + { + "epoch": 1.634526685701735, + "grad_norm": 2.2186741828918457, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7247440814971924, + "num_tokens": 371278625.0, + "step": 14884 + }, + { + "epoch": 1.6346365034043489, + "grad_norm": 2.360337018966675, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7291122078895569, + "num_tokens": 371302641.0, + "step": 14885 + }, + { + "epoch": 1.6347463211069626, + "grad_norm": 2.022148370742798, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7313272953033447, + "num_tokens": 371333481.0, + "step": 14886 + }, + { + "epoch": 1.6348561388095761, + "grad_norm": 2.348362684249878, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7279981374740601, + "num_tokens": 371355938.0, + "step": 14887 + }, + { + "epoch": 1.6349659565121897, + "grad_norm": 2.4022774696350098, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7256126403808594, + "num_tokens": 371380230.0, + "step": 14888 + }, + { + "epoch": 1.6350757742148034, + "grad_norm": 2.8721096515655518, + "learning_rate": 1e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7436292171478271, + "num_tokens": 371397322.0, + "step": 14889 + }, + { + "epoch": 1.6351855919174172, + "grad_norm": 2.8663454055786133, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7484604716300964, + "num_tokens": 371413851.0, + "step": 14890 + }, + { + "epoch": 1.6352954096200307, + "grad_norm": 2.474137783050537, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7017631530761719, + "num_tokens": 371437379.0, + "step": 14891 + }, + { + "epoch": 1.6354052273226443, + "grad_norm": 2.4123172760009766, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7250326871871948, + "num_tokens": 371459201.0, + "step": 14892 + }, + { + "epoch": 1.635515045025258, + "grad_norm": 2.1726856231689453, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7095848321914673, + "num_tokens": 371488588.0, + "step": 14893 + }, + { + "epoch": 1.6356248627278718, + "grad_norm": 2.1913247108459473, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6884820461273193, + "num_tokens": 371516040.0, + "step": 14894 + }, + { + "epoch": 1.6357346804304855, + "grad_norm": 2.4928693771362305, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7234731912612915, + "num_tokens": 371536819.0, + "step": 14895 + }, + { + "epoch": 1.635844498133099, + "grad_norm": 2.31400203704834, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7351316213607788, + "num_tokens": 371560112.0, + "step": 14896 + }, + { + "epoch": 1.6359543158357126, + "grad_norm": 2.083299160003662, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7118276953697205, + "num_tokens": 371588239.0, + "step": 14897 + }, + { + "epoch": 1.6360641335383264, + "grad_norm": 2.23730731010437, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7319376468658447, + "num_tokens": 371614470.0, + "step": 14898 + }, + { + "epoch": 1.6361739512409401, + "grad_norm": 2.305560827255249, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7086793184280396, + "num_tokens": 371639120.0, + "step": 14899 + }, + { + "epoch": 1.6362837689435537, + "grad_norm": 2.5531880855560303, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7227262258529663, + "num_tokens": 371660105.0, + "step": 14900 + }, + { + "epoch": 1.6363935866461674, + "grad_norm": 2.919722080230713, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7521839141845703, + "num_tokens": 371677075.0, + "step": 14901 + }, + { + "epoch": 1.636503404348781, + "grad_norm": 2.3925325870513916, + "learning_rate": 1e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.7446211576461792, + "num_tokens": 371700171.0, + "step": 14902 + }, + { + "epoch": 1.6366132220513947, + "grad_norm": 2.0523781776428223, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7224285006523132, + "num_tokens": 371728828.0, + "step": 14903 + }, + { + "epoch": 1.6367230397540085, + "grad_norm": 2.3502097129821777, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7263882756233215, + "num_tokens": 371752592.0, + "step": 14904 + }, + { + "epoch": 1.636832857456622, + "grad_norm": 2.2575786113739014, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7215933799743652, + "num_tokens": 371781578.0, + "step": 14905 + }, + { + "epoch": 1.6369426751592355, + "grad_norm": 2.0809030532836914, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7185331583023071, + "num_tokens": 371809253.0, + "step": 14906 + }, + { + "epoch": 1.6370524928618493, + "grad_norm": 2.7931623458862305, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7284570932388306, + "num_tokens": 371829169.0, + "step": 14907 + }, + { + "epoch": 1.637162310564463, + "grad_norm": 2.0151050090789795, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7300676107406616, + "num_tokens": 371857285.0, + "step": 14908 + }, + { + "epoch": 1.6372721282670768, + "grad_norm": 2.21421480178833, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7012509703636169, + "num_tokens": 371884901.0, + "step": 14909 + }, + { + "epoch": 1.6373819459696903, + "grad_norm": 2.2068064212799072, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7248556613922119, + "num_tokens": 371909322.0, + "step": 14910 + }, + { + "epoch": 1.6374917636723039, + "grad_norm": 2.273404121398926, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.6980125308036804, + "num_tokens": 371934484.0, + "step": 14911 + }, + { + "epoch": 1.6376015813749176, + "grad_norm": 2.3283064365386963, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7375637292861938, + "num_tokens": 371958123.0, + "step": 14912 + }, + { + "epoch": 1.6377113990775314, + "grad_norm": 2.3410372734069824, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7003129720687866, + "num_tokens": 371982262.0, + "step": 14913 + }, + { + "epoch": 1.637821216780145, + "grad_norm": 2.0788543224334717, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.734571635723114, + "num_tokens": 372011214.0, + "step": 14914 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 2.275123119354248, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.704543948173523, + "num_tokens": 372036490.0, + "step": 14915 + }, + { + "epoch": 1.6380408521853722, + "grad_norm": 2.554957389831543, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7115438580513, + "num_tokens": 372056529.0, + "step": 14916 + }, + { + "epoch": 1.638150669887986, + "grad_norm": 2.26112961769104, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7167109251022339, + "num_tokens": 372082166.0, + "step": 14917 + }, + { + "epoch": 1.6382604875905997, + "grad_norm": 2.717850923538208, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.721606969833374, + "num_tokens": 372100922.0, + "step": 14918 + }, + { + "epoch": 1.6383703052932133, + "grad_norm": 2.4714550971984863, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7415986061096191, + "num_tokens": 372121676.0, + "step": 14919 + }, + { + "epoch": 1.6384801229958268, + "grad_norm": 2.332627534866333, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7260937094688416, + "num_tokens": 372145887.0, + "step": 14920 + }, + { + "epoch": 1.6385899406984406, + "grad_norm": 2.2218635082244873, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7094880938529968, + "num_tokens": 372173703.0, + "step": 14921 + }, + { + "epoch": 1.6386997584010543, + "grad_norm": 2.2677366733551025, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7181798219680786, + "num_tokens": 372198379.0, + "step": 14922 + }, + { + "epoch": 1.638809576103668, + "grad_norm": 2.2686963081359863, + "learning_rate": 1e-06, + "loss": 0.7774, + "mean_token_accuracy": 0.7620900869369507, + "num_tokens": 372221908.0, + "step": 14923 + }, + { + "epoch": 1.6389193938062816, + "grad_norm": 2.0708353519439697, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7139135599136353, + "num_tokens": 372250824.0, + "step": 14924 + }, + { + "epoch": 1.6390292115088951, + "grad_norm": 2.3276262283325195, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.711031436920166, + "num_tokens": 372275625.0, + "step": 14925 + }, + { + "epoch": 1.639139029211509, + "grad_norm": 2.3409276008605957, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7181369662284851, + "num_tokens": 372299611.0, + "step": 14926 + }, + { + "epoch": 1.6392488469141226, + "grad_norm": 2.3725998401641846, + "learning_rate": 1e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7480982542037964, + "num_tokens": 372319749.0, + "step": 14927 + }, + { + "epoch": 1.6393586646167362, + "grad_norm": 2.125251054763794, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7030296921730042, + "num_tokens": 372350400.0, + "step": 14928 + }, + { + "epoch": 1.6394684823193497, + "grad_norm": 2.0599138736724854, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7321852445602417, + "num_tokens": 372378577.0, + "step": 14929 + }, + { + "epoch": 1.6395783000219635, + "grad_norm": 2.0509445667266846, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7198721766471863, + "num_tokens": 372408194.0, + "step": 14930 + }, + { + "epoch": 1.6396881177245772, + "grad_norm": 1.9489189386367798, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7147563695907593, + "num_tokens": 372441381.0, + "step": 14931 + }, + { + "epoch": 1.639797935427191, + "grad_norm": 2.239490509033203, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7366123795509338, + "num_tokens": 372464625.0, + "step": 14932 + }, + { + "epoch": 1.6399077531298045, + "grad_norm": 2.0474486351013184, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7056981921195984, + "num_tokens": 372495174.0, + "step": 14933 + }, + { + "epoch": 1.640017570832418, + "grad_norm": 2.5441768169403076, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7135277390480042, + "num_tokens": 372516784.0, + "step": 14934 + }, + { + "epoch": 1.6401273885350318, + "grad_norm": 2.2589151859283447, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7446917295455933, + "num_tokens": 372541158.0, + "step": 14935 + }, + { + "epoch": 1.6402372062376456, + "grad_norm": 2.3981592655181885, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.725062370300293, + "num_tokens": 372563495.0, + "step": 14936 + }, + { + "epoch": 1.6403470239402593, + "grad_norm": 2.3037078380584717, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7374471426010132, + "num_tokens": 372587843.0, + "step": 14937 + }, + { + "epoch": 1.6404568416428729, + "grad_norm": 2.2133822441101074, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7160395979881287, + "num_tokens": 372613807.0, + "step": 14938 + }, + { + "epoch": 1.6405666593454864, + "grad_norm": 2.0773093700408936, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7135904431343079, + "num_tokens": 372644418.0, + "step": 14939 + }, + { + "epoch": 1.6406764770481002, + "grad_norm": 2.1787681579589844, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.714210033416748, + "num_tokens": 372670913.0, + "step": 14940 + }, + { + "epoch": 1.640786294750714, + "grad_norm": 2.337050199508667, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7159595489501953, + "num_tokens": 372695498.0, + "step": 14941 + }, + { + "epoch": 1.6408961124533274, + "grad_norm": 2.3980154991149902, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7342045903205872, + "num_tokens": 372717712.0, + "step": 14942 + }, + { + "epoch": 1.641005930155941, + "grad_norm": 2.2155745029449463, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7283409833908081, + "num_tokens": 372744050.0, + "step": 14943 + }, + { + "epoch": 1.6411157478585547, + "grad_norm": 2.3672378063201904, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7072417736053467, + "num_tokens": 372768261.0, + "step": 14944 + }, + { + "epoch": 1.6412255655611685, + "grad_norm": 2.289971351623535, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7129497528076172, + "num_tokens": 372794116.0, + "step": 14945 + }, + { + "epoch": 1.6413353832637823, + "grad_norm": 2.7142481803894043, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7346022129058838, + "num_tokens": 372813031.0, + "step": 14946 + }, + { + "epoch": 1.6414452009663958, + "grad_norm": 2.421459913253784, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.730616569519043, + "num_tokens": 372835654.0, + "step": 14947 + }, + { + "epoch": 1.6415550186690093, + "grad_norm": 1.9979588985443115, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7240197658538818, + "num_tokens": 372864792.0, + "step": 14948 + }, + { + "epoch": 1.641664836371623, + "grad_norm": 2.2491209506988525, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7158254981040955, + "num_tokens": 372891493.0, + "step": 14949 + }, + { + "epoch": 1.6417746540742368, + "grad_norm": 2.3832337856292725, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7118933796882629, + "num_tokens": 372914975.0, + "step": 14950 + }, + { + "epoch": 1.6418844717768504, + "grad_norm": 2.418635129928589, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.745943546295166, + "num_tokens": 372936047.0, + "step": 14951 + }, + { + "epoch": 1.6419942894794641, + "grad_norm": 1.9729413986206055, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7047107815742493, + "num_tokens": 372968859.0, + "step": 14952 + }, + { + "epoch": 1.6421041071820777, + "grad_norm": 2.269618034362793, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7130275964736938, + "num_tokens": 372993391.0, + "step": 14953 + }, + { + "epoch": 1.6422139248846914, + "grad_norm": 2.755859851837158, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.74521803855896, + "num_tokens": 373012905.0, + "step": 14954 + }, + { + "epoch": 1.6423237425873052, + "grad_norm": 2.2973599433898926, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.702102780342102, + "num_tokens": 373039144.0, + "step": 14955 + }, + { + "epoch": 1.6424335602899187, + "grad_norm": 2.2276992797851562, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7240448594093323, + "num_tokens": 373065745.0, + "step": 14956 + }, + { + "epoch": 1.6425433779925322, + "grad_norm": 2.8263652324676514, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.738828718662262, + "num_tokens": 373082363.0, + "step": 14957 + }, + { + "epoch": 1.642653195695146, + "grad_norm": 2.532796859741211, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7367985248565674, + "num_tokens": 373101768.0, + "step": 14958 + }, + { + "epoch": 1.6427630133977598, + "grad_norm": 2.280184745788574, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7316463589668274, + "num_tokens": 373127905.0, + "step": 14959 + }, + { + "epoch": 1.6428728311003735, + "grad_norm": 2.2474913597106934, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7119184732437134, + "num_tokens": 373156487.0, + "step": 14960 + }, + { + "epoch": 1.642982648802987, + "grad_norm": 2.6312689781188965, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7333616614341736, + "num_tokens": 373175374.0, + "step": 14961 + }, + { + "epoch": 1.6430924665056006, + "grad_norm": 2.2394115924835205, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7176609039306641, + "num_tokens": 373199760.0, + "step": 14962 + }, + { + "epoch": 1.6432022842082143, + "grad_norm": 2.2668392658233643, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7210960984230042, + "num_tokens": 373224971.0, + "step": 14963 + }, + { + "epoch": 1.643312101910828, + "grad_norm": 2.1203041076660156, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.684963583946228, + "num_tokens": 373255231.0, + "step": 14964 + }, + { + "epoch": 1.6434219196134416, + "grad_norm": 2.3005521297454834, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7025337815284729, + "num_tokens": 373280947.0, + "step": 14965 + }, + { + "epoch": 1.6435317373160554, + "grad_norm": 2.372687578201294, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7044912576675415, + "num_tokens": 373303522.0, + "step": 14966 + }, + { + "epoch": 1.643641555018669, + "grad_norm": 2.2435059547424316, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.741124153137207, + "num_tokens": 373327324.0, + "step": 14967 + }, + { + "epoch": 1.6437513727212827, + "grad_norm": 2.529531955718994, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7353121042251587, + "num_tokens": 373347566.0, + "step": 14968 + }, + { + "epoch": 1.6438611904238964, + "grad_norm": 2.0268876552581787, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7202475666999817, + "num_tokens": 373380182.0, + "step": 14969 + }, + { + "epoch": 1.64397100812651, + "grad_norm": 2.290450096130371, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7284567356109619, + "num_tokens": 373403152.0, + "step": 14970 + }, + { + "epoch": 1.6440808258291235, + "grad_norm": 2.1979401111602783, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7100745439529419, + "num_tokens": 373431978.0, + "step": 14971 + }, + { + "epoch": 1.6441906435317373, + "grad_norm": 2.3907690048217773, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7304187417030334, + "num_tokens": 373455258.0, + "step": 14972 + }, + { + "epoch": 1.644300461234351, + "grad_norm": 2.5095744132995605, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7431598901748657, + "num_tokens": 373475588.0, + "step": 14973 + }, + { + "epoch": 1.6444102789369648, + "grad_norm": 2.1666769981384277, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.719864547252655, + "num_tokens": 373501464.0, + "step": 14974 + }, + { + "epoch": 1.6445200966395783, + "grad_norm": 2.255147933959961, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7407664060592651, + "num_tokens": 373527201.0, + "step": 14975 + }, + { + "epoch": 1.6446299143421919, + "grad_norm": 2.6247851848602295, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7218093872070312, + "num_tokens": 373546970.0, + "step": 14976 + }, + { + "epoch": 1.6447397320448056, + "grad_norm": 2.3027689456939697, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7078510522842407, + "num_tokens": 373572251.0, + "step": 14977 + }, + { + "epoch": 1.6448495497474194, + "grad_norm": 2.152858018875122, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7190747857093811, + "num_tokens": 373598610.0, + "step": 14978 + }, + { + "epoch": 1.644959367450033, + "grad_norm": 2.03589129447937, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7106804847717285, + "num_tokens": 373629893.0, + "step": 14979 + }, + { + "epoch": 1.6450691851526464, + "grad_norm": 2.307016611099243, + "learning_rate": 1e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.7356675863265991, + "num_tokens": 373654285.0, + "step": 14980 + }, + { + "epoch": 1.6451790028552602, + "grad_norm": 2.59322452545166, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7109197974205017, + "num_tokens": 373675049.0, + "step": 14981 + }, + { + "epoch": 1.645288820557874, + "grad_norm": 2.219978094100952, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7223333120346069, + "num_tokens": 373701585.0, + "step": 14982 + }, + { + "epoch": 1.6453986382604877, + "grad_norm": 2.3111610412597656, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7213953137397766, + "num_tokens": 373727204.0, + "step": 14983 + }, + { + "epoch": 1.6455084559631012, + "grad_norm": 2.210982084274292, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.692627489566803, + "num_tokens": 373757076.0, + "step": 14984 + }, + { + "epoch": 1.6456182736657148, + "grad_norm": 2.362297534942627, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7394322156906128, + "num_tokens": 373780720.0, + "step": 14985 + }, + { + "epoch": 1.6457280913683285, + "grad_norm": 1.9473196268081665, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7021900415420532, + "num_tokens": 373813386.0, + "step": 14986 + }, + { + "epoch": 1.6458379090709423, + "grad_norm": 2.269519567489624, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6890634298324585, + "num_tokens": 373839356.0, + "step": 14987 + }, + { + "epoch": 1.645947726773556, + "grad_norm": 2.2513320446014404, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7207291126251221, + "num_tokens": 373864813.0, + "step": 14988 + }, + { + "epoch": 1.6460575444761696, + "grad_norm": 2.522254228591919, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7272771596908569, + "num_tokens": 373887199.0, + "step": 14989 + }, + { + "epoch": 1.6461673621787831, + "grad_norm": 2.485273838043213, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7101501822471619, + "num_tokens": 373909976.0, + "step": 14990 + }, + { + "epoch": 1.6462771798813969, + "grad_norm": 2.1916444301605225, + "learning_rate": 1e-06, + "loss": 0.7661, + "mean_token_accuracy": 0.7568877935409546, + "num_tokens": 373933797.0, + "step": 14991 + }, + { + "epoch": 1.6463869975840106, + "grad_norm": 2.1855506896972656, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7128375768661499, + "num_tokens": 373962848.0, + "step": 14992 + }, + { + "epoch": 1.6464968152866242, + "grad_norm": 2.449586868286133, + "learning_rate": 1e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7410827875137329, + "num_tokens": 373983969.0, + "step": 14993 + }, + { + "epoch": 1.6466066329892377, + "grad_norm": 2.392770528793335, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7229496240615845, + "num_tokens": 374008906.0, + "step": 14994 + }, + { + "epoch": 1.6467164506918515, + "grad_norm": 2.122504234313965, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7211044430732727, + "num_tokens": 374034869.0, + "step": 14995 + }, + { + "epoch": 1.6468262683944652, + "grad_norm": 2.471950054168701, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7214000225067139, + "num_tokens": 374055317.0, + "step": 14996 + }, + { + "epoch": 1.646936086097079, + "grad_norm": 2.360832929611206, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7126518487930298, + "num_tokens": 374079372.0, + "step": 14997 + }, + { + "epoch": 1.6470459037996925, + "grad_norm": 2.2898685932159424, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7141386270523071, + "num_tokens": 374104103.0, + "step": 14998 + }, + { + "epoch": 1.647155721502306, + "grad_norm": 2.3102757930755615, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7269277572631836, + "num_tokens": 374129968.0, + "step": 14999 + }, + { + "epoch": 1.6472655392049198, + "grad_norm": 2.4270527362823486, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7116816639900208, + "num_tokens": 374153866.0, + "step": 15000 + }, + { + "epoch": 1.6473753569075336, + "grad_norm": 2.3152005672454834, + "learning_rate": 1e-06, + "loss": 0.7194, + "mean_token_accuracy": 0.766193687915802, + "num_tokens": 374176169.0, + "step": 15001 + }, + { + "epoch": 1.6474851746101473, + "grad_norm": 2.17868709564209, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.6965894103050232, + "num_tokens": 374201850.0, + "step": 15002 + }, + { + "epoch": 1.6475949923127609, + "grad_norm": 2.2765254974365234, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7107702493667603, + "num_tokens": 374226852.0, + "step": 15003 + }, + { + "epoch": 1.6477048100153744, + "grad_norm": 2.51461124420166, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7133814692497253, + "num_tokens": 374249569.0, + "step": 15004 + }, + { + "epoch": 1.6478146277179881, + "grad_norm": 2.366255283355713, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7492537498474121, + "num_tokens": 374273096.0, + "step": 15005 + }, + { + "epoch": 1.647924445420602, + "grad_norm": 2.5626003742218018, + "learning_rate": 1e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7381418347358704, + "num_tokens": 374293043.0, + "step": 15006 + }, + { + "epoch": 1.6480342631232154, + "grad_norm": 2.2899670600891113, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7297357320785522, + "num_tokens": 374317152.0, + "step": 15007 + }, + { + "epoch": 1.648144080825829, + "grad_norm": 2.40307879447937, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7244541645050049, + "num_tokens": 374341402.0, + "step": 15008 + }, + { + "epoch": 1.6482538985284427, + "grad_norm": 2.175727367401123, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7219812273979187, + "num_tokens": 374369444.0, + "step": 15009 + }, + { + "epoch": 1.6483637162310565, + "grad_norm": 2.1055173873901367, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6918929815292358, + "num_tokens": 374399822.0, + "step": 15010 + }, + { + "epoch": 1.6484735339336702, + "grad_norm": 2.4595954418182373, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7089035511016846, + "num_tokens": 374422210.0, + "step": 15011 + }, + { + "epoch": 1.6485833516362838, + "grad_norm": 2.0238492488861084, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6919063329696655, + "num_tokens": 374455273.0, + "step": 15012 + }, + { + "epoch": 1.6486931693388973, + "grad_norm": 2.0271565914154053, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7238079905509949, + "num_tokens": 374485816.0, + "step": 15013 + }, + { + "epoch": 1.648802987041511, + "grad_norm": 2.3895936012268066, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7139795422554016, + "num_tokens": 374509331.0, + "step": 15014 + }, + { + "epoch": 1.6489128047441248, + "grad_norm": 2.251887321472168, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.734355628490448, + "num_tokens": 374535480.0, + "step": 15015 + }, + { + "epoch": 1.6490226224467384, + "grad_norm": 2.3341798782348633, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7285206317901611, + "num_tokens": 374560210.0, + "step": 15016 + }, + { + "epoch": 1.6491324401493521, + "grad_norm": 2.31606388092041, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7179053425788879, + "num_tokens": 374586656.0, + "step": 15017 + }, + { + "epoch": 1.6492422578519657, + "grad_norm": 2.211975574493408, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7544345855712891, + "num_tokens": 374611164.0, + "step": 15018 + }, + { + "epoch": 1.6493520755545794, + "grad_norm": 2.031757116317749, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7176965475082397, + "num_tokens": 374640290.0, + "step": 15019 + }, + { + "epoch": 1.6494618932571932, + "grad_norm": 2.2408621311187744, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.708306074142456, + "num_tokens": 374667560.0, + "step": 15020 + }, + { + "epoch": 1.6495717109598067, + "grad_norm": 2.0932154655456543, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7210678458213806, + "num_tokens": 374698011.0, + "step": 15021 + }, + { + "epoch": 1.6496815286624202, + "grad_norm": 1.9435245990753174, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7007274627685547, + "num_tokens": 374731337.0, + "step": 15022 + }, + { + "epoch": 1.649791346365034, + "grad_norm": 2.6007912158966064, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7265365123748779, + "num_tokens": 374752528.0, + "step": 15023 + }, + { + "epoch": 1.6499011640676478, + "grad_norm": 2.16452693939209, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.73250412940979, + "num_tokens": 374778677.0, + "step": 15024 + }, + { + "epoch": 1.6500109817702615, + "grad_norm": 2.2247164249420166, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7108272910118103, + "num_tokens": 374805028.0, + "step": 15025 + }, + { + "epoch": 1.650120799472875, + "grad_norm": 2.2408132553100586, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7229447364807129, + "num_tokens": 374832310.0, + "step": 15026 + }, + { + "epoch": 1.6502306171754886, + "grad_norm": 2.286634683609009, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7023879289627075, + "num_tokens": 374857803.0, + "step": 15027 + }, + { + "epoch": 1.6503404348781023, + "grad_norm": 2.1378846168518066, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7056931257247925, + "num_tokens": 374887577.0, + "step": 15028 + }, + { + "epoch": 1.650450252580716, + "grad_norm": 2.350558280944824, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7059533596038818, + "num_tokens": 374912778.0, + "step": 15029 + }, + { + "epoch": 1.6505600702833296, + "grad_norm": 2.2307071685791016, + "learning_rate": 1e-06, + "loss": 0.8063, + "mean_token_accuracy": 0.7521419525146484, + "num_tokens": 374937349.0, + "step": 15030 + }, + { + "epoch": 1.6506698879859434, + "grad_norm": 2.329519748687744, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7048875093460083, + "num_tokens": 374961576.0, + "step": 15031 + }, + { + "epoch": 1.650779705688557, + "grad_norm": 2.2084014415740967, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7169930338859558, + "num_tokens": 374988790.0, + "step": 15032 + }, + { + "epoch": 1.6508895233911707, + "grad_norm": 2.1493985652923584, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7200726270675659, + "num_tokens": 375014289.0, + "step": 15033 + }, + { + "epoch": 1.6509993410937844, + "grad_norm": 2.302338123321533, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7267829179763794, + "num_tokens": 375041964.0, + "step": 15034 + }, + { + "epoch": 1.651109158796398, + "grad_norm": 2.2674598693847656, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7330776453018188, + "num_tokens": 375066536.0, + "step": 15035 + }, + { + "epoch": 1.6512189764990115, + "grad_norm": 2.252156972885132, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7254749536514282, + "num_tokens": 375091838.0, + "step": 15036 + }, + { + "epoch": 1.6513287942016253, + "grad_norm": 2.294055223464966, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7372435331344604, + "num_tokens": 375114609.0, + "step": 15037 + }, + { + "epoch": 1.651438611904239, + "grad_norm": 2.429326295852661, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7288330793380737, + "num_tokens": 375136937.0, + "step": 15038 + }, + { + "epoch": 1.6515484296068528, + "grad_norm": 2.3704535961151123, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.6985981464385986, + "num_tokens": 375161728.0, + "step": 15039 + }, + { + "epoch": 1.6516582473094663, + "grad_norm": 2.336660385131836, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7287794947624207, + "num_tokens": 375185365.0, + "step": 15040 + }, + { + "epoch": 1.6517680650120798, + "grad_norm": 2.200573682785034, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7129926681518555, + "num_tokens": 375212633.0, + "step": 15041 + }, + { + "epoch": 1.6518778827146936, + "grad_norm": 2.0700933933258057, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7191614508628845, + "num_tokens": 375241727.0, + "step": 15042 + }, + { + "epoch": 1.6519877004173074, + "grad_norm": 2.163137674331665, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7032890915870667, + "num_tokens": 375267658.0, + "step": 15043 + }, + { + "epoch": 1.652097518119921, + "grad_norm": 2.1103148460388184, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7116377353668213, + "num_tokens": 375297981.0, + "step": 15044 + }, + { + "epoch": 1.6522073358225344, + "grad_norm": 2.4764442443847656, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7121155261993408, + "num_tokens": 375320220.0, + "step": 15045 + }, + { + "epoch": 1.6523171535251482, + "grad_norm": 2.518232822418213, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7166920304298401, + "num_tokens": 375341955.0, + "step": 15046 + }, + { + "epoch": 1.652426971227762, + "grad_norm": 2.3854575157165527, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7175493836402893, + "num_tokens": 375365210.0, + "step": 15047 + }, + { + "epoch": 1.6525367889303757, + "grad_norm": 2.1568548679351807, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7232926487922668, + "num_tokens": 375392925.0, + "step": 15048 + }, + { + "epoch": 1.6526466066329892, + "grad_norm": 2.1924517154693604, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7044274210929871, + "num_tokens": 375419792.0, + "step": 15049 + }, + { + "epoch": 1.6527564243356028, + "grad_norm": 2.2516632080078125, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7263914346694946, + "num_tokens": 375445284.0, + "step": 15050 + }, + { + "epoch": 1.6528662420382165, + "grad_norm": 2.6383605003356934, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7301672697067261, + "num_tokens": 375465114.0, + "step": 15051 + }, + { + "epoch": 1.6529760597408303, + "grad_norm": 2.4353528022766113, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7404727339744568, + "num_tokens": 375485253.0, + "step": 15052 + }, + { + "epoch": 1.653085877443444, + "grad_norm": 2.393202066421509, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.729288637638092, + "num_tokens": 375508568.0, + "step": 15053 + }, + { + "epoch": 1.6531956951460576, + "grad_norm": 2.422455072402954, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.710340142250061, + "num_tokens": 375530637.0, + "step": 15054 + }, + { + "epoch": 1.653305512848671, + "grad_norm": 2.672905683517456, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.735950767993927, + "num_tokens": 375549516.0, + "step": 15055 + }, + { + "epoch": 1.6534153305512849, + "grad_norm": 2.3398423194885254, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.6982592344284058, + "num_tokens": 375574134.0, + "step": 15056 + }, + { + "epoch": 1.6535251482538986, + "grad_norm": 2.305957555770874, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7532916069030762, + "num_tokens": 375596477.0, + "step": 15057 + }, + { + "epoch": 1.6536349659565122, + "grad_norm": 2.3818609714508057, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7256351709365845, + "num_tokens": 375619997.0, + "step": 15058 + }, + { + "epoch": 1.6537447836591257, + "grad_norm": 2.1426711082458496, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7163509726524353, + "num_tokens": 375645616.0, + "step": 15059 + }, + { + "epoch": 1.6538546013617395, + "grad_norm": 2.4622859954833984, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7355601787567139, + "num_tokens": 375666382.0, + "step": 15060 + }, + { + "epoch": 1.6539644190643532, + "grad_norm": 2.383924722671509, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7587641477584839, + "num_tokens": 375687340.0, + "step": 15061 + }, + { + "epoch": 1.654074236766967, + "grad_norm": 2.393197536468506, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7077853679656982, + "num_tokens": 375711128.0, + "step": 15062 + }, + { + "epoch": 1.6541840544695805, + "grad_norm": 2.1254024505615234, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7045794129371643, + "num_tokens": 375739543.0, + "step": 15063 + }, + { + "epoch": 1.654293872172194, + "grad_norm": 2.3075766563415527, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7227864265441895, + "num_tokens": 375764402.0, + "step": 15064 + }, + { + "epoch": 1.6544036898748078, + "grad_norm": 2.5023839473724365, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7362382411956787, + "num_tokens": 375786241.0, + "step": 15065 + }, + { + "epoch": 1.6545135075774215, + "grad_norm": 2.2921860218048096, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7276407480239868, + "num_tokens": 375811469.0, + "step": 15066 + }, + { + "epoch": 1.6546233252800353, + "grad_norm": 2.3255507946014404, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7254307270050049, + "num_tokens": 375837030.0, + "step": 15067 + }, + { + "epoch": 1.6547331429826488, + "grad_norm": 2.4465339183807373, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7375276684761047, + "num_tokens": 375860546.0, + "step": 15068 + }, + { + "epoch": 1.6548429606852624, + "grad_norm": 2.575212240219116, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7208044528961182, + "num_tokens": 375879957.0, + "step": 15069 + }, + { + "epoch": 1.6549527783878761, + "grad_norm": 2.3380651473999023, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.736876368522644, + "num_tokens": 375904671.0, + "step": 15070 + }, + { + "epoch": 1.65506259609049, + "grad_norm": 2.035221815109253, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7292534708976746, + "num_tokens": 375933637.0, + "step": 15071 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 2.146890640258789, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7203346490859985, + "num_tokens": 375960240.0, + "step": 15072 + }, + { + "epoch": 1.655282231495717, + "grad_norm": 2.672863721847534, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7286415100097656, + "num_tokens": 375978826.0, + "step": 15073 + }, + { + "epoch": 1.6553920491983307, + "grad_norm": 2.771481990814209, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7239025831222534, + "num_tokens": 375996036.0, + "step": 15074 + }, + { + "epoch": 1.6555018669009445, + "grad_norm": 1.9970982074737549, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7020196318626404, + "num_tokens": 376026299.0, + "step": 15075 + }, + { + "epoch": 1.6556116846035582, + "grad_norm": 2.003479242324829, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7242127656936646, + "num_tokens": 376056934.0, + "step": 15076 + }, + { + "epoch": 1.6557215023061718, + "grad_norm": 2.4210071563720703, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7272115349769592, + "num_tokens": 376080855.0, + "step": 15077 + }, + { + "epoch": 1.6558313200087853, + "grad_norm": 2.480652332305908, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7173579931259155, + "num_tokens": 376103089.0, + "step": 15078 + }, + { + "epoch": 1.655941137711399, + "grad_norm": 2.7350966930389404, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7321746945381165, + "num_tokens": 376123393.0, + "step": 15079 + }, + { + "epoch": 1.6560509554140128, + "grad_norm": 2.1706418991088867, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7269288301467896, + "num_tokens": 376152694.0, + "step": 15080 + }, + { + "epoch": 1.6561607731166264, + "grad_norm": 2.341629981994629, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7067509889602661, + "num_tokens": 376175864.0, + "step": 15081 + }, + { + "epoch": 1.65627059081924, + "grad_norm": 2.0642316341400146, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7030329704284668, + "num_tokens": 376206279.0, + "step": 15082 + }, + { + "epoch": 1.6563804085218536, + "grad_norm": 2.3354005813598633, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7223485112190247, + "num_tokens": 376229406.0, + "step": 15083 + }, + { + "epoch": 1.6564902262244674, + "grad_norm": 2.541090965270996, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7381982207298279, + "num_tokens": 376249019.0, + "step": 15084 + }, + { + "epoch": 1.6566000439270812, + "grad_norm": 2.234968900680542, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7324111461639404, + "num_tokens": 376275933.0, + "step": 15085 + }, + { + "epoch": 1.6567098616296947, + "grad_norm": 1.9976683855056763, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7213705778121948, + "num_tokens": 376304901.0, + "step": 15086 + }, + { + "epoch": 1.6568196793323082, + "grad_norm": 2.0207836627960205, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7220193147659302, + "num_tokens": 376335452.0, + "step": 15087 + }, + { + "epoch": 1.656929497034922, + "grad_norm": 2.445887804031372, + "learning_rate": 1e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7444543242454529, + "num_tokens": 376356679.0, + "step": 15088 + }, + { + "epoch": 1.6570393147375357, + "grad_norm": 2.3191580772399902, + "learning_rate": 1e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.746854841709137, + "num_tokens": 376381369.0, + "step": 15089 + }, + { + "epoch": 1.6571491324401495, + "grad_norm": 2.2050187587738037, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7218220233917236, + "num_tokens": 376408043.0, + "step": 15090 + }, + { + "epoch": 1.657258950142763, + "grad_norm": 1.9831182956695557, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7301758527755737, + "num_tokens": 376437481.0, + "step": 15091 + }, + { + "epoch": 1.6573687678453766, + "grad_norm": 2.19599986076355, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7044930458068848, + "num_tokens": 376465240.0, + "step": 15092 + }, + { + "epoch": 1.6574785855479903, + "grad_norm": 2.394684314727783, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.740793764591217, + "num_tokens": 376486632.0, + "step": 15093 + }, + { + "epoch": 1.657588403250604, + "grad_norm": 2.2567012310028076, + "learning_rate": 1e-06, + "loss": 0.8032, + "mean_token_accuracy": 0.7444603443145752, + "num_tokens": 376511242.0, + "step": 15094 + }, + { + "epoch": 1.6576982209532176, + "grad_norm": 2.5463919639587402, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7290936708450317, + "num_tokens": 376532990.0, + "step": 15095 + }, + { + "epoch": 1.6578080386558314, + "grad_norm": 2.291480779647827, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7121806144714355, + "num_tokens": 376560509.0, + "step": 15096 + }, + { + "epoch": 1.657917856358445, + "grad_norm": 2.4964611530303955, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7307315468788147, + "num_tokens": 376582427.0, + "step": 15097 + }, + { + "epoch": 1.6580276740610587, + "grad_norm": 2.176431179046631, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.714095950126648, + "num_tokens": 376612168.0, + "step": 15098 + }, + { + "epoch": 1.6581374917636724, + "grad_norm": 2.1526944637298584, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7273157238960266, + "num_tokens": 376638732.0, + "step": 15099 + }, + { + "epoch": 1.658247309466286, + "grad_norm": 2.3406870365142822, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.740448534488678, + "num_tokens": 376660334.0, + "step": 15100 + }, + { + "epoch": 1.6583571271688995, + "grad_norm": 2.731168270111084, + "learning_rate": 1e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7411694526672363, + "num_tokens": 376678881.0, + "step": 15101 + }, + { + "epoch": 1.6584669448715132, + "grad_norm": 2.2160041332244873, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7307435274124146, + "num_tokens": 376702978.0, + "step": 15102 + }, + { + "epoch": 1.658576762574127, + "grad_norm": 2.1839542388916016, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7182926535606384, + "num_tokens": 376728590.0, + "step": 15103 + }, + { + "epoch": 1.6586865802767408, + "grad_norm": 2.5317654609680176, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.752861738204956, + "num_tokens": 376747771.0, + "step": 15104 + }, + { + "epoch": 1.6587963979793543, + "grad_norm": 2.2905821800231934, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7170025706291199, + "num_tokens": 376773950.0, + "step": 15105 + }, + { + "epoch": 1.6589062156819678, + "grad_norm": 2.3512418270111084, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.707490861415863, + "num_tokens": 376797208.0, + "step": 15106 + }, + { + "epoch": 1.6590160333845816, + "grad_norm": 2.0754754543304443, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.6960464715957642, + "num_tokens": 376826919.0, + "step": 15107 + }, + { + "epoch": 1.6591258510871953, + "grad_norm": 2.1837685108184814, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7248238325119019, + "num_tokens": 376852724.0, + "step": 15108 + }, + { + "epoch": 1.6592356687898089, + "grad_norm": 2.2411255836486816, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7577061653137207, + "num_tokens": 376875105.0, + "step": 15109 + }, + { + "epoch": 1.6593454864924224, + "grad_norm": 2.3747379779815674, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7221882343292236, + "num_tokens": 376896547.0, + "step": 15110 + }, + { + "epoch": 1.6594553041950362, + "grad_norm": 2.1014959812164307, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7028032541275024, + "num_tokens": 376923385.0, + "step": 15111 + }, + { + "epoch": 1.65956512189765, + "grad_norm": 2.0526623725891113, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6957277059555054, + "num_tokens": 376951703.0, + "step": 15112 + }, + { + "epoch": 1.6596749396002637, + "grad_norm": 2.3307814598083496, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7390396595001221, + "num_tokens": 376972483.0, + "step": 15113 + }, + { + "epoch": 1.6597847573028772, + "grad_norm": 2.3560726642608643, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7215291261672974, + "num_tokens": 376997040.0, + "step": 15114 + }, + { + "epoch": 1.6598945750054908, + "grad_norm": 2.2923121452331543, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7095213532447815, + "num_tokens": 377021634.0, + "step": 15115 + }, + { + "epoch": 1.6600043927081045, + "grad_norm": 2.228504180908203, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7148510217666626, + "num_tokens": 377049058.0, + "step": 15116 + }, + { + "epoch": 1.6601142104107183, + "grad_norm": 2.1529605388641357, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7254441976547241, + "num_tokens": 377076168.0, + "step": 15117 + }, + { + "epoch": 1.660224028113332, + "grad_norm": 1.9882763624191284, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7107659578323364, + "num_tokens": 377106179.0, + "step": 15118 + }, + { + "epoch": 1.6603338458159456, + "grad_norm": 2.234793186187744, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7167214155197144, + "num_tokens": 377133172.0, + "step": 15119 + }, + { + "epoch": 1.660443663518559, + "grad_norm": 2.571155071258545, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7315687537193298, + "num_tokens": 377153888.0, + "step": 15120 + }, + { + "epoch": 1.6605534812211729, + "grad_norm": 2.079301357269287, + "learning_rate": 1e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.736361026763916, + "num_tokens": 377179338.0, + "step": 15121 + }, + { + "epoch": 1.6606632989237866, + "grad_norm": 2.124305486679077, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7507255673408508, + "num_tokens": 377203669.0, + "step": 15122 + }, + { + "epoch": 1.6607731166264001, + "grad_norm": 2.0739290714263916, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7234836220741272, + "num_tokens": 377234218.0, + "step": 15123 + }, + { + "epoch": 1.6608829343290137, + "grad_norm": 2.5779378414154053, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7322387099266052, + "num_tokens": 377255976.0, + "step": 15124 + }, + { + "epoch": 1.6609927520316274, + "grad_norm": 2.31463623046875, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7215549945831299, + "num_tokens": 377281397.0, + "step": 15125 + }, + { + "epoch": 1.6611025697342412, + "grad_norm": 2.0961246490478516, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.715044379234314, + "num_tokens": 377310098.0, + "step": 15126 + }, + { + "epoch": 1.661212387436855, + "grad_norm": 2.403028726577759, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7157741785049438, + "num_tokens": 377332293.0, + "step": 15127 + }, + { + "epoch": 1.6613222051394685, + "grad_norm": 2.0336990356445312, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7118493914604187, + "num_tokens": 377361477.0, + "step": 15128 + }, + { + "epoch": 1.661432022842082, + "grad_norm": 2.5331878662109375, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7367265820503235, + "num_tokens": 377382538.0, + "step": 15129 + }, + { + "epoch": 1.6615418405446958, + "grad_norm": 2.192342519760132, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7399520874023438, + "num_tokens": 377408502.0, + "step": 15130 + }, + { + "epoch": 1.6616516582473095, + "grad_norm": 2.118725299835205, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7018502950668335, + "num_tokens": 377438994.0, + "step": 15131 + }, + { + "epoch": 1.661761475949923, + "grad_norm": 2.0951831340789795, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7136325836181641, + "num_tokens": 377466103.0, + "step": 15132 + }, + { + "epoch": 1.6618712936525368, + "grad_norm": 2.202576160430908, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7252880930900574, + "num_tokens": 377492597.0, + "step": 15133 + }, + { + "epoch": 1.6619811113551504, + "grad_norm": 2.0643255710601807, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7243549823760986, + "num_tokens": 377521350.0, + "step": 15134 + }, + { + "epoch": 1.6620909290577641, + "grad_norm": 2.279629945755005, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7013792991638184, + "num_tokens": 377547323.0, + "step": 15135 + }, + { + "epoch": 1.6622007467603779, + "grad_norm": 2.1772420406341553, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7076653838157654, + "num_tokens": 377574286.0, + "step": 15136 + }, + { + "epoch": 1.6623105644629914, + "grad_norm": 2.354226589202881, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7252206206321716, + "num_tokens": 377596173.0, + "step": 15137 + }, + { + "epoch": 1.662420382165605, + "grad_norm": 2.098132371902466, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7288714647293091, + "num_tokens": 377628091.0, + "step": 15138 + }, + { + "epoch": 1.6625301998682187, + "grad_norm": 2.306298017501831, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7192471027374268, + "num_tokens": 377651394.0, + "step": 15139 + }, + { + "epoch": 1.6626400175708325, + "grad_norm": 1.7706059217453003, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6923027634620667, + "num_tokens": 377689033.0, + "step": 15140 + }, + { + "epoch": 1.6627498352734462, + "grad_norm": 2.7005815505981445, + "learning_rate": 1e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7459422945976257, + "num_tokens": 377706952.0, + "step": 15141 + }, + { + "epoch": 1.6628596529760598, + "grad_norm": 2.1124563217163086, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.735478937625885, + "num_tokens": 377732724.0, + "step": 15142 + }, + { + "epoch": 1.6629694706786733, + "grad_norm": 2.582737445831299, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7149026393890381, + "num_tokens": 377753769.0, + "step": 15143 + }, + { + "epoch": 1.663079288381287, + "grad_norm": 2.2795140743255615, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7349453568458557, + "num_tokens": 377777252.0, + "step": 15144 + }, + { + "epoch": 1.6631891060839008, + "grad_norm": 2.4641265869140625, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7395652532577515, + "num_tokens": 377801229.0, + "step": 15145 + }, + { + "epoch": 1.6632989237865143, + "grad_norm": 2.3746912479400635, + "learning_rate": 1e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.7433414459228516, + "num_tokens": 377822682.0, + "step": 15146 + }, + { + "epoch": 1.663408741489128, + "grad_norm": 2.166969060897827, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7046332955360413, + "num_tokens": 377849691.0, + "step": 15147 + }, + { + "epoch": 1.6635185591917416, + "grad_norm": 2.322547674179077, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7390164136886597, + "num_tokens": 377872238.0, + "step": 15148 + }, + { + "epoch": 1.6636283768943554, + "grad_norm": 2.191384792327881, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.6948186159133911, + "num_tokens": 377900084.0, + "step": 15149 + }, + { + "epoch": 1.6637381945969691, + "grad_norm": 2.0658748149871826, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7334142923355103, + "num_tokens": 377927808.0, + "step": 15150 + }, + { + "epoch": 1.6638480122995827, + "grad_norm": 2.571986436843872, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7218304872512817, + "num_tokens": 377947468.0, + "step": 15151 + }, + { + "epoch": 1.6639578300021962, + "grad_norm": 2.3236401081085205, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7051383256912231, + "num_tokens": 377971835.0, + "step": 15152 + }, + { + "epoch": 1.66406764770481, + "grad_norm": 2.6530160903930664, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7314842939376831, + "num_tokens": 377991956.0, + "step": 15153 + }, + { + "epoch": 1.6641774654074237, + "grad_norm": 2.2949764728546143, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.7062547206878662, + "num_tokens": 378017529.0, + "step": 15154 + }, + { + "epoch": 1.6642872831100375, + "grad_norm": 2.0588557720184326, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7160377502441406, + "num_tokens": 378047301.0, + "step": 15155 + }, + { + "epoch": 1.664397100812651, + "grad_norm": 2.2521579265594482, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7161891460418701, + "num_tokens": 378073125.0, + "step": 15156 + }, + { + "epoch": 1.6645069185152646, + "grad_norm": 2.1520090103149414, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7451385259628296, + "num_tokens": 378101547.0, + "step": 15157 + }, + { + "epoch": 1.6646167362178783, + "grad_norm": 2.560187816619873, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7113526463508606, + "num_tokens": 378121522.0, + "step": 15158 + }, + { + "epoch": 1.664726553920492, + "grad_norm": 2.3564586639404297, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7089427709579468, + "num_tokens": 378147670.0, + "step": 15159 + }, + { + "epoch": 1.6648363716231056, + "grad_norm": 2.261767625808716, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7375299334526062, + "num_tokens": 378171342.0, + "step": 15160 + }, + { + "epoch": 1.6649461893257194, + "grad_norm": 2.2342612743377686, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.736741304397583, + "num_tokens": 378195797.0, + "step": 15161 + }, + { + "epoch": 1.665056007028333, + "grad_norm": 1.9735594987869263, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7059119939804077, + "num_tokens": 378230368.0, + "step": 15162 + }, + { + "epoch": 1.6651658247309467, + "grad_norm": 1.9448291063308716, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7113046050071716, + "num_tokens": 378263995.0, + "step": 15163 + }, + { + "epoch": 1.6652756424335604, + "grad_norm": 2.529592990875244, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6934792399406433, + "num_tokens": 378289937.0, + "step": 15164 + }, + { + "epoch": 1.665385460136174, + "grad_norm": 2.3663456439971924, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7182989120483398, + "num_tokens": 378312464.0, + "step": 15165 + }, + { + "epoch": 1.6654952778387875, + "grad_norm": 2.225724220275879, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7280778288841248, + "num_tokens": 378338636.0, + "step": 15166 + }, + { + "epoch": 1.6656050955414012, + "grad_norm": 2.411222457885742, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7263109683990479, + "num_tokens": 378360101.0, + "step": 15167 + }, + { + "epoch": 1.665714913244015, + "grad_norm": 2.1288671493530273, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7098739147186279, + "num_tokens": 378388962.0, + "step": 15168 + }, + { + "epoch": 1.6658247309466288, + "grad_norm": 2.3611533641815186, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7414178848266602, + "num_tokens": 378411901.0, + "step": 15169 + }, + { + "epoch": 1.6659345486492423, + "grad_norm": 2.087918758392334, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7092944979667664, + "num_tokens": 378439889.0, + "step": 15170 + }, + { + "epoch": 1.6660443663518558, + "grad_norm": 2.171577215194702, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7323819398880005, + "num_tokens": 378468216.0, + "step": 15171 + }, + { + "epoch": 1.6661541840544696, + "grad_norm": 2.1328988075256348, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6961580514907837, + "num_tokens": 378498689.0, + "step": 15172 + }, + { + "epoch": 1.6662640017570833, + "grad_norm": 2.166147232055664, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7248469591140747, + "num_tokens": 378525708.0, + "step": 15173 + }, + { + "epoch": 1.6663738194596969, + "grad_norm": 2.717252731323242, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7206881046295166, + "num_tokens": 378554572.0, + "step": 15174 + }, + { + "epoch": 1.6664836371623104, + "grad_norm": 2.1558001041412354, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7266883254051208, + "num_tokens": 378580236.0, + "step": 15175 + }, + { + "epoch": 1.6665934548649242, + "grad_norm": 2.023850202560425, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7343813180923462, + "num_tokens": 378611429.0, + "step": 15176 + }, + { + "epoch": 1.666703272567538, + "grad_norm": 2.3037798404693604, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7021703720092773, + "num_tokens": 378638574.0, + "step": 15177 + }, + { + "epoch": 1.6668130902701517, + "grad_norm": 2.0537402629852295, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7291363477706909, + "num_tokens": 378666382.0, + "step": 15178 + }, + { + "epoch": 1.6669229079727652, + "grad_norm": 2.234165906906128, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.722277820110321, + "num_tokens": 378691767.0, + "step": 15179 + }, + { + "epoch": 1.6670327256753787, + "grad_norm": 2.0075020790100098, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.6960540413856506, + "num_tokens": 378723863.0, + "step": 15180 + }, + { + "epoch": 1.6671425433779925, + "grad_norm": 2.4202027320861816, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7353649735450745, + "num_tokens": 378746155.0, + "step": 15181 + }, + { + "epoch": 1.6672523610806063, + "grad_norm": 2.3430662155151367, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7470612525939941, + "num_tokens": 378770426.0, + "step": 15182 + }, + { + "epoch": 1.66736217878322, + "grad_norm": 2.253408193588257, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7380412220954895, + "num_tokens": 378793308.0, + "step": 15183 + }, + { + "epoch": 1.6674719964858336, + "grad_norm": 2.4098970890045166, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7358192205429077, + "num_tokens": 378815266.0, + "step": 15184 + }, + { + "epoch": 1.667581814188447, + "grad_norm": 2.616189479827881, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7247670292854309, + "num_tokens": 378835295.0, + "step": 15185 + }, + { + "epoch": 1.6676916318910608, + "grad_norm": 2.1339271068573, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7061936855316162, + "num_tokens": 378863907.0, + "step": 15186 + }, + { + "epoch": 1.6678014495936746, + "grad_norm": 2.6366500854492188, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7296788692474365, + "num_tokens": 378883297.0, + "step": 15187 + }, + { + "epoch": 1.6679112672962881, + "grad_norm": 2.3873140811920166, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7073874473571777, + "num_tokens": 378907266.0, + "step": 15188 + }, + { + "epoch": 1.6680210849989017, + "grad_norm": 2.397980213165283, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7153416872024536, + "num_tokens": 378931112.0, + "step": 15189 + }, + { + "epoch": 1.6681309027015154, + "grad_norm": 2.472792148590088, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.713078498840332, + "num_tokens": 378953294.0, + "step": 15190 + }, + { + "epoch": 1.6682407204041292, + "grad_norm": 2.0491855144500732, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7431291341781616, + "num_tokens": 378982138.0, + "step": 15191 + }, + { + "epoch": 1.668350538106743, + "grad_norm": 2.1047816276550293, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7291558384895325, + "num_tokens": 379010575.0, + "step": 15192 + }, + { + "epoch": 1.6684603558093565, + "grad_norm": 2.193427801132202, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7104852199554443, + "num_tokens": 379037535.0, + "step": 15193 + }, + { + "epoch": 1.66857017351197, + "grad_norm": 2.4771037101745605, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.740767776966095, + "num_tokens": 379058224.0, + "step": 15194 + }, + { + "epoch": 1.6686799912145838, + "grad_norm": 2.599081039428711, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7351482510566711, + "num_tokens": 379076514.0, + "step": 15195 + }, + { + "epoch": 1.6687898089171975, + "grad_norm": 2.334962844848633, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7235958576202393, + "num_tokens": 379100664.0, + "step": 15196 + }, + { + "epoch": 1.668899626619811, + "grad_norm": 2.2571935653686523, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7280715703964233, + "num_tokens": 379125311.0, + "step": 15197 + }, + { + "epoch": 1.6690094443224248, + "grad_norm": 2.2912304401397705, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7132141590118408, + "num_tokens": 379151209.0, + "step": 15198 + }, + { + "epoch": 1.6691192620250384, + "grad_norm": 1.8905739784240723, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7390707731246948, + "num_tokens": 379184016.0, + "step": 15199 + }, + { + "epoch": 1.669229079727652, + "grad_norm": 2.2350847721099854, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7260608673095703, + "num_tokens": 379210186.0, + "step": 15200 + }, + { + "epoch": 1.6693388974302659, + "grad_norm": 2.2776575088500977, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7231482267379761, + "num_tokens": 379235927.0, + "step": 15201 + }, + { + "epoch": 1.6694487151328794, + "grad_norm": 2.3009443283081055, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7152349948883057, + "num_tokens": 379260730.0, + "step": 15202 + }, + { + "epoch": 1.669558532835493, + "grad_norm": 2.376903772354126, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7300246953964233, + "num_tokens": 379282531.0, + "step": 15203 + }, + { + "epoch": 1.6696683505381067, + "grad_norm": 2.3980836868286133, + "learning_rate": 1e-06, + "loss": 0.781, + "mean_token_accuracy": 0.749677836894989, + "num_tokens": 379303406.0, + "step": 15204 + }, + { + "epoch": 1.6697781682407205, + "grad_norm": 2.590029001235962, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7146831154823303, + "num_tokens": 379325467.0, + "step": 15205 + }, + { + "epoch": 1.6698879859433342, + "grad_norm": 2.2849721908569336, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7197384238243103, + "num_tokens": 379351076.0, + "step": 15206 + }, + { + "epoch": 1.6699978036459477, + "grad_norm": 2.0513875484466553, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7085500955581665, + "num_tokens": 379380064.0, + "step": 15207 + }, + { + "epoch": 1.6701076213485613, + "grad_norm": 2.160351037979126, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.714895486831665, + "num_tokens": 379407863.0, + "step": 15208 + }, + { + "epoch": 1.670217439051175, + "grad_norm": 2.532151222229004, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7110180854797363, + "num_tokens": 379428521.0, + "step": 15209 + }, + { + "epoch": 1.6703272567537888, + "grad_norm": 2.546434164047241, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7124975919723511, + "num_tokens": 379449708.0, + "step": 15210 + }, + { + "epoch": 1.6704370744564023, + "grad_norm": 2.4595565795898438, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7138213515281677, + "num_tokens": 379472204.0, + "step": 15211 + }, + { + "epoch": 1.670546892159016, + "grad_norm": 2.456955909729004, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7298376560211182, + "num_tokens": 379493345.0, + "step": 15212 + }, + { + "epoch": 1.6706567098616296, + "grad_norm": 2.7652053833007812, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7352964878082275, + "num_tokens": 379510857.0, + "step": 15213 + }, + { + "epoch": 1.6707665275642434, + "grad_norm": 2.3926472663879395, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7285747528076172, + "num_tokens": 379531972.0, + "step": 15214 + }, + { + "epoch": 1.6708763452668571, + "grad_norm": 2.5645298957824707, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7165505886077881, + "num_tokens": 379553774.0, + "step": 15215 + }, + { + "epoch": 1.6709861629694707, + "grad_norm": 2.405900716781616, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7197260856628418, + "num_tokens": 379577884.0, + "step": 15216 + }, + { + "epoch": 1.6710959806720842, + "grad_norm": 2.1781394481658936, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7161307334899902, + "num_tokens": 379603706.0, + "step": 15217 + }, + { + "epoch": 1.671205798374698, + "grad_norm": 2.285993814468384, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7201045155525208, + "num_tokens": 379629206.0, + "step": 15218 + }, + { + "epoch": 1.6713156160773117, + "grad_norm": 2.201045274734497, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7366958856582642, + "num_tokens": 379654005.0, + "step": 15219 + }, + { + "epoch": 1.6714254337799255, + "grad_norm": 2.1411986351013184, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7123056054115295, + "num_tokens": 379683428.0, + "step": 15220 + }, + { + "epoch": 1.671535251482539, + "grad_norm": 2.072636365890503, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7277403473854065, + "num_tokens": 379710709.0, + "step": 15221 + }, + { + "epoch": 1.6716450691851525, + "grad_norm": 2.2339696884155273, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6971449851989746, + "num_tokens": 379737568.0, + "step": 15222 + }, + { + "epoch": 1.6717548868877663, + "grad_norm": 2.4932310581207275, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7173705101013184, + "num_tokens": 379758252.0, + "step": 15223 + }, + { + "epoch": 1.67186470459038, + "grad_norm": 2.267167806625366, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7245616316795349, + "num_tokens": 379785254.0, + "step": 15224 + }, + { + "epoch": 1.6719745222929936, + "grad_norm": 2.2247061729431152, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7146866321563721, + "num_tokens": 379811100.0, + "step": 15225 + }, + { + "epoch": 1.6720843399956071, + "grad_norm": 2.4439632892608643, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.712729811668396, + "num_tokens": 379832673.0, + "step": 15226 + }, + { + "epoch": 1.6721941576982209, + "grad_norm": 1.976524829864502, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.724317193031311, + "num_tokens": 379862678.0, + "step": 15227 + }, + { + "epoch": 1.6723039754008346, + "grad_norm": 2.651658535003662, + "learning_rate": 1e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7454625964164734, + "num_tokens": 379880377.0, + "step": 15228 + }, + { + "epoch": 1.6724137931034484, + "grad_norm": 2.076826810836792, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7039070725440979, + "num_tokens": 379910062.0, + "step": 15229 + }, + { + "epoch": 1.672523610806062, + "grad_norm": 2.1889166831970215, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7212450504302979, + "num_tokens": 379937692.0, + "step": 15230 + }, + { + "epoch": 1.6726334285086755, + "grad_norm": 2.4825360774993896, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7353689074516296, + "num_tokens": 379958523.0, + "step": 15231 + }, + { + "epoch": 1.6727432462112892, + "grad_norm": 2.184837818145752, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7116218209266663, + "num_tokens": 379986013.0, + "step": 15232 + }, + { + "epoch": 1.672853063913903, + "grad_norm": 2.582566261291504, + "learning_rate": 1e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.737708330154419, + "num_tokens": 380006761.0, + "step": 15233 + }, + { + "epoch": 1.6729628816165167, + "grad_norm": 2.906799793243408, + "learning_rate": 1e-06, + "loss": 0.8095, + "mean_token_accuracy": 0.7477667331695557, + "num_tokens": 380022233.0, + "step": 15234 + }, + { + "epoch": 1.6730726993191303, + "grad_norm": 2.6739659309387207, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7313554286956787, + "num_tokens": 380042661.0, + "step": 15235 + }, + { + "epoch": 1.6731825170217438, + "grad_norm": 2.293206214904785, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7356938123703003, + "num_tokens": 380066546.0, + "step": 15236 + }, + { + "epoch": 1.6732923347243576, + "grad_norm": 2.281578540802002, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7131184935569763, + "num_tokens": 380092493.0, + "step": 15237 + }, + { + "epoch": 1.6734021524269713, + "grad_norm": 2.5277798175811768, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7404953241348267, + "num_tokens": 380112114.0, + "step": 15238 + }, + { + "epoch": 1.6735119701295849, + "grad_norm": 2.852987766265869, + "learning_rate": 1e-06, + "loss": 0.791, + "mean_token_accuracy": 0.743028461933136, + "num_tokens": 380129647.0, + "step": 15239 + }, + { + "epoch": 1.6736217878321984, + "grad_norm": 2.279546022415161, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7286315560340881, + "num_tokens": 380153465.0, + "step": 15240 + }, + { + "epoch": 1.6737316055348122, + "grad_norm": 2.4321582317352295, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7380730509757996, + "num_tokens": 380175772.0, + "step": 15241 + }, + { + "epoch": 1.673841423237426, + "grad_norm": 2.3325774669647217, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7355184555053711, + "num_tokens": 380197727.0, + "step": 15242 + }, + { + "epoch": 1.6739512409400397, + "grad_norm": 2.2146177291870117, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7270643711090088, + "num_tokens": 380224822.0, + "step": 15243 + }, + { + "epoch": 1.6740610586426532, + "grad_norm": 2.4249157905578613, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7454115748405457, + "num_tokens": 380245856.0, + "step": 15244 + }, + { + "epoch": 1.6741708763452667, + "grad_norm": 2.2558603286743164, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.701491117477417, + "num_tokens": 380272764.0, + "step": 15245 + }, + { + "epoch": 1.6742806940478805, + "grad_norm": 2.4901063442230225, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7262431383132935, + "num_tokens": 380293690.0, + "step": 15246 + }, + { + "epoch": 1.6743905117504942, + "grad_norm": 2.0461223125457764, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7188749313354492, + "num_tokens": 380322495.0, + "step": 15247 + }, + { + "epoch": 1.674500329453108, + "grad_norm": 2.0679140090942383, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7306907176971436, + "num_tokens": 380353181.0, + "step": 15248 + }, + { + "epoch": 1.6746101471557215, + "grad_norm": 2.3217241764068604, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7128893733024597, + "num_tokens": 380376511.0, + "step": 15249 + }, + { + "epoch": 1.674719964858335, + "grad_norm": 2.2476398944854736, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7402082681655884, + "num_tokens": 380402965.0, + "step": 15250 + }, + { + "epoch": 1.6748297825609488, + "grad_norm": 1.963949203491211, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7216794490814209, + "num_tokens": 380434826.0, + "step": 15251 + }, + { + "epoch": 1.6749396002635626, + "grad_norm": 2.439241647720337, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7081449031829834, + "num_tokens": 380456180.0, + "step": 15252 + }, + { + "epoch": 1.6750494179661761, + "grad_norm": 2.510714292526245, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7196310758590698, + "num_tokens": 380477423.0, + "step": 15253 + }, + { + "epoch": 1.6751592356687897, + "grad_norm": 2.209418773651123, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.711127519607544, + "num_tokens": 380504201.0, + "step": 15254 + }, + { + "epoch": 1.6752690533714034, + "grad_norm": 2.7484676837921143, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.736173152923584, + "num_tokens": 380523237.0, + "step": 15255 + }, + { + "epoch": 1.6753788710740172, + "grad_norm": 2.323692798614502, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7274116277694702, + "num_tokens": 380546488.0, + "step": 15256 + }, + { + "epoch": 1.675488688776631, + "grad_norm": 2.440743923187256, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7136890888214111, + "num_tokens": 380570341.0, + "step": 15257 + }, + { + "epoch": 1.6755985064792445, + "grad_norm": 2.1428821086883545, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7069007158279419, + "num_tokens": 380597897.0, + "step": 15258 + }, + { + "epoch": 1.675708324181858, + "grad_norm": 2.15439510345459, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.718482255935669, + "num_tokens": 380625035.0, + "step": 15259 + }, + { + "epoch": 1.6758181418844718, + "grad_norm": 2.213301420211792, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7274167537689209, + "num_tokens": 380650847.0, + "step": 15260 + }, + { + "epoch": 1.6759279595870855, + "grad_norm": 2.3226242065429688, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7066461443901062, + "num_tokens": 380676342.0, + "step": 15261 + }, + { + "epoch": 1.676037777289699, + "grad_norm": 2.08601713180542, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7097995281219482, + "num_tokens": 380706078.0, + "step": 15262 + }, + { + "epoch": 1.6761475949923128, + "grad_norm": 2.07302188873291, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7133086323738098, + "num_tokens": 380738437.0, + "step": 15263 + }, + { + "epoch": 1.6762574126949263, + "grad_norm": 2.4375927448272705, + "learning_rate": 1e-06, + "loss": 0.7698, + "mean_token_accuracy": 0.7562352418899536, + "num_tokens": 380760764.0, + "step": 15264 + }, + { + "epoch": 1.67636723039754, + "grad_norm": 1.9978114366531372, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7123165130615234, + "num_tokens": 380791737.0, + "step": 15265 + }, + { + "epoch": 1.6764770481001539, + "grad_norm": 2.2219200134277344, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7466613054275513, + "num_tokens": 380815995.0, + "step": 15266 + }, + { + "epoch": 1.6765868658027674, + "grad_norm": 2.0544662475585938, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7125325202941895, + "num_tokens": 380845949.0, + "step": 15267 + }, + { + "epoch": 1.676696683505381, + "grad_norm": 2.2872154712677, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7320461869239807, + "num_tokens": 380871041.0, + "step": 15268 + }, + { + "epoch": 1.6768065012079947, + "grad_norm": 2.153331756591797, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7115817070007324, + "num_tokens": 380898504.0, + "step": 15269 + }, + { + "epoch": 1.6769163189106084, + "grad_norm": 2.0350089073181152, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7316791415214539, + "num_tokens": 380926380.0, + "step": 15270 + }, + { + "epoch": 1.6770261366132222, + "grad_norm": 2.257934808731079, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7302520871162415, + "num_tokens": 380952659.0, + "step": 15271 + }, + { + "epoch": 1.6771359543158357, + "grad_norm": 2.0125603675842285, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7094513177871704, + "num_tokens": 380984646.0, + "step": 15272 + }, + { + "epoch": 1.6772457720184493, + "grad_norm": 2.2603540420532227, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7198579907417297, + "num_tokens": 381010807.0, + "step": 15273 + }, + { + "epoch": 1.677355589721063, + "grad_norm": 2.136002779006958, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7087287902832031, + "num_tokens": 381040047.0, + "step": 15274 + }, + { + "epoch": 1.6774654074236768, + "grad_norm": 2.0316762924194336, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7080364227294922, + "num_tokens": 381070226.0, + "step": 15275 + }, + { + "epoch": 1.6775752251262903, + "grad_norm": 2.415039539337158, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7174952030181885, + "num_tokens": 381092545.0, + "step": 15276 + }, + { + "epoch": 1.677685042828904, + "grad_norm": 2.2031607627868652, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7175886631011963, + "num_tokens": 381120290.0, + "step": 15277 + }, + { + "epoch": 1.6777948605315176, + "grad_norm": 2.199880599975586, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7480273842811584, + "num_tokens": 381145249.0, + "step": 15278 + }, + { + "epoch": 1.6779046782341314, + "grad_norm": 2.492982864379883, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7199996113777161, + "num_tokens": 381166333.0, + "step": 15279 + }, + { + "epoch": 1.6780144959367451, + "grad_norm": 2.2718284130096436, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7147774696350098, + "num_tokens": 381191574.0, + "step": 15280 + }, + { + "epoch": 1.6781243136393587, + "grad_norm": 2.0623416900634766, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7044810652732849, + "num_tokens": 381222244.0, + "step": 15281 + }, + { + "epoch": 1.6782341313419722, + "grad_norm": 2.3607938289642334, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7321109771728516, + "num_tokens": 381244419.0, + "step": 15282 + }, + { + "epoch": 1.678343949044586, + "grad_norm": 2.3228678703308105, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7335352897644043, + "num_tokens": 381268458.0, + "step": 15283 + }, + { + "epoch": 1.6784537667471997, + "grad_norm": 2.340932846069336, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7158061265945435, + "num_tokens": 381293020.0, + "step": 15284 + }, + { + "epoch": 1.6785635844498135, + "grad_norm": 2.4002606868743896, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7666011452674866, + "num_tokens": 381313771.0, + "step": 15285 + }, + { + "epoch": 1.678673402152427, + "grad_norm": 2.4841432571411133, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7442859411239624, + "num_tokens": 381334987.0, + "step": 15286 + }, + { + "epoch": 1.6787832198550405, + "grad_norm": 2.162473440170288, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7095120549201965, + "num_tokens": 381364560.0, + "step": 15287 + }, + { + "epoch": 1.6788930375576543, + "grad_norm": 2.6423652172088623, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7326858043670654, + "num_tokens": 381383127.0, + "step": 15288 + }, + { + "epoch": 1.679002855260268, + "grad_norm": 2.3648416996002197, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7050668001174927, + "num_tokens": 381407644.0, + "step": 15289 + }, + { + "epoch": 1.6791126729628816, + "grad_norm": 2.4115326404571533, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7312228083610535, + "num_tokens": 381429777.0, + "step": 15290 + }, + { + "epoch": 1.6792224906654951, + "grad_norm": 2.5257208347320557, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7150434255599976, + "num_tokens": 381451286.0, + "step": 15291 + }, + { + "epoch": 1.6793323083681089, + "grad_norm": 2.0429165363311768, + "learning_rate": 1e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7434350848197937, + "num_tokens": 381479514.0, + "step": 15292 + }, + { + "epoch": 1.6794421260707226, + "grad_norm": 2.373469114303589, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7096084356307983, + "num_tokens": 381505500.0, + "step": 15293 + }, + { + "epoch": 1.6795519437733364, + "grad_norm": 2.2279410362243652, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7140665054321289, + "num_tokens": 381531818.0, + "step": 15294 + }, + { + "epoch": 1.67966176147595, + "grad_norm": 2.2574257850646973, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7150871753692627, + "num_tokens": 381557179.0, + "step": 15295 + }, + { + "epoch": 1.6797715791785635, + "grad_norm": 2.476386785507202, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7215883135795593, + "num_tokens": 381579025.0, + "step": 15296 + }, + { + "epoch": 1.6798813968811772, + "grad_norm": 2.4019691944122314, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7489815354347229, + "num_tokens": 381599909.0, + "step": 15297 + }, + { + "epoch": 1.679991214583791, + "grad_norm": 2.3696577548980713, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.711292028427124, + "num_tokens": 381623667.0, + "step": 15298 + }, + { + "epoch": 1.6801010322864047, + "grad_norm": 2.408374071121216, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7102794051170349, + "num_tokens": 381647445.0, + "step": 15299 + }, + { + "epoch": 1.6802108499890183, + "grad_norm": 2.1933538913726807, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7421454787254333, + "num_tokens": 381673057.0, + "step": 15300 + }, + { + "epoch": 1.6803206676916318, + "grad_norm": 2.373551607131958, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6989650726318359, + "num_tokens": 381698979.0, + "step": 15301 + }, + { + "epoch": 1.6804304853942456, + "grad_norm": 2.0161991119384766, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6886394619941711, + "num_tokens": 381734789.0, + "step": 15302 + }, + { + "epoch": 1.6805403030968593, + "grad_norm": 2.1770291328430176, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7203061580657959, + "num_tokens": 381762831.0, + "step": 15303 + }, + { + "epoch": 1.6806501207994728, + "grad_norm": 2.2280445098876953, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7327988743782043, + "num_tokens": 381791120.0, + "step": 15304 + }, + { + "epoch": 1.6807599385020864, + "grad_norm": 2.3562586307525635, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.714428186416626, + "num_tokens": 381813464.0, + "step": 15305 + }, + { + "epoch": 1.6808697562047001, + "grad_norm": 2.0899157524108887, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7003285884857178, + "num_tokens": 381843255.0, + "step": 15306 + }, + { + "epoch": 1.680979573907314, + "grad_norm": 1.9780555963516235, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7105602025985718, + "num_tokens": 381875539.0, + "step": 15307 + }, + { + "epoch": 1.6810893916099277, + "grad_norm": 2.4310615062713623, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7276986837387085, + "num_tokens": 381896164.0, + "step": 15308 + }, + { + "epoch": 1.6811992093125412, + "grad_norm": 2.6331515312194824, + "learning_rate": 1e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7391833066940308, + "num_tokens": 381915803.0, + "step": 15309 + }, + { + "epoch": 1.6813090270151547, + "grad_norm": 2.1879632472991943, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.696442186832428, + "num_tokens": 381946225.0, + "step": 15310 + }, + { + "epoch": 1.6814188447177685, + "grad_norm": 2.1931495666503906, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7013115286827087, + "num_tokens": 381972502.0, + "step": 15311 + }, + { + "epoch": 1.6815286624203822, + "grad_norm": 2.1371705532073975, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7104430198669434, + "num_tokens": 382001985.0, + "step": 15312 + }, + { + "epoch": 1.681638480122996, + "grad_norm": 2.547994613647461, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7504812479019165, + "num_tokens": 382023296.0, + "step": 15313 + }, + { + "epoch": 1.6817482978256095, + "grad_norm": 2.810638427734375, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7272369861602783, + "num_tokens": 382040605.0, + "step": 15314 + }, + { + "epoch": 1.681858115528223, + "grad_norm": 2.606576442718506, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7334614396095276, + "num_tokens": 382059778.0, + "step": 15315 + }, + { + "epoch": 1.6819679332308368, + "grad_norm": 2.5531160831451416, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7308995723724365, + "num_tokens": 382082109.0, + "step": 15316 + }, + { + "epoch": 1.6820777509334506, + "grad_norm": 2.423779249191284, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7358893752098083, + "num_tokens": 382104769.0, + "step": 15317 + }, + { + "epoch": 1.6821875686360641, + "grad_norm": 2.0421829223632812, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7127577066421509, + "num_tokens": 382136681.0, + "step": 15318 + }, + { + "epoch": 1.6822973863386776, + "grad_norm": 2.2376344203948975, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6810100078582764, + "num_tokens": 382164840.0, + "step": 15319 + }, + { + "epoch": 1.6824072040412914, + "grad_norm": 2.586158514022827, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7088257074356079, + "num_tokens": 382187825.0, + "step": 15320 + }, + { + "epoch": 1.6825170217439052, + "grad_norm": 2.104924201965332, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.734195351600647, + "num_tokens": 382215299.0, + "step": 15321 + }, + { + "epoch": 1.682626839446519, + "grad_norm": 2.022939443588257, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7222838401794434, + "num_tokens": 382246162.0, + "step": 15322 + }, + { + "epoch": 1.6827366571491325, + "grad_norm": 2.517223358154297, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7360371351242065, + "num_tokens": 382266171.0, + "step": 15323 + }, + { + "epoch": 1.682846474851746, + "grad_norm": 2.156350612640381, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7114112973213196, + "num_tokens": 382292229.0, + "step": 15324 + }, + { + "epoch": 1.6829562925543597, + "grad_norm": 2.6830317974090576, + "learning_rate": 1e-06, + "loss": 0.7372, + "mean_token_accuracy": 0.769322395324707, + "num_tokens": 382311369.0, + "step": 15325 + }, + { + "epoch": 1.6830661102569735, + "grad_norm": 2.4023284912109375, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7315462231636047, + "num_tokens": 382333173.0, + "step": 15326 + }, + { + "epoch": 1.683175927959587, + "grad_norm": 2.330083131790161, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7056607007980347, + "num_tokens": 382361828.0, + "step": 15327 + }, + { + "epoch": 1.6832857456622008, + "grad_norm": 2.662335157394409, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.721347451210022, + "num_tokens": 382381508.0, + "step": 15328 + }, + { + "epoch": 1.6833955633648143, + "grad_norm": 2.5737364292144775, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.727814793586731, + "num_tokens": 382400627.0, + "step": 15329 + }, + { + "epoch": 1.683505381067428, + "grad_norm": 2.4085230827331543, + "learning_rate": 1e-06, + "loss": 0.7797, + "mean_token_accuracy": 0.7463129758834839, + "num_tokens": 382421222.0, + "step": 15330 + }, + { + "epoch": 1.6836151987700418, + "grad_norm": 2.2973811626434326, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7318171858787537, + "num_tokens": 382444029.0, + "step": 15331 + }, + { + "epoch": 1.6837250164726554, + "grad_norm": 2.060645341873169, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7271337509155273, + "num_tokens": 382472177.0, + "step": 15332 + }, + { + "epoch": 1.683834834175269, + "grad_norm": 2.24609112739563, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7089662551879883, + "num_tokens": 382497921.0, + "step": 15333 + }, + { + "epoch": 1.6839446518778827, + "grad_norm": 2.2524149417877197, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7320027351379395, + "num_tokens": 382520792.0, + "step": 15334 + }, + { + "epoch": 1.6840544695804964, + "grad_norm": 1.9081792831420898, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7284932136535645, + "num_tokens": 382554354.0, + "step": 15335 + }, + { + "epoch": 1.6841642872831102, + "grad_norm": 2.505995750427246, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7213919162750244, + "num_tokens": 382576967.0, + "step": 15336 + }, + { + "epoch": 1.6842741049857237, + "grad_norm": 2.0931873321533203, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7248618602752686, + "num_tokens": 382604546.0, + "step": 15337 + }, + { + "epoch": 1.6843839226883373, + "grad_norm": 2.0761520862579346, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7295528650283813, + "num_tokens": 382632546.0, + "step": 15338 + }, + { + "epoch": 1.684493740390951, + "grad_norm": 2.1837408542633057, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7097609043121338, + "num_tokens": 382660483.0, + "step": 15339 + }, + { + "epoch": 1.6846035580935648, + "grad_norm": 2.3512890338897705, + "learning_rate": 1e-06, + "loss": 0.7462, + "mean_token_accuracy": 0.7623181939125061, + "num_tokens": 382683288.0, + "step": 15340 + }, + { + "epoch": 1.6847133757961783, + "grad_norm": 2.422088861465454, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7532011270523071, + "num_tokens": 382704151.0, + "step": 15341 + }, + { + "epoch": 1.684823193498792, + "grad_norm": 2.3333842754364014, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7244886159896851, + "num_tokens": 382728928.0, + "step": 15342 + }, + { + "epoch": 1.6849330112014056, + "grad_norm": 2.1926474571228027, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7382879257202148, + "num_tokens": 382754528.0, + "step": 15343 + }, + { + "epoch": 1.6850428289040194, + "grad_norm": 2.6115546226501465, + "learning_rate": 1e-06, + "loss": 0.7485, + "mean_token_accuracy": 0.7517139315605164, + "num_tokens": 382772875.0, + "step": 15344 + }, + { + "epoch": 1.685152646606633, + "grad_norm": 2.354660987854004, + "learning_rate": 1e-06, + "loss": 0.7816, + "mean_token_accuracy": 0.7551808953285217, + "num_tokens": 382793726.0, + "step": 15345 + }, + { + "epoch": 1.6852624643092466, + "grad_norm": 2.1462345123291016, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7107704877853394, + "num_tokens": 382822521.0, + "step": 15346 + }, + { + "epoch": 1.6853722820118602, + "grad_norm": 2.1622655391693115, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7446973323822021, + "num_tokens": 382848317.0, + "step": 15347 + }, + { + "epoch": 1.685482099714474, + "grad_norm": 2.1816821098327637, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7318302989006042, + "num_tokens": 382874582.0, + "step": 15348 + }, + { + "epoch": 1.6855919174170877, + "grad_norm": 2.4587109088897705, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7034788131713867, + "num_tokens": 382897682.0, + "step": 15349 + }, + { + "epoch": 1.6857017351197014, + "grad_norm": 2.2506825923919678, + "learning_rate": 1e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7408257126808167, + "num_tokens": 382923673.0, + "step": 15350 + }, + { + "epoch": 1.685811552822315, + "grad_norm": 2.121195077896118, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7133313417434692, + "num_tokens": 382951639.0, + "step": 15351 + }, + { + "epoch": 1.6859213705249285, + "grad_norm": 2.2774953842163086, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7165793180465698, + "num_tokens": 382976041.0, + "step": 15352 + }, + { + "epoch": 1.6860311882275423, + "grad_norm": 2.103069305419922, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.707358181476593, + "num_tokens": 383005120.0, + "step": 15353 + }, + { + "epoch": 1.686141005930156, + "grad_norm": 2.030597448348999, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7013546824455261, + "num_tokens": 383035540.0, + "step": 15354 + }, + { + "epoch": 1.6862508236327696, + "grad_norm": 2.212268829345703, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7202132940292358, + "num_tokens": 383063023.0, + "step": 15355 + }, + { + "epoch": 1.686360641335383, + "grad_norm": 2.4372243881225586, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7178797125816345, + "num_tokens": 383084168.0, + "step": 15356 + }, + { + "epoch": 1.6864704590379969, + "grad_norm": 2.8721656799316406, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7412326335906982, + "num_tokens": 383101272.0, + "step": 15357 + }, + { + "epoch": 1.6865802767406106, + "grad_norm": 2.1382038593292236, + "learning_rate": 1e-06, + "loss": 1.0475, + "mean_token_accuracy": 0.6817992329597473, + "num_tokens": 383131893.0, + "step": 15358 + }, + { + "epoch": 1.6866900944432244, + "grad_norm": 2.4063427448272705, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.732895016670227, + "num_tokens": 383153737.0, + "step": 15359 + }, + { + "epoch": 1.686799912145838, + "grad_norm": 2.8349955081939697, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7244794964790344, + "num_tokens": 383171859.0, + "step": 15360 + }, + { + "epoch": 1.6869097298484514, + "grad_norm": 2.3784244060516357, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7274269461631775, + "num_tokens": 383195544.0, + "step": 15361 + }, + { + "epoch": 1.6870195475510652, + "grad_norm": 2.182602882385254, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7136756181716919, + "num_tokens": 383222648.0, + "step": 15362 + }, + { + "epoch": 1.687129365253679, + "grad_norm": 2.3269972801208496, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7115746736526489, + "num_tokens": 383247426.0, + "step": 15363 + }, + { + "epoch": 1.6872391829562927, + "grad_norm": 2.2929623126983643, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7311789989471436, + "num_tokens": 383272061.0, + "step": 15364 + }, + { + "epoch": 1.6873490006589063, + "grad_norm": 2.7616844177246094, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.734163761138916, + "num_tokens": 383296496.0, + "step": 15365 + }, + { + "epoch": 1.6874588183615198, + "grad_norm": 2.0719404220581055, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.7033029794692993, + "num_tokens": 383328375.0, + "step": 15366 + }, + { + "epoch": 1.6875686360641335, + "grad_norm": 2.42228102684021, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.6985791921615601, + "num_tokens": 383351852.0, + "step": 15367 + }, + { + "epoch": 1.6876784537667473, + "grad_norm": 1.9985841512680054, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7270982265472412, + "num_tokens": 383380756.0, + "step": 15368 + }, + { + "epoch": 1.6877882714693608, + "grad_norm": 2.3870198726654053, + "learning_rate": 1e-06, + "loss": 0.7289, + "mean_token_accuracy": 0.7607234120368958, + "num_tokens": 383402003.0, + "step": 15369 + }, + { + "epoch": 1.6878980891719744, + "grad_norm": 2.457787036895752, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7144137620925903, + "num_tokens": 383424619.0, + "step": 15370 + }, + { + "epoch": 1.6880079068745881, + "grad_norm": 2.3083574771881104, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7038414478302002, + "num_tokens": 383448743.0, + "step": 15371 + }, + { + "epoch": 1.6881177245772019, + "grad_norm": 2.6152162551879883, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7368486523628235, + "num_tokens": 383468449.0, + "step": 15372 + }, + { + "epoch": 1.6882275422798156, + "grad_norm": 1.9974545240402222, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7130563259124756, + "num_tokens": 383499335.0, + "step": 15373 + }, + { + "epoch": 1.6883373599824292, + "grad_norm": 2.0288310050964355, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7202255725860596, + "num_tokens": 383529769.0, + "step": 15374 + }, + { + "epoch": 1.6884471776850427, + "grad_norm": 2.210866928100586, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7042933106422424, + "num_tokens": 383555682.0, + "step": 15375 + }, + { + "epoch": 1.6885569953876565, + "grad_norm": 2.573045253753662, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7238986492156982, + "num_tokens": 383577444.0, + "step": 15376 + }, + { + "epoch": 1.6886668130902702, + "grad_norm": 2.2680442333221436, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.714560866355896, + "num_tokens": 383604035.0, + "step": 15377 + }, + { + "epoch": 1.6887766307928838, + "grad_norm": 2.09713077545166, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7030873894691467, + "num_tokens": 383635040.0, + "step": 15378 + }, + { + "epoch": 1.6888864484954975, + "grad_norm": 2.375230073928833, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7176921367645264, + "num_tokens": 383660280.0, + "step": 15379 + }, + { + "epoch": 1.688996266198111, + "grad_norm": 2.6244022846221924, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7351844310760498, + "num_tokens": 383680130.0, + "step": 15380 + }, + { + "epoch": 1.6891060839007248, + "grad_norm": 2.1526052951812744, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7228788137435913, + "num_tokens": 383708318.0, + "step": 15381 + }, + { + "epoch": 1.6892159016033386, + "grad_norm": 2.3934292793273926, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7109782099723816, + "num_tokens": 383733511.0, + "step": 15382 + }, + { + "epoch": 1.689325719305952, + "grad_norm": 1.8862642049789429, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7071331739425659, + "num_tokens": 383768457.0, + "step": 15383 + }, + { + "epoch": 1.6894355370085656, + "grad_norm": 2.1450719833374023, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7146639227867126, + "num_tokens": 383796051.0, + "step": 15384 + }, + { + "epoch": 1.6895453547111794, + "grad_norm": 2.307774782180786, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7058059573173523, + "num_tokens": 383822464.0, + "step": 15385 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 2.4033124446868896, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7064570784568787, + "num_tokens": 383845657.0, + "step": 15386 + }, + { + "epoch": 1.689764990116407, + "grad_norm": 2.2655067443847656, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7140231728553772, + "num_tokens": 383872480.0, + "step": 15387 + }, + { + "epoch": 1.6898748078190204, + "grad_norm": 2.537961959838867, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7129625082015991, + "num_tokens": 383893550.0, + "step": 15388 + }, + { + "epoch": 1.689984625521634, + "grad_norm": 2.2662193775177, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7349623441696167, + "num_tokens": 383917261.0, + "step": 15389 + }, + { + "epoch": 1.6900944432242477, + "grad_norm": 2.393289804458618, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7081303000450134, + "num_tokens": 383940703.0, + "step": 15390 + }, + { + "epoch": 1.6902042609268615, + "grad_norm": 2.2274811267852783, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7275588512420654, + "num_tokens": 383966808.0, + "step": 15391 + }, + { + "epoch": 1.690314078629475, + "grad_norm": 1.8778488636016846, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7256306409835815, + "num_tokens": 383999052.0, + "step": 15392 + }, + { + "epoch": 1.6904238963320888, + "grad_norm": 2.0836544036865234, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7165135145187378, + "num_tokens": 384029270.0, + "step": 15393 + }, + { + "epoch": 1.6905337140347023, + "grad_norm": 2.403583288192749, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7301591634750366, + "num_tokens": 384052144.0, + "step": 15394 + }, + { + "epoch": 1.690643531737316, + "grad_norm": 2.4784915447235107, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7121078968048096, + "num_tokens": 384074430.0, + "step": 15395 + }, + { + "epoch": 1.6907533494399298, + "grad_norm": 2.337204933166504, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7282936573028564, + "num_tokens": 384098384.0, + "step": 15396 + }, + { + "epoch": 1.6908631671425434, + "grad_norm": 2.199367046356201, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7029645442962646, + "num_tokens": 384123992.0, + "step": 15397 + }, + { + "epoch": 1.690972984845157, + "grad_norm": 2.056535482406616, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6947648525238037, + "num_tokens": 384158335.0, + "step": 15398 + }, + { + "epoch": 1.6910828025477707, + "grad_norm": 2.3666868209838867, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.720465898513794, + "num_tokens": 384182758.0, + "step": 15399 + }, + { + "epoch": 1.6911926202503844, + "grad_norm": 2.169429063796997, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.70762038230896, + "num_tokens": 384210023.0, + "step": 15400 + }, + { + "epoch": 1.6913024379529982, + "grad_norm": 2.239372968673706, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7130577564239502, + "num_tokens": 384236279.0, + "step": 15401 + }, + { + "epoch": 1.6914122556556117, + "grad_norm": 2.173009157180786, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7250010967254639, + "num_tokens": 384262488.0, + "step": 15402 + }, + { + "epoch": 1.6915220733582252, + "grad_norm": 2.2431492805480957, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7180123925209045, + "num_tokens": 384288829.0, + "step": 15403 + }, + { + "epoch": 1.691631891060839, + "grad_norm": 1.9386954307556152, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7019935846328735, + "num_tokens": 384323311.0, + "step": 15404 + }, + { + "epoch": 1.6917417087634528, + "grad_norm": 2.1465072631835938, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7193934917449951, + "num_tokens": 384350888.0, + "step": 15405 + }, + { + "epoch": 1.6918515264660663, + "grad_norm": 2.20511794090271, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7178390622138977, + "num_tokens": 384378301.0, + "step": 15406 + }, + { + "epoch": 1.6919613441686798, + "grad_norm": 1.8940403461456299, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.6992012858390808, + "num_tokens": 384412327.0, + "step": 15407 + }, + { + "epoch": 1.6920711618712936, + "grad_norm": 2.4185216426849365, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.727246880531311, + "num_tokens": 384434408.0, + "step": 15408 + }, + { + "epoch": 1.6921809795739073, + "grad_norm": 2.3381388187408447, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7154192924499512, + "num_tokens": 384462540.0, + "step": 15409 + }, + { + "epoch": 1.692290797276521, + "grad_norm": 2.0554802417755127, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7247530221939087, + "num_tokens": 384491316.0, + "step": 15410 + }, + { + "epoch": 1.6924006149791346, + "grad_norm": 1.9437135457992554, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.6995941400527954, + "num_tokens": 384525323.0, + "step": 15411 + }, + { + "epoch": 1.6925104326817482, + "grad_norm": 2.2718582153320312, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7270792126655579, + "num_tokens": 384548381.0, + "step": 15412 + }, + { + "epoch": 1.692620250384362, + "grad_norm": 2.235506534576416, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6995264291763306, + "num_tokens": 384575827.0, + "step": 15413 + }, + { + "epoch": 1.6927300680869757, + "grad_norm": 1.9216490983963013, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7443639636039734, + "num_tokens": 384606327.0, + "step": 15414 + }, + { + "epoch": 1.6928398857895894, + "grad_norm": 2.4181127548217773, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7337589859962463, + "num_tokens": 384628125.0, + "step": 15415 + }, + { + "epoch": 1.692949703492203, + "grad_norm": 2.106482744216919, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7230736613273621, + "num_tokens": 384654120.0, + "step": 15416 + }, + { + "epoch": 1.6930595211948165, + "grad_norm": 2.3767805099487305, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7338206171989441, + "num_tokens": 384675338.0, + "step": 15417 + }, + { + "epoch": 1.6931693388974303, + "grad_norm": 2.2020835876464844, + "learning_rate": 1e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7281757593154907, + "num_tokens": 384700349.0, + "step": 15418 + }, + { + "epoch": 1.693279156600044, + "grad_norm": 2.21232271194458, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7363381385803223, + "num_tokens": 384725051.0, + "step": 15419 + }, + { + "epoch": 1.6933889743026576, + "grad_norm": 2.4636781215667725, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7352901697158813, + "num_tokens": 384745199.0, + "step": 15420 + }, + { + "epoch": 1.693498792005271, + "grad_norm": 2.888376474380493, + "learning_rate": 1e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7458065152168274, + "num_tokens": 384762376.0, + "step": 15421 + }, + { + "epoch": 1.6936086097078848, + "grad_norm": 1.9852590560913086, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7098526358604431, + "num_tokens": 384794082.0, + "step": 15422 + }, + { + "epoch": 1.6937184274104986, + "grad_norm": 2.406794786453247, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7129354476928711, + "num_tokens": 384817264.0, + "step": 15423 + }, + { + "epoch": 1.6938282451131124, + "grad_norm": 2.2833914756774902, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.6982083320617676, + "num_tokens": 384842063.0, + "step": 15424 + }, + { + "epoch": 1.693938062815726, + "grad_norm": 2.2585577964782715, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7126693725585938, + "num_tokens": 384869458.0, + "step": 15425 + }, + { + "epoch": 1.6940478805183394, + "grad_norm": 2.1922571659088135, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7046219110488892, + "num_tokens": 384895741.0, + "step": 15426 + }, + { + "epoch": 1.6941576982209532, + "grad_norm": 2.4295666217803955, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7428313493728638, + "num_tokens": 384917203.0, + "step": 15427 + }, + { + "epoch": 1.694267515923567, + "grad_norm": 2.400503635406494, + "learning_rate": 1e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7464247941970825, + "num_tokens": 384940411.0, + "step": 15428 + }, + { + "epoch": 1.6943773336261807, + "grad_norm": 2.4756431579589844, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7281025648117065, + "num_tokens": 384962869.0, + "step": 15429 + }, + { + "epoch": 1.6944871513287942, + "grad_norm": 2.17581844329834, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.717630922794342, + "num_tokens": 384989519.0, + "step": 15430 + }, + { + "epoch": 1.6945969690314078, + "grad_norm": 2.192749500274658, + "learning_rate": 1e-06, + "loss": 0.7683, + "mean_token_accuracy": 0.7517919540405273, + "num_tokens": 385015481.0, + "step": 15431 + }, + { + "epoch": 1.6947067867340215, + "grad_norm": 2.189436435699463, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7260348796844482, + "num_tokens": 385043097.0, + "step": 15432 + }, + { + "epoch": 1.6948166044366353, + "grad_norm": 2.1128413677215576, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7215538024902344, + "num_tokens": 385069563.0, + "step": 15433 + }, + { + "epoch": 1.6949264221392488, + "grad_norm": 2.2853896617889404, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7229738235473633, + "num_tokens": 385093357.0, + "step": 15434 + }, + { + "epoch": 1.6950362398418624, + "grad_norm": 2.0694408416748047, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6984421014785767, + "num_tokens": 385123312.0, + "step": 15435 + }, + { + "epoch": 1.6951460575444761, + "grad_norm": 2.437897205352783, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7187114953994751, + "num_tokens": 385145274.0, + "step": 15436 + }, + { + "epoch": 1.6952558752470899, + "grad_norm": 2.9017202854156494, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.740898609161377, + "num_tokens": 385162263.0, + "step": 15437 + }, + { + "epoch": 1.6953656929497036, + "grad_norm": 2.123631715774536, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7078076601028442, + "num_tokens": 385187149.0, + "step": 15438 + }, + { + "epoch": 1.6954755106523172, + "grad_norm": 2.0598373413085938, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7058796882629395, + "num_tokens": 385218414.0, + "step": 15439 + }, + { + "epoch": 1.6955853283549307, + "grad_norm": 2.1427814960479736, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7182464599609375, + "num_tokens": 385243982.0, + "step": 15440 + }, + { + "epoch": 1.6956951460575445, + "grad_norm": 2.2529914379119873, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7151740193367004, + "num_tokens": 385269290.0, + "step": 15441 + }, + { + "epoch": 1.6958049637601582, + "grad_norm": 2.6372690200805664, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7406965494155884, + "num_tokens": 385290108.0, + "step": 15442 + }, + { + "epoch": 1.6959147814627717, + "grad_norm": 2.2340586185455322, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7280245423316956, + "num_tokens": 385314781.0, + "step": 15443 + }, + { + "epoch": 1.6960245991653855, + "grad_norm": 2.402632474899292, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7218866348266602, + "num_tokens": 385337558.0, + "step": 15444 + }, + { + "epoch": 1.696134416867999, + "grad_norm": 2.556854009628296, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7218689322471619, + "num_tokens": 385358519.0, + "step": 15445 + }, + { + "epoch": 1.6962442345706128, + "grad_norm": 2.2610294818878174, + "learning_rate": 1e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7586482763290405, + "num_tokens": 385382373.0, + "step": 15446 + }, + { + "epoch": 1.6963540522732266, + "grad_norm": 2.1141629219055176, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7207406163215637, + "num_tokens": 385407925.0, + "step": 15447 + }, + { + "epoch": 1.69646386997584, + "grad_norm": 2.2593564987182617, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.7384878396987915, + "num_tokens": 385431308.0, + "step": 15448 + }, + { + "epoch": 1.6965736876784536, + "grad_norm": 2.4437575340270996, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7208317518234253, + "num_tokens": 385452847.0, + "step": 15449 + }, + { + "epoch": 1.6966835053810674, + "grad_norm": 2.0504672527313232, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7362592816352844, + "num_tokens": 385482607.0, + "step": 15450 + }, + { + "epoch": 1.6967933230836811, + "grad_norm": 2.137585163116455, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7286773920059204, + "num_tokens": 385508290.0, + "step": 15451 + }, + { + "epoch": 1.696903140786295, + "grad_norm": 2.473485231399536, + "learning_rate": 1e-06, + "loss": 0.7696, + "mean_token_accuracy": 0.7564825415611267, + "num_tokens": 385528467.0, + "step": 15452 + }, + { + "epoch": 1.6970129584889084, + "grad_norm": 1.9833558797836304, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7040321826934814, + "num_tokens": 385559361.0, + "step": 15453 + }, + { + "epoch": 1.697122776191522, + "grad_norm": 2.3335418701171875, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7277898788452148, + "num_tokens": 385584059.0, + "step": 15454 + }, + { + "epoch": 1.6972325938941357, + "grad_norm": 2.2750940322875977, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.725985050201416, + "num_tokens": 385610540.0, + "step": 15455 + }, + { + "epoch": 1.6973424115967495, + "grad_norm": 2.4519495964050293, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7206777334213257, + "num_tokens": 385632957.0, + "step": 15456 + }, + { + "epoch": 1.697452229299363, + "grad_norm": 2.620712995529175, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7387892603874207, + "num_tokens": 385652831.0, + "step": 15457 + }, + { + "epoch": 1.6975620470019768, + "grad_norm": 2.0814430713653564, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7090347409248352, + "num_tokens": 385681549.0, + "step": 15458 + }, + { + "epoch": 1.6976718647045903, + "grad_norm": 2.421736717224121, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7038517594337463, + "num_tokens": 385704171.0, + "step": 15459 + }, + { + "epoch": 1.697781682407204, + "grad_norm": 2.4245800971984863, + "learning_rate": 1e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7383615374565125, + "num_tokens": 385726171.0, + "step": 15460 + }, + { + "epoch": 1.6978915001098178, + "grad_norm": 2.507506847381592, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7317973375320435, + "num_tokens": 385747663.0, + "step": 15461 + }, + { + "epoch": 1.6980013178124314, + "grad_norm": 2.3114206790924072, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7147501707077026, + "num_tokens": 385771778.0, + "step": 15462 + }, + { + "epoch": 1.698111135515045, + "grad_norm": 2.5699946880340576, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7237814664840698, + "num_tokens": 385794678.0, + "step": 15463 + }, + { + "epoch": 1.6982209532176586, + "grad_norm": 1.9957958459854126, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7298489212989807, + "num_tokens": 385823104.0, + "step": 15464 + }, + { + "epoch": 1.6983307709202724, + "grad_norm": 2.3190317153930664, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7223268747329712, + "num_tokens": 385845649.0, + "step": 15465 + }, + { + "epoch": 1.6984405886228862, + "grad_norm": 2.258431911468506, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7140469551086426, + "num_tokens": 385870668.0, + "step": 15466 + }, + { + "epoch": 1.6985504063254997, + "grad_norm": 2.2995524406433105, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7090986371040344, + "num_tokens": 385895666.0, + "step": 15467 + }, + { + "epoch": 1.6986602240281132, + "grad_norm": 2.275327444076538, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7261170148849487, + "num_tokens": 385922432.0, + "step": 15468 + }, + { + "epoch": 1.698770041730727, + "grad_norm": 1.9375418424606323, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7194833755493164, + "num_tokens": 385954921.0, + "step": 15469 + }, + { + "epoch": 1.6988798594333407, + "grad_norm": 1.9475125074386597, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7082807421684265, + "num_tokens": 385985190.0, + "step": 15470 + }, + { + "epoch": 1.6989896771359543, + "grad_norm": 2.41275691986084, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7112565040588379, + "num_tokens": 386008262.0, + "step": 15471 + }, + { + "epoch": 1.6990994948385678, + "grad_norm": 2.1026673316955566, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7049896717071533, + "num_tokens": 386035462.0, + "step": 15472 + }, + { + "epoch": 1.6992093125411816, + "grad_norm": 2.108389139175415, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7079156041145325, + "num_tokens": 386062651.0, + "step": 15473 + }, + { + "epoch": 1.6993191302437953, + "grad_norm": 2.8032119274139404, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.716404378414154, + "num_tokens": 386079938.0, + "step": 15474 + }, + { + "epoch": 1.699428947946409, + "grad_norm": 2.4115662574768066, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7457822561264038, + "num_tokens": 386100898.0, + "step": 15475 + }, + { + "epoch": 1.6995387656490226, + "grad_norm": 2.2005114555358887, + "learning_rate": 1e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7369258403778076, + "num_tokens": 386124552.0, + "step": 15476 + }, + { + "epoch": 1.6996485833516362, + "grad_norm": 2.3704113960266113, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7054063677787781, + "num_tokens": 386149144.0, + "step": 15477 + }, + { + "epoch": 1.69975840105425, + "grad_norm": 2.3763842582702637, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7216348648071289, + "num_tokens": 386170537.0, + "step": 15478 + }, + { + "epoch": 1.6998682187568637, + "grad_norm": 2.5128724575042725, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7392433285713196, + "num_tokens": 386190772.0, + "step": 15479 + }, + { + "epoch": 1.6999780364594774, + "grad_norm": 2.469639539718628, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7164608240127563, + "num_tokens": 386213101.0, + "step": 15480 + }, + { + "epoch": 1.700087854162091, + "grad_norm": 2.7379496097564697, + "learning_rate": 1e-06, + "loss": 0.794, + "mean_token_accuracy": 0.7487000226974487, + "num_tokens": 386230616.0, + "step": 15481 + }, + { + "epoch": 1.7001976718647045, + "grad_norm": 2.1932010650634766, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7157981395721436, + "num_tokens": 386257334.0, + "step": 15482 + }, + { + "epoch": 1.7003074895673183, + "grad_norm": 2.4353530406951904, + "learning_rate": 1e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7414379715919495, + "num_tokens": 386278225.0, + "step": 15483 + }, + { + "epoch": 1.700417307269932, + "grad_norm": 2.203118085861206, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.713096022605896, + "num_tokens": 386304428.0, + "step": 15484 + }, + { + "epoch": 1.7005271249725455, + "grad_norm": 1.9599779844284058, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7194170951843262, + "num_tokens": 386334900.0, + "step": 15485 + }, + { + "epoch": 1.700636942675159, + "grad_norm": 1.9261512756347656, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.704227089881897, + "num_tokens": 386368418.0, + "step": 15486 + }, + { + "epoch": 1.7007467603777728, + "grad_norm": 2.2579569816589355, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7238372564315796, + "num_tokens": 386393639.0, + "step": 15487 + }, + { + "epoch": 1.7008565780803866, + "grad_norm": 2.424835443496704, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7258837223052979, + "num_tokens": 386416373.0, + "step": 15488 + }, + { + "epoch": 1.7009663957830004, + "grad_norm": 2.3811933994293213, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7291114926338196, + "num_tokens": 386439317.0, + "step": 15489 + }, + { + "epoch": 1.7010762134856139, + "grad_norm": 2.4310386180877686, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7312290668487549, + "num_tokens": 386462438.0, + "step": 15490 + }, + { + "epoch": 1.7011860311882274, + "grad_norm": 2.20169997215271, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6967433094978333, + "num_tokens": 386490456.0, + "step": 15491 + }, + { + "epoch": 1.7012958488908412, + "grad_norm": 2.1781795024871826, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7256793975830078, + "num_tokens": 386514730.0, + "step": 15492 + }, + { + "epoch": 1.701405666593455, + "grad_norm": 2.17250657081604, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.736668586730957, + "num_tokens": 386539249.0, + "step": 15493 + }, + { + "epoch": 1.7015154842960687, + "grad_norm": 2.297435760498047, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7286486625671387, + "num_tokens": 386565456.0, + "step": 15494 + }, + { + "epoch": 1.7016253019986822, + "grad_norm": 2.1487057209014893, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7105457782745361, + "num_tokens": 386592970.0, + "step": 15495 + }, + { + "epoch": 1.7017351197012958, + "grad_norm": 2.311788320541382, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7124700546264648, + "num_tokens": 386617701.0, + "step": 15496 + }, + { + "epoch": 1.7018449374039095, + "grad_norm": 2.373701572418213, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7403740882873535, + "num_tokens": 386638948.0, + "step": 15497 + }, + { + "epoch": 1.7019547551065233, + "grad_norm": 2.161332368850708, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7185437679290771, + "num_tokens": 386664772.0, + "step": 15498 + }, + { + "epoch": 1.7020645728091368, + "grad_norm": 2.288205146789551, + "learning_rate": 1e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.749549150466919, + "num_tokens": 386688443.0, + "step": 15499 + }, + { + "epoch": 1.7021743905117503, + "grad_norm": 2.0761795043945312, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.698050320148468, + "num_tokens": 386715777.0, + "step": 15500 + }, + { + "epoch": 1.702284208214364, + "grad_norm": 2.2457356452941895, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7167304158210754, + "num_tokens": 386740958.0, + "step": 15501 + }, + { + "epoch": 1.7023940259169779, + "grad_norm": 2.5225768089294434, + "learning_rate": 1e-06, + "loss": 0.8002, + "mean_token_accuracy": 0.7446123361587524, + "num_tokens": 386760579.0, + "step": 15502 + }, + { + "epoch": 1.7025038436195916, + "grad_norm": 2.0074687004089355, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6886147260665894, + "num_tokens": 386791953.0, + "step": 15503 + }, + { + "epoch": 1.7026136613222052, + "grad_norm": 2.362332820892334, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7136058211326599, + "num_tokens": 386817264.0, + "step": 15504 + }, + { + "epoch": 1.7027234790248187, + "grad_norm": 2.7148756980895996, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7285020351409912, + "num_tokens": 386836664.0, + "step": 15505 + }, + { + "epoch": 1.7028332967274324, + "grad_norm": 2.6995394229888916, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7264500856399536, + "num_tokens": 386857105.0, + "step": 15506 + }, + { + "epoch": 1.7029431144300462, + "grad_norm": 2.0894012451171875, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7126349806785583, + "num_tokens": 386886162.0, + "step": 15507 + }, + { + "epoch": 1.7030529321326597, + "grad_norm": 2.288837194442749, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7152206897735596, + "num_tokens": 386912736.0, + "step": 15508 + }, + { + "epoch": 1.7031627498352735, + "grad_norm": 2.4714746475219727, + "learning_rate": 1e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.7461543083190918, + "num_tokens": 386934973.0, + "step": 15509 + }, + { + "epoch": 1.703272567537887, + "grad_norm": 2.294142484664917, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7413268685340881, + "num_tokens": 386958320.0, + "step": 15510 + }, + { + "epoch": 1.7033823852405008, + "grad_norm": 1.8898621797561646, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7162908911705017, + "num_tokens": 386992812.0, + "step": 15511 + }, + { + "epoch": 1.7034922029431145, + "grad_norm": 2.5253093242645264, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7362799644470215, + "num_tokens": 387015155.0, + "step": 15512 + }, + { + "epoch": 1.703602020645728, + "grad_norm": 2.7159464359283447, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7437020540237427, + "num_tokens": 387032401.0, + "step": 15513 + }, + { + "epoch": 1.7037118383483416, + "grad_norm": 2.3669791221618652, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.723953902721405, + "num_tokens": 387057613.0, + "step": 15514 + }, + { + "epoch": 1.7038216560509554, + "grad_norm": 2.2581000328063965, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7335023880004883, + "num_tokens": 387081316.0, + "step": 15515 + }, + { + "epoch": 1.7039314737535691, + "grad_norm": 2.2119102478027344, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7058535218238831, + "num_tokens": 387106816.0, + "step": 15516 + }, + { + "epoch": 1.7040412914561829, + "grad_norm": 2.391019105911255, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.714988648891449, + "num_tokens": 387127859.0, + "step": 15517 + }, + { + "epoch": 1.7041511091587964, + "grad_norm": 2.3022971153259277, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7149076461791992, + "num_tokens": 387152651.0, + "step": 15518 + }, + { + "epoch": 1.70426092686141, + "grad_norm": 2.7250185012817383, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7383396625518799, + "num_tokens": 387170766.0, + "step": 15519 + }, + { + "epoch": 1.7043707445640237, + "grad_norm": 2.2515034675598145, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7127374410629272, + "num_tokens": 387197014.0, + "step": 15520 + }, + { + "epoch": 1.7044805622666375, + "grad_norm": 2.2959389686584473, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.718157172203064, + "num_tokens": 387220803.0, + "step": 15521 + }, + { + "epoch": 1.704590379969251, + "grad_norm": 2.173463821411133, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7247207164764404, + "num_tokens": 387245832.0, + "step": 15522 + }, + { + "epoch": 1.7047001976718648, + "grad_norm": 2.13955020904541, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7107741832733154, + "num_tokens": 387275182.0, + "step": 15523 + }, + { + "epoch": 1.7048100153744783, + "grad_norm": 2.4067277908325195, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7113855481147766, + "num_tokens": 387299852.0, + "step": 15524 + }, + { + "epoch": 1.704919833077092, + "grad_norm": 2.1447770595550537, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7306793332099915, + "num_tokens": 387329019.0, + "step": 15525 + }, + { + "epoch": 1.7050296507797058, + "grad_norm": 2.5069363117218018, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7241566181182861, + "num_tokens": 387349766.0, + "step": 15526 + }, + { + "epoch": 1.7051394684823193, + "grad_norm": 2.3432159423828125, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.718487560749054, + "num_tokens": 387374169.0, + "step": 15527 + }, + { + "epoch": 1.7052492861849329, + "grad_norm": 2.3129634857177734, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7307885885238647, + "num_tokens": 387396648.0, + "step": 15528 + }, + { + "epoch": 1.7053591038875466, + "grad_norm": 2.4206197261810303, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7131751179695129, + "num_tokens": 387418036.0, + "step": 15529 + }, + { + "epoch": 1.7054689215901604, + "grad_norm": 2.218419313430786, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7174516320228577, + "num_tokens": 387444427.0, + "step": 15530 + }, + { + "epoch": 1.7055787392927741, + "grad_norm": 2.4504523277282715, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7290956377983093, + "num_tokens": 387466769.0, + "step": 15531 + }, + { + "epoch": 1.7056885569953877, + "grad_norm": 1.9652314186096191, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7102840542793274, + "num_tokens": 387500573.0, + "step": 15532 + }, + { + "epoch": 1.7057983746980012, + "grad_norm": 2.3088502883911133, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7180009484291077, + "num_tokens": 387525234.0, + "step": 15533 + }, + { + "epoch": 1.705908192400615, + "grad_norm": 2.2158725261688232, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7140181064605713, + "num_tokens": 387551745.0, + "step": 15534 + }, + { + "epoch": 1.7060180101032287, + "grad_norm": 2.3753697872161865, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7320016622543335, + "num_tokens": 387576378.0, + "step": 15535 + }, + { + "epoch": 1.7061278278058423, + "grad_norm": 2.0849945545196533, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6918273568153381, + "num_tokens": 387608350.0, + "step": 15536 + }, + { + "epoch": 1.7062376455084558, + "grad_norm": 2.609052896499634, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.733864426612854, + "num_tokens": 387627095.0, + "step": 15537 + }, + { + "epoch": 1.7063474632110696, + "grad_norm": 2.3795018196105957, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7222909331321716, + "num_tokens": 387650643.0, + "step": 15538 + }, + { + "epoch": 1.7064572809136833, + "grad_norm": 2.3671019077301025, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7220433354377747, + "num_tokens": 387673175.0, + "step": 15539 + }, + { + "epoch": 1.706567098616297, + "grad_norm": 2.585803508758545, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7186689376831055, + "num_tokens": 387693301.0, + "step": 15540 + }, + { + "epoch": 1.7066769163189106, + "grad_norm": 2.0468974113464355, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7345575094223022, + "num_tokens": 387724912.0, + "step": 15541 + }, + { + "epoch": 1.7067867340215241, + "grad_norm": 2.1093032360076904, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7220733761787415, + "num_tokens": 387754844.0, + "step": 15542 + }, + { + "epoch": 1.706896551724138, + "grad_norm": 2.4618237018585205, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7329812049865723, + "num_tokens": 387776157.0, + "step": 15543 + }, + { + "epoch": 1.7070063694267517, + "grad_norm": 2.3075485229492188, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7088212370872498, + "num_tokens": 387800725.0, + "step": 15544 + }, + { + "epoch": 1.7071161871293654, + "grad_norm": 2.5702321529388428, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.717461109161377, + "num_tokens": 387821913.0, + "step": 15545 + }, + { + "epoch": 1.707226004831979, + "grad_norm": 2.4307305812835693, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7273958325386047, + "num_tokens": 387843804.0, + "step": 15546 + }, + { + "epoch": 1.7073358225345925, + "grad_norm": 2.029829740524292, + "learning_rate": 1e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7424944639205933, + "num_tokens": 387869656.0, + "step": 15547 + }, + { + "epoch": 1.7074456402372062, + "grad_norm": 2.3851354122161865, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7291954755783081, + "num_tokens": 387893241.0, + "step": 15548 + }, + { + "epoch": 1.70755545793982, + "grad_norm": 2.5411689281463623, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7019242644309998, + "num_tokens": 387915429.0, + "step": 15549 + }, + { + "epoch": 1.7076652756424335, + "grad_norm": 2.4209799766540527, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7363574504852295, + "num_tokens": 387934775.0, + "step": 15550 + }, + { + "epoch": 1.707775093345047, + "grad_norm": 2.691553831100464, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.732680082321167, + "num_tokens": 387953814.0, + "step": 15551 + }, + { + "epoch": 1.7078849110476608, + "grad_norm": 2.347304582595825, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7194316983222961, + "num_tokens": 387976270.0, + "step": 15552 + }, + { + "epoch": 1.7079947287502746, + "grad_norm": 2.3769867420196533, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7217271327972412, + "num_tokens": 387999328.0, + "step": 15553 + }, + { + "epoch": 1.7081045464528883, + "grad_norm": 2.47045636177063, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7353121042251587, + "num_tokens": 388021435.0, + "step": 15554 + }, + { + "epoch": 1.7082143641555019, + "grad_norm": 2.3764586448669434, + "learning_rate": 1e-06, + "loss": 0.812, + "mean_token_accuracy": 0.7404762506484985, + "num_tokens": 388046536.0, + "step": 15555 + }, + { + "epoch": 1.7083241818581154, + "grad_norm": 2.4253480434417725, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7269834876060486, + "num_tokens": 388067966.0, + "step": 15556 + }, + { + "epoch": 1.7084339995607292, + "grad_norm": 2.5680480003356934, + "learning_rate": 1e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7476036548614502, + "num_tokens": 388086205.0, + "step": 15557 + }, + { + "epoch": 1.708543817263343, + "grad_norm": 2.248013496398926, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7029999494552612, + "num_tokens": 388111129.0, + "step": 15558 + }, + { + "epoch": 1.7086536349659565, + "grad_norm": 2.286067485809326, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7182546257972717, + "num_tokens": 388137688.0, + "step": 15559 + }, + { + "epoch": 1.7087634526685702, + "grad_norm": 2.3779022693634033, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7349892854690552, + "num_tokens": 388161044.0, + "step": 15560 + }, + { + "epoch": 1.7088732703711838, + "grad_norm": 2.3593173027038574, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7253361940383911, + "num_tokens": 388184294.0, + "step": 15561 + }, + { + "epoch": 1.7089830880737975, + "grad_norm": 2.4675753116607666, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7184192538261414, + "num_tokens": 388206365.0, + "step": 15562 + }, + { + "epoch": 1.7090929057764113, + "grad_norm": 2.149197578430176, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7149887084960938, + "num_tokens": 388231754.0, + "step": 15563 + }, + { + "epoch": 1.7092027234790248, + "grad_norm": 2.0337960720062256, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7299998998641968, + "num_tokens": 388259551.0, + "step": 15564 + }, + { + "epoch": 1.7093125411816383, + "grad_norm": 2.177154064178467, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7414209842681885, + "num_tokens": 388284627.0, + "step": 15565 + }, + { + "epoch": 1.709422358884252, + "grad_norm": 2.3566200733184814, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7297708988189697, + "num_tokens": 388308864.0, + "step": 15566 + }, + { + "epoch": 1.7095321765868658, + "grad_norm": 1.984897494316101, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7141246199607849, + "num_tokens": 388340420.0, + "step": 15567 + }, + { + "epoch": 1.7096419942894796, + "grad_norm": 2.3187692165374756, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.6990740299224854, + "num_tokens": 388365942.0, + "step": 15568 + }, + { + "epoch": 1.7097518119920931, + "grad_norm": 2.195777654647827, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6875455379486084, + "num_tokens": 388392372.0, + "step": 15569 + }, + { + "epoch": 1.7098616296947067, + "grad_norm": 2.4780197143554688, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7333694100379944, + "num_tokens": 388415015.0, + "step": 15570 + }, + { + "epoch": 1.7099714473973204, + "grad_norm": 2.7996914386749268, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7370733022689819, + "num_tokens": 388432258.0, + "step": 15571 + }, + { + "epoch": 1.7100812650999342, + "grad_norm": 2.1281728744506836, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7041076421737671, + "num_tokens": 388460468.0, + "step": 15572 + }, + { + "epoch": 1.7101910828025477, + "grad_norm": 2.5290467739105225, + "learning_rate": 1e-06, + "loss": 0.8081, + "mean_token_accuracy": 0.7458320260047913, + "num_tokens": 388479667.0, + "step": 15573 + }, + { + "epoch": 1.7103009005051615, + "grad_norm": 2.3180735111236572, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7180004119873047, + "num_tokens": 388504781.0, + "step": 15574 + }, + { + "epoch": 1.710410718207775, + "grad_norm": 2.8482675552368164, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7245347499847412, + "num_tokens": 388522444.0, + "step": 15575 + }, + { + "epoch": 1.7105205359103888, + "grad_norm": 2.494414806365967, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7461144924163818, + "num_tokens": 388544649.0, + "step": 15576 + }, + { + "epoch": 1.7106303536130025, + "grad_norm": 2.1726458072662354, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7261680960655212, + "num_tokens": 388572005.0, + "step": 15577 + }, + { + "epoch": 1.710740171315616, + "grad_norm": 2.0816168785095215, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7240182161331177, + "num_tokens": 388600732.0, + "step": 15578 + }, + { + "epoch": 1.7108499890182296, + "grad_norm": 2.1098010540008545, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7361562252044678, + "num_tokens": 388627454.0, + "step": 15579 + }, + { + "epoch": 1.7109598067208434, + "grad_norm": 2.2822163105010986, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.725310206413269, + "num_tokens": 388653384.0, + "step": 15580 + }, + { + "epoch": 1.7110696244234571, + "grad_norm": 2.55009388923645, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7291405200958252, + "num_tokens": 388675743.0, + "step": 15581 + }, + { + "epoch": 1.7111794421260709, + "grad_norm": 2.474327802658081, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7318816781044006, + "num_tokens": 388697699.0, + "step": 15582 + }, + { + "epoch": 1.7112892598286844, + "grad_norm": 2.2000627517700195, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7291719317436218, + "num_tokens": 388723183.0, + "step": 15583 + }, + { + "epoch": 1.711399077531298, + "grad_norm": 2.230105400085449, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.723179817199707, + "num_tokens": 388749102.0, + "step": 15584 + }, + { + "epoch": 1.7115088952339117, + "grad_norm": 2.3518974781036377, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7112014889717102, + "num_tokens": 388775872.0, + "step": 15585 + }, + { + "epoch": 1.7116187129365255, + "grad_norm": 2.0094985961914062, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7167787551879883, + "num_tokens": 388805443.0, + "step": 15586 + }, + { + "epoch": 1.711728530639139, + "grad_norm": 2.2139229774475098, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7376294136047363, + "num_tokens": 388830670.0, + "step": 15587 + }, + { + "epoch": 1.7118383483417525, + "grad_norm": 2.091883420944214, + "learning_rate": 1e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7550363540649414, + "num_tokens": 388856190.0, + "step": 15588 + }, + { + "epoch": 1.7119481660443663, + "grad_norm": 2.380890130996704, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6873095035552979, + "num_tokens": 388882127.0, + "step": 15589 + }, + { + "epoch": 1.71205798374698, + "grad_norm": 2.104728937149048, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7227210998535156, + "num_tokens": 388912939.0, + "step": 15590 + }, + { + "epoch": 1.7121678014495938, + "grad_norm": 2.2790629863739014, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7495549917221069, + "num_tokens": 388937066.0, + "step": 15591 + }, + { + "epoch": 1.7122776191522073, + "grad_norm": 2.1065611839294434, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7169978618621826, + "num_tokens": 388963731.0, + "step": 15592 + }, + { + "epoch": 1.7123874368548209, + "grad_norm": 2.5820069313049316, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7185264825820923, + "num_tokens": 388984861.0, + "step": 15593 + }, + { + "epoch": 1.7124972545574346, + "grad_norm": 2.2661283016204834, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.735812783241272, + "num_tokens": 389008124.0, + "step": 15594 + }, + { + "epoch": 1.7126070722600484, + "grad_norm": 2.362999439239502, + "learning_rate": 1e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7354731559753418, + "num_tokens": 389029082.0, + "step": 15595 + }, + { + "epoch": 1.7127168899626621, + "grad_norm": 2.2079780101776123, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7197977304458618, + "num_tokens": 389054249.0, + "step": 15596 + }, + { + "epoch": 1.7128267076652757, + "grad_norm": 2.1176693439483643, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7274779081344604, + "num_tokens": 389082017.0, + "step": 15597 + }, + { + "epoch": 1.7129365253678892, + "grad_norm": 2.3199665546417236, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7054488062858582, + "num_tokens": 389110850.0, + "step": 15598 + }, + { + "epoch": 1.713046343070503, + "grad_norm": 2.3764891624450684, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7010905742645264, + "num_tokens": 389134674.0, + "step": 15599 + }, + { + "epoch": 1.7131561607731167, + "grad_norm": 2.1627588272094727, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7372767925262451, + "num_tokens": 389162659.0, + "step": 15600 + }, + { + "epoch": 1.7132659784757303, + "grad_norm": 2.481982946395874, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7052617073059082, + "num_tokens": 389186094.0, + "step": 15601 + }, + { + "epoch": 1.7133757961783438, + "grad_norm": 2.260806083679199, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7194586992263794, + "num_tokens": 389212338.0, + "step": 15602 + }, + { + "epoch": 1.7134856138809575, + "grad_norm": 2.2151565551757812, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7155454158782959, + "num_tokens": 389238467.0, + "step": 15603 + }, + { + "epoch": 1.7135954315835713, + "grad_norm": 2.1166601181030273, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7258289456367493, + "num_tokens": 389269966.0, + "step": 15604 + }, + { + "epoch": 1.713705249286185, + "grad_norm": 1.8037515878677368, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7004106044769287, + "num_tokens": 389309040.0, + "step": 15605 + }, + { + "epoch": 1.7138150669887986, + "grad_norm": 2.2929303646087646, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7146930694580078, + "num_tokens": 389334709.0, + "step": 15606 + }, + { + "epoch": 1.7139248846914121, + "grad_norm": 2.1434805393218994, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7264835834503174, + "num_tokens": 389361301.0, + "step": 15607 + }, + { + "epoch": 1.7140347023940259, + "grad_norm": 2.0943686962127686, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7293937802314758, + "num_tokens": 389387188.0, + "step": 15608 + }, + { + "epoch": 1.7141445200966396, + "grad_norm": 2.26186466217041, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6929681301116943, + "num_tokens": 389413316.0, + "step": 15609 + }, + { + "epoch": 1.7142543377992534, + "grad_norm": 2.380699396133423, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.710268497467041, + "num_tokens": 389437999.0, + "step": 15610 + }, + { + "epoch": 1.714364155501867, + "grad_norm": 2.374354839324951, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7229858636856079, + "num_tokens": 389460729.0, + "step": 15611 + }, + { + "epoch": 1.7144739732044805, + "grad_norm": 2.4961652755737305, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6963925361633301, + "num_tokens": 389482628.0, + "step": 15612 + }, + { + "epoch": 1.7145837909070942, + "grad_norm": 2.2263619899749756, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7119477391242981, + "num_tokens": 389509349.0, + "step": 15613 + }, + { + "epoch": 1.714693608609708, + "grad_norm": 2.3371646404266357, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7125381231307983, + "num_tokens": 389535693.0, + "step": 15614 + }, + { + "epoch": 1.7148034263123215, + "grad_norm": 2.267918825149536, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7157485485076904, + "num_tokens": 389563398.0, + "step": 15615 + }, + { + "epoch": 1.714913244014935, + "grad_norm": 2.3763318061828613, + "learning_rate": 1e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7423597574234009, + "num_tokens": 389584856.0, + "step": 15616 + }, + { + "epoch": 1.7150230617175488, + "grad_norm": 2.1162760257720947, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7168121933937073, + "num_tokens": 389614424.0, + "step": 15617 + }, + { + "epoch": 1.7151328794201626, + "grad_norm": 2.070023536682129, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7011944055557251, + "num_tokens": 389643779.0, + "step": 15618 + }, + { + "epoch": 1.7152426971227763, + "grad_norm": 2.375575304031372, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7357578277587891, + "num_tokens": 389666685.0, + "step": 15619 + }, + { + "epoch": 1.7153525148253899, + "grad_norm": 2.253120183944702, + "learning_rate": 1e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7371355295181274, + "num_tokens": 389691126.0, + "step": 15620 + }, + { + "epoch": 1.7154623325280034, + "grad_norm": 2.491811513900757, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7236816883087158, + "num_tokens": 389712135.0, + "step": 15621 + }, + { + "epoch": 1.7155721502306172, + "grad_norm": 1.994096040725708, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7052319049835205, + "num_tokens": 389745125.0, + "step": 15622 + }, + { + "epoch": 1.715681967933231, + "grad_norm": 2.152184009552002, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.729064404964447, + "num_tokens": 389771563.0, + "step": 15623 + }, + { + "epoch": 1.7157917856358444, + "grad_norm": 2.1419835090637207, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7246408462524414, + "num_tokens": 389798846.0, + "step": 15624 + }, + { + "epoch": 1.7159016033384582, + "grad_norm": 2.0050113201141357, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7335842251777649, + "num_tokens": 389826708.0, + "step": 15625 + }, + { + "epoch": 1.7160114210410717, + "grad_norm": 2.272130250930786, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7118615508079529, + "num_tokens": 389851205.0, + "step": 15626 + }, + { + "epoch": 1.7161212387436855, + "grad_norm": 2.702380418777466, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7260396480560303, + "num_tokens": 389871246.0, + "step": 15627 + }, + { + "epoch": 1.7162310564462993, + "grad_norm": 2.5944042205810547, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7184882164001465, + "num_tokens": 389893357.0, + "step": 15628 + }, + { + "epoch": 1.7163408741489128, + "grad_norm": 2.5120859146118164, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7309027910232544, + "num_tokens": 389915753.0, + "step": 15629 + }, + { + "epoch": 1.7164506918515263, + "grad_norm": 2.262754201889038, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7139856815338135, + "num_tokens": 389942687.0, + "step": 15630 + }, + { + "epoch": 1.71656050955414, + "grad_norm": 2.612297773361206, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7326550483703613, + "num_tokens": 389963237.0, + "step": 15631 + }, + { + "epoch": 1.7166703272567538, + "grad_norm": 2.109130620956421, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7080460786819458, + "num_tokens": 389989158.0, + "step": 15632 + }, + { + "epoch": 1.7167801449593676, + "grad_norm": 2.1918253898620605, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7037670612335205, + "num_tokens": 390016948.0, + "step": 15633 + }, + { + "epoch": 1.7168899626619811, + "grad_norm": 2.3503503799438477, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7313768863677979, + "num_tokens": 390039430.0, + "step": 15634 + }, + { + "epoch": 1.7169997803645947, + "grad_norm": 2.363987684249878, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7160533666610718, + "num_tokens": 390062554.0, + "step": 15635 + }, + { + "epoch": 1.7171095980672084, + "grad_norm": 2.104034423828125, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.700094997882843, + "num_tokens": 390090550.0, + "step": 15636 + }, + { + "epoch": 1.7172194157698222, + "grad_norm": 2.161190986633301, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7217360734939575, + "num_tokens": 390117651.0, + "step": 15637 + }, + { + "epoch": 1.7173292334724357, + "grad_norm": 2.7350716590881348, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7407086491584778, + "num_tokens": 390136190.0, + "step": 15638 + }, + { + "epoch": 1.7174390511750495, + "grad_norm": 2.194917917251587, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7001059055328369, + "num_tokens": 390162501.0, + "step": 15639 + }, + { + "epoch": 1.717548868877663, + "grad_norm": 2.015671730041504, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6893489360809326, + "num_tokens": 390192391.0, + "step": 15640 + }, + { + "epoch": 1.7176586865802768, + "grad_norm": 2.363495111465454, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.730130672454834, + "num_tokens": 390216115.0, + "step": 15641 + }, + { + "epoch": 1.7177685042828905, + "grad_norm": 2.1350257396698, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7236512899398804, + "num_tokens": 390244671.0, + "step": 15642 + }, + { + "epoch": 1.717878321985504, + "grad_norm": 2.2263643741607666, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7278554439544678, + "num_tokens": 390271259.0, + "step": 15643 + }, + { + "epoch": 1.7179881396881176, + "grad_norm": 2.5917835235595703, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7205884456634521, + "num_tokens": 390291729.0, + "step": 15644 + }, + { + "epoch": 1.7180979573907313, + "grad_norm": 2.013462781906128, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.6993733644485474, + "num_tokens": 390322419.0, + "step": 15645 + }, + { + "epoch": 1.718207775093345, + "grad_norm": 1.9065401554107666, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7176032662391663, + "num_tokens": 390356031.0, + "step": 15646 + }, + { + "epoch": 1.7183175927959589, + "grad_norm": 2.646866798400879, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7369346618652344, + "num_tokens": 390375847.0, + "step": 15647 + }, + { + "epoch": 1.7184274104985724, + "grad_norm": 2.4456658363342285, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.712434709072113, + "num_tokens": 390397505.0, + "step": 15648 + }, + { + "epoch": 1.718537228201186, + "grad_norm": 2.6314282417297363, + "learning_rate": 1e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7513440847396851, + "num_tokens": 390416437.0, + "step": 15649 + }, + { + "epoch": 1.7186470459037997, + "grad_norm": 2.5385501384735107, + "learning_rate": 1e-06, + "loss": 0.7694, + "mean_token_accuracy": 0.7578619122505188, + "num_tokens": 390436528.0, + "step": 15650 + }, + { + "epoch": 1.7187568636064134, + "grad_norm": 2.3326609134674072, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7118597626686096, + "num_tokens": 390461898.0, + "step": 15651 + }, + { + "epoch": 1.718866681309027, + "grad_norm": 2.3726513385772705, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7196911573410034, + "num_tokens": 390484262.0, + "step": 15652 + }, + { + "epoch": 1.7189764990116405, + "grad_norm": 2.297290802001953, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7043582201004028, + "num_tokens": 390508308.0, + "step": 15653 + }, + { + "epoch": 1.7190863167142543, + "grad_norm": 2.397390604019165, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.72138911485672, + "num_tokens": 390530148.0, + "step": 15654 + }, + { + "epoch": 1.719196134416868, + "grad_norm": 2.185657262802124, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7214676737785339, + "num_tokens": 390555280.0, + "step": 15655 + }, + { + "epoch": 1.7193059521194818, + "grad_norm": 2.150437593460083, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7015646696090698, + "num_tokens": 390583246.0, + "step": 15656 + }, + { + "epoch": 1.7194157698220953, + "grad_norm": 2.312415361404419, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7192211151123047, + "num_tokens": 390606865.0, + "step": 15657 + }, + { + "epoch": 1.7195255875247089, + "grad_norm": 2.3908350467681885, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7261501550674438, + "num_tokens": 390629648.0, + "step": 15658 + }, + { + "epoch": 1.7196354052273226, + "grad_norm": 2.18304443359375, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7116190195083618, + "num_tokens": 390658739.0, + "step": 15659 + }, + { + "epoch": 1.7197452229299364, + "grad_norm": 2.162778615951538, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7053262591362, + "num_tokens": 390686215.0, + "step": 15660 + }, + { + "epoch": 1.7198550406325501, + "grad_norm": 2.4191031455993652, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7221410274505615, + "num_tokens": 390709317.0, + "step": 15661 + }, + { + "epoch": 1.7199648583351637, + "grad_norm": 2.6932289600372314, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7344465255737305, + "num_tokens": 390728175.0, + "step": 15662 + }, + { + "epoch": 1.7200746760377772, + "grad_norm": 2.409918785095215, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7336949706077576, + "num_tokens": 390750783.0, + "step": 15663 + }, + { + "epoch": 1.720184493740391, + "grad_norm": 2.136162042617798, + "learning_rate": 1e-06, + "loss": 1.0808, + "mean_token_accuracy": 0.6938573122024536, + "num_tokens": 390781187.0, + "step": 15664 + }, + { + "epoch": 1.7202943114430047, + "grad_norm": 2.332286834716797, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7167748212814331, + "num_tokens": 390804511.0, + "step": 15665 + }, + { + "epoch": 1.7204041291456182, + "grad_norm": 2.308657169342041, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7035200595855713, + "num_tokens": 390829194.0, + "step": 15666 + }, + { + "epoch": 1.7205139468482318, + "grad_norm": 1.91303551197052, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6927664279937744, + "num_tokens": 390862953.0, + "step": 15667 + }, + { + "epoch": 1.7206237645508455, + "grad_norm": 2.612140655517578, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.726661205291748, + "num_tokens": 390884058.0, + "step": 15668 + }, + { + "epoch": 1.7207335822534593, + "grad_norm": 1.9775310754776, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7120790481567383, + "num_tokens": 390917736.0, + "step": 15669 + }, + { + "epoch": 1.720843399956073, + "grad_norm": 2.276747226715088, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7255621552467346, + "num_tokens": 390942071.0, + "step": 15670 + }, + { + "epoch": 1.7209532176586866, + "grad_norm": 2.2948741912841797, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7295467853546143, + "num_tokens": 390967525.0, + "step": 15671 + }, + { + "epoch": 1.7210630353613001, + "grad_norm": 1.9880167245864868, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7116076350212097, + "num_tokens": 391000039.0, + "step": 15672 + }, + { + "epoch": 1.7211728530639139, + "grad_norm": 2.264885902404785, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.716803789138794, + "num_tokens": 391026918.0, + "step": 15673 + }, + { + "epoch": 1.7212826707665276, + "grad_norm": 2.2506070137023926, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7078323364257812, + "num_tokens": 391053298.0, + "step": 15674 + }, + { + "epoch": 1.7213924884691414, + "grad_norm": 2.1635940074920654, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.6981720328330994, + "num_tokens": 391082761.0, + "step": 15675 + }, + { + "epoch": 1.721502306171755, + "grad_norm": 2.188394784927368, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7299607992172241, + "num_tokens": 391110735.0, + "step": 15676 + }, + { + "epoch": 1.7216121238743685, + "grad_norm": 2.216214179992676, + "learning_rate": 1e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7447590231895447, + "num_tokens": 391136171.0, + "step": 15677 + }, + { + "epoch": 1.7217219415769822, + "grad_norm": 2.417483329772949, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7215829491615295, + "num_tokens": 391156864.0, + "step": 15678 + }, + { + "epoch": 1.721831759279596, + "grad_norm": 2.509094476699829, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7354059815406799, + "num_tokens": 391176340.0, + "step": 15679 + }, + { + "epoch": 1.7219415769822095, + "grad_norm": 2.1682236194610596, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.722976803779602, + "num_tokens": 391202275.0, + "step": 15680 + }, + { + "epoch": 1.722051394684823, + "grad_norm": 2.020153045654297, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7041287422180176, + "num_tokens": 391233954.0, + "step": 15681 + }, + { + "epoch": 1.7221612123874368, + "grad_norm": 2.4589860439300537, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7537354230880737, + "num_tokens": 391254010.0, + "step": 15682 + }, + { + "epoch": 1.7222710300900506, + "grad_norm": 2.091179370880127, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7130639553070068, + "num_tokens": 391282153.0, + "step": 15683 + }, + { + "epoch": 1.7223808477926643, + "grad_norm": 2.171328067779541, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7141850590705872, + "num_tokens": 391309008.0, + "step": 15684 + }, + { + "epoch": 1.7224906654952779, + "grad_norm": 2.391065835952759, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.713862955570221, + "num_tokens": 391331900.0, + "step": 15685 + }, + { + "epoch": 1.7226004831978914, + "grad_norm": 2.4967830181121826, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7006661891937256, + "num_tokens": 391354256.0, + "step": 15686 + }, + { + "epoch": 1.7227103009005051, + "grad_norm": 2.5924253463745117, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7341558337211609, + "num_tokens": 391375005.0, + "step": 15687 + }, + { + "epoch": 1.722820118603119, + "grad_norm": 2.17789888381958, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.714505672454834, + "num_tokens": 391400779.0, + "step": 15688 + }, + { + "epoch": 1.7229299363057324, + "grad_norm": 2.1262993812561035, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7150243520736694, + "num_tokens": 391428613.0, + "step": 15689 + }, + { + "epoch": 1.7230397540083462, + "grad_norm": 2.2308950424194336, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7174534797668457, + "num_tokens": 391455233.0, + "step": 15690 + }, + { + "epoch": 1.7231495717109597, + "grad_norm": 1.9434168338775635, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7192904949188232, + "num_tokens": 391485114.0, + "step": 15691 + }, + { + "epoch": 1.7232593894135735, + "grad_norm": 2.6492972373962402, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7384212017059326, + "num_tokens": 391506106.0, + "step": 15692 + }, + { + "epoch": 1.7233692071161872, + "grad_norm": 2.346698760986328, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7335928678512573, + "num_tokens": 391530278.0, + "step": 15693 + }, + { + "epoch": 1.7234790248188008, + "grad_norm": 2.3214361667633057, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7160313129425049, + "num_tokens": 391555766.0, + "step": 15694 + }, + { + "epoch": 1.7235888425214143, + "grad_norm": 1.9462043046951294, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7019586563110352, + "num_tokens": 391588011.0, + "step": 15695 + }, + { + "epoch": 1.723698660224028, + "grad_norm": 2.2122998237609863, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7213989496231079, + "num_tokens": 391613442.0, + "step": 15696 + }, + { + "epoch": 1.7238084779266418, + "grad_norm": 2.325854539871216, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7073163390159607, + "num_tokens": 391637632.0, + "step": 15697 + }, + { + "epoch": 1.7239182956292556, + "grad_norm": 2.312403678894043, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7246379852294922, + "num_tokens": 391662980.0, + "step": 15698 + }, + { + "epoch": 1.7240281133318691, + "grad_norm": 2.1232573986053467, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7309109568595886, + "num_tokens": 391689754.0, + "step": 15699 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 2.1795778274536133, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7170733213424683, + "num_tokens": 391716475.0, + "step": 15700 + }, + { + "epoch": 1.7242477487370964, + "grad_norm": 2.092017412185669, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7095813155174255, + "num_tokens": 391744730.0, + "step": 15701 + }, + { + "epoch": 1.7243575664397102, + "grad_norm": 2.0145111083984375, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7008427977561951, + "num_tokens": 391775141.0, + "step": 15702 + }, + { + "epoch": 1.7244673841423237, + "grad_norm": 2.6596553325653076, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7443794012069702, + "num_tokens": 391793774.0, + "step": 15703 + }, + { + "epoch": 1.7245772018449375, + "grad_norm": 2.566446304321289, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.735853374004364, + "num_tokens": 391813589.0, + "step": 15704 + }, + { + "epoch": 1.724687019547551, + "grad_norm": 2.509392499923706, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7390663623809814, + "num_tokens": 391833895.0, + "step": 15705 + }, + { + "epoch": 1.7247968372501647, + "grad_norm": 2.398660898208618, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.7005504369735718, + "num_tokens": 391859264.0, + "step": 15706 + }, + { + "epoch": 1.7249066549527785, + "grad_norm": 2.690523862838745, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7218747138977051, + "num_tokens": 391878772.0, + "step": 15707 + }, + { + "epoch": 1.725016472655392, + "grad_norm": 2.2844858169555664, + "learning_rate": 1e-06, + "loss": 0.8061, + "mean_token_accuracy": 0.744948148727417, + "num_tokens": 391903425.0, + "step": 15708 + }, + { + "epoch": 1.7251262903580056, + "grad_norm": 2.435767412185669, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7372254729270935, + "num_tokens": 391926378.0, + "step": 15709 + }, + { + "epoch": 1.7252361080606193, + "grad_norm": 2.6197519302368164, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.741621196269989, + "num_tokens": 391944204.0, + "step": 15710 + }, + { + "epoch": 1.725345925763233, + "grad_norm": 2.330918550491333, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7353115081787109, + "num_tokens": 391968277.0, + "step": 15711 + }, + { + "epoch": 1.7254557434658468, + "grad_norm": 2.5132203102111816, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7444819211959839, + "num_tokens": 391988249.0, + "step": 15712 + }, + { + "epoch": 1.7255655611684604, + "grad_norm": 2.613518238067627, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7284965515136719, + "num_tokens": 392007638.0, + "step": 15713 + }, + { + "epoch": 1.725675378871074, + "grad_norm": 2.242910623550415, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7105149626731873, + "num_tokens": 392034082.0, + "step": 15714 + }, + { + "epoch": 1.7257851965736877, + "grad_norm": 2.609147071838379, + "learning_rate": 1e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7415058612823486, + "num_tokens": 392053545.0, + "step": 15715 + }, + { + "epoch": 1.7258950142763014, + "grad_norm": 2.415947914123535, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7141277194023132, + "num_tokens": 392076959.0, + "step": 15716 + }, + { + "epoch": 1.726004831978915, + "grad_norm": 2.5976781845092773, + "learning_rate": 1e-06, + "loss": 0.7752, + "mean_token_accuracy": 0.7544945478439331, + "num_tokens": 392096315.0, + "step": 15717 + }, + { + "epoch": 1.7261146496815285, + "grad_norm": 2.368473529815674, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7492166757583618, + "num_tokens": 392118034.0, + "step": 15718 + }, + { + "epoch": 1.7262244673841423, + "grad_norm": 2.2529046535491943, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7198059558868408, + "num_tokens": 392141784.0, + "step": 15719 + }, + { + "epoch": 1.726334285086756, + "grad_norm": 2.1960744857788086, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7107742428779602, + "num_tokens": 392168189.0, + "step": 15720 + }, + { + "epoch": 1.7264441027893698, + "grad_norm": 2.0879523754119873, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7159435749053955, + "num_tokens": 392197448.0, + "step": 15721 + }, + { + "epoch": 1.7265539204919833, + "grad_norm": 2.5696606636047363, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7083876132965088, + "num_tokens": 392218851.0, + "step": 15722 + }, + { + "epoch": 1.7266637381945968, + "grad_norm": 2.197251796722412, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7116760015487671, + "num_tokens": 392247674.0, + "step": 15723 + }, + { + "epoch": 1.7267735558972106, + "grad_norm": 2.3859031200408936, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7396913766860962, + "num_tokens": 392269979.0, + "step": 15724 + }, + { + "epoch": 1.7268833735998244, + "grad_norm": 2.3719003200531006, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7125755548477173, + "num_tokens": 392294255.0, + "step": 15725 + }, + { + "epoch": 1.7269931913024381, + "grad_norm": 2.1436474323272705, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.727186918258667, + "num_tokens": 392319350.0, + "step": 15726 + }, + { + "epoch": 1.7271030090050516, + "grad_norm": 2.4810051918029785, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.734368085861206, + "num_tokens": 392340737.0, + "step": 15727 + }, + { + "epoch": 1.7272128267076652, + "grad_norm": 2.184777021408081, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7208033800125122, + "num_tokens": 392367298.0, + "step": 15728 + }, + { + "epoch": 1.727322644410279, + "grad_norm": 2.3248085975646973, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7203590273857117, + "num_tokens": 392391451.0, + "step": 15729 + }, + { + "epoch": 1.7274324621128927, + "grad_norm": 2.4210195541381836, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7203697562217712, + "num_tokens": 392415021.0, + "step": 15730 + }, + { + "epoch": 1.7275422798155062, + "grad_norm": 2.5662102699279785, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7156333923339844, + "num_tokens": 392435426.0, + "step": 15731 + }, + { + "epoch": 1.7276520975181198, + "grad_norm": 2.2327816486358643, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7018832564353943, + "num_tokens": 392463476.0, + "step": 15732 + }, + { + "epoch": 1.7277619152207335, + "grad_norm": 2.0480539798736572, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.694973886013031, + "num_tokens": 392493171.0, + "step": 15733 + }, + { + "epoch": 1.7278717329233473, + "grad_norm": 2.117178440093994, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7289406657218933, + "num_tokens": 392520607.0, + "step": 15734 + }, + { + "epoch": 1.727981550625961, + "grad_norm": 2.30599308013916, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7230145335197449, + "num_tokens": 392544468.0, + "step": 15735 + }, + { + "epoch": 1.7280913683285746, + "grad_norm": 2.1531119346618652, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7238982915878296, + "num_tokens": 392572227.0, + "step": 15736 + }, + { + "epoch": 1.728201186031188, + "grad_norm": 2.102804183959961, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6871637105941772, + "num_tokens": 392603231.0, + "step": 15737 + }, + { + "epoch": 1.7283110037338019, + "grad_norm": 2.122851848602295, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6992831230163574, + "num_tokens": 392633679.0, + "step": 15738 + }, + { + "epoch": 1.7284208214364156, + "grad_norm": 2.2019524574279785, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.6950047016143799, + "num_tokens": 392660361.0, + "step": 15739 + }, + { + "epoch": 1.7285306391390292, + "grad_norm": 2.3007099628448486, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7121548056602478, + "num_tokens": 392684852.0, + "step": 15740 + }, + { + "epoch": 1.728640456841643, + "grad_norm": 2.1492867469787598, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7142630815505981, + "num_tokens": 392713120.0, + "step": 15741 + }, + { + "epoch": 1.7287502745442564, + "grad_norm": 2.4478919506073, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.6956350803375244, + "num_tokens": 392736776.0, + "step": 15742 + }, + { + "epoch": 1.7288600922468702, + "grad_norm": 1.9073148965835571, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.6987708806991577, + "num_tokens": 392769839.0, + "step": 15743 + }, + { + "epoch": 1.728969909949484, + "grad_norm": 1.9812079668045044, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6955310106277466, + "num_tokens": 392803584.0, + "step": 15744 + }, + { + "epoch": 1.7290797276520975, + "grad_norm": 2.137575149536133, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7151685953140259, + "num_tokens": 392833237.0, + "step": 15745 + }, + { + "epoch": 1.729189545354711, + "grad_norm": 2.1963307857513428, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7214352488517761, + "num_tokens": 392859041.0, + "step": 15746 + }, + { + "epoch": 1.7292993630573248, + "grad_norm": 2.4153521060943604, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7438181638717651, + "num_tokens": 392880504.0, + "step": 15747 + }, + { + "epoch": 1.7294091807599385, + "grad_norm": 2.101870059967041, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7041481733322144, + "num_tokens": 392907750.0, + "step": 15748 + }, + { + "epoch": 1.7295189984625523, + "grad_norm": 1.9432194232940674, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7059857845306396, + "num_tokens": 392939432.0, + "step": 15749 + }, + { + "epoch": 1.7296288161651658, + "grad_norm": 2.152113676071167, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7181013822555542, + "num_tokens": 392966958.0, + "step": 15750 + }, + { + "epoch": 1.7297386338677794, + "grad_norm": 2.107172727584839, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7116286158561707, + "num_tokens": 392995937.0, + "step": 15751 + }, + { + "epoch": 1.7298484515703931, + "grad_norm": 2.0044827461242676, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.693152904510498, + "num_tokens": 393028788.0, + "step": 15752 + }, + { + "epoch": 1.7299582692730069, + "grad_norm": 2.1073639392852783, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7007228136062622, + "num_tokens": 393058283.0, + "step": 15753 + }, + { + "epoch": 1.7300680869756204, + "grad_norm": 2.1873812675476074, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7298038601875305, + "num_tokens": 393085166.0, + "step": 15754 + }, + { + "epoch": 1.7301779046782342, + "grad_norm": 1.9358985424041748, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.6972646713256836, + "num_tokens": 393117351.0, + "step": 15755 + }, + { + "epoch": 1.7302877223808477, + "grad_norm": 2.2274487018585205, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7317293286323547, + "num_tokens": 393144014.0, + "step": 15756 + }, + { + "epoch": 1.7303975400834615, + "grad_norm": 2.444930076599121, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.73649001121521, + "num_tokens": 393165928.0, + "step": 15757 + }, + { + "epoch": 1.7305073577860752, + "grad_norm": 2.4682157039642334, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7425534725189209, + "num_tokens": 393188238.0, + "step": 15758 + }, + { + "epoch": 1.7306171754886888, + "grad_norm": 2.0654938220977783, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6933985948562622, + "num_tokens": 393219746.0, + "step": 15759 + }, + { + "epoch": 1.7307269931913023, + "grad_norm": 2.3515279293060303, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.732309103012085, + "num_tokens": 393242941.0, + "step": 15760 + }, + { + "epoch": 1.730836810893916, + "grad_norm": 2.2251241207122803, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7080892324447632, + "num_tokens": 393271030.0, + "step": 15761 + }, + { + "epoch": 1.7309466285965298, + "grad_norm": 2.2041783332824707, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7230000495910645, + "num_tokens": 393299072.0, + "step": 15762 + }, + { + "epoch": 1.7310564462991436, + "grad_norm": 2.838900089263916, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7445499897003174, + "num_tokens": 393316022.0, + "step": 15763 + }, + { + "epoch": 1.731166264001757, + "grad_norm": 2.2277255058288574, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7077574729919434, + "num_tokens": 393343080.0, + "step": 15764 + }, + { + "epoch": 1.7312760817043706, + "grad_norm": 2.07281756401062, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7010184526443481, + "num_tokens": 393373582.0, + "step": 15765 + }, + { + "epoch": 1.7313858994069844, + "grad_norm": 2.4202725887298584, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7441340684890747, + "num_tokens": 393396222.0, + "step": 15766 + }, + { + "epoch": 1.7314957171095982, + "grad_norm": 2.1122066974639893, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7370260953903198, + "num_tokens": 393422349.0, + "step": 15767 + }, + { + "epoch": 1.7316055348122117, + "grad_norm": 2.0516700744628906, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7160680890083313, + "num_tokens": 393456004.0, + "step": 15768 + }, + { + "epoch": 1.7317153525148252, + "grad_norm": 1.9603567123413086, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7029873132705688, + "num_tokens": 393490158.0, + "step": 15769 + }, + { + "epoch": 1.731825170217439, + "grad_norm": 2.49714994430542, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.710631251335144, + "num_tokens": 393512505.0, + "step": 15770 + }, + { + "epoch": 1.7319349879200527, + "grad_norm": 2.3126730918884277, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7341569662094116, + "num_tokens": 393536398.0, + "step": 15771 + }, + { + "epoch": 1.7320448056226665, + "grad_norm": 2.2952957153320312, + "learning_rate": 1e-06, + "loss": 0.7811, + "mean_token_accuracy": 0.7528190612792969, + "num_tokens": 393560865.0, + "step": 15772 + }, + { + "epoch": 1.73215462332528, + "grad_norm": 2.072995185852051, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7163116335868835, + "num_tokens": 393589628.0, + "step": 15773 + }, + { + "epoch": 1.7322644410278936, + "grad_norm": 2.105499029159546, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7274693846702576, + "num_tokens": 393620961.0, + "step": 15774 + }, + { + "epoch": 1.7323742587305073, + "grad_norm": 2.3188366889953613, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7047004699707031, + "num_tokens": 393645428.0, + "step": 15775 + }, + { + "epoch": 1.732484076433121, + "grad_norm": 2.2389678955078125, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.743041455745697, + "num_tokens": 393670732.0, + "step": 15776 + }, + { + "epoch": 1.7325938941357348, + "grad_norm": 2.0318925380706787, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7312098741531372, + "num_tokens": 393699662.0, + "step": 15777 + }, + { + "epoch": 1.7327037118383484, + "grad_norm": 2.553384780883789, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7048642635345459, + "num_tokens": 393721214.0, + "step": 15778 + }, + { + "epoch": 1.732813529540962, + "grad_norm": 2.6336100101470947, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7237911224365234, + "num_tokens": 393740928.0, + "step": 15779 + }, + { + "epoch": 1.7329233472435757, + "grad_norm": 2.4181573390960693, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7126805782318115, + "num_tokens": 393763761.0, + "step": 15780 + }, + { + "epoch": 1.7330331649461894, + "grad_norm": 2.11146879196167, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7211756706237793, + "num_tokens": 393790406.0, + "step": 15781 + }, + { + "epoch": 1.733142982648803, + "grad_norm": 2.245380401611328, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7230989336967468, + "num_tokens": 393815889.0, + "step": 15782 + }, + { + "epoch": 1.7332528003514165, + "grad_norm": 2.39032244682312, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7402744293212891, + "num_tokens": 393838214.0, + "step": 15783 + }, + { + "epoch": 1.7333626180540302, + "grad_norm": 2.571246385574341, + "learning_rate": 1e-06, + "loss": 0.7953, + "mean_token_accuracy": 0.7460707426071167, + "num_tokens": 393857059.0, + "step": 15784 + }, + { + "epoch": 1.733472435756644, + "grad_norm": 2.1370832920074463, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7349580526351929, + "num_tokens": 393883204.0, + "step": 15785 + }, + { + "epoch": 1.7335822534592578, + "grad_norm": 2.2269487380981445, + "learning_rate": 1e-06, + "loss": 0.7479, + "mean_token_accuracy": 0.7577672004699707, + "num_tokens": 393906304.0, + "step": 15786 + }, + { + "epoch": 1.7336920711618713, + "grad_norm": 2.2943942546844482, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7119313478469849, + "num_tokens": 393930131.0, + "step": 15787 + }, + { + "epoch": 1.7338018888644848, + "grad_norm": 2.0227744579315186, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6964125037193298, + "num_tokens": 393961722.0, + "step": 15788 + }, + { + "epoch": 1.7339117065670986, + "grad_norm": 2.4121358394622803, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7311108112335205, + "num_tokens": 393983549.0, + "step": 15789 + }, + { + "epoch": 1.7340215242697123, + "grad_norm": 2.133404016494751, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7034289836883545, + "num_tokens": 394013055.0, + "step": 15790 + }, + { + "epoch": 1.734131341972326, + "grad_norm": 2.3935976028442383, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7166531085968018, + "num_tokens": 394035369.0, + "step": 15791 + }, + { + "epoch": 1.7342411596749396, + "grad_norm": 2.2606592178344727, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7133724689483643, + "num_tokens": 394059748.0, + "step": 15792 + }, + { + "epoch": 1.7343509773775532, + "grad_norm": 2.1848862171173096, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7257282733917236, + "num_tokens": 394087339.0, + "step": 15793 + }, + { + "epoch": 1.734460795080167, + "grad_norm": 2.4679782390594482, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7221641540527344, + "num_tokens": 394110412.0, + "step": 15794 + }, + { + "epoch": 1.7345706127827807, + "grad_norm": 2.021347761154175, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7005919218063354, + "num_tokens": 394141858.0, + "step": 15795 + }, + { + "epoch": 1.7346804304853942, + "grad_norm": 2.294099807739258, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7199286222457886, + "num_tokens": 394167767.0, + "step": 15796 + }, + { + "epoch": 1.7347902481880078, + "grad_norm": 2.6416079998016357, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.717458963394165, + "num_tokens": 394187286.0, + "step": 15797 + }, + { + "epoch": 1.7349000658906215, + "grad_norm": 2.186361312866211, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7318940162658691, + "num_tokens": 394214254.0, + "step": 15798 + }, + { + "epoch": 1.7350098835932353, + "grad_norm": 2.6858444213867188, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.722692608833313, + "num_tokens": 394233894.0, + "step": 15799 + }, + { + "epoch": 1.735119701295849, + "grad_norm": 2.4560186862945557, + "learning_rate": 1e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.745112955570221, + "num_tokens": 394254194.0, + "step": 15800 + }, + { + "epoch": 1.7352295189984626, + "grad_norm": 2.0011825561523438, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.731702446937561, + "num_tokens": 394283439.0, + "step": 15801 + }, + { + "epoch": 1.735339336701076, + "grad_norm": 2.095410108566284, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.725415825843811, + "num_tokens": 394311961.0, + "step": 15802 + }, + { + "epoch": 1.7354491544036899, + "grad_norm": 2.23740291595459, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7343554496765137, + "num_tokens": 394338734.0, + "step": 15803 + }, + { + "epoch": 1.7355589721063036, + "grad_norm": 2.226595163345337, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7275301814079285, + "num_tokens": 394363939.0, + "step": 15804 + }, + { + "epoch": 1.7356687898089171, + "grad_norm": 2.156294584274292, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7031724452972412, + "num_tokens": 394391503.0, + "step": 15805 + }, + { + "epoch": 1.735778607511531, + "grad_norm": 2.3873140811920166, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7147130370140076, + "num_tokens": 394414947.0, + "step": 15806 + }, + { + "epoch": 1.7358884252141444, + "grad_norm": 2.7863779067993164, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7442965507507324, + "num_tokens": 394431666.0, + "step": 15807 + }, + { + "epoch": 1.7359982429167582, + "grad_norm": 2.4629361629486084, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7321188449859619, + "num_tokens": 394454011.0, + "step": 15808 + }, + { + "epoch": 1.736108060619372, + "grad_norm": 2.275712251663208, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7187713980674744, + "num_tokens": 394478749.0, + "step": 15809 + }, + { + "epoch": 1.7362178783219855, + "grad_norm": 2.073916435241699, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7234992980957031, + "num_tokens": 394506812.0, + "step": 15810 + }, + { + "epoch": 1.736327696024599, + "grad_norm": 2.0415196418762207, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7229921817779541, + "num_tokens": 394538322.0, + "step": 15811 + }, + { + "epoch": 1.7364375137272128, + "grad_norm": 2.1130621433258057, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7186350226402283, + "num_tokens": 394566999.0, + "step": 15812 + }, + { + "epoch": 1.7365473314298265, + "grad_norm": 2.107722520828247, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7126931548118591, + "num_tokens": 394596054.0, + "step": 15813 + }, + { + "epoch": 1.7366571491324403, + "grad_norm": 2.05753493309021, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.697951078414917, + "num_tokens": 394625975.0, + "step": 15814 + }, + { + "epoch": 1.7367669668350538, + "grad_norm": 2.055781602859497, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.726201593875885, + "num_tokens": 394654564.0, + "step": 15815 + }, + { + "epoch": 1.7368767845376674, + "grad_norm": 2.382112503051758, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7174140214920044, + "num_tokens": 394678606.0, + "step": 15816 + }, + { + "epoch": 1.7369866022402811, + "grad_norm": 2.3540921211242676, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7261376976966858, + "num_tokens": 394701658.0, + "step": 15817 + }, + { + "epoch": 1.7370964199428949, + "grad_norm": 2.48419189453125, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7093261480331421, + "num_tokens": 394722568.0, + "step": 15818 + }, + { + "epoch": 1.7372062376455084, + "grad_norm": 2.5583322048187256, + "learning_rate": 1e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7307151556015015, + "num_tokens": 394742254.0, + "step": 15819 + }, + { + "epoch": 1.7373160553481222, + "grad_norm": 2.0468790531158447, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6967957019805908, + "num_tokens": 394775223.0, + "step": 15820 + }, + { + "epoch": 1.7374258730507357, + "grad_norm": 2.4714653491973877, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7153307795524597, + "num_tokens": 394796330.0, + "step": 15821 + }, + { + "epoch": 1.7375356907533495, + "grad_norm": 2.1442806720733643, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.723854124546051, + "num_tokens": 394825849.0, + "step": 15822 + }, + { + "epoch": 1.7376455084559632, + "grad_norm": 2.2738282680511475, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7420593500137329, + "num_tokens": 394851125.0, + "step": 15823 + }, + { + "epoch": 1.7377553261585768, + "grad_norm": 2.4606103897094727, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7221235036849976, + "num_tokens": 394874464.0, + "step": 15824 + }, + { + "epoch": 1.7378651438611903, + "grad_norm": 2.5907399654388428, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7258756160736084, + "num_tokens": 394897650.0, + "step": 15825 + }, + { + "epoch": 1.737974961563804, + "grad_norm": 2.089186906814575, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7160784006118774, + "num_tokens": 394927797.0, + "step": 15826 + }, + { + "epoch": 1.7380847792664178, + "grad_norm": 2.2568647861480713, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7287889719009399, + "num_tokens": 394951630.0, + "step": 15827 + }, + { + "epoch": 1.7381945969690316, + "grad_norm": 2.6028003692626953, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7273472547531128, + "num_tokens": 394972900.0, + "step": 15828 + }, + { + "epoch": 1.738304414671645, + "grad_norm": 2.068754196166992, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7288817167282104, + "num_tokens": 395000098.0, + "step": 15829 + }, + { + "epoch": 1.7384142323742586, + "grad_norm": 2.1573848724365234, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.717667818069458, + "num_tokens": 395026833.0, + "step": 15830 + }, + { + "epoch": 1.7385240500768724, + "grad_norm": 2.265425205230713, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7238691449165344, + "num_tokens": 395053208.0, + "step": 15831 + }, + { + "epoch": 1.7386338677794861, + "grad_norm": 2.2553417682647705, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7154760360717773, + "num_tokens": 395079192.0, + "step": 15832 + }, + { + "epoch": 1.7387436854820997, + "grad_norm": 2.404785394668579, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7191219925880432, + "num_tokens": 395102243.0, + "step": 15833 + }, + { + "epoch": 1.7388535031847132, + "grad_norm": 2.5265424251556396, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7368113398551941, + "num_tokens": 395123560.0, + "step": 15834 + }, + { + "epoch": 1.738963320887327, + "grad_norm": 2.337913751602173, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7152999639511108, + "num_tokens": 395148098.0, + "step": 15835 + }, + { + "epoch": 1.7390731385899407, + "grad_norm": 2.390521764755249, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7016860246658325, + "num_tokens": 395172534.0, + "step": 15836 + }, + { + "epoch": 1.7391829562925545, + "grad_norm": 2.1741816997528076, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7095293998718262, + "num_tokens": 395200568.0, + "step": 15837 + }, + { + "epoch": 1.739292773995168, + "grad_norm": 2.3445661067962646, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7257435321807861, + "num_tokens": 395225693.0, + "step": 15838 + }, + { + "epoch": 1.7394025916977816, + "grad_norm": 2.428015947341919, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7229871153831482, + "num_tokens": 395248377.0, + "step": 15839 + }, + { + "epoch": 1.7395124094003953, + "grad_norm": 2.416337013244629, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7119581699371338, + "num_tokens": 395270060.0, + "step": 15840 + }, + { + "epoch": 1.739622227103009, + "grad_norm": 2.729790210723877, + "learning_rate": 1e-06, + "loss": 0.7834, + "mean_token_accuracy": 0.7463164329528809, + "num_tokens": 395289023.0, + "step": 15841 + }, + { + "epoch": 1.7397320448056228, + "grad_norm": 2.248896598815918, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7224299907684326, + "num_tokens": 395313251.0, + "step": 15842 + }, + { + "epoch": 1.7398418625082364, + "grad_norm": 2.1338512897491455, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7120522856712341, + "num_tokens": 395341514.0, + "step": 15843 + }, + { + "epoch": 1.73995168021085, + "grad_norm": 2.3987903594970703, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7203638553619385, + "num_tokens": 395366733.0, + "step": 15844 + }, + { + "epoch": 1.7400614979134637, + "grad_norm": 1.9109560251235962, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7118408679962158, + "num_tokens": 395398294.0, + "step": 15845 + }, + { + "epoch": 1.7401713156160774, + "grad_norm": 2.365649938583374, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7485802173614502, + "num_tokens": 395421140.0, + "step": 15846 + }, + { + "epoch": 1.740281133318691, + "grad_norm": 2.2636566162109375, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6930471658706665, + "num_tokens": 395446180.0, + "step": 15847 + }, + { + "epoch": 1.7403909510213045, + "grad_norm": 2.2839293479919434, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7198960781097412, + "num_tokens": 395471566.0, + "step": 15848 + }, + { + "epoch": 1.7405007687239182, + "grad_norm": 1.9949190616607666, + "learning_rate": 1e-06, + "loss": 0.7474, + "mean_token_accuracy": 0.7618284225463867, + "num_tokens": 395499250.0, + "step": 15849 + }, + { + "epoch": 1.740610586426532, + "grad_norm": 2.1763625144958496, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7257093191146851, + "num_tokens": 395523768.0, + "step": 15850 + }, + { + "epoch": 1.7407204041291457, + "grad_norm": 2.1600685119628906, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.719001054763794, + "num_tokens": 395552411.0, + "step": 15851 + }, + { + "epoch": 1.7408302218317593, + "grad_norm": 2.1771624088287354, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7112871408462524, + "num_tokens": 395578534.0, + "step": 15852 + }, + { + "epoch": 1.7409400395343728, + "grad_norm": 2.739621877670288, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7295736074447632, + "num_tokens": 395597392.0, + "step": 15853 + }, + { + "epoch": 1.7410498572369866, + "grad_norm": 2.083843946456909, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6976683735847473, + "num_tokens": 395625231.0, + "step": 15854 + }, + { + "epoch": 1.7411596749396003, + "grad_norm": 2.355012893676758, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7464442253112793, + "num_tokens": 395647664.0, + "step": 15855 + }, + { + "epoch": 1.741269492642214, + "grad_norm": 2.588913679122925, + "learning_rate": 1e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7419440150260925, + "num_tokens": 395667530.0, + "step": 15856 + }, + { + "epoch": 1.7413793103448276, + "grad_norm": 2.186586380004883, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.71560138463974, + "num_tokens": 395693888.0, + "step": 15857 + }, + { + "epoch": 1.7414891280474412, + "grad_norm": 2.1313023567199707, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7043195962905884, + "num_tokens": 395722785.0, + "step": 15858 + }, + { + "epoch": 1.741598945750055, + "grad_norm": 2.488184690475464, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.726175844669342, + "num_tokens": 395744614.0, + "step": 15859 + }, + { + "epoch": 1.7417087634526687, + "grad_norm": 2.4356532096862793, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7546340227127075, + "num_tokens": 395765264.0, + "step": 15860 + }, + { + "epoch": 1.7418185811552822, + "grad_norm": 2.349184513092041, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7201316356658936, + "num_tokens": 395788232.0, + "step": 15861 + }, + { + "epoch": 1.7419283988578957, + "grad_norm": 2.970937967300415, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7370140552520752, + "num_tokens": 395804494.0, + "step": 15862 + }, + { + "epoch": 1.7420382165605095, + "grad_norm": 2.116018295288086, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7206721305847168, + "num_tokens": 395831549.0, + "step": 15863 + }, + { + "epoch": 1.7421480342631233, + "grad_norm": 2.0047683715820312, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7155129909515381, + "num_tokens": 395863499.0, + "step": 15864 + }, + { + "epoch": 1.742257851965737, + "grad_norm": 2.417207717895508, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7193940281867981, + "num_tokens": 395886142.0, + "step": 15865 + }, + { + "epoch": 1.7423676696683505, + "grad_norm": 2.248274564743042, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7167778015136719, + "num_tokens": 395911479.0, + "step": 15866 + }, + { + "epoch": 1.742477487370964, + "grad_norm": 1.978646993637085, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7139666080474854, + "num_tokens": 395943181.0, + "step": 15867 + }, + { + "epoch": 1.7425873050735778, + "grad_norm": 2.2331044673919678, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7272024750709534, + "num_tokens": 395967339.0, + "step": 15868 + }, + { + "epoch": 1.7426971227761916, + "grad_norm": 2.1574862003326416, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.675290048122406, + "num_tokens": 395998017.0, + "step": 15869 + }, + { + "epoch": 1.7428069404788051, + "grad_norm": 2.5366156101226807, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7305470705032349, + "num_tokens": 396019414.0, + "step": 15870 + }, + { + "epoch": 1.742916758181419, + "grad_norm": 2.0836658477783203, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7411532402038574, + "num_tokens": 396046908.0, + "step": 15871 + }, + { + "epoch": 1.7430265758840324, + "grad_norm": 2.2980237007141113, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7230250835418701, + "num_tokens": 396070726.0, + "step": 15872 + }, + { + "epoch": 1.7431363935866462, + "grad_norm": 2.5254604816436768, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7085778713226318, + "num_tokens": 396092858.0, + "step": 15873 + }, + { + "epoch": 1.74324621128926, + "grad_norm": 2.531224489212036, + "learning_rate": 1e-06, + "loss": 0.7993, + "mean_token_accuracy": 0.7461312413215637, + "num_tokens": 396112708.0, + "step": 15874 + }, + { + "epoch": 1.7433560289918735, + "grad_norm": 2.189610719680786, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7480919361114502, + "num_tokens": 396137662.0, + "step": 15875 + }, + { + "epoch": 1.743465846694487, + "grad_norm": 2.378422260284424, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7278706431388855, + "num_tokens": 396158971.0, + "step": 15876 + }, + { + "epoch": 1.7435756643971008, + "grad_norm": 2.1945905685424805, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7048310041427612, + "num_tokens": 396187518.0, + "step": 15877 + }, + { + "epoch": 1.7436854820997145, + "grad_norm": 2.4309048652648926, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7238869667053223, + "num_tokens": 396209596.0, + "step": 15878 + }, + { + "epoch": 1.7437952998023283, + "grad_norm": 2.3470945358276367, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7178221344947815, + "num_tokens": 396233460.0, + "step": 15879 + }, + { + "epoch": 1.7439051175049418, + "grad_norm": 2.2251474857330322, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7135593891143799, + "num_tokens": 396259954.0, + "step": 15880 + }, + { + "epoch": 1.7440149352075554, + "grad_norm": 2.0849621295928955, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7123304605484009, + "num_tokens": 396290037.0, + "step": 15881 + }, + { + "epoch": 1.744124752910169, + "grad_norm": 2.5480616092681885, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7242240905761719, + "num_tokens": 396311054.0, + "step": 15882 + }, + { + "epoch": 1.7442345706127829, + "grad_norm": 2.495769739151001, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7211825847625732, + "num_tokens": 396332718.0, + "step": 15883 + }, + { + "epoch": 1.7443443883153964, + "grad_norm": 2.126718521118164, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7090365886688232, + "num_tokens": 396360259.0, + "step": 15884 + }, + { + "epoch": 1.7444542060180102, + "grad_norm": 2.602691888809204, + "learning_rate": 1e-06, + "loss": 0.7722, + "mean_token_accuracy": 0.7439538240432739, + "num_tokens": 396378430.0, + "step": 15885 + }, + { + "epoch": 1.7445640237206237, + "grad_norm": 2.70406174659729, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7347341775894165, + "num_tokens": 396397265.0, + "step": 15886 + }, + { + "epoch": 1.7446738414232374, + "grad_norm": 2.3496878147125244, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7086557745933533, + "num_tokens": 396422192.0, + "step": 15887 + }, + { + "epoch": 1.7447836591258512, + "grad_norm": 2.168396472930908, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7329015135765076, + "num_tokens": 396450130.0, + "step": 15888 + }, + { + "epoch": 1.7448934768284647, + "grad_norm": 2.5786614418029785, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7343683242797852, + "num_tokens": 396470440.0, + "step": 15889 + }, + { + "epoch": 1.7450032945310783, + "grad_norm": 2.3215038776397705, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7443183660507202, + "num_tokens": 396494668.0, + "step": 15890 + }, + { + "epoch": 1.745113112233692, + "grad_norm": 2.265735387802124, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7173720598220825, + "num_tokens": 396521603.0, + "step": 15891 + }, + { + "epoch": 1.7452229299363058, + "grad_norm": 2.389143943786621, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7124278545379639, + "num_tokens": 396545728.0, + "step": 15892 + }, + { + "epoch": 1.7453327476389195, + "grad_norm": 2.078089714050293, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6983364820480347, + "num_tokens": 396576193.0, + "step": 15893 + }, + { + "epoch": 1.745442565341533, + "grad_norm": 2.3164031505584717, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7086632251739502, + "num_tokens": 396601794.0, + "step": 15894 + }, + { + "epoch": 1.7455523830441466, + "grad_norm": 2.1866536140441895, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7216616868972778, + "num_tokens": 396628281.0, + "step": 15895 + }, + { + "epoch": 1.7456622007467604, + "grad_norm": 2.164079427719116, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.705747127532959, + "num_tokens": 396656784.0, + "step": 15896 + }, + { + "epoch": 1.7457720184493741, + "grad_norm": 2.1349246501922607, + "learning_rate": 1e-06, + "loss": 0.7823, + "mean_token_accuracy": 0.7468650341033936, + "num_tokens": 396682089.0, + "step": 15897 + }, + { + "epoch": 1.7458818361519877, + "grad_norm": 2.268315553665161, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7050595283508301, + "num_tokens": 396705682.0, + "step": 15898 + }, + { + "epoch": 1.7459916538546012, + "grad_norm": 2.1480183601379395, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7122209072113037, + "num_tokens": 396735078.0, + "step": 15899 + }, + { + "epoch": 1.746101471557215, + "grad_norm": 2.113189220428467, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7060708403587341, + "num_tokens": 396767400.0, + "step": 15900 + }, + { + "epoch": 1.7462112892598287, + "grad_norm": 2.3266589641571045, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7328140139579773, + "num_tokens": 396790143.0, + "step": 15901 + }, + { + "epoch": 1.7463211069624425, + "grad_norm": 2.296424627304077, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7191871404647827, + "num_tokens": 396815169.0, + "step": 15902 + }, + { + "epoch": 1.746430924665056, + "grad_norm": 2.0719501972198486, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7089674472808838, + "num_tokens": 396844749.0, + "step": 15903 + }, + { + "epoch": 1.7465407423676695, + "grad_norm": 2.241121768951416, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7228344678878784, + "num_tokens": 396870912.0, + "step": 15904 + }, + { + "epoch": 1.7466505600702833, + "grad_norm": 2.6643683910369873, + "learning_rate": 1e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7581373453140259, + "num_tokens": 396888432.0, + "step": 15905 + }, + { + "epoch": 1.746760377772897, + "grad_norm": 2.278984785079956, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7104470133781433, + "num_tokens": 396915645.0, + "step": 15906 + }, + { + "epoch": 1.7468701954755108, + "grad_norm": 2.4764273166656494, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7119014263153076, + "num_tokens": 396940534.0, + "step": 15907 + }, + { + "epoch": 1.7469800131781243, + "grad_norm": 2.3111889362335205, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7268413305282593, + "num_tokens": 396963510.0, + "step": 15908 + }, + { + "epoch": 1.7470898308807379, + "grad_norm": 2.697068214416504, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7325961589813232, + "num_tokens": 396981439.0, + "step": 15909 + }, + { + "epoch": 1.7471996485833516, + "grad_norm": 2.760962963104248, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7415162324905396, + "num_tokens": 396998241.0, + "step": 15910 + }, + { + "epoch": 1.7473094662859654, + "grad_norm": 1.9616471529006958, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7093455791473389, + "num_tokens": 397030743.0, + "step": 15911 + }, + { + "epoch": 1.747419283988579, + "grad_norm": 2.099956512451172, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7282866835594177, + "num_tokens": 397056427.0, + "step": 15912 + }, + { + "epoch": 1.7475291016911925, + "grad_norm": 2.2199971675872803, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7011529207229614, + "num_tokens": 397082524.0, + "step": 15913 + }, + { + "epoch": 1.7476389193938062, + "grad_norm": 2.2494256496429443, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.6999334096908569, + "num_tokens": 397108220.0, + "step": 15914 + }, + { + "epoch": 1.74774873709642, + "grad_norm": 2.0763094425201416, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7051708698272705, + "num_tokens": 397139683.0, + "step": 15915 + }, + { + "epoch": 1.7478585547990337, + "grad_norm": 2.036222219467163, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7041968107223511, + "num_tokens": 397168592.0, + "step": 15916 + }, + { + "epoch": 1.7479683725016473, + "grad_norm": 2.1895039081573486, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7263350486755371, + "num_tokens": 397193194.0, + "step": 15917 + }, + { + "epoch": 1.7480781902042608, + "grad_norm": 2.2844722270965576, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7225857973098755, + "num_tokens": 397219217.0, + "step": 15918 + }, + { + "epoch": 1.7481880079068746, + "grad_norm": 2.0388898849487305, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7259879112243652, + "num_tokens": 397249322.0, + "step": 15919 + }, + { + "epoch": 1.7482978256094883, + "grad_norm": 2.1225225925445557, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7159172296524048, + "num_tokens": 397277377.0, + "step": 15920 + }, + { + "epoch": 1.7484076433121019, + "grad_norm": 2.386470079421997, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7355579733848572, + "num_tokens": 397301142.0, + "step": 15921 + }, + { + "epoch": 1.7485174610147156, + "grad_norm": 2.1866486072540283, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7280915379524231, + "num_tokens": 397325499.0, + "step": 15922 + }, + { + "epoch": 1.7486272787173291, + "grad_norm": 2.619143009185791, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7140251398086548, + "num_tokens": 397346262.0, + "step": 15923 + }, + { + "epoch": 1.748737096419943, + "grad_norm": 2.611558198928833, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7048850655555725, + "num_tokens": 397366464.0, + "step": 15924 + }, + { + "epoch": 1.7488469141225567, + "grad_norm": 2.2412993907928467, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.6965275406837463, + "num_tokens": 397394712.0, + "step": 15925 + }, + { + "epoch": 1.7489567318251702, + "grad_norm": 2.1814393997192383, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7144235372543335, + "num_tokens": 397420686.0, + "step": 15926 + }, + { + "epoch": 1.7490665495277837, + "grad_norm": 2.523268938064575, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7241609692573547, + "num_tokens": 397442806.0, + "step": 15927 + }, + { + "epoch": 1.7491763672303975, + "grad_norm": 2.2685256004333496, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7129465937614441, + "num_tokens": 397466910.0, + "step": 15928 + }, + { + "epoch": 1.7492861849330112, + "grad_norm": 2.300243854522705, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7254461050033569, + "num_tokens": 397492446.0, + "step": 15929 + }, + { + "epoch": 1.749396002635625, + "grad_norm": 2.3058013916015625, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7129680514335632, + "num_tokens": 397517437.0, + "step": 15930 + }, + { + "epoch": 1.7495058203382385, + "grad_norm": 2.3877530097961426, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7174443006515503, + "num_tokens": 397541547.0, + "step": 15931 + }, + { + "epoch": 1.749615638040852, + "grad_norm": 2.2067151069641113, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7185159921646118, + "num_tokens": 397568541.0, + "step": 15932 + }, + { + "epoch": 1.7497254557434658, + "grad_norm": 2.18643856048584, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.7003287672996521, + "num_tokens": 397596582.0, + "step": 15933 + }, + { + "epoch": 1.7498352734460796, + "grad_norm": 2.102522850036621, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7054845094680786, + "num_tokens": 397626426.0, + "step": 15934 + }, + { + "epoch": 1.7499450911486931, + "grad_norm": 2.1121630668640137, + "learning_rate": 1e-06, + "loss": 0.7838, + "mean_token_accuracy": 0.7540084719657898, + "num_tokens": 397652878.0, + "step": 15935 + }, + { + "epoch": 1.7500549088513069, + "grad_norm": 2.1024394035339355, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7151539325714111, + "num_tokens": 397681425.0, + "step": 15936 + }, + { + "epoch": 1.7501647265539204, + "grad_norm": 2.509544610977173, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7248729467391968, + "num_tokens": 397702638.0, + "step": 15937 + }, + { + "epoch": 1.7502745442565342, + "grad_norm": 2.024782657623291, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7024806141853333, + "num_tokens": 397732832.0, + "step": 15938 + }, + { + "epoch": 1.750384361959148, + "grad_norm": 2.0365943908691406, + "learning_rate": 1e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7473167181015015, + "num_tokens": 397760202.0, + "step": 15939 + }, + { + "epoch": 1.7504941796617615, + "grad_norm": 2.2690203189849854, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7286819219589233, + "num_tokens": 397785207.0, + "step": 15940 + }, + { + "epoch": 1.750603997364375, + "grad_norm": 2.1989033222198486, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7254378199577332, + "num_tokens": 397811247.0, + "step": 15941 + }, + { + "epoch": 1.7507138150669888, + "grad_norm": 2.5028483867645264, + "learning_rate": 1e-06, + "loss": 0.7112, + "mean_token_accuracy": 0.7599083185195923, + "num_tokens": 397830206.0, + "step": 15942 + }, + { + "epoch": 1.7508236327696025, + "grad_norm": 2.350247859954834, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7046595215797424, + "num_tokens": 397853200.0, + "step": 15943 + }, + { + "epoch": 1.7509334504722163, + "grad_norm": 2.111750841140747, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7149465084075928, + "num_tokens": 397880767.0, + "step": 15944 + }, + { + "epoch": 1.7510432681748298, + "grad_norm": 1.9921094179153442, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7079740166664124, + "num_tokens": 397911005.0, + "step": 15945 + }, + { + "epoch": 1.7511530858774433, + "grad_norm": 2.528040647506714, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7291445732116699, + "num_tokens": 397931969.0, + "step": 15946 + }, + { + "epoch": 1.751262903580057, + "grad_norm": 2.4676856994628906, + "learning_rate": 1e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.7500967979431152, + "num_tokens": 397952940.0, + "step": 15947 + }, + { + "epoch": 1.7513727212826709, + "grad_norm": 2.0682170391082764, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.6986730098724365, + "num_tokens": 397981563.0, + "step": 15948 + }, + { + "epoch": 1.7514825389852844, + "grad_norm": 2.254218816757202, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7136093974113464, + "num_tokens": 398007698.0, + "step": 15949 + }, + { + "epoch": 1.7515923566878981, + "grad_norm": 2.309548854827881, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7027645111083984, + "num_tokens": 398034751.0, + "step": 15950 + }, + { + "epoch": 1.7517021743905117, + "grad_norm": 2.547081470489502, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7282670736312866, + "num_tokens": 398056550.0, + "step": 15951 + }, + { + "epoch": 1.7518119920931254, + "grad_norm": 2.2287991046905518, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7375048398971558, + "num_tokens": 398080722.0, + "step": 15952 + }, + { + "epoch": 1.7519218097957392, + "grad_norm": 2.2360026836395264, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7016235589981079, + "num_tokens": 398107797.0, + "step": 15953 + }, + { + "epoch": 1.7520316274983527, + "grad_norm": 2.4384469985961914, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7252292633056641, + "num_tokens": 398129778.0, + "step": 15954 + }, + { + "epoch": 1.7521414452009663, + "grad_norm": 2.3903894424438477, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7206698656082153, + "num_tokens": 398151920.0, + "step": 15955 + }, + { + "epoch": 1.75225126290358, + "grad_norm": 2.3131017684936523, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7250853180885315, + "num_tokens": 398176322.0, + "step": 15956 + }, + { + "epoch": 1.7523610806061938, + "grad_norm": 2.1218748092651367, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7297675013542175, + "num_tokens": 398203186.0, + "step": 15957 + }, + { + "epoch": 1.7524708983088075, + "grad_norm": 2.1827027797698975, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7291578650474548, + "num_tokens": 398228151.0, + "step": 15958 + }, + { + "epoch": 1.752580716011421, + "grad_norm": 2.0472559928894043, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.6867913007736206, + "num_tokens": 398260514.0, + "step": 15959 + }, + { + "epoch": 1.7526905337140346, + "grad_norm": 2.212622880935669, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7257218360900879, + "num_tokens": 398286708.0, + "step": 15960 + }, + { + "epoch": 1.7528003514166484, + "grad_norm": 2.420159101486206, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7253929376602173, + "num_tokens": 398307750.0, + "step": 15961 + }, + { + "epoch": 1.7529101691192621, + "grad_norm": 2.446823835372925, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7036933898925781, + "num_tokens": 398329915.0, + "step": 15962 + }, + { + "epoch": 1.7530199868218757, + "grad_norm": 1.9197052717208862, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6849051117897034, + "num_tokens": 398362185.0, + "step": 15963 + }, + { + "epoch": 1.7531298045244892, + "grad_norm": 2.07393479347229, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7037572264671326, + "num_tokens": 398393309.0, + "step": 15964 + }, + { + "epoch": 1.753239622227103, + "grad_norm": 2.516512393951416, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7178398370742798, + "num_tokens": 398414313.0, + "step": 15965 + }, + { + "epoch": 1.7533494399297167, + "grad_norm": 2.2521250247955322, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7181305289268494, + "num_tokens": 398439043.0, + "step": 15966 + }, + { + "epoch": 1.7534592576323305, + "grad_norm": 2.058541774749756, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7213418483734131, + "num_tokens": 398469579.0, + "step": 15967 + }, + { + "epoch": 1.753569075334944, + "grad_norm": 2.2745306491851807, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7201076745986938, + "num_tokens": 398495767.0, + "step": 15968 + }, + { + "epoch": 1.7536788930375575, + "grad_norm": 2.1647074222564697, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7095435261726379, + "num_tokens": 398523070.0, + "step": 15969 + }, + { + "epoch": 1.7537887107401713, + "grad_norm": 2.2480826377868652, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7243314981460571, + "num_tokens": 398548057.0, + "step": 15970 + }, + { + "epoch": 1.753898528442785, + "grad_norm": 2.332989454269409, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7365229725837708, + "num_tokens": 398572588.0, + "step": 15971 + }, + { + "epoch": 1.7540083461453988, + "grad_norm": 2.4407763481140137, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7193197011947632, + "num_tokens": 398595152.0, + "step": 15972 + }, + { + "epoch": 1.7541181638480123, + "grad_norm": 2.359443426132202, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7214188575744629, + "num_tokens": 398620414.0, + "step": 15973 + }, + { + "epoch": 1.7542279815506259, + "grad_norm": 2.529776096343994, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7131819128990173, + "num_tokens": 398642606.0, + "step": 15974 + }, + { + "epoch": 1.7543377992532396, + "grad_norm": 2.0205202102661133, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.737845778465271, + "num_tokens": 398671148.0, + "step": 15975 + }, + { + "epoch": 1.7544476169558534, + "grad_norm": 2.387550115585327, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7058526873588562, + "num_tokens": 398695363.0, + "step": 15976 + }, + { + "epoch": 1.754557434658467, + "grad_norm": 2.314281940460205, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7294884920120239, + "num_tokens": 398718645.0, + "step": 15977 + }, + { + "epoch": 1.7546672523610805, + "grad_norm": 1.9992530345916748, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7023911476135254, + "num_tokens": 398750077.0, + "step": 15978 + }, + { + "epoch": 1.7547770700636942, + "grad_norm": 2.1449222564697266, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.713120698928833, + "num_tokens": 398776943.0, + "step": 15979 + }, + { + "epoch": 1.754886887766308, + "grad_norm": 2.3869733810424805, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7333585619926453, + "num_tokens": 398797939.0, + "step": 15980 + }, + { + "epoch": 1.7549967054689217, + "grad_norm": 2.602940320968628, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7422758936882019, + "num_tokens": 398818838.0, + "step": 15981 + }, + { + "epoch": 1.7551065231715353, + "grad_norm": 2.5932066440582275, + "learning_rate": 1e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7406971454620361, + "num_tokens": 398839945.0, + "step": 15982 + }, + { + "epoch": 1.7552163408741488, + "grad_norm": 2.479827404022217, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7370919585227966, + "num_tokens": 398861731.0, + "step": 15983 + }, + { + "epoch": 1.7553261585767626, + "grad_norm": 2.0029184818267822, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7346484661102295, + "num_tokens": 398890795.0, + "step": 15984 + }, + { + "epoch": 1.7554359762793763, + "grad_norm": 2.0838253498077393, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6973546743392944, + "num_tokens": 398922899.0, + "step": 15985 + }, + { + "epoch": 1.7555457939819898, + "grad_norm": 2.749528169631958, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7339714765548706, + "num_tokens": 398942163.0, + "step": 15986 + }, + { + "epoch": 1.7556556116846036, + "grad_norm": 1.872985601425171, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7054737210273743, + "num_tokens": 398974560.0, + "step": 15987 + }, + { + "epoch": 1.7557654293872171, + "grad_norm": 1.9914480447769165, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.706252932548523, + "num_tokens": 399006893.0, + "step": 15988 + }, + { + "epoch": 1.755875247089831, + "grad_norm": 2.2980313301086426, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7303199172019958, + "num_tokens": 399031601.0, + "step": 15989 + }, + { + "epoch": 1.7559850647924446, + "grad_norm": 2.407623529434204, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.6939336061477661, + "num_tokens": 399055828.0, + "step": 15990 + }, + { + "epoch": 1.7560948824950582, + "grad_norm": 2.6240673065185547, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7277432680130005, + "num_tokens": 399076246.0, + "step": 15991 + }, + { + "epoch": 1.7562047001976717, + "grad_norm": 2.187070369720459, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7112231254577637, + "num_tokens": 399103075.0, + "step": 15992 + }, + { + "epoch": 1.7563145179002855, + "grad_norm": 2.386237859725952, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7345384359359741, + "num_tokens": 399125376.0, + "step": 15993 + }, + { + "epoch": 1.7564243356028992, + "grad_norm": 2.385472059249878, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7383353114128113, + "num_tokens": 399148733.0, + "step": 15994 + }, + { + "epoch": 1.756534153305513, + "grad_norm": 2.3277814388275146, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.7404499650001526, + "num_tokens": 399172696.0, + "step": 15995 + }, + { + "epoch": 1.7566439710081265, + "grad_norm": 2.33288311958313, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7153047919273376, + "num_tokens": 399195310.0, + "step": 15996 + }, + { + "epoch": 1.75675378871074, + "grad_norm": 2.471343517303467, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7288432121276855, + "num_tokens": 399216646.0, + "step": 15997 + }, + { + "epoch": 1.7568636064133538, + "grad_norm": 2.3526318073272705, + "learning_rate": 1e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7453662157058716, + "num_tokens": 399239446.0, + "step": 15998 + }, + { + "epoch": 1.7569734241159676, + "grad_norm": 2.567627429962158, + "learning_rate": 1e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7480077743530273, + "num_tokens": 399257917.0, + "step": 15999 + }, + { + "epoch": 1.757083241818581, + "grad_norm": 2.4214882850646973, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7383633852005005, + "num_tokens": 399280580.0, + "step": 16000 + }, + { + "epoch": 1.7571930595211949, + "grad_norm": 2.0499556064605713, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6882389783859253, + "num_tokens": 399311514.0, + "step": 16001 + }, + { + "epoch": 1.7573028772238084, + "grad_norm": 2.49385404586792, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7223720550537109, + "num_tokens": 399333489.0, + "step": 16002 + }, + { + "epoch": 1.7574126949264222, + "grad_norm": 2.2569055557250977, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7233942747116089, + "num_tokens": 399359428.0, + "step": 16003 + }, + { + "epoch": 1.757522512629036, + "grad_norm": 2.112671136856079, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7090322971343994, + "num_tokens": 399387547.0, + "step": 16004 + }, + { + "epoch": 1.7576323303316495, + "grad_norm": 2.095207691192627, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7208762764930725, + "num_tokens": 399415815.0, + "step": 16005 + }, + { + "epoch": 1.757742148034263, + "grad_norm": 2.2074272632598877, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7235640287399292, + "num_tokens": 399442115.0, + "step": 16006 + }, + { + "epoch": 1.7578519657368767, + "grad_norm": 2.5311849117279053, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7241874933242798, + "num_tokens": 399465018.0, + "step": 16007 + }, + { + "epoch": 1.7579617834394905, + "grad_norm": 2.2906856536865234, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7141162157058716, + "num_tokens": 399488874.0, + "step": 16008 + }, + { + "epoch": 1.7580716011421043, + "grad_norm": 2.3587303161621094, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7029354572296143, + "num_tokens": 399513814.0, + "step": 16009 + }, + { + "epoch": 1.7581814188447178, + "grad_norm": 2.494840145111084, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7111093997955322, + "num_tokens": 399535357.0, + "step": 16010 + }, + { + "epoch": 1.7582912365473313, + "grad_norm": 2.5373706817626953, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7370319366455078, + "num_tokens": 399555943.0, + "step": 16011 + }, + { + "epoch": 1.758401054249945, + "grad_norm": 2.1014657020568848, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6961509585380554, + "num_tokens": 399586096.0, + "step": 16012 + }, + { + "epoch": 1.7585108719525588, + "grad_norm": 2.3969478607177734, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7289177179336548, + "num_tokens": 399608226.0, + "step": 16013 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 2.1031653881073, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7526859045028687, + "num_tokens": 399635242.0, + "step": 16014 + }, + { + "epoch": 1.758730507357786, + "grad_norm": 2.0523552894592285, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7087399363517761, + "num_tokens": 399664272.0, + "step": 16015 + }, + { + "epoch": 1.7588403250603997, + "grad_norm": 2.147278070449829, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7072662115097046, + "num_tokens": 399691544.0, + "step": 16016 + }, + { + "epoch": 1.7589501427630134, + "grad_norm": 2.039295196533203, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7149399518966675, + "num_tokens": 399722492.0, + "step": 16017 + }, + { + "epoch": 1.7590599604656272, + "grad_norm": 2.1426949501037598, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.699560284614563, + "num_tokens": 399750820.0, + "step": 16018 + }, + { + "epoch": 1.7591697781682407, + "grad_norm": 2.705960750579834, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7462793588638306, + "num_tokens": 399769313.0, + "step": 16019 + }, + { + "epoch": 1.7592795958708543, + "grad_norm": 2.5616514682769775, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.720754861831665, + "num_tokens": 399790028.0, + "step": 16020 + }, + { + "epoch": 1.759389413573468, + "grad_norm": 2.48311710357666, + "learning_rate": 1e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7501425743103027, + "num_tokens": 399808750.0, + "step": 16021 + }, + { + "epoch": 1.7594992312760818, + "grad_norm": 2.2438411712646484, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7114031314849854, + "num_tokens": 399832522.0, + "step": 16022 + }, + { + "epoch": 1.7596090489786955, + "grad_norm": 2.059683084487915, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7134629487991333, + "num_tokens": 399862673.0, + "step": 16023 + }, + { + "epoch": 1.759718866681309, + "grad_norm": 2.12264347076416, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.7430106401443481, + "num_tokens": 399888306.0, + "step": 16024 + }, + { + "epoch": 1.7598286843839226, + "grad_norm": 2.250993013381958, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7032417058944702, + "num_tokens": 399916118.0, + "step": 16025 + }, + { + "epoch": 1.7599385020865363, + "grad_norm": 2.1941370964050293, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.710638165473938, + "num_tokens": 399942379.0, + "step": 16026 + }, + { + "epoch": 1.76004831978915, + "grad_norm": 2.665632963180542, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7194013595581055, + "num_tokens": 399960743.0, + "step": 16027 + }, + { + "epoch": 1.7601581374917636, + "grad_norm": 1.888936996459961, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.6970916986465454, + "num_tokens": 399995557.0, + "step": 16028 + }, + { + "epoch": 1.7602679551943772, + "grad_norm": 2.1101818084716797, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7165840268135071, + "num_tokens": 400026215.0, + "step": 16029 + }, + { + "epoch": 1.760377772896991, + "grad_norm": 2.2203376293182373, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7293417453765869, + "num_tokens": 400052070.0, + "step": 16030 + }, + { + "epoch": 1.7604875905996047, + "grad_norm": 2.1341943740844727, + "learning_rate": 1e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7398696541786194, + "num_tokens": 400078502.0, + "step": 16031 + }, + { + "epoch": 1.7605974083022184, + "grad_norm": 2.2735061645507812, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7312328219413757, + "num_tokens": 400103261.0, + "step": 16032 + }, + { + "epoch": 1.760707226004832, + "grad_norm": 2.238461494445801, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7241082191467285, + "num_tokens": 400128928.0, + "step": 16033 + }, + { + "epoch": 1.7608170437074455, + "grad_norm": 2.1325314044952393, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7148395776748657, + "num_tokens": 400157672.0, + "step": 16034 + }, + { + "epoch": 1.7609268614100593, + "grad_norm": 2.106823444366455, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7383390665054321, + "num_tokens": 400185702.0, + "step": 16035 + }, + { + "epoch": 1.761036679112673, + "grad_norm": 2.248821258544922, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7218388319015503, + "num_tokens": 400211668.0, + "step": 16036 + }, + { + "epoch": 1.7611464968152868, + "grad_norm": 2.1297881603240967, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7017564177513123, + "num_tokens": 400241029.0, + "step": 16037 + }, + { + "epoch": 1.7612563145179003, + "grad_norm": 2.1903610229492188, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7259863018989563, + "num_tokens": 400267925.0, + "step": 16038 + }, + { + "epoch": 1.7613661322205139, + "grad_norm": 2.130364179611206, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7086974382400513, + "num_tokens": 400294982.0, + "step": 16039 + }, + { + "epoch": 1.7614759499231276, + "grad_norm": 2.41668963432312, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6976480484008789, + "num_tokens": 400316724.0, + "step": 16040 + }, + { + "epoch": 1.7615857676257414, + "grad_norm": 2.109694004058838, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7212945222854614, + "num_tokens": 400343625.0, + "step": 16041 + }, + { + "epoch": 1.761695585328355, + "grad_norm": 2.428447723388672, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7240003347396851, + "num_tokens": 400364669.0, + "step": 16042 + }, + { + "epoch": 1.7618054030309684, + "grad_norm": 2.2832517623901367, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7174028754234314, + "num_tokens": 400391321.0, + "step": 16043 + }, + { + "epoch": 1.7619152207335822, + "grad_norm": 2.1120641231536865, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.733558714389801, + "num_tokens": 400419677.0, + "step": 16044 + }, + { + "epoch": 1.762025038436196, + "grad_norm": 2.2165393829345703, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7049512267112732, + "num_tokens": 400446133.0, + "step": 16045 + }, + { + "epoch": 1.7621348561388097, + "grad_norm": 2.5510950088500977, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7461622953414917, + "num_tokens": 400466797.0, + "step": 16046 + }, + { + "epoch": 1.7622446738414232, + "grad_norm": 2.3833162784576416, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7250207662582397, + "num_tokens": 400489502.0, + "step": 16047 + }, + { + "epoch": 1.7623544915440368, + "grad_norm": 2.3643453121185303, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7020038962364197, + "num_tokens": 400510818.0, + "step": 16048 + }, + { + "epoch": 1.7624643092466505, + "grad_norm": 2.6287145614624023, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7283515334129333, + "num_tokens": 400530747.0, + "step": 16049 + }, + { + "epoch": 1.7625741269492643, + "grad_norm": 2.9212987422943115, + "learning_rate": 1e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.7444493770599365, + "num_tokens": 400547392.0, + "step": 16050 + }, + { + "epoch": 1.7626839446518778, + "grad_norm": 2.3725454807281494, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7231603860855103, + "num_tokens": 400569552.0, + "step": 16051 + }, + { + "epoch": 1.7627937623544916, + "grad_norm": 2.260737895965576, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7240670919418335, + "num_tokens": 400593105.0, + "step": 16052 + }, + { + "epoch": 1.7629035800571051, + "grad_norm": 1.9081134796142578, + "learning_rate": 1e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.746489942073822, + "num_tokens": 400622093.0, + "step": 16053 + }, + { + "epoch": 1.7630133977597189, + "grad_norm": 2.4995882511138916, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7021157741546631, + "num_tokens": 400644279.0, + "step": 16054 + }, + { + "epoch": 1.7631232154623326, + "grad_norm": 2.318002462387085, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7186844944953918, + "num_tokens": 400667443.0, + "step": 16055 + }, + { + "epoch": 1.7632330331649462, + "grad_norm": 2.180746555328369, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7058238983154297, + "num_tokens": 400694922.0, + "step": 16056 + }, + { + "epoch": 1.7633428508675597, + "grad_norm": 2.1862175464630127, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7307826280593872, + "num_tokens": 400720673.0, + "step": 16057 + }, + { + "epoch": 1.7634526685701735, + "grad_norm": 2.2462148666381836, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.742476224899292, + "num_tokens": 400746748.0, + "step": 16058 + }, + { + "epoch": 1.7635624862727872, + "grad_norm": 2.109015941619873, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7169965505599976, + "num_tokens": 400773615.0, + "step": 16059 + }, + { + "epoch": 1.763672303975401, + "grad_norm": 2.2360165119171143, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7291644215583801, + "num_tokens": 400799136.0, + "step": 16060 + }, + { + "epoch": 1.7637821216780145, + "grad_norm": 2.365001678466797, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7249048948287964, + "num_tokens": 400822797.0, + "step": 16061 + }, + { + "epoch": 1.763891939380628, + "grad_norm": 2.356583833694458, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7094889879226685, + "num_tokens": 400846896.0, + "step": 16062 + }, + { + "epoch": 1.7640017570832418, + "grad_norm": 2.3175272941589355, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7443785667419434, + "num_tokens": 400870481.0, + "step": 16063 + }, + { + "epoch": 1.7641115747858556, + "grad_norm": 2.4776828289031982, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7146801948547363, + "num_tokens": 400891448.0, + "step": 16064 + }, + { + "epoch": 1.764221392488469, + "grad_norm": 2.350123882293701, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7309380769729614, + "num_tokens": 400915721.0, + "step": 16065 + }, + { + "epoch": 1.7643312101910829, + "grad_norm": 2.324256420135498, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7219459414482117, + "num_tokens": 400939177.0, + "step": 16066 + }, + { + "epoch": 1.7644410278936964, + "grad_norm": 2.4436655044555664, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7326265573501587, + "num_tokens": 400961673.0, + "step": 16067 + }, + { + "epoch": 1.7645508455963101, + "grad_norm": 2.449315071105957, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7204147577285767, + "num_tokens": 400984741.0, + "step": 16068 + }, + { + "epoch": 1.764660663298924, + "grad_norm": 1.9631210565567017, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.6963049173355103, + "num_tokens": 401015801.0, + "step": 16069 + }, + { + "epoch": 1.7647704810015374, + "grad_norm": 2.5807173252105713, + "learning_rate": 1e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7381914854049683, + "num_tokens": 401036025.0, + "step": 16070 + }, + { + "epoch": 1.764880298704151, + "grad_norm": 2.3442811965942383, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7212792634963989, + "num_tokens": 401059561.0, + "step": 16071 + }, + { + "epoch": 1.7649901164067647, + "grad_norm": 2.247004747390747, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7235922813415527, + "num_tokens": 401087686.0, + "step": 16072 + }, + { + "epoch": 1.7650999341093785, + "grad_norm": 2.777395486831665, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7196730971336365, + "num_tokens": 401104546.0, + "step": 16073 + }, + { + "epoch": 1.7652097518119922, + "grad_norm": 2.399210214614868, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7217053174972534, + "num_tokens": 401128555.0, + "step": 16074 + }, + { + "epoch": 1.7653195695146058, + "grad_norm": 2.1336662769317627, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7085416316986084, + "num_tokens": 401157324.0, + "step": 16075 + }, + { + "epoch": 1.7654293872172193, + "grad_norm": 2.589972972869873, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7364175319671631, + "num_tokens": 401177773.0, + "step": 16076 + }, + { + "epoch": 1.765539204919833, + "grad_norm": 2.1485044956207275, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7347235679626465, + "num_tokens": 401204734.0, + "step": 16077 + }, + { + "epoch": 1.7656490226224468, + "grad_norm": 2.1785507202148438, + "learning_rate": 1e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7394211292266846, + "num_tokens": 401232604.0, + "step": 16078 + }, + { + "epoch": 1.7657588403250604, + "grad_norm": 2.3316075801849365, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7442225813865662, + "num_tokens": 401255206.0, + "step": 16079 + }, + { + "epoch": 1.765868658027674, + "grad_norm": 2.4499330520629883, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.727802574634552, + "num_tokens": 401278173.0, + "step": 16080 + }, + { + "epoch": 1.7659784757302877, + "grad_norm": 2.354856491088867, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7100353240966797, + "num_tokens": 401302472.0, + "step": 16081 + }, + { + "epoch": 1.7660882934329014, + "grad_norm": 2.890582799911499, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7215170860290527, + "num_tokens": 401318535.0, + "step": 16082 + }, + { + "epoch": 1.7661981111355152, + "grad_norm": 2.130422353744507, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7114688158035278, + "num_tokens": 401345087.0, + "step": 16083 + }, + { + "epoch": 1.7663079288381287, + "grad_norm": 2.36869740486145, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7357219457626343, + "num_tokens": 401366789.0, + "step": 16084 + }, + { + "epoch": 1.7664177465407422, + "grad_norm": 2.35504412651062, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7300417423248291, + "num_tokens": 401389033.0, + "step": 16085 + }, + { + "epoch": 1.766527564243356, + "grad_norm": 2.027869701385498, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.703269362449646, + "num_tokens": 401420305.0, + "step": 16086 + }, + { + "epoch": 1.7666373819459698, + "grad_norm": 2.334230422973633, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7073764801025391, + "num_tokens": 401444602.0, + "step": 16087 + }, + { + "epoch": 1.7667471996485835, + "grad_norm": 2.041593074798584, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6939190626144409, + "num_tokens": 401475360.0, + "step": 16088 + }, + { + "epoch": 1.766857017351197, + "grad_norm": 2.161017656326294, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7168487906455994, + "num_tokens": 401500605.0, + "step": 16089 + }, + { + "epoch": 1.7669668350538106, + "grad_norm": 2.1671230792999268, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7292741537094116, + "num_tokens": 401526653.0, + "step": 16090 + }, + { + "epoch": 1.7670766527564243, + "grad_norm": 2.433001756668091, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7091126441955566, + "num_tokens": 401549169.0, + "step": 16091 + }, + { + "epoch": 1.767186470459038, + "grad_norm": 2.171111583709717, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7186194658279419, + "num_tokens": 401575545.0, + "step": 16092 + }, + { + "epoch": 1.7672962881616516, + "grad_norm": 2.1338202953338623, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7106508016586304, + "num_tokens": 401603075.0, + "step": 16093 + }, + { + "epoch": 1.7674061058642652, + "grad_norm": 2.180983304977417, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7262284755706787, + "num_tokens": 401629064.0, + "step": 16094 + }, + { + "epoch": 1.767515923566879, + "grad_norm": 2.3117008209228516, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7213439345359802, + "num_tokens": 401653410.0, + "step": 16095 + }, + { + "epoch": 1.7676257412694927, + "grad_norm": 2.272972822189331, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7132196426391602, + "num_tokens": 401677807.0, + "step": 16096 + }, + { + "epoch": 1.7677355589721064, + "grad_norm": 2.0701112747192383, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7167527079582214, + "num_tokens": 401706424.0, + "step": 16097 + }, + { + "epoch": 1.76784537667472, + "grad_norm": 2.1973800659179688, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7360213398933411, + "num_tokens": 401731154.0, + "step": 16098 + }, + { + "epoch": 1.7679551943773335, + "grad_norm": 2.2755517959594727, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7099214792251587, + "num_tokens": 401755825.0, + "step": 16099 + }, + { + "epoch": 1.7680650120799473, + "grad_norm": 2.254809617996216, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7335219383239746, + "num_tokens": 401781843.0, + "step": 16100 + }, + { + "epoch": 1.768174829782561, + "grad_norm": 2.476825714111328, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7206733226776123, + "num_tokens": 401802843.0, + "step": 16101 + }, + { + "epoch": 1.7682846474851748, + "grad_norm": 2.17427921295166, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7255457639694214, + "num_tokens": 401829275.0, + "step": 16102 + }, + { + "epoch": 1.7683944651877883, + "grad_norm": 2.0786919593811035, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7060055732727051, + "num_tokens": 401859020.0, + "step": 16103 + }, + { + "epoch": 1.7685042828904018, + "grad_norm": 2.3462917804718018, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7294932007789612, + "num_tokens": 401882988.0, + "step": 16104 + }, + { + "epoch": 1.7686141005930156, + "grad_norm": 2.3456692695617676, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7305477857589722, + "num_tokens": 401907405.0, + "step": 16105 + }, + { + "epoch": 1.7687239182956294, + "grad_norm": 2.2222232818603516, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.733343243598938, + "num_tokens": 401933528.0, + "step": 16106 + }, + { + "epoch": 1.768833735998243, + "grad_norm": 2.4229750633239746, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6958322525024414, + "num_tokens": 401957240.0, + "step": 16107 + }, + { + "epoch": 1.7689435537008564, + "grad_norm": 2.246034860610962, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7267993688583374, + "num_tokens": 401982719.0, + "step": 16108 + }, + { + "epoch": 1.7690533714034702, + "grad_norm": 2.672032117843628, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7321407198905945, + "num_tokens": 402001709.0, + "step": 16109 + }, + { + "epoch": 1.769163189106084, + "grad_norm": 2.1845574378967285, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.6994546055793762, + "num_tokens": 402028459.0, + "step": 16110 + }, + { + "epoch": 1.7692730068086977, + "grad_norm": 2.20910382270813, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7151480317115784, + "num_tokens": 402054776.0, + "step": 16111 + }, + { + "epoch": 1.7693828245113112, + "grad_norm": 2.0898752212524414, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7237839698791504, + "num_tokens": 402081506.0, + "step": 16112 + }, + { + "epoch": 1.7694926422139248, + "grad_norm": 2.465940475463867, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7076364159584045, + "num_tokens": 402103121.0, + "step": 16113 + }, + { + "epoch": 1.7696024599165385, + "grad_norm": 2.5161123275756836, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7120589017868042, + "num_tokens": 402123556.0, + "step": 16114 + }, + { + "epoch": 1.7697122776191523, + "grad_norm": 2.289374351501465, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7090878486633301, + "num_tokens": 402149744.0, + "step": 16115 + }, + { + "epoch": 1.7698220953217658, + "grad_norm": 2.5632002353668213, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.730506181716919, + "num_tokens": 402169673.0, + "step": 16116 + }, + { + "epoch": 1.7699319130243796, + "grad_norm": 2.408622980117798, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7299373149871826, + "num_tokens": 402191617.0, + "step": 16117 + }, + { + "epoch": 1.7700417307269931, + "grad_norm": 2.2253804206848145, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7261502742767334, + "num_tokens": 402216166.0, + "step": 16118 + }, + { + "epoch": 1.7701515484296069, + "grad_norm": 2.172135829925537, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7160193920135498, + "num_tokens": 402243248.0, + "step": 16119 + }, + { + "epoch": 1.7702613661322206, + "grad_norm": 2.288740396499634, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7225685119628906, + "num_tokens": 402268019.0, + "step": 16120 + }, + { + "epoch": 1.7703711838348342, + "grad_norm": 2.393150806427002, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7232129573822021, + "num_tokens": 402290656.0, + "step": 16121 + }, + { + "epoch": 1.7704810015374477, + "grad_norm": 2.226104497909546, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7436175346374512, + "num_tokens": 402316204.0, + "step": 16122 + }, + { + "epoch": 1.7705908192400615, + "grad_norm": 2.6881139278411865, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7464470863342285, + "num_tokens": 402334959.0, + "step": 16123 + }, + { + "epoch": 1.7707006369426752, + "grad_norm": 2.463355541229248, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.734565258026123, + "num_tokens": 402355864.0, + "step": 16124 + }, + { + "epoch": 1.770810454645289, + "grad_norm": 2.1600100994110107, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7203704118728638, + "num_tokens": 402384537.0, + "step": 16125 + }, + { + "epoch": 1.7709202723479025, + "grad_norm": 1.8960669040679932, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7172024250030518, + "num_tokens": 402419020.0, + "step": 16126 + }, + { + "epoch": 1.771030090050516, + "grad_norm": 2.0607004165649414, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.714939534664154, + "num_tokens": 402448161.0, + "step": 16127 + }, + { + "epoch": 1.7711399077531298, + "grad_norm": 2.042705535888672, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.6983422636985779, + "num_tokens": 402477725.0, + "step": 16128 + }, + { + "epoch": 1.7712497254557436, + "grad_norm": 2.287292718887329, + "learning_rate": 1e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7385586500167847, + "num_tokens": 402501388.0, + "step": 16129 + }, + { + "epoch": 1.771359543158357, + "grad_norm": 2.0974669456481934, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7183130979537964, + "num_tokens": 402530307.0, + "step": 16130 + }, + { + "epoch": 1.7714693608609708, + "grad_norm": 2.285973310470581, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7304507493972778, + "num_tokens": 402554673.0, + "step": 16131 + }, + { + "epoch": 1.7715791785635844, + "grad_norm": 2.2399542331695557, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7114688158035278, + "num_tokens": 402580946.0, + "step": 16132 + }, + { + "epoch": 1.7716889962661981, + "grad_norm": 2.4288811683654785, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.744416356086731, + "num_tokens": 402601234.0, + "step": 16133 + }, + { + "epoch": 1.771798813968812, + "grad_norm": 2.1705474853515625, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7254738807678223, + "num_tokens": 402628373.0, + "step": 16134 + }, + { + "epoch": 1.7719086316714254, + "grad_norm": 2.309051036834717, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7414814233779907, + "num_tokens": 402654181.0, + "step": 16135 + }, + { + "epoch": 1.772018449374039, + "grad_norm": 2.3727822303771973, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7187913060188293, + "num_tokens": 402676570.0, + "step": 16136 + }, + { + "epoch": 1.7721282670766527, + "grad_norm": 2.5878357887268066, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7299034595489502, + "num_tokens": 402697131.0, + "step": 16137 + }, + { + "epoch": 1.7722380847792665, + "grad_norm": 2.2103590965270996, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7141866087913513, + "num_tokens": 402723808.0, + "step": 16138 + }, + { + "epoch": 1.7723479024818802, + "grad_norm": 2.3684537410736084, + "learning_rate": 1e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.7415879368782043, + "num_tokens": 402746568.0, + "step": 16139 + }, + { + "epoch": 1.7724577201844938, + "grad_norm": 2.351639986038208, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7357563376426697, + "num_tokens": 402769449.0, + "step": 16140 + }, + { + "epoch": 1.7725675378871073, + "grad_norm": 2.48740816116333, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7500169277191162, + "num_tokens": 402790238.0, + "step": 16141 + }, + { + "epoch": 1.772677355589721, + "grad_norm": 2.6680853366851807, + "learning_rate": 1e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.7429687976837158, + "num_tokens": 402808330.0, + "step": 16142 + }, + { + "epoch": 1.7727871732923348, + "grad_norm": 2.070207357406616, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.710739254951477, + "num_tokens": 402836255.0, + "step": 16143 + }, + { + "epoch": 1.7728969909949484, + "grad_norm": 2.394908905029297, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7166169881820679, + "num_tokens": 402858830.0, + "step": 16144 + }, + { + "epoch": 1.7730068086975619, + "grad_norm": 2.110356569290161, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7297719717025757, + "num_tokens": 402886536.0, + "step": 16145 + }, + { + "epoch": 1.7731166264001756, + "grad_norm": 2.4926960468292236, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7342473864555359, + "num_tokens": 402907632.0, + "step": 16146 + }, + { + "epoch": 1.7732264441027894, + "grad_norm": 2.280106782913208, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7196555137634277, + "num_tokens": 402933038.0, + "step": 16147 + }, + { + "epoch": 1.7733362618054032, + "grad_norm": 2.0702009201049805, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6970229744911194, + "num_tokens": 402963080.0, + "step": 16148 + }, + { + "epoch": 1.7734460795080167, + "grad_norm": 2.011582374572754, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7199563384056091, + "num_tokens": 402993844.0, + "step": 16149 + }, + { + "epoch": 1.7735558972106302, + "grad_norm": 2.6556410789489746, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.737177312374115, + "num_tokens": 403013115.0, + "step": 16150 + }, + { + "epoch": 1.773665714913244, + "grad_norm": 2.054175615310669, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7337313890457153, + "num_tokens": 403042030.0, + "step": 16151 + }, + { + "epoch": 1.7737755326158577, + "grad_norm": 2.0347447395324707, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7349299788475037, + "num_tokens": 403070315.0, + "step": 16152 + }, + { + "epoch": 1.7738853503184715, + "grad_norm": 2.4429523944854736, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7119534611701965, + "num_tokens": 403094336.0, + "step": 16153 + }, + { + "epoch": 1.773995168021085, + "grad_norm": 2.3006978034973145, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7321829795837402, + "num_tokens": 403116572.0, + "step": 16154 + }, + { + "epoch": 1.7741049857236986, + "grad_norm": 2.028928756713867, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6883602738380432, + "num_tokens": 403149626.0, + "step": 16155 + }, + { + "epoch": 1.7742148034263123, + "grad_norm": 2.5040743350982666, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7166652679443359, + "num_tokens": 403171886.0, + "step": 16156 + }, + { + "epoch": 1.774324621128926, + "grad_norm": 2.2396841049194336, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7101163268089294, + "num_tokens": 403196045.0, + "step": 16157 + }, + { + "epoch": 1.7744344388315396, + "grad_norm": 1.996945858001709, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7225635647773743, + "num_tokens": 403227252.0, + "step": 16158 + }, + { + "epoch": 1.7745442565341532, + "grad_norm": 2.1437997817993164, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.72993004322052, + "num_tokens": 403254809.0, + "step": 16159 + }, + { + "epoch": 1.774654074236767, + "grad_norm": 2.4044384956359863, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7387222647666931, + "num_tokens": 403276715.0, + "step": 16160 + }, + { + "epoch": 1.7747638919393807, + "grad_norm": 1.9455262422561646, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7178161144256592, + "num_tokens": 403310193.0, + "step": 16161 + }, + { + "epoch": 1.7748737096419944, + "grad_norm": 2.483095407485962, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7543261051177979, + "num_tokens": 403330973.0, + "step": 16162 + }, + { + "epoch": 1.774983527344608, + "grad_norm": 2.1075615882873535, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.719393253326416, + "num_tokens": 403359700.0, + "step": 16163 + }, + { + "epoch": 1.7750933450472215, + "grad_norm": 2.2825472354888916, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7322258949279785, + "num_tokens": 403384649.0, + "step": 16164 + }, + { + "epoch": 1.7752031627498353, + "grad_norm": 1.7722591161727905, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7134196758270264, + "num_tokens": 403420501.0, + "step": 16165 + }, + { + "epoch": 1.775312980452449, + "grad_norm": 2.3647871017456055, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7083638906478882, + "num_tokens": 403445438.0, + "step": 16166 + }, + { + "epoch": 1.7754227981550625, + "grad_norm": 2.557368755340576, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7340222001075745, + "num_tokens": 403465426.0, + "step": 16167 + }, + { + "epoch": 1.7755326158576763, + "grad_norm": 2.3806426525115967, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7091312408447266, + "num_tokens": 403490310.0, + "step": 16168 + }, + { + "epoch": 1.7756424335602898, + "grad_norm": 2.2785863876342773, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7466346025466919, + "num_tokens": 403514004.0, + "step": 16169 + }, + { + "epoch": 1.7757522512629036, + "grad_norm": 2.296299457550049, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7217337489128113, + "num_tokens": 403538427.0, + "step": 16170 + }, + { + "epoch": 1.7758620689655173, + "grad_norm": 2.122422456741333, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7003690600395203, + "num_tokens": 403568139.0, + "step": 16171 + }, + { + "epoch": 1.7759718866681309, + "grad_norm": 2.0550546646118164, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7247950434684753, + "num_tokens": 403596966.0, + "step": 16172 + }, + { + "epoch": 1.7760817043707444, + "grad_norm": 2.354243516921997, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7293967008590698, + "num_tokens": 403619125.0, + "step": 16173 + }, + { + "epoch": 1.7761915220733582, + "grad_norm": 2.1949448585510254, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7049166560173035, + "num_tokens": 403645088.0, + "step": 16174 + }, + { + "epoch": 1.776301339775972, + "grad_norm": 2.51749849319458, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7339543104171753, + "num_tokens": 403665661.0, + "step": 16175 + }, + { + "epoch": 1.7764111574785857, + "grad_norm": 2.311648368835449, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.728968620300293, + "num_tokens": 403689453.0, + "step": 16176 + }, + { + "epoch": 1.7765209751811992, + "grad_norm": 2.3096399307250977, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7007541656494141, + "num_tokens": 403714652.0, + "step": 16177 + }, + { + "epoch": 1.7766307928838128, + "grad_norm": 2.4130640029907227, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7259670495986938, + "num_tokens": 403736340.0, + "step": 16178 + }, + { + "epoch": 1.7767406105864265, + "grad_norm": 2.1990368366241455, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6962984204292297, + "num_tokens": 403763742.0, + "step": 16179 + }, + { + "epoch": 1.7768504282890403, + "grad_norm": 2.193408489227295, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7340590357780457, + "num_tokens": 403789188.0, + "step": 16180 + }, + { + "epoch": 1.7769602459916538, + "grad_norm": 2.328062057495117, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7163950204849243, + "num_tokens": 403813692.0, + "step": 16181 + }, + { + "epoch": 1.7770700636942676, + "grad_norm": 2.1903462409973145, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7247546911239624, + "num_tokens": 403840076.0, + "step": 16182 + }, + { + "epoch": 1.777179881396881, + "grad_norm": 2.0712413787841797, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7107855081558228, + "num_tokens": 403868057.0, + "step": 16183 + }, + { + "epoch": 1.7772896990994949, + "grad_norm": 2.3516461849212646, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7353314161300659, + "num_tokens": 403890282.0, + "step": 16184 + }, + { + "epoch": 1.7773995168021086, + "grad_norm": 1.9577301740646362, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7082548141479492, + "num_tokens": 403923269.0, + "step": 16185 + }, + { + "epoch": 1.7775093345047221, + "grad_norm": 2.0703303813934326, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.713355541229248, + "num_tokens": 403951275.0, + "step": 16186 + }, + { + "epoch": 1.7776191522073357, + "grad_norm": 2.068227767944336, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7320570945739746, + "num_tokens": 403978507.0, + "step": 16187 + }, + { + "epoch": 1.7777289699099494, + "grad_norm": 2.8051068782806396, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7503459453582764, + "num_tokens": 403995412.0, + "step": 16188 + }, + { + "epoch": 1.7778387876125632, + "grad_norm": 2.463444709777832, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7233885526657104, + "num_tokens": 404015833.0, + "step": 16189 + }, + { + "epoch": 1.777948605315177, + "grad_norm": 2.4108803272247314, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7211694717407227, + "num_tokens": 404038016.0, + "step": 16190 + }, + { + "epoch": 1.7780584230177905, + "grad_norm": 2.2101969718933105, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7331995368003845, + "num_tokens": 404066614.0, + "step": 16191 + }, + { + "epoch": 1.778168240720404, + "grad_norm": 2.455512762069702, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7255988121032715, + "num_tokens": 404089081.0, + "step": 16192 + }, + { + "epoch": 1.7782780584230178, + "grad_norm": 2.268303155899048, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.6981937885284424, + "num_tokens": 404115912.0, + "step": 16193 + }, + { + "epoch": 1.7783878761256315, + "grad_norm": 2.2476236820220947, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7097842693328857, + "num_tokens": 404141214.0, + "step": 16194 + }, + { + "epoch": 1.778497693828245, + "grad_norm": 2.2189760208129883, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7057862281799316, + "num_tokens": 404167660.0, + "step": 16195 + }, + { + "epoch": 1.7786075115308586, + "grad_norm": 2.1057980060577393, + "learning_rate": 1e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7411749362945557, + "num_tokens": 404193308.0, + "step": 16196 + }, + { + "epoch": 1.7787173292334724, + "grad_norm": 2.362258195877075, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.703120231628418, + "num_tokens": 404218699.0, + "step": 16197 + }, + { + "epoch": 1.7788271469360861, + "grad_norm": 2.3568830490112305, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7275736331939697, + "num_tokens": 404242543.0, + "step": 16198 + }, + { + "epoch": 1.7789369646386999, + "grad_norm": 2.1833391189575195, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.6993165612220764, + "num_tokens": 404271078.0, + "step": 16199 + }, + { + "epoch": 1.7790467823413134, + "grad_norm": 2.255699872970581, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7097485661506653, + "num_tokens": 404295300.0, + "step": 16200 + }, + { + "epoch": 1.779156600043927, + "grad_norm": 2.6206517219543457, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7405000925064087, + "num_tokens": 404313863.0, + "step": 16201 + }, + { + "epoch": 1.7792664177465407, + "grad_norm": 2.4172182083129883, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7340712547302246, + "num_tokens": 404335101.0, + "step": 16202 + }, + { + "epoch": 1.7793762354491545, + "grad_norm": 2.244774103164673, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7146692276000977, + "num_tokens": 404360860.0, + "step": 16203 + }, + { + "epoch": 1.7794860531517682, + "grad_norm": 2.312760353088379, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7232328653335571, + "num_tokens": 404385163.0, + "step": 16204 + }, + { + "epoch": 1.7795958708543818, + "grad_norm": 1.9984030723571777, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7258108258247375, + "num_tokens": 404416228.0, + "step": 16205 + }, + { + "epoch": 1.7797056885569953, + "grad_norm": 2.298173189163208, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7368898391723633, + "num_tokens": 404440425.0, + "step": 16206 + }, + { + "epoch": 1.779815506259609, + "grad_norm": 2.0987937450408936, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7065024375915527, + "num_tokens": 404470176.0, + "step": 16207 + }, + { + "epoch": 1.7799253239622228, + "grad_norm": 2.5218517780303955, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7210348844528198, + "num_tokens": 404491648.0, + "step": 16208 + }, + { + "epoch": 1.7800351416648363, + "grad_norm": 2.385206460952759, + "learning_rate": 1e-06, + "loss": 0.782, + "mean_token_accuracy": 0.7552732825279236, + "num_tokens": 404514315.0, + "step": 16209 + }, + { + "epoch": 1.7801449593674499, + "grad_norm": 2.343400001525879, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7158627510070801, + "num_tokens": 404538579.0, + "step": 16210 + }, + { + "epoch": 1.7802547770700636, + "grad_norm": 2.220303773880005, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7242364883422852, + "num_tokens": 404565235.0, + "step": 16211 + }, + { + "epoch": 1.7803645947726774, + "grad_norm": 2.399773120880127, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7569701075553894, + "num_tokens": 404585637.0, + "step": 16212 + }, + { + "epoch": 1.7804744124752911, + "grad_norm": 2.2344937324523926, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7213155627250671, + "num_tokens": 404609902.0, + "step": 16213 + }, + { + "epoch": 1.7805842301779047, + "grad_norm": 2.0603268146514893, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.70059734582901, + "num_tokens": 404640829.0, + "step": 16214 + }, + { + "epoch": 1.7806940478805182, + "grad_norm": 2.367823839187622, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7161790132522583, + "num_tokens": 404665289.0, + "step": 16215 + }, + { + "epoch": 1.780803865583132, + "grad_norm": 2.8219082355499268, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7370902299880981, + "num_tokens": 404683290.0, + "step": 16216 + }, + { + "epoch": 1.7809136832857457, + "grad_norm": 2.4961740970611572, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7052099704742432, + "num_tokens": 404704945.0, + "step": 16217 + }, + { + "epoch": 1.7810235009883595, + "grad_norm": 2.2627382278442383, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7324128150939941, + "num_tokens": 404729363.0, + "step": 16218 + }, + { + "epoch": 1.781133318690973, + "grad_norm": 2.1092638969421387, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7154207229614258, + "num_tokens": 404758876.0, + "step": 16219 + }, + { + "epoch": 1.7812431363935866, + "grad_norm": 2.114392042160034, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7032785415649414, + "num_tokens": 404788570.0, + "step": 16220 + }, + { + "epoch": 1.7813529540962003, + "grad_norm": 2.4689924716949463, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7149922847747803, + "num_tokens": 404810114.0, + "step": 16221 + }, + { + "epoch": 1.781462771798814, + "grad_norm": 2.1276047229766846, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6882778406143188, + "num_tokens": 404837855.0, + "step": 16222 + }, + { + "epoch": 1.7815725895014276, + "grad_norm": 2.3422977924346924, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7156200408935547, + "num_tokens": 404860460.0, + "step": 16223 + }, + { + "epoch": 1.7816824072040411, + "grad_norm": 2.1856298446655273, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.732018232345581, + "num_tokens": 404885459.0, + "step": 16224 + }, + { + "epoch": 1.781792224906655, + "grad_norm": 2.2379744052886963, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7045190930366516, + "num_tokens": 404911483.0, + "step": 16225 + }, + { + "epoch": 1.7819020426092687, + "grad_norm": 2.780008316040039, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7307077050209045, + "num_tokens": 404929083.0, + "step": 16226 + }, + { + "epoch": 1.7820118603118824, + "grad_norm": 2.149466037750244, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7279061079025269, + "num_tokens": 404955973.0, + "step": 16227 + }, + { + "epoch": 1.782121678014496, + "grad_norm": 2.2208592891693115, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7115286588668823, + "num_tokens": 404982666.0, + "step": 16228 + }, + { + "epoch": 1.7822314957171095, + "grad_norm": 2.149765968322754, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.732866108417511, + "num_tokens": 405008643.0, + "step": 16229 + }, + { + "epoch": 1.7823413134197232, + "grad_norm": 2.3894128799438477, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7326490879058838, + "num_tokens": 405031883.0, + "step": 16230 + }, + { + "epoch": 1.782451131122337, + "grad_norm": 2.362288236618042, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7178506851196289, + "num_tokens": 405053735.0, + "step": 16231 + }, + { + "epoch": 1.7825609488249505, + "grad_norm": 2.778083324432373, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.72634357213974, + "num_tokens": 405072155.0, + "step": 16232 + }, + { + "epoch": 1.7826707665275643, + "grad_norm": 2.2684693336486816, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7183336615562439, + "num_tokens": 405097918.0, + "step": 16233 + }, + { + "epoch": 1.7827805842301778, + "grad_norm": 2.434108257293701, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7304903268814087, + "num_tokens": 405120707.0, + "step": 16234 + }, + { + "epoch": 1.7828904019327916, + "grad_norm": 2.180628776550293, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7218359708786011, + "num_tokens": 405147038.0, + "step": 16235 + }, + { + "epoch": 1.7830002196354053, + "grad_norm": 2.15899658203125, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.701167106628418, + "num_tokens": 405175146.0, + "step": 16236 + }, + { + "epoch": 1.7831100373380189, + "grad_norm": 2.1178829669952393, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7130736112594604, + "num_tokens": 405202512.0, + "step": 16237 + }, + { + "epoch": 1.7832198550406324, + "grad_norm": 2.164716958999634, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7289997339248657, + "num_tokens": 405227777.0, + "step": 16238 + }, + { + "epoch": 1.7833296727432462, + "grad_norm": 2.330690860748291, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7016291618347168, + "num_tokens": 405251343.0, + "step": 16239 + }, + { + "epoch": 1.78343949044586, + "grad_norm": 2.3212382793426514, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6983746886253357, + "num_tokens": 405277213.0, + "step": 16240 + }, + { + "epoch": 1.7835493081484737, + "grad_norm": 2.5405139923095703, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7158865928649902, + "num_tokens": 405299999.0, + "step": 16241 + }, + { + "epoch": 1.7836591258510872, + "grad_norm": 2.2584128379821777, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7341134548187256, + "num_tokens": 405325543.0, + "step": 16242 + }, + { + "epoch": 1.7837689435537007, + "grad_norm": 2.3132903575897217, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7358841300010681, + "num_tokens": 405350541.0, + "step": 16243 + }, + { + "epoch": 1.7838787612563145, + "grad_norm": 2.479499340057373, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7216525673866272, + "num_tokens": 405372796.0, + "step": 16244 + }, + { + "epoch": 1.7839885789589283, + "grad_norm": 2.3166046142578125, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7477553486824036, + "num_tokens": 405395698.0, + "step": 16245 + }, + { + "epoch": 1.7840983966615418, + "grad_norm": 2.0246963500976562, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7043066024780273, + "num_tokens": 405425108.0, + "step": 16246 + }, + { + "epoch": 1.7842082143641556, + "grad_norm": 2.0603456497192383, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.716160237789154, + "num_tokens": 405453831.0, + "step": 16247 + }, + { + "epoch": 1.784318032066769, + "grad_norm": 2.038264513015747, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7094912528991699, + "num_tokens": 405485099.0, + "step": 16248 + }, + { + "epoch": 1.7844278497693828, + "grad_norm": 2.1804873943328857, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7305241823196411, + "num_tokens": 405514009.0, + "step": 16249 + }, + { + "epoch": 1.7845376674719966, + "grad_norm": 2.5660297870635986, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7308815121650696, + "num_tokens": 405534510.0, + "step": 16250 + }, + { + "epoch": 1.7846474851746101, + "grad_norm": 2.1693460941314697, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7344712615013123, + "num_tokens": 405560942.0, + "step": 16251 + }, + { + "epoch": 1.7847573028772237, + "grad_norm": 2.230741262435913, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7228940725326538, + "num_tokens": 405586366.0, + "step": 16252 + }, + { + "epoch": 1.7848671205798374, + "grad_norm": 2.53017258644104, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7386273741722107, + "num_tokens": 405607457.0, + "step": 16253 + }, + { + "epoch": 1.7849769382824512, + "grad_norm": 2.531942844390869, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7390719652175903, + "num_tokens": 405627209.0, + "step": 16254 + }, + { + "epoch": 1.785086755985065, + "grad_norm": 2.669943332672119, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7756878137588501, + "num_tokens": 405644614.0, + "step": 16255 + }, + { + "epoch": 1.7851965736876785, + "grad_norm": 2.3208694458007812, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7122281193733215, + "num_tokens": 405670094.0, + "step": 16256 + }, + { + "epoch": 1.785306391390292, + "grad_norm": 2.189948558807373, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7309349775314331, + "num_tokens": 405695243.0, + "step": 16257 + }, + { + "epoch": 1.7854162090929058, + "grad_norm": 2.0771846771240234, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7322024703025818, + "num_tokens": 405723918.0, + "step": 16258 + }, + { + "epoch": 1.7855260267955195, + "grad_norm": 2.2084643840789795, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7222329378128052, + "num_tokens": 405752033.0, + "step": 16259 + }, + { + "epoch": 1.785635844498133, + "grad_norm": 2.253638982772827, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7243086099624634, + "num_tokens": 405776805.0, + "step": 16260 + }, + { + "epoch": 1.7857456622007466, + "grad_norm": 2.3177568912506104, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7251142859458923, + "num_tokens": 405801660.0, + "step": 16261 + }, + { + "epoch": 1.7858554799033604, + "grad_norm": 2.2882497310638428, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7141348123550415, + "num_tokens": 405827846.0, + "step": 16262 + }, + { + "epoch": 1.7859652976059741, + "grad_norm": 2.447819709777832, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7352312803268433, + "num_tokens": 405848572.0, + "step": 16263 + }, + { + "epoch": 1.7860751153085879, + "grad_norm": 2.611375570297241, + "learning_rate": 1e-06, + "loss": 0.7544, + "mean_token_accuracy": 0.7590883374214172, + "num_tokens": 405868075.0, + "step": 16264 + }, + { + "epoch": 1.7861849330112014, + "grad_norm": 2.075226306915283, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7379639148712158, + "num_tokens": 405895828.0, + "step": 16265 + }, + { + "epoch": 1.786294750713815, + "grad_norm": 2.521946907043457, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7359164953231812, + "num_tokens": 405918042.0, + "step": 16266 + }, + { + "epoch": 1.7864045684164287, + "grad_norm": 2.5875484943389893, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7281999588012695, + "num_tokens": 405938744.0, + "step": 16267 + }, + { + "epoch": 1.7865143861190425, + "grad_norm": 2.2157933712005615, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7135586142539978, + "num_tokens": 405964294.0, + "step": 16268 + }, + { + "epoch": 1.7866242038216562, + "grad_norm": 2.1773877143859863, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7184991240501404, + "num_tokens": 405992285.0, + "step": 16269 + }, + { + "epoch": 1.7867340215242697, + "grad_norm": 2.6546268463134766, + "learning_rate": 1e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7444902062416077, + "num_tokens": 406010255.0, + "step": 16270 + }, + { + "epoch": 1.7868438392268833, + "grad_norm": 2.706800937652588, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7279731035232544, + "num_tokens": 406029474.0, + "step": 16271 + }, + { + "epoch": 1.786953656929497, + "grad_norm": 2.2065818309783936, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6971306800842285, + "num_tokens": 406054362.0, + "step": 16272 + }, + { + "epoch": 1.7870634746321108, + "grad_norm": 2.150773763656616, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.718769371509552, + "num_tokens": 406080842.0, + "step": 16273 + }, + { + "epoch": 1.7871732923347243, + "grad_norm": 2.446584701538086, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7263213396072388, + "num_tokens": 406101468.0, + "step": 16274 + }, + { + "epoch": 1.7872831100373379, + "grad_norm": 2.2884368896484375, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7045691013336182, + "num_tokens": 406126631.0, + "step": 16275 + }, + { + "epoch": 1.7873929277399516, + "grad_norm": 2.12441349029541, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7198226451873779, + "num_tokens": 406154152.0, + "step": 16276 + }, + { + "epoch": 1.7875027454425654, + "grad_norm": 2.3224332332611084, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7102200984954834, + "num_tokens": 406177419.0, + "step": 16277 + }, + { + "epoch": 1.7876125631451791, + "grad_norm": 2.3241491317749023, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7349498867988586, + "num_tokens": 406197838.0, + "step": 16278 + }, + { + "epoch": 1.7877223808477927, + "grad_norm": 1.9902292490005493, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7273978590965271, + "num_tokens": 406226936.0, + "step": 16279 + }, + { + "epoch": 1.7878321985504062, + "grad_norm": 2.0925164222717285, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.6947985887527466, + "num_tokens": 406258274.0, + "step": 16280 + }, + { + "epoch": 1.78794201625302, + "grad_norm": 2.433130979537964, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6996726393699646, + "num_tokens": 406282319.0, + "step": 16281 + }, + { + "epoch": 1.7880518339556337, + "grad_norm": 2.3648898601531982, + "learning_rate": 1e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7533188462257385, + "num_tokens": 406303353.0, + "step": 16282 + }, + { + "epoch": 1.7881616516582475, + "grad_norm": 2.1269209384918213, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7161111831665039, + "num_tokens": 406331208.0, + "step": 16283 + }, + { + "epoch": 1.788271469360861, + "grad_norm": 2.5852720737457275, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7447479963302612, + "num_tokens": 406349741.0, + "step": 16284 + }, + { + "epoch": 1.7883812870634745, + "grad_norm": 2.3078131675720215, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7012531757354736, + "num_tokens": 406376855.0, + "step": 16285 + }, + { + "epoch": 1.7884911047660883, + "grad_norm": 2.294586658477783, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7218124866485596, + "num_tokens": 406402713.0, + "step": 16286 + }, + { + "epoch": 1.788600922468702, + "grad_norm": 1.9215940237045288, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7239755988121033, + "num_tokens": 406434167.0, + "step": 16287 + }, + { + "epoch": 1.7887107401713156, + "grad_norm": 2.5861058235168457, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7356066107749939, + "num_tokens": 406454276.0, + "step": 16288 + }, + { + "epoch": 1.7888205578739291, + "grad_norm": 2.117722511291504, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7129753232002258, + "num_tokens": 406481068.0, + "step": 16289 + }, + { + "epoch": 1.7889303755765429, + "grad_norm": 2.4398486614227295, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7137612104415894, + "num_tokens": 406503632.0, + "step": 16290 + }, + { + "epoch": 1.7890401932791566, + "grad_norm": 2.0256805419921875, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6941249370574951, + "num_tokens": 406536785.0, + "step": 16291 + }, + { + "epoch": 1.7891500109817704, + "grad_norm": 2.3911168575286865, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7157529592514038, + "num_tokens": 406559075.0, + "step": 16292 + }, + { + "epoch": 1.789259828684384, + "grad_norm": 2.377622365951538, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7078985571861267, + "num_tokens": 406583175.0, + "step": 16293 + }, + { + "epoch": 1.7893696463869975, + "grad_norm": 2.4535000324249268, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7274354100227356, + "num_tokens": 406603794.0, + "step": 16294 + }, + { + "epoch": 1.7894794640896112, + "grad_norm": 2.6145060062408447, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.707837700843811, + "num_tokens": 406625044.0, + "step": 16295 + }, + { + "epoch": 1.789589281792225, + "grad_norm": 2.077667474746704, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7033401727676392, + "num_tokens": 406653787.0, + "step": 16296 + }, + { + "epoch": 1.7896990994948385, + "grad_norm": 2.304331064224243, + "learning_rate": 1e-06, + "loss": 0.842, + "mean_token_accuracy": 0.731369137763977, + "num_tokens": 406678263.0, + "step": 16297 + }, + { + "epoch": 1.7898089171974523, + "grad_norm": 2.237379789352417, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6940822601318359, + "num_tokens": 406707108.0, + "step": 16298 + }, + { + "epoch": 1.7899187349000658, + "grad_norm": 2.1639695167541504, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7183170318603516, + "num_tokens": 406734489.0, + "step": 16299 + }, + { + "epoch": 1.7900285526026796, + "grad_norm": 2.313762664794922, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7101219892501831, + "num_tokens": 406759096.0, + "step": 16300 + }, + { + "epoch": 1.7901383703052933, + "grad_norm": 2.114727735519409, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7260959148406982, + "num_tokens": 406785709.0, + "step": 16301 + }, + { + "epoch": 1.7902481880079069, + "grad_norm": 2.089015483856201, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7276571989059448, + "num_tokens": 406814423.0, + "step": 16302 + }, + { + "epoch": 1.7903580057105204, + "grad_norm": 2.694952964782715, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7360577583312988, + "num_tokens": 406832096.0, + "step": 16303 + }, + { + "epoch": 1.7904678234131342, + "grad_norm": 2.113227367401123, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7056690454483032, + "num_tokens": 406862245.0, + "step": 16304 + }, + { + "epoch": 1.790577641115748, + "grad_norm": 2.259138584136963, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7286441326141357, + "num_tokens": 406886724.0, + "step": 16305 + }, + { + "epoch": 1.7906874588183617, + "grad_norm": 2.3922293186187744, + "learning_rate": 1e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7401931881904602, + "num_tokens": 406909626.0, + "step": 16306 + }, + { + "epoch": 1.7907972765209752, + "grad_norm": 2.0476605892181396, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7137583494186401, + "num_tokens": 406938962.0, + "step": 16307 + }, + { + "epoch": 1.7909070942235887, + "grad_norm": 2.179074287414551, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7097810506820679, + "num_tokens": 406969181.0, + "step": 16308 + }, + { + "epoch": 1.7910169119262025, + "grad_norm": 2.365455150604248, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7239333391189575, + "num_tokens": 406992808.0, + "step": 16309 + }, + { + "epoch": 1.7911267296288162, + "grad_norm": 2.3090569972991943, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7090740203857422, + "num_tokens": 407017431.0, + "step": 16310 + }, + { + "epoch": 1.7912365473314298, + "grad_norm": 2.6190381050109863, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7344744205474854, + "num_tokens": 407036831.0, + "step": 16311 + }, + { + "epoch": 1.7913463650340435, + "grad_norm": 2.2417242527008057, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7328723669052124, + "num_tokens": 407062082.0, + "step": 16312 + }, + { + "epoch": 1.791456182736657, + "grad_norm": 2.29166316986084, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.70501708984375, + "num_tokens": 407085606.0, + "step": 16313 + }, + { + "epoch": 1.7915660004392708, + "grad_norm": 2.4293739795684814, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7271189093589783, + "num_tokens": 407108128.0, + "step": 16314 + }, + { + "epoch": 1.7916758181418846, + "grad_norm": 2.35787034034729, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7280862331390381, + "num_tokens": 407131230.0, + "step": 16315 + }, + { + "epoch": 1.7917856358444981, + "grad_norm": 2.9819045066833496, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7458686828613281, + "num_tokens": 407147521.0, + "step": 16316 + }, + { + "epoch": 1.7918954535471117, + "grad_norm": 2.4636213779449463, + "learning_rate": 1e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7217547297477722, + "num_tokens": 407168957.0, + "step": 16317 + }, + { + "epoch": 1.7920052712497254, + "grad_norm": 2.16947603225708, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.6956875920295715, + "num_tokens": 407195924.0, + "step": 16318 + }, + { + "epoch": 1.7921150889523392, + "grad_norm": 1.9601399898529053, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7155629992485046, + "num_tokens": 407227585.0, + "step": 16319 + }, + { + "epoch": 1.792224906654953, + "grad_norm": 2.2698886394500732, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7218210697174072, + "num_tokens": 407253119.0, + "step": 16320 + }, + { + "epoch": 1.7923347243575665, + "grad_norm": 2.0388290882110596, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7126380205154419, + "num_tokens": 407281224.0, + "step": 16321 + }, + { + "epoch": 1.79244454206018, + "grad_norm": 2.2597525119781494, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6997883915901184, + "num_tokens": 407308979.0, + "step": 16322 + }, + { + "epoch": 1.7925543597627938, + "grad_norm": 2.4150354862213135, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7303013801574707, + "num_tokens": 407332132.0, + "step": 16323 + }, + { + "epoch": 1.7926641774654075, + "grad_norm": 2.088534355163574, + "learning_rate": 1e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7386146187782288, + "num_tokens": 407358359.0, + "step": 16324 + }, + { + "epoch": 1.792773995168021, + "grad_norm": 2.2308428287506104, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7267903089523315, + "num_tokens": 407385039.0, + "step": 16325 + }, + { + "epoch": 1.7928838128706346, + "grad_norm": 2.3539464473724365, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6957008242607117, + "num_tokens": 407411943.0, + "step": 16326 + }, + { + "epoch": 1.7929936305732483, + "grad_norm": 2.2254416942596436, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7161263227462769, + "num_tokens": 407438295.0, + "step": 16327 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 2.059372663497925, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7335481643676758, + "num_tokens": 407466324.0, + "step": 16328 + }, + { + "epoch": 1.7932132659784759, + "grad_norm": 2.6132547855377197, + "learning_rate": 1e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7409765124320984, + "num_tokens": 407485067.0, + "step": 16329 + }, + { + "epoch": 1.7933230836810894, + "grad_norm": 2.2812588214874268, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7124326229095459, + "num_tokens": 407509600.0, + "step": 16330 + }, + { + "epoch": 1.793432901383703, + "grad_norm": 1.8712366819381714, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7014734745025635, + "num_tokens": 407545488.0, + "step": 16331 + }, + { + "epoch": 1.7935427190863167, + "grad_norm": 2.5446529388427734, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7245752811431885, + "num_tokens": 407568038.0, + "step": 16332 + }, + { + "epoch": 1.7936525367889304, + "grad_norm": 2.3954548835754395, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7334441542625427, + "num_tokens": 407591142.0, + "step": 16333 + }, + { + "epoch": 1.7937623544915442, + "grad_norm": 2.473376989364624, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7383519411087036, + "num_tokens": 407613658.0, + "step": 16334 + }, + { + "epoch": 1.7938721721941577, + "grad_norm": 2.3132290840148926, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7283183336257935, + "num_tokens": 407636827.0, + "step": 16335 + }, + { + "epoch": 1.7939819898967713, + "grad_norm": 2.6156342029571533, + "learning_rate": 1e-06, + "loss": 0.794, + "mean_token_accuracy": 0.7449017763137817, + "num_tokens": 407656043.0, + "step": 16336 + }, + { + "epoch": 1.794091807599385, + "grad_norm": 2.402972936630249, + "learning_rate": 1e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7329833507537842, + "num_tokens": 407678161.0, + "step": 16337 + }, + { + "epoch": 1.7942016253019988, + "grad_norm": 2.062915325164795, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7218295335769653, + "num_tokens": 407705774.0, + "step": 16338 + }, + { + "epoch": 1.7943114430046123, + "grad_norm": 2.019282579421997, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6968165636062622, + "num_tokens": 407736180.0, + "step": 16339 + }, + { + "epoch": 1.7944212607072259, + "grad_norm": 2.427337169647217, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7394531965255737, + "num_tokens": 407758344.0, + "step": 16340 + }, + { + "epoch": 1.7945310784098396, + "grad_norm": 2.6103694438934326, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7179235219955444, + "num_tokens": 407778644.0, + "step": 16341 + }, + { + "epoch": 1.7946408961124534, + "grad_norm": 2.2388856410980225, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.71831876039505, + "num_tokens": 407805775.0, + "step": 16342 + }, + { + "epoch": 1.7947507138150671, + "grad_norm": 2.0868794918060303, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7252024412155151, + "num_tokens": 407835173.0, + "step": 16343 + }, + { + "epoch": 1.7948605315176807, + "grad_norm": 2.0798535346984863, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7253372669219971, + "num_tokens": 407864553.0, + "step": 16344 + }, + { + "epoch": 1.7949703492202942, + "grad_norm": 2.0776469707489014, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7148511409759521, + "num_tokens": 407896923.0, + "step": 16345 + }, + { + "epoch": 1.795080166922908, + "grad_norm": 2.972644329071045, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.741899847984314, + "num_tokens": 407912768.0, + "step": 16346 + }, + { + "epoch": 1.7951899846255217, + "grad_norm": 2.3547847270965576, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7423883676528931, + "num_tokens": 407935601.0, + "step": 16347 + }, + { + "epoch": 1.7952998023281352, + "grad_norm": 2.205301523208618, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.691571831703186, + "num_tokens": 407961036.0, + "step": 16348 + }, + { + "epoch": 1.795409620030749, + "grad_norm": 2.3389127254486084, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7253611087799072, + "num_tokens": 407985611.0, + "step": 16349 + }, + { + "epoch": 1.7955194377333625, + "grad_norm": 2.422044038772583, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.714545488357544, + "num_tokens": 408009142.0, + "step": 16350 + }, + { + "epoch": 1.7956292554359763, + "grad_norm": 2.3229057788848877, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.696010410785675, + "num_tokens": 408035160.0, + "step": 16351 + }, + { + "epoch": 1.79573907313859, + "grad_norm": 2.193389654159546, + "learning_rate": 1e-06, + "loss": 0.7812, + "mean_token_accuracy": 0.7550227642059326, + "num_tokens": 408059829.0, + "step": 16352 + }, + { + "epoch": 1.7958488908412036, + "grad_norm": 2.4982120990753174, + "learning_rate": 1e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7360419034957886, + "num_tokens": 408083481.0, + "step": 16353 + }, + { + "epoch": 1.7959587085438171, + "grad_norm": 2.480665445327759, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7159715890884399, + "num_tokens": 408106815.0, + "step": 16354 + }, + { + "epoch": 1.7960685262464309, + "grad_norm": 2.0691163539886475, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7378121614456177, + "num_tokens": 408136138.0, + "step": 16355 + }, + { + "epoch": 1.7961783439490446, + "grad_norm": 2.0759646892547607, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7144896984100342, + "num_tokens": 408166090.0, + "step": 16356 + }, + { + "epoch": 1.7962881616516584, + "grad_norm": 2.5860936641693115, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7260228991508484, + "num_tokens": 408187797.0, + "step": 16357 + }, + { + "epoch": 1.796397979354272, + "grad_norm": 1.9330286979675293, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7118849754333496, + "num_tokens": 408220021.0, + "step": 16358 + }, + { + "epoch": 1.7965077970568855, + "grad_norm": 2.1236515045166016, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6977328658103943, + "num_tokens": 408249242.0, + "step": 16359 + }, + { + "epoch": 1.7966176147594992, + "grad_norm": 2.2392852306365967, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7123541235923767, + "num_tokens": 408273710.0, + "step": 16360 + }, + { + "epoch": 1.796727432462113, + "grad_norm": 2.3612060546875, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7466369271278381, + "num_tokens": 408297720.0, + "step": 16361 + }, + { + "epoch": 1.7968372501647265, + "grad_norm": 2.401305913925171, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7122308015823364, + "num_tokens": 408320237.0, + "step": 16362 + }, + { + "epoch": 1.7969470678673403, + "grad_norm": 2.4507434368133545, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.719566822052002, + "num_tokens": 408343563.0, + "step": 16363 + }, + { + "epoch": 1.7970568855699538, + "grad_norm": 2.233351707458496, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7342804074287415, + "num_tokens": 408368785.0, + "step": 16364 + }, + { + "epoch": 1.7971667032725676, + "grad_norm": 1.9494928121566772, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.707043468952179, + "num_tokens": 408400929.0, + "step": 16365 + }, + { + "epoch": 1.7972765209751813, + "grad_norm": 2.300071954727173, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7327790260314941, + "num_tokens": 408425436.0, + "step": 16366 + }, + { + "epoch": 1.7973863386777948, + "grad_norm": 2.367164373397827, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7305151224136353, + "num_tokens": 408447125.0, + "step": 16367 + }, + { + "epoch": 1.7974961563804084, + "grad_norm": 2.3421947956085205, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7058984637260437, + "num_tokens": 408474271.0, + "step": 16368 + }, + { + "epoch": 1.7976059740830221, + "grad_norm": 2.229043483734131, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7309748530387878, + "num_tokens": 408499959.0, + "step": 16369 + }, + { + "epoch": 1.797715791785636, + "grad_norm": 2.5029358863830566, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7383452653884888, + "num_tokens": 408519922.0, + "step": 16370 + }, + { + "epoch": 1.7978256094882497, + "grad_norm": 2.2633965015411377, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7231107950210571, + "num_tokens": 408544139.0, + "step": 16371 + }, + { + "epoch": 1.7979354271908632, + "grad_norm": 2.5291740894317627, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7141977548599243, + "num_tokens": 408564586.0, + "step": 16372 + }, + { + "epoch": 1.7980452448934767, + "grad_norm": 1.9215065240859985, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.728303074836731, + "num_tokens": 408596355.0, + "step": 16373 + }, + { + "epoch": 1.7981550625960905, + "grad_norm": 2.70383620262146, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7195096015930176, + "num_tokens": 408616275.0, + "step": 16374 + }, + { + "epoch": 1.7982648802987042, + "grad_norm": 2.573094367980957, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7523545026779175, + "num_tokens": 408639637.0, + "step": 16375 + }, + { + "epoch": 1.7983746980013178, + "grad_norm": 2.1641974449157715, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7396687269210815, + "num_tokens": 408665834.0, + "step": 16376 + }, + { + "epoch": 1.7984845157039313, + "grad_norm": 2.3181068897247314, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7275624871253967, + "num_tokens": 408689237.0, + "step": 16377 + }, + { + "epoch": 1.798594333406545, + "grad_norm": 2.387049436569214, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7277277708053589, + "num_tokens": 408712690.0, + "step": 16378 + }, + { + "epoch": 1.7987041511091588, + "grad_norm": 2.0781490802764893, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7022265195846558, + "num_tokens": 408740692.0, + "step": 16379 + }, + { + "epoch": 1.7988139688117726, + "grad_norm": 2.112736940383911, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7108609080314636, + "num_tokens": 408768507.0, + "step": 16380 + }, + { + "epoch": 1.7989237865143861, + "grad_norm": 1.9739216566085815, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7249664068222046, + "num_tokens": 408801343.0, + "step": 16381 + }, + { + "epoch": 1.7990336042169996, + "grad_norm": 2.1261842250823975, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7306389808654785, + "num_tokens": 408828754.0, + "step": 16382 + }, + { + "epoch": 1.7991434219196134, + "grad_norm": 2.077908515930176, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7105711698532104, + "num_tokens": 408859816.0, + "step": 16383 + }, + { + "epoch": 1.7992532396222272, + "grad_norm": 2.3696162700653076, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.745693564414978, + "num_tokens": 408882951.0, + "step": 16384 + }, + { + "epoch": 1.799363057324841, + "grad_norm": 2.3967626094818115, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7093391418457031, + "num_tokens": 408905401.0, + "step": 16385 + }, + { + "epoch": 1.7994728750274545, + "grad_norm": 2.044848680496216, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7181056141853333, + "num_tokens": 408935666.0, + "step": 16386 + }, + { + "epoch": 1.799582692730068, + "grad_norm": 2.639901876449585, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7009641528129578, + "num_tokens": 408956838.0, + "step": 16387 + }, + { + "epoch": 1.7996925104326817, + "grad_norm": 2.604205846786499, + "learning_rate": 1e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.7545632123947144, + "num_tokens": 408975730.0, + "step": 16388 + }, + { + "epoch": 1.7998023281352955, + "grad_norm": 2.4601447582244873, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7294567823410034, + "num_tokens": 408996258.0, + "step": 16389 + }, + { + "epoch": 1.799912145837909, + "grad_norm": 2.1706840991973877, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7058107256889343, + "num_tokens": 409022461.0, + "step": 16390 + }, + { + "epoch": 1.8000219635405226, + "grad_norm": 2.1077842712402344, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7430319786071777, + "num_tokens": 409047888.0, + "step": 16391 + }, + { + "epoch": 1.8001317812431363, + "grad_norm": 2.250321865081787, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7080004215240479, + "num_tokens": 409072523.0, + "step": 16392 + }, + { + "epoch": 1.80024159894575, + "grad_norm": 2.3094184398651123, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.6988434195518494, + "num_tokens": 409096034.0, + "step": 16393 + }, + { + "epoch": 1.8003514166483638, + "grad_norm": 1.9364103078842163, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.6989756226539612, + "num_tokens": 409127766.0, + "step": 16394 + }, + { + "epoch": 1.8004612343509774, + "grad_norm": 2.0322320461273193, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.722099781036377, + "num_tokens": 409157363.0, + "step": 16395 + }, + { + "epoch": 1.800571052053591, + "grad_norm": 2.3839762210845947, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7180840969085693, + "num_tokens": 409182450.0, + "step": 16396 + }, + { + "epoch": 1.8006808697562047, + "grad_norm": 2.228081703186035, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7196192145347595, + "num_tokens": 409209257.0, + "step": 16397 + }, + { + "epoch": 1.8007906874588184, + "grad_norm": 2.4453258514404297, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7318418025970459, + "num_tokens": 409232586.0, + "step": 16398 + }, + { + "epoch": 1.8009005051614322, + "grad_norm": 2.2609517574310303, + "learning_rate": 1e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7393742203712463, + "num_tokens": 409256372.0, + "step": 16399 + }, + { + "epoch": 1.8010103228640457, + "grad_norm": 2.133507251739502, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7103216648101807, + "num_tokens": 409283061.0, + "step": 16400 + }, + { + "epoch": 1.8011201405666593, + "grad_norm": 2.706306219100952, + "learning_rate": 1e-06, + "loss": 0.7767, + "mean_token_accuracy": 0.7592325210571289, + "num_tokens": 409301085.0, + "step": 16401 + }, + { + "epoch": 1.801229958269273, + "grad_norm": 2.2262399196624756, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7243704199790955, + "num_tokens": 409327799.0, + "step": 16402 + }, + { + "epoch": 1.8013397759718868, + "grad_norm": 2.2490806579589844, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7354457378387451, + "num_tokens": 409352146.0, + "step": 16403 + }, + { + "epoch": 1.8014495936745003, + "grad_norm": 2.666085720062256, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7327040433883667, + "num_tokens": 409370586.0, + "step": 16404 + }, + { + "epoch": 1.8015594113771138, + "grad_norm": 2.388629913330078, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7197566032409668, + "num_tokens": 409393671.0, + "step": 16405 + }, + { + "epoch": 1.8016692290797276, + "grad_norm": 2.0678460597991943, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7228858470916748, + "num_tokens": 409422672.0, + "step": 16406 + }, + { + "epoch": 1.8017790467823414, + "grad_norm": 2.2719383239746094, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7065753936767578, + "num_tokens": 409450937.0, + "step": 16407 + }, + { + "epoch": 1.801888864484955, + "grad_norm": 2.584275484085083, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7256768345832825, + "num_tokens": 409472475.0, + "step": 16408 + }, + { + "epoch": 1.8019986821875686, + "grad_norm": 2.2526042461395264, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7004668712615967, + "num_tokens": 409501120.0, + "step": 16409 + }, + { + "epoch": 1.8021084998901822, + "grad_norm": 2.086669445037842, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7289353609085083, + "num_tokens": 409529654.0, + "step": 16410 + }, + { + "epoch": 1.802218317592796, + "grad_norm": 2.304981231689453, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7390145063400269, + "num_tokens": 409555595.0, + "step": 16411 + }, + { + "epoch": 1.8023281352954097, + "grad_norm": 2.0292015075683594, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7308671474456787, + "num_tokens": 409584212.0, + "step": 16412 + }, + { + "epoch": 1.8024379529980232, + "grad_norm": 2.106283664703369, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7248871326446533, + "num_tokens": 409612754.0, + "step": 16413 + }, + { + "epoch": 1.802547770700637, + "grad_norm": 2.3329544067382812, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.6974665522575378, + "num_tokens": 409637708.0, + "step": 16414 + }, + { + "epoch": 1.8026575884032505, + "grad_norm": 1.9279323816299438, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6872425675392151, + "num_tokens": 409675261.0, + "step": 16415 + }, + { + "epoch": 1.8027674061058643, + "grad_norm": 2.067199230194092, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7199357151985168, + "num_tokens": 409704284.0, + "step": 16416 + }, + { + "epoch": 1.802877223808478, + "grad_norm": 1.912447452545166, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7136472463607788, + "num_tokens": 409738409.0, + "step": 16417 + }, + { + "epoch": 1.8029870415110916, + "grad_norm": 2.4189069271087646, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7289246916770935, + "num_tokens": 409759948.0, + "step": 16418 + }, + { + "epoch": 1.803096859213705, + "grad_norm": 2.1042673587799072, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.735146164894104, + "num_tokens": 409786032.0, + "step": 16419 + }, + { + "epoch": 1.8032066769163189, + "grad_norm": 2.3404018878936768, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.6974696516990662, + "num_tokens": 409809486.0, + "step": 16420 + }, + { + "epoch": 1.8033164946189326, + "grad_norm": 2.5469634532928467, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7020780444145203, + "num_tokens": 409831313.0, + "step": 16421 + }, + { + "epoch": 1.8034263123215464, + "grad_norm": 2.3235156536102295, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7128618955612183, + "num_tokens": 409855087.0, + "step": 16422 + }, + { + "epoch": 1.80353613002416, + "grad_norm": 2.3160319328308105, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7172677516937256, + "num_tokens": 409878320.0, + "step": 16423 + }, + { + "epoch": 1.8036459477267734, + "grad_norm": 2.0757968425750732, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.686795711517334, + "num_tokens": 409908826.0, + "step": 16424 + }, + { + "epoch": 1.8037557654293872, + "grad_norm": 2.245736598968506, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.699380099773407, + "num_tokens": 409936926.0, + "step": 16425 + }, + { + "epoch": 1.803865583132001, + "grad_norm": 2.4065868854522705, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.711373507976532, + "num_tokens": 409959970.0, + "step": 16426 + }, + { + "epoch": 1.8039754008346145, + "grad_norm": 2.238609552383423, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7375340461730957, + "num_tokens": 409984844.0, + "step": 16427 + }, + { + "epoch": 1.8040852185372283, + "grad_norm": 2.002753973007202, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7418603897094727, + "num_tokens": 410012132.0, + "step": 16428 + }, + { + "epoch": 1.8041950362398418, + "grad_norm": 2.553086757659912, + "learning_rate": 1e-06, + "loss": 0.7264, + "mean_token_accuracy": 0.7665351629257202, + "num_tokens": 410030955.0, + "step": 16429 + }, + { + "epoch": 1.8043048539424555, + "grad_norm": 2.203705310821533, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.719181478023529, + "num_tokens": 410057311.0, + "step": 16430 + }, + { + "epoch": 1.8044146716450693, + "grad_norm": 1.9388773441314697, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7037495374679565, + "num_tokens": 410090909.0, + "step": 16431 + }, + { + "epoch": 1.8045244893476828, + "grad_norm": 2.373936176300049, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7301738262176514, + "num_tokens": 410113097.0, + "step": 16432 + }, + { + "epoch": 1.8046343070502964, + "grad_norm": 1.991616129875183, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7333858013153076, + "num_tokens": 410142455.0, + "step": 16433 + }, + { + "epoch": 1.8047441247529101, + "grad_norm": 2.154460906982422, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7223672270774841, + "num_tokens": 410169224.0, + "step": 16434 + }, + { + "epoch": 1.8048539424555239, + "grad_norm": 2.0707318782806396, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7159214019775391, + "num_tokens": 410198844.0, + "step": 16435 + }, + { + "epoch": 1.8049637601581376, + "grad_norm": 2.7594151496887207, + "learning_rate": 1e-06, + "loss": 0.7502, + "mean_token_accuracy": 0.7638446092605591, + "num_tokens": 410216651.0, + "step": 16436 + }, + { + "epoch": 1.8050735778607512, + "grad_norm": 2.3000845909118652, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7276592254638672, + "num_tokens": 410239249.0, + "step": 16437 + }, + { + "epoch": 1.8051833955633647, + "grad_norm": 2.023087501525879, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.6980339884757996, + "num_tokens": 410271116.0, + "step": 16438 + }, + { + "epoch": 1.8052932132659785, + "grad_norm": 2.3591771125793457, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7287503480911255, + "num_tokens": 410294897.0, + "step": 16439 + }, + { + "epoch": 1.8054030309685922, + "grad_norm": 2.2303271293640137, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6811608076095581, + "num_tokens": 410323377.0, + "step": 16440 + }, + { + "epoch": 1.8055128486712058, + "grad_norm": 2.3331873416900635, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.6958070993423462, + "num_tokens": 410348213.0, + "step": 16441 + }, + { + "epoch": 1.8056226663738193, + "grad_norm": 2.317533254623413, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7262157201766968, + "num_tokens": 410373320.0, + "step": 16442 + }, + { + "epoch": 1.805732484076433, + "grad_norm": 2.0470800399780273, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7040292620658875, + "num_tokens": 410405778.0, + "step": 16443 + }, + { + "epoch": 1.8058423017790468, + "grad_norm": 2.221186637878418, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7161465883255005, + "num_tokens": 410433576.0, + "step": 16444 + }, + { + "epoch": 1.8059521194816606, + "grad_norm": 2.618849039077759, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7299209237098694, + "num_tokens": 410454165.0, + "step": 16445 + }, + { + "epoch": 1.806061937184274, + "grad_norm": 2.4762985706329346, + "learning_rate": 1e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7402239441871643, + "num_tokens": 410475901.0, + "step": 16446 + }, + { + "epoch": 1.8061717548868876, + "grad_norm": 2.195596694946289, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7259582281112671, + "num_tokens": 410502596.0, + "step": 16447 + }, + { + "epoch": 1.8062815725895014, + "grad_norm": 2.502598762512207, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7510702013969421, + "num_tokens": 410523335.0, + "step": 16448 + }, + { + "epoch": 1.8063913902921152, + "grad_norm": 2.1661715507507324, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7247186899185181, + "num_tokens": 410549823.0, + "step": 16449 + }, + { + "epoch": 1.806501207994729, + "grad_norm": 2.6121909618377686, + "learning_rate": 1e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7439596056938171, + "num_tokens": 410571166.0, + "step": 16450 + }, + { + "epoch": 1.8066110256973424, + "grad_norm": 2.447359323501587, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7065019607543945, + "num_tokens": 410593927.0, + "step": 16451 + }, + { + "epoch": 1.806720843399956, + "grad_norm": 2.298999071121216, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7214462757110596, + "num_tokens": 410618629.0, + "step": 16452 + }, + { + "epoch": 1.8068306611025697, + "grad_norm": 2.0327372550964355, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7395415902137756, + "num_tokens": 410646945.0, + "step": 16453 + }, + { + "epoch": 1.8069404788051835, + "grad_norm": 2.23425030708313, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.727525532245636, + "num_tokens": 410672016.0, + "step": 16454 + }, + { + "epoch": 1.807050296507797, + "grad_norm": 2.6179568767547607, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7376726269721985, + "num_tokens": 410691266.0, + "step": 16455 + }, + { + "epoch": 1.8071601142104106, + "grad_norm": 2.056321859359741, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7139502763748169, + "num_tokens": 410720804.0, + "step": 16456 + }, + { + "epoch": 1.8072699319130243, + "grad_norm": 2.26043438911438, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.716494619846344, + "num_tokens": 410746229.0, + "step": 16457 + }, + { + "epoch": 1.807379749615638, + "grad_norm": 2.093600273132324, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7067803144454956, + "num_tokens": 410775108.0, + "step": 16458 + }, + { + "epoch": 1.8074895673182518, + "grad_norm": 1.9530394077301025, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7417178750038147, + "num_tokens": 410805011.0, + "step": 16459 + }, + { + "epoch": 1.8075993850208654, + "grad_norm": 2.650235414505005, + "learning_rate": 1e-06, + "loss": 0.7516, + "mean_token_accuracy": 0.7569635510444641, + "num_tokens": 410823504.0, + "step": 16460 + }, + { + "epoch": 1.807709202723479, + "grad_norm": 2.375020980834961, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7211959958076477, + "num_tokens": 410849723.0, + "step": 16461 + }, + { + "epoch": 1.8078190204260927, + "grad_norm": 2.380587577819824, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7429326176643372, + "num_tokens": 410871778.0, + "step": 16462 + }, + { + "epoch": 1.8079288381287064, + "grad_norm": 1.915224552154541, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7224122285842896, + "num_tokens": 410904454.0, + "step": 16463 + }, + { + "epoch": 1.8080386558313202, + "grad_norm": 2.444033145904541, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7191844582557678, + "num_tokens": 410927002.0, + "step": 16464 + }, + { + "epoch": 1.8081484735339337, + "grad_norm": 2.1021788120269775, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7248428463935852, + "num_tokens": 410952993.0, + "step": 16465 + }, + { + "epoch": 1.8082582912365472, + "grad_norm": 2.159289836883545, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7067314386367798, + "num_tokens": 410980256.0, + "step": 16466 + }, + { + "epoch": 1.808368108939161, + "grad_norm": 1.898322582244873, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7173247337341309, + "num_tokens": 411014864.0, + "step": 16467 + }, + { + "epoch": 1.8084779266417748, + "grad_norm": 2.1093881130218506, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7077177166938782, + "num_tokens": 411045155.0, + "step": 16468 + }, + { + "epoch": 1.8085877443443883, + "grad_norm": 2.09226393699646, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7328051328659058, + "num_tokens": 411074385.0, + "step": 16469 + }, + { + "epoch": 1.8086975620470018, + "grad_norm": 2.375495433807373, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7247516512870789, + "num_tokens": 411099377.0, + "step": 16470 + }, + { + "epoch": 1.8088073797496156, + "grad_norm": 1.9377191066741943, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7107630968093872, + "num_tokens": 411132368.0, + "step": 16471 + }, + { + "epoch": 1.8089171974522293, + "grad_norm": 2.3333513736724854, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7015247941017151, + "num_tokens": 411156099.0, + "step": 16472 + }, + { + "epoch": 1.809027015154843, + "grad_norm": 2.3875863552093506, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7213644981384277, + "num_tokens": 411180326.0, + "step": 16473 + }, + { + "epoch": 1.8091368328574566, + "grad_norm": 2.4119579792022705, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6966539621353149, + "num_tokens": 411204839.0, + "step": 16474 + }, + { + "epoch": 1.8092466505600702, + "grad_norm": 2.2133564949035645, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7203452587127686, + "num_tokens": 411228944.0, + "step": 16475 + }, + { + "epoch": 1.809356468262684, + "grad_norm": 2.2614235877990723, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7052533626556396, + "num_tokens": 411253291.0, + "step": 16476 + }, + { + "epoch": 1.8094662859652977, + "grad_norm": 1.9989478588104248, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6947019100189209, + "num_tokens": 411284176.0, + "step": 16477 + }, + { + "epoch": 1.8095761036679112, + "grad_norm": 2.697655200958252, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7355194091796875, + "num_tokens": 411303944.0, + "step": 16478 + }, + { + "epoch": 1.809685921370525, + "grad_norm": 2.2617335319519043, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7242705821990967, + "num_tokens": 411326795.0, + "step": 16479 + }, + { + "epoch": 1.8097957390731385, + "grad_norm": 2.4755866527557373, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7389963269233704, + "num_tokens": 411349358.0, + "step": 16480 + }, + { + "epoch": 1.8099055567757523, + "grad_norm": 2.512791633605957, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7199152708053589, + "num_tokens": 411371500.0, + "step": 16481 + }, + { + "epoch": 1.810015374478366, + "grad_norm": 2.6244723796844482, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7292087078094482, + "num_tokens": 411390900.0, + "step": 16482 + }, + { + "epoch": 1.8101251921809796, + "grad_norm": 2.091743230819702, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7294602990150452, + "num_tokens": 411420430.0, + "step": 16483 + }, + { + "epoch": 1.810235009883593, + "grad_norm": 2.4378321170806885, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7373170852661133, + "num_tokens": 411443895.0, + "step": 16484 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 2.233525514602661, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7226210832595825, + "num_tokens": 411469767.0, + "step": 16485 + }, + { + "epoch": 1.8104546452888206, + "grad_norm": 2.452627658843994, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7330241203308105, + "num_tokens": 411491543.0, + "step": 16486 + }, + { + "epoch": 1.8105644629914344, + "grad_norm": 2.3091917037963867, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7103465795516968, + "num_tokens": 411516167.0, + "step": 16487 + }, + { + "epoch": 1.810674280694048, + "grad_norm": 2.5536234378814697, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7279547452926636, + "num_tokens": 411536590.0, + "step": 16488 + }, + { + "epoch": 1.8107840983966614, + "grad_norm": 1.91145658493042, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.722129225730896, + "num_tokens": 411570599.0, + "step": 16489 + }, + { + "epoch": 1.8108939160992752, + "grad_norm": 2.0994515419006348, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7101449370384216, + "num_tokens": 411598841.0, + "step": 16490 + }, + { + "epoch": 1.811003733801889, + "grad_norm": 2.0113956928253174, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7163823843002319, + "num_tokens": 411627247.0, + "step": 16491 + }, + { + "epoch": 1.8111135515045025, + "grad_norm": 2.3175864219665527, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7208932042121887, + "num_tokens": 411651319.0, + "step": 16492 + }, + { + "epoch": 1.8112233692071162, + "grad_norm": 2.190559148788452, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7253267168998718, + "num_tokens": 411677708.0, + "step": 16493 + }, + { + "epoch": 1.8113331869097298, + "grad_norm": 2.7955634593963623, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7322116494178772, + "num_tokens": 411695261.0, + "step": 16494 + }, + { + "epoch": 1.8114430046123435, + "grad_norm": 2.4124221801757812, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7104760408401489, + "num_tokens": 411718513.0, + "step": 16495 + }, + { + "epoch": 1.8115528223149573, + "grad_norm": 2.085289239883423, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7351757884025574, + "num_tokens": 411748594.0, + "step": 16496 + }, + { + "epoch": 1.8116626400175708, + "grad_norm": 2.5755422115325928, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7225075960159302, + "num_tokens": 411768770.0, + "step": 16497 + }, + { + "epoch": 1.8117724577201844, + "grad_norm": 2.5752463340759277, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7511337995529175, + "num_tokens": 411787303.0, + "step": 16498 + }, + { + "epoch": 1.8118822754227981, + "grad_norm": 2.0205576419830322, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7227569222450256, + "num_tokens": 411818127.0, + "step": 16499 + }, + { + "epoch": 1.8119920931254119, + "grad_norm": 2.230149984359741, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7296503186225891, + "num_tokens": 411843077.0, + "step": 16500 + }, + { + "epoch": 1.8121019108280256, + "grad_norm": 2.151480197906494, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7161306142807007, + "num_tokens": 411869701.0, + "step": 16501 + }, + { + "epoch": 1.8122117285306392, + "grad_norm": 2.456530809402466, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7188583612442017, + "num_tokens": 411893856.0, + "step": 16502 + }, + { + "epoch": 1.8123215462332527, + "grad_norm": 1.91135573387146, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7070685625076294, + "num_tokens": 411928663.0, + "step": 16503 + }, + { + "epoch": 1.8124313639358665, + "grad_norm": 2.388641595840454, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7248789072036743, + "num_tokens": 411952239.0, + "step": 16504 + }, + { + "epoch": 1.8125411816384802, + "grad_norm": 2.252774953842163, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7033119201660156, + "num_tokens": 411980156.0, + "step": 16505 + }, + { + "epoch": 1.8126509993410937, + "grad_norm": 2.48203444480896, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7049647569656372, + "num_tokens": 412004316.0, + "step": 16506 + }, + { + "epoch": 1.8127608170437073, + "grad_norm": 2.3505208492279053, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7055047154426575, + "num_tokens": 412029255.0, + "step": 16507 + }, + { + "epoch": 1.812870634746321, + "grad_norm": 2.6570141315460205, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7199538946151733, + "num_tokens": 412049421.0, + "step": 16508 + }, + { + "epoch": 1.8129804524489348, + "grad_norm": 2.243861675262451, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7231814861297607, + "num_tokens": 412074908.0, + "step": 16509 + }, + { + "epoch": 1.8130902701515486, + "grad_norm": 1.9438776969909668, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7190878391265869, + "num_tokens": 412109479.0, + "step": 16510 + }, + { + "epoch": 1.813200087854162, + "grad_norm": 2.4611246585845947, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7194629311561584, + "num_tokens": 412131892.0, + "step": 16511 + }, + { + "epoch": 1.8133099055567756, + "grad_norm": 2.421386241912842, + "learning_rate": 1e-06, + "loss": 0.7123, + "mean_token_accuracy": 0.7741692662239075, + "num_tokens": 412151833.0, + "step": 16512 + }, + { + "epoch": 1.8134197232593894, + "grad_norm": 2.4369359016418457, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.71478271484375, + "num_tokens": 412175084.0, + "step": 16513 + }, + { + "epoch": 1.8135295409620031, + "grad_norm": 2.479322910308838, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7217245101928711, + "num_tokens": 412198704.0, + "step": 16514 + }, + { + "epoch": 1.813639358664617, + "grad_norm": 2.0586860179901123, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7320349216461182, + "num_tokens": 412229903.0, + "step": 16515 + }, + { + "epoch": 1.8137491763672304, + "grad_norm": 2.3017306327819824, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7219897508621216, + "num_tokens": 412253898.0, + "step": 16516 + }, + { + "epoch": 1.813858994069844, + "grad_norm": 2.6758759021759033, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7307473421096802, + "num_tokens": 412273157.0, + "step": 16517 + }, + { + "epoch": 1.8139688117724577, + "grad_norm": 2.1502137184143066, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7274609804153442, + "num_tokens": 412299129.0, + "step": 16518 + }, + { + "epoch": 1.8140786294750715, + "grad_norm": 2.1987974643707275, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.709226131439209, + "num_tokens": 412325685.0, + "step": 16519 + }, + { + "epoch": 1.814188447177685, + "grad_norm": 2.4050638675689697, + "learning_rate": 1e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7345033288002014, + "num_tokens": 412347976.0, + "step": 16520 + }, + { + "epoch": 1.8142982648802986, + "grad_norm": 2.2921485900878906, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7452351450920105, + "num_tokens": 412370833.0, + "step": 16521 + }, + { + "epoch": 1.8144080825829123, + "grad_norm": 2.1146745681762695, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7139443755149841, + "num_tokens": 412399093.0, + "step": 16522 + }, + { + "epoch": 1.814517900285526, + "grad_norm": 2.5070223808288574, + "learning_rate": 1e-06, + "loss": 0.7681, + "mean_token_accuracy": 0.7521735429763794, + "num_tokens": 412418801.0, + "step": 16523 + }, + { + "epoch": 1.8146277179881398, + "grad_norm": 2.603681802749634, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7266373038291931, + "num_tokens": 412440003.0, + "step": 16524 + }, + { + "epoch": 1.8147375356907534, + "grad_norm": 2.5767223834991455, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7265616655349731, + "num_tokens": 412459046.0, + "step": 16525 + }, + { + "epoch": 1.814847353393367, + "grad_norm": 2.5707504749298096, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7297884225845337, + "num_tokens": 412479872.0, + "step": 16526 + }, + { + "epoch": 1.8149571710959806, + "grad_norm": 2.390368938446045, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7240742444992065, + "num_tokens": 412504312.0, + "step": 16527 + }, + { + "epoch": 1.8150669887985944, + "grad_norm": 2.5385923385620117, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7254775762557983, + "num_tokens": 412525014.0, + "step": 16528 + }, + { + "epoch": 1.815176806501208, + "grad_norm": 2.21541690826416, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7160086631774902, + "num_tokens": 412551724.0, + "step": 16529 + }, + { + "epoch": 1.8152866242038217, + "grad_norm": 2.323873281478882, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7382200360298157, + "num_tokens": 412576594.0, + "step": 16530 + }, + { + "epoch": 1.8153964419064352, + "grad_norm": 2.5711779594421387, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7098382711410522, + "num_tokens": 412598216.0, + "step": 16531 + }, + { + "epoch": 1.815506259609049, + "grad_norm": 2.052615165710449, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7160509824752808, + "num_tokens": 412627522.0, + "step": 16532 + }, + { + "epoch": 1.8156160773116627, + "grad_norm": 2.220163583755493, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7228381633758545, + "num_tokens": 412652656.0, + "step": 16533 + }, + { + "epoch": 1.8157258950142763, + "grad_norm": 2.0437052249908447, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7068150043487549, + "num_tokens": 412681227.0, + "step": 16534 + }, + { + "epoch": 1.8158357127168898, + "grad_norm": 2.119778871536255, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7197060585021973, + "num_tokens": 412709097.0, + "step": 16535 + }, + { + "epoch": 1.8159455304195036, + "grad_norm": 2.330974817276001, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6889061331748962, + "num_tokens": 412734286.0, + "step": 16536 + }, + { + "epoch": 1.8160553481221173, + "grad_norm": 2.429036855697632, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7232418656349182, + "num_tokens": 412759329.0, + "step": 16537 + }, + { + "epoch": 1.816165165824731, + "grad_norm": 2.4931490421295166, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7296625375747681, + "num_tokens": 412780333.0, + "step": 16538 + }, + { + "epoch": 1.8162749835273446, + "grad_norm": 2.1010324954986572, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7133650779724121, + "num_tokens": 412810447.0, + "step": 16539 + }, + { + "epoch": 1.8163848012299582, + "grad_norm": 2.420534610748291, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.727447509765625, + "num_tokens": 412832334.0, + "step": 16540 + }, + { + "epoch": 1.816494618932572, + "grad_norm": 2.0123114585876465, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7150336503982544, + "num_tokens": 412866081.0, + "step": 16541 + }, + { + "epoch": 1.8166044366351857, + "grad_norm": 2.475153923034668, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7346030473709106, + "num_tokens": 412887447.0, + "step": 16542 + }, + { + "epoch": 1.8167142543377992, + "grad_norm": 2.3073477745056152, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7208172082901001, + "num_tokens": 412912867.0, + "step": 16543 + }, + { + "epoch": 1.816824072040413, + "grad_norm": 1.9183628559112549, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6941204071044922, + "num_tokens": 412947155.0, + "step": 16544 + }, + { + "epoch": 1.8169338897430265, + "grad_norm": 2.127399206161499, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7127463817596436, + "num_tokens": 412974618.0, + "step": 16545 + }, + { + "epoch": 1.8170437074456403, + "grad_norm": 2.471359968185425, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7345484495162964, + "num_tokens": 412996431.0, + "step": 16546 + }, + { + "epoch": 1.817153525148254, + "grad_norm": 2.0905840396881104, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7150237560272217, + "num_tokens": 413024337.0, + "step": 16547 + }, + { + "epoch": 1.8172633428508675, + "grad_norm": 2.5376150608062744, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7049964666366577, + "num_tokens": 413046847.0, + "step": 16548 + }, + { + "epoch": 1.817373160553481, + "grad_norm": 2.269092082977295, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.715420663356781, + "num_tokens": 413070816.0, + "step": 16549 + }, + { + "epoch": 1.8174829782560948, + "grad_norm": 2.518146514892578, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7247177958488464, + "num_tokens": 413093156.0, + "step": 16550 + }, + { + "epoch": 1.8175927959587086, + "grad_norm": 2.094651937484741, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7101137638092041, + "num_tokens": 413123306.0, + "step": 16551 + }, + { + "epoch": 1.8177026136613224, + "grad_norm": 2.415161609649658, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7290162444114685, + "num_tokens": 413146589.0, + "step": 16552 + }, + { + "epoch": 1.8178124313639359, + "grad_norm": 2.0517685413360596, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7349666357040405, + "num_tokens": 413176204.0, + "step": 16553 + }, + { + "epoch": 1.8179222490665494, + "grad_norm": 2.197435140609741, + "learning_rate": 1e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7546756267547607, + "num_tokens": 413199302.0, + "step": 16554 + }, + { + "epoch": 1.8180320667691632, + "grad_norm": 2.5314865112304688, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7010799050331116, + "num_tokens": 413224559.0, + "step": 16555 + }, + { + "epoch": 1.818141884471777, + "grad_norm": 2.29789400100708, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7069987654685974, + "num_tokens": 413249575.0, + "step": 16556 + }, + { + "epoch": 1.8182517021743905, + "grad_norm": 2.4495162963867188, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7282509803771973, + "num_tokens": 413269855.0, + "step": 16557 + }, + { + "epoch": 1.818361519877004, + "grad_norm": 2.137641668319702, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.6952900290489197, + "num_tokens": 413298671.0, + "step": 16558 + }, + { + "epoch": 1.8184713375796178, + "grad_norm": 2.253983736038208, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7327065467834473, + "num_tokens": 413323436.0, + "step": 16559 + }, + { + "epoch": 1.8185811552822315, + "grad_norm": 2.2150607109069824, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7053554654121399, + "num_tokens": 413349708.0, + "step": 16560 + }, + { + "epoch": 1.8186909729848453, + "grad_norm": 2.259413003921509, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7248801589012146, + "num_tokens": 413373091.0, + "step": 16561 + }, + { + "epoch": 1.8188007906874588, + "grad_norm": 2.349989891052246, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7118383646011353, + "num_tokens": 413396465.0, + "step": 16562 + }, + { + "epoch": 1.8189106083900723, + "grad_norm": 2.746974468231201, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7272309064865112, + "num_tokens": 413416628.0, + "step": 16563 + }, + { + "epoch": 1.819020426092686, + "grad_norm": 2.093463182449341, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7141004800796509, + "num_tokens": 413446039.0, + "step": 16564 + }, + { + "epoch": 1.8191302437952999, + "grad_norm": 2.2910261154174805, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.726334810256958, + "num_tokens": 413469515.0, + "step": 16565 + }, + { + "epoch": 1.8192400614979136, + "grad_norm": 2.566133499145508, + "learning_rate": 1e-06, + "loss": 0.7583, + "mean_token_accuracy": 0.7554364800453186, + "num_tokens": 413489491.0, + "step": 16566 + }, + { + "epoch": 1.8193498792005272, + "grad_norm": 2.1640939712524414, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7350923418998718, + "num_tokens": 413514942.0, + "step": 16567 + }, + { + "epoch": 1.8194596969031407, + "grad_norm": 2.314892530441284, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7337527275085449, + "num_tokens": 413536127.0, + "step": 16568 + }, + { + "epoch": 1.8195695146057544, + "grad_norm": 2.3135485649108887, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7333630919456482, + "num_tokens": 413558505.0, + "step": 16569 + }, + { + "epoch": 1.8196793323083682, + "grad_norm": 2.491364002227783, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7088807821273804, + "num_tokens": 413580889.0, + "step": 16570 + }, + { + "epoch": 1.8197891500109817, + "grad_norm": 2.9127256870269775, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7195815443992615, + "num_tokens": 413597941.0, + "step": 16571 + }, + { + "epoch": 1.8198989677135953, + "grad_norm": 2.3118152618408203, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.6981574296951294, + "num_tokens": 413623472.0, + "step": 16572 + }, + { + "epoch": 1.820008785416209, + "grad_norm": 2.43495512008667, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7317349910736084, + "num_tokens": 413645125.0, + "step": 16573 + }, + { + "epoch": 1.8201186031188228, + "grad_norm": 2.2239925861358643, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7086763381958008, + "num_tokens": 413671361.0, + "step": 16574 + }, + { + "epoch": 1.8202284208214365, + "grad_norm": 2.160707950592041, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7260290384292603, + "num_tokens": 413696813.0, + "step": 16575 + }, + { + "epoch": 1.82033823852405, + "grad_norm": 2.6169378757476807, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7278938293457031, + "num_tokens": 413715824.0, + "step": 16576 + }, + { + "epoch": 1.8204480562266636, + "grad_norm": 2.4321963787078857, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7207365036010742, + "num_tokens": 413740495.0, + "step": 16577 + }, + { + "epoch": 1.8205578739292774, + "grad_norm": 2.40841007232666, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.703636884689331, + "num_tokens": 413763494.0, + "step": 16578 + }, + { + "epoch": 1.8206676916318911, + "grad_norm": 2.1895337104797363, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.704594612121582, + "num_tokens": 413788956.0, + "step": 16579 + }, + { + "epoch": 1.8207775093345049, + "grad_norm": 2.2898244857788086, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7354162335395813, + "num_tokens": 413812187.0, + "step": 16580 + }, + { + "epoch": 1.8208873270371184, + "grad_norm": 2.536785364151001, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7486093044281006, + "num_tokens": 413831997.0, + "step": 16581 + }, + { + "epoch": 1.820997144739732, + "grad_norm": 2.549513578414917, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.731929361820221, + "num_tokens": 413852694.0, + "step": 16582 + }, + { + "epoch": 1.8211069624423457, + "grad_norm": 2.1583468914031982, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7342461347579956, + "num_tokens": 413880464.0, + "step": 16583 + }, + { + "epoch": 1.8212167801449595, + "grad_norm": 2.1166374683380127, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6885294318199158, + "num_tokens": 413909387.0, + "step": 16584 + }, + { + "epoch": 1.821326597847573, + "grad_norm": 2.463531970977783, + "learning_rate": 1e-06, + "loss": 0.7633, + "mean_token_accuracy": 0.7499340772628784, + "num_tokens": 413929641.0, + "step": 16585 + }, + { + "epoch": 1.8214364155501865, + "grad_norm": 2.1520862579345703, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7150282859802246, + "num_tokens": 413957518.0, + "step": 16586 + }, + { + "epoch": 1.8215462332528003, + "grad_norm": 2.2549495697021484, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7392183542251587, + "num_tokens": 413982145.0, + "step": 16587 + }, + { + "epoch": 1.821656050955414, + "grad_norm": 1.93537175655365, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7073737382888794, + "num_tokens": 414016598.0, + "step": 16588 + }, + { + "epoch": 1.8217658686580278, + "grad_norm": 2.165642261505127, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7217108011245728, + "num_tokens": 414043220.0, + "step": 16589 + }, + { + "epoch": 1.8218756863606413, + "grad_norm": 2.2411446571350098, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7262085676193237, + "num_tokens": 414067649.0, + "step": 16590 + }, + { + "epoch": 1.8219855040632549, + "grad_norm": 1.9930028915405273, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7105992436408997, + "num_tokens": 414096016.0, + "step": 16591 + }, + { + "epoch": 1.8220953217658686, + "grad_norm": 2.209334373474121, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7234801650047302, + "num_tokens": 414123105.0, + "step": 16592 + }, + { + "epoch": 1.8222051394684824, + "grad_norm": 2.227156400680542, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7188192009925842, + "num_tokens": 414148374.0, + "step": 16593 + }, + { + "epoch": 1.822314957171096, + "grad_norm": 2.1296627521514893, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7140228152275085, + "num_tokens": 414176595.0, + "step": 16594 + }, + { + "epoch": 1.8224247748737097, + "grad_norm": 2.3485405445098877, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7314304113388062, + "num_tokens": 414199453.0, + "step": 16595 + }, + { + "epoch": 1.8225345925763232, + "grad_norm": 2.2252182960510254, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7166597247123718, + "num_tokens": 414226083.0, + "step": 16596 + }, + { + "epoch": 1.822644410278937, + "grad_norm": 2.2269766330718994, + "learning_rate": 1e-06, + "loss": 1.0965, + "mean_token_accuracy": 0.6844906210899353, + "num_tokens": 414254879.0, + "step": 16597 + }, + { + "epoch": 1.8227542279815507, + "grad_norm": 2.294563055038452, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7120819091796875, + "num_tokens": 414278880.0, + "step": 16598 + }, + { + "epoch": 1.8228640456841643, + "grad_norm": 2.1607706546783447, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7274695634841919, + "num_tokens": 414306646.0, + "step": 16599 + }, + { + "epoch": 1.8229738633867778, + "grad_norm": 2.1996631622314453, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7176074385643005, + "num_tokens": 414331634.0, + "step": 16600 + }, + { + "epoch": 1.8230836810893916, + "grad_norm": 2.171973466873169, + "learning_rate": 1e-06, + "loss": 0.7927, + "mean_token_accuracy": 0.7539200782775879, + "num_tokens": 414359282.0, + "step": 16601 + }, + { + "epoch": 1.8231934987920053, + "grad_norm": 2.6793136596679688, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7139182090759277, + "num_tokens": 414379461.0, + "step": 16602 + }, + { + "epoch": 1.823303316494619, + "grad_norm": 2.1102349758148193, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7255479693412781, + "num_tokens": 414407029.0, + "step": 16603 + }, + { + "epoch": 1.8234131341972326, + "grad_norm": 2.176133632659912, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7273063659667969, + "num_tokens": 414432525.0, + "step": 16604 + }, + { + "epoch": 1.8235229518998461, + "grad_norm": 2.1374542713165283, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7062885761260986, + "num_tokens": 414459274.0, + "step": 16605 + }, + { + "epoch": 1.82363276960246, + "grad_norm": 2.3067445755004883, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7337735891342163, + "num_tokens": 414482894.0, + "step": 16606 + }, + { + "epoch": 1.8237425873050737, + "grad_norm": 2.64745831489563, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7072492837905884, + "num_tokens": 414502713.0, + "step": 16607 + }, + { + "epoch": 1.8238524050076872, + "grad_norm": 2.1460752487182617, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7051941752433777, + "num_tokens": 414528911.0, + "step": 16608 + }, + { + "epoch": 1.823962222710301, + "grad_norm": 2.4790563583374023, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7142953872680664, + "num_tokens": 414550236.0, + "step": 16609 + }, + { + "epoch": 1.8240720404129145, + "grad_norm": 2.223055362701416, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7196311950683594, + "num_tokens": 414575647.0, + "step": 16610 + }, + { + "epoch": 1.8241818581155282, + "grad_norm": 2.5483944416046143, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7009334564208984, + "num_tokens": 414597777.0, + "step": 16611 + }, + { + "epoch": 1.824291675818142, + "grad_norm": 2.284355401992798, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7270928025245667, + "num_tokens": 414621525.0, + "step": 16612 + }, + { + "epoch": 1.8244014935207555, + "grad_norm": 2.1450307369232178, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7252147197723389, + "num_tokens": 414649587.0, + "step": 16613 + }, + { + "epoch": 1.824511311223369, + "grad_norm": 2.0611979961395264, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7201097011566162, + "num_tokens": 414677397.0, + "step": 16614 + }, + { + "epoch": 1.8246211289259828, + "grad_norm": 2.2967352867126465, + "learning_rate": 1e-06, + "loss": 0.7305, + "mean_token_accuracy": 0.7576724886894226, + "num_tokens": 414701109.0, + "step": 16615 + }, + { + "epoch": 1.8247309466285966, + "grad_norm": 2.2155654430389404, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7439494132995605, + "num_tokens": 414725010.0, + "step": 16616 + }, + { + "epoch": 1.8248407643312103, + "grad_norm": 2.1692521572113037, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7165178060531616, + "num_tokens": 414752665.0, + "step": 16617 + }, + { + "epoch": 1.8249505820338239, + "grad_norm": 2.0427072048187256, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7019599676132202, + "num_tokens": 414784736.0, + "step": 16618 + }, + { + "epoch": 1.8250603997364374, + "grad_norm": 2.1072769165039062, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7189203500747681, + "num_tokens": 414812181.0, + "step": 16619 + }, + { + "epoch": 1.8251702174390512, + "grad_norm": 2.325669050216675, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7362533807754517, + "num_tokens": 414838107.0, + "step": 16620 + }, + { + "epoch": 1.825280035141665, + "grad_norm": 2.3294787406921387, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7103439569473267, + "num_tokens": 414861563.0, + "step": 16621 + }, + { + "epoch": 1.8253898528442785, + "grad_norm": 2.2096097469329834, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7323740720748901, + "num_tokens": 414887368.0, + "step": 16622 + }, + { + "epoch": 1.825499670546892, + "grad_norm": 2.5720715522766113, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7187463045120239, + "num_tokens": 414910716.0, + "step": 16623 + }, + { + "epoch": 1.8256094882495058, + "grad_norm": 2.547301769256592, + "learning_rate": 1e-06, + "loss": 0.7077, + "mean_token_accuracy": 0.7730089426040649, + "num_tokens": 414929493.0, + "step": 16624 + }, + { + "epoch": 1.8257193059521195, + "grad_norm": 2.244252920150757, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.723497211933136, + "num_tokens": 414956210.0, + "step": 16625 + }, + { + "epoch": 1.8258291236547333, + "grad_norm": 2.3107056617736816, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7094119191169739, + "num_tokens": 414982275.0, + "step": 16626 + }, + { + "epoch": 1.8259389413573468, + "grad_norm": 2.1917693614959717, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7180650234222412, + "num_tokens": 415007621.0, + "step": 16627 + }, + { + "epoch": 1.8260487590599603, + "grad_norm": 2.165659189224243, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7326262593269348, + "num_tokens": 415034435.0, + "step": 16628 + }, + { + "epoch": 1.826158576762574, + "grad_norm": 2.171943426132202, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7199849486351013, + "num_tokens": 415060674.0, + "step": 16629 + }, + { + "epoch": 1.8262683944651878, + "grad_norm": 2.202676773071289, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.71775883436203, + "num_tokens": 415085689.0, + "step": 16630 + }, + { + "epoch": 1.8263782121678016, + "grad_norm": 2.209780216217041, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7033324837684631, + "num_tokens": 415114928.0, + "step": 16631 + }, + { + "epoch": 1.8264880298704151, + "grad_norm": 2.340705394744873, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7047103643417358, + "num_tokens": 415139166.0, + "step": 16632 + }, + { + "epoch": 1.8265978475730287, + "grad_norm": 2.072000026702881, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7154402732849121, + "num_tokens": 415167077.0, + "step": 16633 + }, + { + "epoch": 1.8267076652756424, + "grad_norm": 2.042966365814209, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7237391471862793, + "num_tokens": 415197413.0, + "step": 16634 + }, + { + "epoch": 1.8268174829782562, + "grad_norm": 2.42152738571167, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.721001386642456, + "num_tokens": 415221608.0, + "step": 16635 + }, + { + "epoch": 1.8269273006808697, + "grad_norm": 2.5380825996398926, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7226303815841675, + "num_tokens": 415242274.0, + "step": 16636 + }, + { + "epoch": 1.8270371183834833, + "grad_norm": 2.2788076400756836, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7040496468544006, + "num_tokens": 415266541.0, + "step": 16637 + }, + { + "epoch": 1.827146936086097, + "grad_norm": 2.004918336868286, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7098246812820435, + "num_tokens": 415297229.0, + "step": 16638 + }, + { + "epoch": 1.8272567537887108, + "grad_norm": 2.043224573135376, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7306660413742065, + "num_tokens": 415324752.0, + "step": 16639 + }, + { + "epoch": 1.8273665714913245, + "grad_norm": 2.006437301635742, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7059097290039062, + "num_tokens": 415356083.0, + "step": 16640 + }, + { + "epoch": 1.827476389193938, + "grad_norm": 2.6966586112976074, + "learning_rate": 1e-06, + "loss": 0.7365, + "mean_token_accuracy": 0.7649003267288208, + "num_tokens": 415371943.0, + "step": 16641 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 2.3127944469451904, + "learning_rate": 1e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7350714206695557, + "num_tokens": 415396057.0, + "step": 16642 + }, + { + "epoch": 1.8276960245991654, + "grad_norm": 2.2964870929718018, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7091642618179321, + "num_tokens": 415421939.0, + "step": 16643 + }, + { + "epoch": 1.8278058423017791, + "grad_norm": 1.934992790222168, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.691363513469696, + "num_tokens": 415455381.0, + "step": 16644 + }, + { + "epoch": 1.8279156600043929, + "grad_norm": 2.4654324054718018, + "learning_rate": 1e-06, + "loss": 0.7527, + "mean_token_accuracy": 0.7586190104484558, + "num_tokens": 415476287.0, + "step": 16645 + }, + { + "epoch": 1.8280254777070064, + "grad_norm": 2.092169761657715, + "learning_rate": 1e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7385313510894775, + "num_tokens": 415503425.0, + "step": 16646 + }, + { + "epoch": 1.82813529540962, + "grad_norm": 2.4777567386627197, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7075023055076599, + "num_tokens": 415524125.0, + "step": 16647 + }, + { + "epoch": 1.8282451131122337, + "grad_norm": 2.3874740600585938, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.708293080329895, + "num_tokens": 415543854.0, + "step": 16648 + }, + { + "epoch": 1.8283549308148475, + "grad_norm": 2.1768176555633545, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7270240783691406, + "num_tokens": 415569103.0, + "step": 16649 + }, + { + "epoch": 1.828464748517461, + "grad_norm": 2.6017134189605713, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7046956419944763, + "num_tokens": 415589100.0, + "step": 16650 + }, + { + "epoch": 1.8285745662200745, + "grad_norm": 2.193028688430786, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7258617281913757, + "num_tokens": 415615968.0, + "step": 16651 + }, + { + "epoch": 1.8286843839226883, + "grad_norm": 2.0267438888549805, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7187056541442871, + "num_tokens": 415644540.0, + "step": 16652 + }, + { + "epoch": 1.828794201625302, + "grad_norm": 2.1941912174224854, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7334576845169067, + "num_tokens": 415669813.0, + "step": 16653 + }, + { + "epoch": 1.8289040193279158, + "grad_norm": 2.360383987426758, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7138352394104004, + "num_tokens": 415694228.0, + "step": 16654 + }, + { + "epoch": 1.8290138370305293, + "grad_norm": 2.1529247760772705, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7202973365783691, + "num_tokens": 415722779.0, + "step": 16655 + }, + { + "epoch": 1.8291236547331429, + "grad_norm": 2.041456460952759, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7199190855026245, + "num_tokens": 415751529.0, + "step": 16656 + }, + { + "epoch": 1.8292334724357566, + "grad_norm": 2.481639862060547, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7299622893333435, + "num_tokens": 415772715.0, + "step": 16657 + }, + { + "epoch": 1.8293432901383704, + "grad_norm": 2.411546468734741, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7085132598876953, + "num_tokens": 415797832.0, + "step": 16658 + }, + { + "epoch": 1.829453107840984, + "grad_norm": 2.874953031539917, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7201701402664185, + "num_tokens": 415815774.0, + "step": 16659 + }, + { + "epoch": 1.8295629255435977, + "grad_norm": 2.320948600769043, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7238830924034119, + "num_tokens": 415843496.0, + "step": 16660 + }, + { + "epoch": 1.8296727432462112, + "grad_norm": 2.5048890113830566, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7336401343345642, + "num_tokens": 415865889.0, + "step": 16661 + }, + { + "epoch": 1.829782560948825, + "grad_norm": 2.0645124912261963, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.716160237789154, + "num_tokens": 415893623.0, + "step": 16662 + }, + { + "epoch": 1.8298923786514387, + "grad_norm": 2.2442283630371094, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7313742637634277, + "num_tokens": 415916770.0, + "step": 16663 + }, + { + "epoch": 1.8300021963540523, + "grad_norm": 2.3589799404144287, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7094371318817139, + "num_tokens": 415939726.0, + "step": 16664 + }, + { + "epoch": 1.8301120140566658, + "grad_norm": 2.5751020908355713, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7169869542121887, + "num_tokens": 415960747.0, + "step": 16665 + }, + { + "epoch": 1.8302218317592795, + "grad_norm": 2.2616586685180664, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7159126996994019, + "num_tokens": 415985729.0, + "step": 16666 + }, + { + "epoch": 1.8303316494618933, + "grad_norm": 2.152585506439209, + "learning_rate": 1e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7426282167434692, + "num_tokens": 416013913.0, + "step": 16667 + }, + { + "epoch": 1.830441467164507, + "grad_norm": 2.3567445278167725, + "learning_rate": 1e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7567487955093384, + "num_tokens": 416036344.0, + "step": 16668 + }, + { + "epoch": 1.8305512848671206, + "grad_norm": 2.3485164642333984, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.728142499923706, + "num_tokens": 416058175.0, + "step": 16669 + }, + { + "epoch": 1.8306611025697341, + "grad_norm": 2.5107362270355225, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7286526560783386, + "num_tokens": 416078708.0, + "step": 16670 + }, + { + "epoch": 1.830770920272348, + "grad_norm": 2.0220227241516113, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7222843766212463, + "num_tokens": 416106686.0, + "step": 16671 + }, + { + "epoch": 1.8308807379749616, + "grad_norm": 2.176706552505493, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7152414321899414, + "num_tokens": 416135936.0, + "step": 16672 + }, + { + "epoch": 1.8309905556775752, + "grad_norm": 1.9882780313491821, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.6951256990432739, + "num_tokens": 416167407.0, + "step": 16673 + }, + { + "epoch": 1.831100373380189, + "grad_norm": 2.5893712043762207, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.731285572052002, + "num_tokens": 416188856.0, + "step": 16674 + }, + { + "epoch": 1.8312101910828025, + "grad_norm": 2.401578187942505, + "learning_rate": 1e-06, + "loss": 0.7791, + "mean_token_accuracy": 0.7497542500495911, + "num_tokens": 416211484.0, + "step": 16675 + }, + { + "epoch": 1.8313200087854162, + "grad_norm": 2.5061800479888916, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7104076743125916, + "num_tokens": 416235600.0, + "step": 16676 + }, + { + "epoch": 1.83142982648803, + "grad_norm": 2.243523120880127, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7268290519714355, + "num_tokens": 416261743.0, + "step": 16677 + }, + { + "epoch": 1.8315396441906435, + "grad_norm": 2.8522372245788574, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7287938594818115, + "num_tokens": 416279893.0, + "step": 16678 + }, + { + "epoch": 1.831649461893257, + "grad_norm": 2.3372690677642822, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7168916463851929, + "num_tokens": 416303667.0, + "step": 16679 + }, + { + "epoch": 1.8317592795958708, + "grad_norm": 2.042518138885498, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7339837551116943, + "num_tokens": 416333036.0, + "step": 16680 + }, + { + "epoch": 1.8318690972984846, + "grad_norm": 2.2225770950317383, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7135680913925171, + "num_tokens": 416358355.0, + "step": 16681 + }, + { + "epoch": 1.8319789150010983, + "grad_norm": 2.398390769958496, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7387986183166504, + "num_tokens": 416381398.0, + "step": 16682 + }, + { + "epoch": 1.8320887327037119, + "grad_norm": 2.438534736633301, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7455627918243408, + "num_tokens": 416401744.0, + "step": 16683 + }, + { + "epoch": 1.8321985504063254, + "grad_norm": 2.1760470867156982, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6941537857055664, + "num_tokens": 416429947.0, + "step": 16684 + }, + { + "epoch": 1.8323083681089392, + "grad_norm": 2.4511282444000244, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.720912754535675, + "num_tokens": 416452329.0, + "step": 16685 + }, + { + "epoch": 1.832418185811553, + "grad_norm": 2.285756826400757, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7240257263183594, + "num_tokens": 416475524.0, + "step": 16686 + }, + { + "epoch": 1.8325280035141664, + "grad_norm": 2.1157751083374023, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7092263102531433, + "num_tokens": 416503964.0, + "step": 16687 + }, + { + "epoch": 1.83263782121678, + "grad_norm": 2.4425365924835205, + "learning_rate": 1e-06, + "loss": 0.8042, + "mean_token_accuracy": 0.744400143623352, + "num_tokens": 416525954.0, + "step": 16688 + }, + { + "epoch": 1.8327476389193937, + "grad_norm": 2.3445534706115723, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7283516526222229, + "num_tokens": 416547772.0, + "step": 16689 + }, + { + "epoch": 1.8328574566220075, + "grad_norm": 2.327014923095703, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7177252769470215, + "num_tokens": 416574209.0, + "step": 16690 + }, + { + "epoch": 1.8329672743246213, + "grad_norm": 2.300130605697632, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7042394876480103, + "num_tokens": 416599355.0, + "step": 16691 + }, + { + "epoch": 1.8330770920272348, + "grad_norm": 2.0295071601867676, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7122164368629456, + "num_tokens": 416627863.0, + "step": 16692 + }, + { + "epoch": 1.8331869097298483, + "grad_norm": 2.21614146232605, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7319560647010803, + "num_tokens": 416653930.0, + "step": 16693 + }, + { + "epoch": 1.833296727432462, + "grad_norm": 1.9157593250274658, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7090979218482971, + "num_tokens": 416686899.0, + "step": 16694 + }, + { + "epoch": 1.8334065451350758, + "grad_norm": 2.1759724617004395, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7320264577865601, + "num_tokens": 416712773.0, + "step": 16695 + }, + { + "epoch": 1.8335163628376896, + "grad_norm": 2.609517812728882, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7274959087371826, + "num_tokens": 416732351.0, + "step": 16696 + }, + { + "epoch": 1.8336261805403031, + "grad_norm": 2.0331239700317383, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7009645104408264, + "num_tokens": 416762963.0, + "step": 16697 + }, + { + "epoch": 1.8337359982429167, + "grad_norm": 2.389686346054077, + "learning_rate": 1e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7431169748306274, + "num_tokens": 416785746.0, + "step": 16698 + }, + { + "epoch": 1.8338458159455304, + "grad_norm": 2.1892993450164795, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7209511399269104, + "num_tokens": 416814913.0, + "step": 16699 + }, + { + "epoch": 1.8339556336481442, + "grad_norm": 2.302107810974121, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.725874662399292, + "num_tokens": 416841060.0, + "step": 16700 + }, + { + "epoch": 1.8340654513507577, + "grad_norm": 2.1999666690826416, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7168960571289062, + "num_tokens": 416866387.0, + "step": 16701 + }, + { + "epoch": 1.8341752690533712, + "grad_norm": 2.630232334136963, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7235285043716431, + "num_tokens": 416887024.0, + "step": 16702 + }, + { + "epoch": 1.834285086755985, + "grad_norm": 2.230111598968506, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7176342606544495, + "num_tokens": 416912548.0, + "step": 16703 + }, + { + "epoch": 1.8343949044585988, + "grad_norm": 2.0703651905059814, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7273758053779602, + "num_tokens": 416940371.0, + "step": 16704 + }, + { + "epoch": 1.8345047221612125, + "grad_norm": 2.0313773155212402, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7124704718589783, + "num_tokens": 416972176.0, + "step": 16705 + }, + { + "epoch": 1.834614539863826, + "grad_norm": 2.2416844367980957, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7183929681777954, + "num_tokens": 416994846.0, + "step": 16706 + }, + { + "epoch": 1.8347243575664396, + "grad_norm": 2.3869073390960693, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7335402965545654, + "num_tokens": 417017642.0, + "step": 16707 + }, + { + "epoch": 1.8348341752690533, + "grad_norm": 2.231599807739258, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7136552929878235, + "num_tokens": 417045097.0, + "step": 16708 + }, + { + "epoch": 1.834943992971667, + "grad_norm": 2.207587718963623, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7060457468032837, + "num_tokens": 417071694.0, + "step": 16709 + }, + { + "epoch": 1.8350538106742806, + "grad_norm": 2.2501015663146973, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7122783660888672, + "num_tokens": 417096587.0, + "step": 16710 + }, + { + "epoch": 1.8351636283768944, + "grad_norm": 2.492424488067627, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7361952662467957, + "num_tokens": 417117469.0, + "step": 16711 + }, + { + "epoch": 1.835273446079508, + "grad_norm": 2.389066457748413, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.724992036819458, + "num_tokens": 417140727.0, + "step": 16712 + }, + { + "epoch": 1.8353832637821217, + "grad_norm": 2.529796838760376, + "learning_rate": 1e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7416025400161743, + "num_tokens": 417162291.0, + "step": 16713 + }, + { + "epoch": 1.8354930814847354, + "grad_norm": 2.41200590133667, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7237075567245483, + "num_tokens": 417184913.0, + "step": 16714 + }, + { + "epoch": 1.835602899187349, + "grad_norm": 2.0926685333251953, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7232872247695923, + "num_tokens": 417212760.0, + "step": 16715 + }, + { + "epoch": 1.8357127168899625, + "grad_norm": 2.231926441192627, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7178100943565369, + "num_tokens": 417238766.0, + "step": 16716 + }, + { + "epoch": 1.8358225345925763, + "grad_norm": 2.093045711517334, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7231497764587402, + "num_tokens": 417268687.0, + "step": 16717 + }, + { + "epoch": 1.83593235229519, + "grad_norm": 2.1712119579315186, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7326819896697998, + "num_tokens": 417296566.0, + "step": 16718 + }, + { + "epoch": 1.8360421699978038, + "grad_norm": 2.071169853210449, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7169907689094543, + "num_tokens": 417328812.0, + "step": 16719 + }, + { + "epoch": 1.8361519877004173, + "grad_norm": 2.38991379737854, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7342166304588318, + "num_tokens": 417349998.0, + "step": 16720 + }, + { + "epoch": 1.8362618054030309, + "grad_norm": 2.209486484527588, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.735786497592926, + "num_tokens": 417375489.0, + "step": 16721 + }, + { + "epoch": 1.8363716231056446, + "grad_norm": 2.0820577144622803, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.706299901008606, + "num_tokens": 417403936.0, + "step": 16722 + }, + { + "epoch": 1.8364814408082584, + "grad_norm": 2.1853296756744385, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.6986838579177856, + "num_tokens": 417430889.0, + "step": 16723 + }, + { + "epoch": 1.836591258510872, + "grad_norm": 2.176100015640259, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7150222659111023, + "num_tokens": 417460327.0, + "step": 16724 + }, + { + "epoch": 1.8367010762134857, + "grad_norm": 2.4082255363464355, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.744709312915802, + "num_tokens": 417482209.0, + "step": 16725 + }, + { + "epoch": 1.8368108939160992, + "grad_norm": 2.6244733333587646, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7329838871955872, + "num_tokens": 417501696.0, + "step": 16726 + }, + { + "epoch": 1.836920711618713, + "grad_norm": 2.2983527183532715, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7292450666427612, + "num_tokens": 417525631.0, + "step": 16727 + }, + { + "epoch": 1.8370305293213267, + "grad_norm": 2.58105731010437, + "learning_rate": 1e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.754806399345398, + "num_tokens": 417545103.0, + "step": 16728 + }, + { + "epoch": 1.8371403470239402, + "grad_norm": 2.3964710235595703, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7414389252662659, + "num_tokens": 417568635.0, + "step": 16729 + }, + { + "epoch": 1.8372501647265538, + "grad_norm": 2.0573742389678955, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7251219749450684, + "num_tokens": 417600557.0, + "step": 16730 + }, + { + "epoch": 1.8373599824291675, + "grad_norm": 2.555560350418091, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7270194292068481, + "num_tokens": 417621539.0, + "step": 16731 + }, + { + "epoch": 1.8374698001317813, + "grad_norm": 2.1301777362823486, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7112300395965576, + "num_tokens": 417648570.0, + "step": 16732 + }, + { + "epoch": 1.837579617834395, + "grad_norm": 2.292830467224121, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.729915976524353, + "num_tokens": 417671445.0, + "step": 16733 + }, + { + "epoch": 1.8376894355370086, + "grad_norm": 2.5403354167938232, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7083807587623596, + "num_tokens": 417693354.0, + "step": 16734 + }, + { + "epoch": 1.8377992532396221, + "grad_norm": 2.2990052700042725, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7282770872116089, + "num_tokens": 417715398.0, + "step": 16735 + }, + { + "epoch": 1.8379090709422359, + "grad_norm": 2.2843639850616455, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.726386547088623, + "num_tokens": 417740289.0, + "step": 16736 + }, + { + "epoch": 1.8380188886448496, + "grad_norm": 2.2597997188568115, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7143532037734985, + "num_tokens": 417766690.0, + "step": 16737 + }, + { + "epoch": 1.8381287063474632, + "grad_norm": 2.7096498012542725, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7313790321350098, + "num_tokens": 417786798.0, + "step": 16738 + }, + { + "epoch": 1.838238524050077, + "grad_norm": 2.0396111011505127, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7195748090744019, + "num_tokens": 417816354.0, + "step": 16739 + }, + { + "epoch": 1.8383483417526905, + "grad_norm": 2.170722484588623, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7136943340301514, + "num_tokens": 417843653.0, + "step": 16740 + }, + { + "epoch": 1.8384581594553042, + "grad_norm": 2.109793186187744, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7258834838867188, + "num_tokens": 417872095.0, + "step": 16741 + }, + { + "epoch": 1.838567977157918, + "grad_norm": 2.3660762310028076, + "learning_rate": 1e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7397993206977844, + "num_tokens": 417893360.0, + "step": 16742 + }, + { + "epoch": 1.8386777948605315, + "grad_norm": 2.538609743118286, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7156053781509399, + "num_tokens": 417915359.0, + "step": 16743 + }, + { + "epoch": 1.838787612563145, + "grad_norm": 2.4532456398010254, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7247433066368103, + "num_tokens": 417937221.0, + "step": 16744 + }, + { + "epoch": 1.8388974302657588, + "grad_norm": 3.113009452819824, + "learning_rate": 1e-06, + "loss": 0.7672, + "mean_token_accuracy": 0.7545762062072754, + "num_tokens": 417952482.0, + "step": 16745 + }, + { + "epoch": 1.8390072479683726, + "grad_norm": 2.286590814590454, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6957966685295105, + "num_tokens": 417981151.0, + "step": 16746 + }, + { + "epoch": 1.8391170656709863, + "grad_norm": 2.1496951580047607, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7260030508041382, + "num_tokens": 418007063.0, + "step": 16747 + }, + { + "epoch": 1.8392268833735999, + "grad_norm": 2.0879695415496826, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7273184657096863, + "num_tokens": 418037497.0, + "step": 16748 + }, + { + "epoch": 1.8393367010762134, + "grad_norm": 2.1869256496429443, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.729893147945404, + "num_tokens": 418063173.0, + "step": 16749 + }, + { + "epoch": 1.8394465187788271, + "grad_norm": 2.8079962730407715, + "learning_rate": 1e-06, + "loss": 0.7581, + "mean_token_accuracy": 0.7516976594924927, + "num_tokens": 418079211.0, + "step": 16750 + }, + { + "epoch": 1.839556336481441, + "grad_norm": 2.093012809753418, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6971781253814697, + "num_tokens": 418109395.0, + "step": 16751 + }, + { + "epoch": 1.8396661541840544, + "grad_norm": 2.1743392944335938, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7075586318969727, + "num_tokens": 418136372.0, + "step": 16752 + }, + { + "epoch": 1.839775971886668, + "grad_norm": 2.2688615322113037, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7139549851417542, + "num_tokens": 418162251.0, + "step": 16753 + }, + { + "epoch": 1.8398857895892817, + "grad_norm": 2.340549945831299, + "learning_rate": 1e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.7384248375892639, + "num_tokens": 418185572.0, + "step": 16754 + }, + { + "epoch": 1.8399956072918955, + "grad_norm": 2.6074469089508057, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.741237998008728, + "num_tokens": 418204590.0, + "step": 16755 + }, + { + "epoch": 1.8401054249945092, + "grad_norm": 2.3014020919799805, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.731917142868042, + "num_tokens": 418227212.0, + "step": 16756 + }, + { + "epoch": 1.8402152426971228, + "grad_norm": 2.080974578857422, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7254382371902466, + "num_tokens": 418253308.0, + "step": 16757 + }, + { + "epoch": 1.8403250603997363, + "grad_norm": 2.3069376945495605, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7148710489273071, + "num_tokens": 418280482.0, + "step": 16758 + }, + { + "epoch": 1.84043487810235, + "grad_norm": 2.357753276824951, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7234716415405273, + "num_tokens": 418304693.0, + "step": 16759 + }, + { + "epoch": 1.8405446958049638, + "grad_norm": 2.453070878982544, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7268719673156738, + "num_tokens": 418325467.0, + "step": 16760 + }, + { + "epoch": 1.8406545135075776, + "grad_norm": 1.9122051000595093, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7136167287826538, + "num_tokens": 418358332.0, + "step": 16761 + }, + { + "epoch": 1.8407643312101911, + "grad_norm": 2.2709522247314453, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.714565634727478, + "num_tokens": 418384103.0, + "step": 16762 + }, + { + "epoch": 1.8408741489128047, + "grad_norm": 2.254962921142578, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7217167615890503, + "num_tokens": 418411194.0, + "step": 16763 + }, + { + "epoch": 1.8409839666154184, + "grad_norm": 2.2398173809051514, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.718262791633606, + "num_tokens": 418437695.0, + "step": 16764 + }, + { + "epoch": 1.8410937843180322, + "grad_norm": 2.614781141281128, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7330031394958496, + "num_tokens": 418458159.0, + "step": 16765 + }, + { + "epoch": 1.8412036020206457, + "grad_norm": 2.129326343536377, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7224084734916687, + "num_tokens": 418484552.0, + "step": 16766 + }, + { + "epoch": 1.8413134197232592, + "grad_norm": 2.022519111633301, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7314627766609192, + "num_tokens": 418515065.0, + "step": 16767 + }, + { + "epoch": 1.841423237425873, + "grad_norm": 2.4058661460876465, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7257134318351746, + "num_tokens": 418535552.0, + "step": 16768 + }, + { + "epoch": 1.8415330551284868, + "grad_norm": 2.2525486946105957, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7146592736244202, + "num_tokens": 418563835.0, + "step": 16769 + }, + { + "epoch": 1.8416428728311005, + "grad_norm": 2.0462090969085693, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.709627628326416, + "num_tokens": 418595251.0, + "step": 16770 + }, + { + "epoch": 1.841752690533714, + "grad_norm": 2.2790377140045166, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7282354831695557, + "num_tokens": 418620423.0, + "step": 16771 + }, + { + "epoch": 1.8418625082363276, + "grad_norm": 2.453465461730957, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.719863772392273, + "num_tokens": 418642387.0, + "step": 16772 + }, + { + "epoch": 1.8419723259389413, + "grad_norm": 2.3595714569091797, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7068765759468079, + "num_tokens": 418668801.0, + "step": 16773 + }, + { + "epoch": 1.842082143641555, + "grad_norm": 2.1186351776123047, + "learning_rate": 1e-06, + "loss": 0.7437, + "mean_token_accuracy": 0.7546581029891968, + "num_tokens": 418693052.0, + "step": 16774 + }, + { + "epoch": 1.8421919613441686, + "grad_norm": 1.9931014776229858, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7187511920928955, + "num_tokens": 418723228.0, + "step": 16775 + }, + { + "epoch": 1.8423017790467824, + "grad_norm": 2.400947093963623, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.760718047618866, + "num_tokens": 418746209.0, + "step": 16776 + }, + { + "epoch": 1.842411596749396, + "grad_norm": 2.0527963638305664, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7191693782806396, + "num_tokens": 418774287.0, + "step": 16777 + }, + { + "epoch": 1.8425214144520097, + "grad_norm": 2.2185847759246826, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7322999835014343, + "num_tokens": 418799382.0, + "step": 16778 + }, + { + "epoch": 1.8426312321546234, + "grad_norm": 2.3505747318267822, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7122344970703125, + "num_tokens": 418825212.0, + "step": 16779 + }, + { + "epoch": 1.842741049857237, + "grad_norm": 2.213703155517578, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7308810949325562, + "num_tokens": 418850511.0, + "step": 16780 + }, + { + "epoch": 1.8428508675598505, + "grad_norm": 2.114046096801758, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7326589822769165, + "num_tokens": 418876453.0, + "step": 16781 + }, + { + "epoch": 1.8429606852624643, + "grad_norm": 2.0984554290771484, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7050915360450745, + "num_tokens": 418904035.0, + "step": 16782 + }, + { + "epoch": 1.843070502965078, + "grad_norm": 2.237452983856201, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7198325395584106, + "num_tokens": 418930493.0, + "step": 16783 + }, + { + "epoch": 1.8431803206676918, + "grad_norm": 2.4600346088409424, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7345595359802246, + "num_tokens": 418952173.0, + "step": 16784 + }, + { + "epoch": 1.8432901383703053, + "grad_norm": 2.551234722137451, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7485230565071106, + "num_tokens": 418972295.0, + "step": 16785 + }, + { + "epoch": 1.8433999560729188, + "grad_norm": 2.138378620147705, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.722778856754303, + "num_tokens": 418999293.0, + "step": 16786 + }, + { + "epoch": 1.8435097737755326, + "grad_norm": 2.1704518795013428, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.6960963606834412, + "num_tokens": 419026515.0, + "step": 16787 + }, + { + "epoch": 1.8436195914781464, + "grad_norm": 2.2955799102783203, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7286868095397949, + "num_tokens": 419051141.0, + "step": 16788 + }, + { + "epoch": 1.84372940918076, + "grad_norm": 2.232670545578003, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7340739369392395, + "num_tokens": 419075846.0, + "step": 16789 + }, + { + "epoch": 1.8438392268833736, + "grad_norm": 2.357811689376831, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7369287014007568, + "num_tokens": 419099282.0, + "step": 16790 + }, + { + "epoch": 1.8439490445859872, + "grad_norm": 2.434241533279419, + "learning_rate": 1e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7515889406204224, + "num_tokens": 419122739.0, + "step": 16791 + }, + { + "epoch": 1.844058862288601, + "grad_norm": 2.3419699668884277, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.721667468547821, + "num_tokens": 419144985.0, + "step": 16792 + }, + { + "epoch": 1.8441686799912147, + "grad_norm": 2.377723455429077, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7107936143875122, + "num_tokens": 419168746.0, + "step": 16793 + }, + { + "epoch": 1.8442784976938282, + "grad_norm": 2.3550820350646973, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7345249652862549, + "num_tokens": 419192400.0, + "step": 16794 + }, + { + "epoch": 1.8443883153964418, + "grad_norm": 2.1415483951568604, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7073392271995544, + "num_tokens": 419221308.0, + "step": 16795 + }, + { + "epoch": 1.8444981330990555, + "grad_norm": 2.149350643157959, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7007601261138916, + "num_tokens": 419247926.0, + "step": 16796 + }, + { + "epoch": 1.8446079508016693, + "grad_norm": 2.004115581512451, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7151432633399963, + "num_tokens": 419277545.0, + "step": 16797 + }, + { + "epoch": 1.844717768504283, + "grad_norm": 2.2628636360168457, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7077122926712036, + "num_tokens": 419305459.0, + "step": 16798 + }, + { + "epoch": 1.8448275862068966, + "grad_norm": 2.466125249862671, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7222736477851868, + "num_tokens": 419326686.0, + "step": 16799 + }, + { + "epoch": 1.84493740390951, + "grad_norm": 2.1499106884002686, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.693382740020752, + "num_tokens": 419355129.0, + "step": 16800 + }, + { + "epoch": 1.8450472216121239, + "grad_norm": 2.449403762817383, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6948788166046143, + "num_tokens": 419378039.0, + "step": 16801 + }, + { + "epoch": 1.8451570393147376, + "grad_norm": 2.40753436088562, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7242951393127441, + "num_tokens": 419399632.0, + "step": 16802 + }, + { + "epoch": 1.8452668570173512, + "grad_norm": 2.4704434871673584, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7182010412216187, + "num_tokens": 419421703.0, + "step": 16803 + }, + { + "epoch": 1.8453766747199647, + "grad_norm": 2.213825225830078, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7206516265869141, + "num_tokens": 419447809.0, + "step": 16804 + }, + { + "epoch": 1.8454864924225785, + "grad_norm": 2.1808218955993652, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7141996026039124, + "num_tokens": 419474052.0, + "step": 16805 + }, + { + "epoch": 1.8455963101251922, + "grad_norm": 2.0323641300201416, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7152184247970581, + "num_tokens": 419504553.0, + "step": 16806 + }, + { + "epoch": 1.845706127827806, + "grad_norm": 2.5434622764587402, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7226867079734802, + "num_tokens": 419525980.0, + "step": 16807 + }, + { + "epoch": 1.8458159455304195, + "grad_norm": 2.2790141105651855, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7325143814086914, + "num_tokens": 419552045.0, + "step": 16808 + }, + { + "epoch": 1.845925763233033, + "grad_norm": 2.0853564739227295, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7249714136123657, + "num_tokens": 419580121.0, + "step": 16809 + }, + { + "epoch": 1.8460355809356468, + "grad_norm": 2.0699305534362793, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7091911435127258, + "num_tokens": 419609198.0, + "step": 16810 + }, + { + "epoch": 1.8461453986382605, + "grad_norm": 2.8410396575927734, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7521029710769653, + "num_tokens": 419626683.0, + "step": 16811 + }, + { + "epoch": 1.8462552163408743, + "grad_norm": 2.103026866912842, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7104175686836243, + "num_tokens": 419654123.0, + "step": 16812 + }, + { + "epoch": 1.8463650340434878, + "grad_norm": 2.3611624240875244, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7303043603897095, + "num_tokens": 419675526.0, + "step": 16813 + }, + { + "epoch": 1.8464748517461014, + "grad_norm": 2.3856098651885986, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7249423265457153, + "num_tokens": 419698774.0, + "step": 16814 + }, + { + "epoch": 1.8465846694487151, + "grad_norm": 2.290400266647339, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7119778394699097, + "num_tokens": 419724882.0, + "step": 16815 + }, + { + "epoch": 1.846694487151329, + "grad_norm": 2.147446393966675, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7347368597984314, + "num_tokens": 419749933.0, + "step": 16816 + }, + { + "epoch": 1.8468043048539424, + "grad_norm": 2.1176912784576416, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7300504446029663, + "num_tokens": 419779619.0, + "step": 16817 + }, + { + "epoch": 1.846914122556556, + "grad_norm": 2.7892704010009766, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7290492653846741, + "num_tokens": 419797102.0, + "step": 16818 + }, + { + "epoch": 1.8470239402591697, + "grad_norm": 2.1635143756866455, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7269970774650574, + "num_tokens": 419823158.0, + "step": 16819 + }, + { + "epoch": 1.8471337579617835, + "grad_norm": 2.5621728897094727, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7298442125320435, + "num_tokens": 419844021.0, + "step": 16820 + }, + { + "epoch": 1.8472435756643972, + "grad_norm": 2.3196873664855957, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.707300066947937, + "num_tokens": 419869701.0, + "step": 16821 + }, + { + "epoch": 1.8473533933670108, + "grad_norm": 2.1793291568756104, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7385631799697876, + "num_tokens": 419894109.0, + "step": 16822 + }, + { + "epoch": 1.8474632110696243, + "grad_norm": 1.989833950996399, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7220540046691895, + "num_tokens": 419923494.0, + "step": 16823 + }, + { + "epoch": 1.847573028772238, + "grad_norm": 2.140301465988159, + "learning_rate": 1e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7365237474441528, + "num_tokens": 419949603.0, + "step": 16824 + }, + { + "epoch": 1.8476828464748518, + "grad_norm": 1.9634144306182861, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7041763067245483, + "num_tokens": 419981552.0, + "step": 16825 + }, + { + "epoch": 1.8477926641774656, + "grad_norm": 2.018949031829834, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7392364740371704, + "num_tokens": 420010189.0, + "step": 16826 + }, + { + "epoch": 1.847902481880079, + "grad_norm": 2.296076536178589, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7228841185569763, + "num_tokens": 420033774.0, + "step": 16827 + }, + { + "epoch": 1.8480122995826926, + "grad_norm": 2.5697505474090576, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.739428699016571, + "num_tokens": 420053601.0, + "step": 16828 + }, + { + "epoch": 1.8481221172853064, + "grad_norm": 2.0924601554870605, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7338913679122925, + "num_tokens": 420079901.0, + "step": 16829 + }, + { + "epoch": 1.8482319349879202, + "grad_norm": 2.3498544692993164, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7381213903427124, + "num_tokens": 420102766.0, + "step": 16830 + }, + { + "epoch": 1.8483417526905337, + "grad_norm": 2.102278709411621, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7257312536239624, + "num_tokens": 420131500.0, + "step": 16831 + }, + { + "epoch": 1.8484515703931472, + "grad_norm": 2.2305257320404053, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7194919586181641, + "num_tokens": 420156547.0, + "step": 16832 + }, + { + "epoch": 1.848561388095761, + "grad_norm": 2.11787486076355, + "learning_rate": 1e-06, + "loss": 1.0623, + "mean_token_accuracy": 0.6785366535186768, + "num_tokens": 420186672.0, + "step": 16833 + }, + { + "epoch": 1.8486712057983747, + "grad_norm": 2.202185869216919, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7285735607147217, + "num_tokens": 420212159.0, + "step": 16834 + }, + { + "epoch": 1.8487810235009885, + "grad_norm": 2.2367804050445557, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7185274958610535, + "num_tokens": 420237344.0, + "step": 16835 + }, + { + "epoch": 1.848890841203602, + "grad_norm": 2.0369160175323486, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.700232744216919, + "num_tokens": 420268924.0, + "step": 16836 + }, + { + "epoch": 1.8490006589062156, + "grad_norm": 2.1899986267089844, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7029422521591187, + "num_tokens": 420296915.0, + "step": 16837 + }, + { + "epoch": 1.8491104766088293, + "grad_norm": 2.7165231704711914, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7337824106216431, + "num_tokens": 420315497.0, + "step": 16838 + }, + { + "epoch": 1.849220294311443, + "grad_norm": 2.3474647998809814, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7256223559379578, + "num_tokens": 420340981.0, + "step": 16839 + }, + { + "epoch": 1.8493301120140566, + "grad_norm": 1.931312918663025, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7002435922622681, + "num_tokens": 420374200.0, + "step": 16840 + }, + { + "epoch": 1.8494399297166704, + "grad_norm": 2.5797133445739746, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7506102323532104, + "num_tokens": 420393539.0, + "step": 16841 + }, + { + "epoch": 1.849549747419284, + "grad_norm": 1.9821863174438477, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.69915372133255, + "num_tokens": 420427383.0, + "step": 16842 + }, + { + "epoch": 1.8496595651218977, + "grad_norm": 2.380383014678955, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7227379679679871, + "num_tokens": 420449160.0, + "step": 16843 + }, + { + "epoch": 1.8497693828245114, + "grad_norm": 2.5428481101989746, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.737545371055603, + "num_tokens": 420469122.0, + "step": 16844 + }, + { + "epoch": 1.849879200527125, + "grad_norm": 1.9949318170547485, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7279608845710754, + "num_tokens": 420498901.0, + "step": 16845 + }, + { + "epoch": 1.8499890182297385, + "grad_norm": 2.135011911392212, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6991020441055298, + "num_tokens": 420526017.0, + "step": 16846 + }, + { + "epoch": 1.8500988359323522, + "grad_norm": 2.176442861557007, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7004770040512085, + "num_tokens": 420555707.0, + "step": 16847 + }, + { + "epoch": 1.850208653634966, + "grad_norm": 2.2492103576660156, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7170891165733337, + "num_tokens": 420582010.0, + "step": 16848 + }, + { + "epoch": 1.8503184713375798, + "grad_norm": 2.470885992050171, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7264832854270935, + "num_tokens": 420603321.0, + "step": 16849 + }, + { + "epoch": 1.8504282890401933, + "grad_norm": 2.1840951442718506, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7267730236053467, + "num_tokens": 420628790.0, + "step": 16850 + }, + { + "epoch": 1.8505381067428068, + "grad_norm": 2.0691075325012207, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7213743925094604, + "num_tokens": 420656883.0, + "step": 16851 + }, + { + "epoch": 1.8506479244454206, + "grad_norm": 2.469513416290283, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.754059910774231, + "num_tokens": 420677068.0, + "step": 16852 + }, + { + "epoch": 1.8507577421480343, + "grad_norm": 2.338918924331665, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7240958213806152, + "num_tokens": 420699890.0, + "step": 16853 + }, + { + "epoch": 1.8508675598506479, + "grad_norm": 2.2992310523986816, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7211026549339294, + "num_tokens": 420725146.0, + "step": 16854 + }, + { + "epoch": 1.8509773775532616, + "grad_norm": 2.3965141773223877, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7083613872528076, + "num_tokens": 420749691.0, + "step": 16855 + }, + { + "epoch": 1.8510871952558752, + "grad_norm": 2.1796422004699707, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7159606218338013, + "num_tokens": 420777627.0, + "step": 16856 + }, + { + "epoch": 1.851197012958489, + "grad_norm": 2.149925947189331, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7249319553375244, + "num_tokens": 420803591.0, + "step": 16857 + }, + { + "epoch": 1.8513068306611027, + "grad_norm": 2.2933876514434814, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.6985969543457031, + "num_tokens": 420828483.0, + "step": 16858 + }, + { + "epoch": 1.8514166483637162, + "grad_norm": 2.650970220565796, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7249526977539062, + "num_tokens": 420848556.0, + "step": 16859 + }, + { + "epoch": 1.8515264660663298, + "grad_norm": 2.350496292114258, + "learning_rate": 1e-06, + "loss": 0.827, + "mean_token_accuracy": 0.738166332244873, + "num_tokens": 420871628.0, + "step": 16860 + }, + { + "epoch": 1.8516362837689435, + "grad_norm": 2.306100368499756, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7417687773704529, + "num_tokens": 420895338.0, + "step": 16861 + }, + { + "epoch": 1.8517461014715573, + "grad_norm": 2.3080217838287354, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7319947481155396, + "num_tokens": 420919435.0, + "step": 16862 + }, + { + "epoch": 1.851855919174171, + "grad_norm": 2.170494556427002, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7160236835479736, + "num_tokens": 420945372.0, + "step": 16863 + }, + { + "epoch": 1.8519657368767846, + "grad_norm": 2.3747167587280273, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7429782152175903, + "num_tokens": 420966758.0, + "step": 16864 + }, + { + "epoch": 1.852075554579398, + "grad_norm": 1.9385871887207031, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.71045982837677, + "num_tokens": 420997486.0, + "step": 16865 + }, + { + "epoch": 1.8521853722820119, + "grad_norm": 2.2348415851593018, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7067974805831909, + "num_tokens": 421023076.0, + "step": 16866 + }, + { + "epoch": 1.8522951899846256, + "grad_norm": 2.0110983848571777, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7170974016189575, + "num_tokens": 421054727.0, + "step": 16867 + }, + { + "epoch": 1.8524050076872391, + "grad_norm": 2.2115323543548584, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7234748005867004, + "num_tokens": 421080494.0, + "step": 16868 + }, + { + "epoch": 1.8525148253898527, + "grad_norm": 2.205735206604004, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7299714088439941, + "num_tokens": 421107625.0, + "step": 16869 + }, + { + "epoch": 1.8526246430924664, + "grad_norm": 2.015258550643921, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.737467885017395, + "num_tokens": 421135543.0, + "step": 16870 + }, + { + "epoch": 1.8527344607950802, + "grad_norm": 2.181102991104126, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7388654947280884, + "num_tokens": 421161299.0, + "step": 16871 + }, + { + "epoch": 1.852844278497694, + "grad_norm": 2.128307819366455, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.722338080406189, + "num_tokens": 421190695.0, + "step": 16872 + }, + { + "epoch": 1.8529540962003075, + "grad_norm": 2.3911654949188232, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7239556908607483, + "num_tokens": 421212557.0, + "step": 16873 + }, + { + "epoch": 1.853063913902921, + "grad_norm": 2.362004041671753, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.718310534954071, + "num_tokens": 421237444.0, + "step": 16874 + }, + { + "epoch": 1.8531737316055348, + "grad_norm": 2.2956864833831787, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7188159823417664, + "num_tokens": 421263146.0, + "step": 16875 + }, + { + "epoch": 1.8532835493081485, + "grad_norm": 2.3501598834991455, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7281500101089478, + "num_tokens": 421285893.0, + "step": 16876 + }, + { + "epoch": 1.8533933670107623, + "grad_norm": 2.3726308345794678, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7064494490623474, + "num_tokens": 421310832.0, + "step": 16877 + }, + { + "epoch": 1.8535031847133758, + "grad_norm": 2.303084135055542, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7572243809700012, + "num_tokens": 421334327.0, + "step": 16878 + }, + { + "epoch": 1.8536130024159894, + "grad_norm": 2.0126616954803467, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7343040108680725, + "num_tokens": 421363691.0, + "step": 16879 + }, + { + "epoch": 1.8537228201186031, + "grad_norm": 2.411010980606079, + "learning_rate": 1e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7528102397918701, + "num_tokens": 421386835.0, + "step": 16880 + }, + { + "epoch": 1.8538326378212169, + "grad_norm": 2.263364553451538, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7006707787513733, + "num_tokens": 421412067.0, + "step": 16881 + }, + { + "epoch": 1.8539424555238304, + "grad_norm": 2.1360554695129395, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7168124318122864, + "num_tokens": 421439513.0, + "step": 16882 + }, + { + "epoch": 1.854052273226444, + "grad_norm": 2.1768813133239746, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7224489450454712, + "num_tokens": 421466671.0, + "step": 16883 + }, + { + "epoch": 1.8541620909290577, + "grad_norm": 2.4536831378936768, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7385179996490479, + "num_tokens": 421487368.0, + "step": 16884 + }, + { + "epoch": 1.8542719086316715, + "grad_norm": 2.4386098384857178, + "learning_rate": 1e-06, + "loss": 0.7717, + "mean_token_accuracy": 0.7491145133972168, + "num_tokens": 421507880.0, + "step": 16885 + }, + { + "epoch": 1.8543817263342852, + "grad_norm": 2.4569146633148193, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7409548759460449, + "num_tokens": 421528858.0, + "step": 16886 + }, + { + "epoch": 1.8544915440368988, + "grad_norm": 2.2785327434539795, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7168301343917847, + "num_tokens": 421552180.0, + "step": 16887 + }, + { + "epoch": 1.8546013617395123, + "grad_norm": 2.3794872760772705, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7173522710800171, + "num_tokens": 421576691.0, + "step": 16888 + }, + { + "epoch": 1.854711179442126, + "grad_norm": 2.185176372528076, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7184034585952759, + "num_tokens": 421602442.0, + "step": 16889 + }, + { + "epoch": 1.8548209971447398, + "grad_norm": 2.5270869731903076, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7407004833221436, + "num_tokens": 421622686.0, + "step": 16890 + }, + { + "epoch": 1.8549308148473536, + "grad_norm": 2.1911063194274902, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6979551315307617, + "num_tokens": 421651389.0, + "step": 16891 + }, + { + "epoch": 1.855040632549967, + "grad_norm": 2.5993475914001465, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7375636100769043, + "num_tokens": 421669857.0, + "step": 16892 + }, + { + "epoch": 1.8551504502525806, + "grad_norm": 2.5185115337371826, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7115824818611145, + "num_tokens": 421691714.0, + "step": 16893 + }, + { + "epoch": 1.8552602679551944, + "grad_norm": 2.2014245986938477, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.691421627998352, + "num_tokens": 421720010.0, + "step": 16894 + }, + { + "epoch": 1.8553700856578081, + "grad_norm": 2.464782238006592, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7291770577430725, + "num_tokens": 421741914.0, + "step": 16895 + }, + { + "epoch": 1.8554799033604217, + "grad_norm": 2.477027654647827, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7339040040969849, + "num_tokens": 421763406.0, + "step": 16896 + }, + { + "epoch": 1.8555897210630352, + "grad_norm": 2.260481357574463, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7092639207839966, + "num_tokens": 421789580.0, + "step": 16897 + }, + { + "epoch": 1.855699538765649, + "grad_norm": 2.445340156555176, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7349700331687927, + "num_tokens": 421811201.0, + "step": 16898 + }, + { + "epoch": 1.8558093564682627, + "grad_norm": 2.3358051776885986, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6938337087631226, + "num_tokens": 421838033.0, + "step": 16899 + }, + { + "epoch": 1.8559191741708765, + "grad_norm": 2.4296886920928955, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7128807306289673, + "num_tokens": 421859927.0, + "step": 16900 + }, + { + "epoch": 1.85602899187349, + "grad_norm": 2.2403175830841064, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7293920516967773, + "num_tokens": 421887355.0, + "step": 16901 + }, + { + "epoch": 1.8561388095761036, + "grad_norm": 2.3067843914031982, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7210494875907898, + "num_tokens": 421910011.0, + "step": 16902 + }, + { + "epoch": 1.8562486272787173, + "grad_norm": 2.485189914703369, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7010140419006348, + "num_tokens": 421933942.0, + "step": 16903 + }, + { + "epoch": 1.856358444981331, + "grad_norm": 2.087111473083496, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7120366096496582, + "num_tokens": 421962463.0, + "step": 16904 + }, + { + "epoch": 1.8564682626839446, + "grad_norm": 2.1932756900787354, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7282596826553345, + "num_tokens": 421989108.0, + "step": 16905 + }, + { + "epoch": 1.8565780803865584, + "grad_norm": 2.383603572845459, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7221804261207581, + "num_tokens": 422012858.0, + "step": 16906 + }, + { + "epoch": 1.856687898089172, + "grad_norm": 2.4560227394104004, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7408558130264282, + "num_tokens": 422034707.0, + "step": 16907 + }, + { + "epoch": 1.8567977157917857, + "grad_norm": 2.250624895095825, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7287546992301941, + "num_tokens": 422059791.0, + "step": 16908 + }, + { + "epoch": 1.8569075334943994, + "grad_norm": 2.6972975730895996, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7071929574012756, + "num_tokens": 422080768.0, + "step": 16909 + }, + { + "epoch": 1.857017351197013, + "grad_norm": 2.0131683349609375, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7156673669815063, + "num_tokens": 422112759.0, + "step": 16910 + }, + { + "epoch": 1.8571271688996265, + "grad_norm": 2.5169076919555664, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.723395049571991, + "num_tokens": 422133702.0, + "step": 16911 + }, + { + "epoch": 1.8572369866022402, + "grad_norm": 2.355362892150879, + "learning_rate": 1e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7373207211494446, + "num_tokens": 422155427.0, + "step": 16912 + }, + { + "epoch": 1.857346804304854, + "grad_norm": 2.215651035308838, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7080600261688232, + "num_tokens": 422181903.0, + "step": 16913 + }, + { + "epoch": 1.8574566220074678, + "grad_norm": 2.450641393661499, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7188971042633057, + "num_tokens": 422202993.0, + "step": 16914 + }, + { + "epoch": 1.8575664397100813, + "grad_norm": 2.1968202590942383, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7440924644470215, + "num_tokens": 422227872.0, + "step": 16915 + }, + { + "epoch": 1.8576762574126948, + "grad_norm": 2.488018035888672, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7186465263366699, + "num_tokens": 422250628.0, + "step": 16916 + }, + { + "epoch": 1.8577860751153086, + "grad_norm": 2.139331102371216, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6922885179519653, + "num_tokens": 422279603.0, + "step": 16917 + }, + { + "epoch": 1.8578958928179223, + "grad_norm": 2.1379594802856445, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7038048505783081, + "num_tokens": 422309492.0, + "step": 16918 + }, + { + "epoch": 1.8580057105205359, + "grad_norm": 2.3281731605529785, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7025788426399231, + "num_tokens": 422332674.0, + "step": 16919 + }, + { + "epoch": 1.8581155282231496, + "grad_norm": 2.6036643981933594, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.734435498714447, + "num_tokens": 422352743.0, + "step": 16920 + }, + { + "epoch": 1.8582253459257632, + "grad_norm": 2.369277238845825, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7093610763549805, + "num_tokens": 422376189.0, + "step": 16921 + }, + { + "epoch": 1.858335163628377, + "grad_norm": 2.1726062297821045, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7026312351226807, + "num_tokens": 422404942.0, + "step": 16922 + }, + { + "epoch": 1.8584449813309907, + "grad_norm": 2.8743624687194824, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7382592558860779, + "num_tokens": 422422559.0, + "step": 16923 + }, + { + "epoch": 1.8585547990336042, + "grad_norm": 2.305323362350464, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7240453362464905, + "num_tokens": 422446437.0, + "step": 16924 + }, + { + "epoch": 1.8586646167362177, + "grad_norm": 2.0705726146698, + "learning_rate": 1e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7366855144500732, + "num_tokens": 422473150.0, + "step": 16925 + }, + { + "epoch": 1.8587744344388315, + "grad_norm": 2.022372007369995, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7260621786117554, + "num_tokens": 422501957.0, + "step": 16926 + }, + { + "epoch": 1.8588842521414453, + "grad_norm": 2.195058822631836, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.712926983833313, + "num_tokens": 422527754.0, + "step": 16927 + }, + { + "epoch": 1.858994069844059, + "grad_norm": 2.3829612731933594, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.73261559009552, + "num_tokens": 422550648.0, + "step": 16928 + }, + { + "epoch": 1.8591038875466726, + "grad_norm": 2.125778913497925, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6856697797775269, + "num_tokens": 422580905.0, + "step": 16929 + }, + { + "epoch": 1.859213705249286, + "grad_norm": 2.3389692306518555, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7045285701751709, + "num_tokens": 422605616.0, + "step": 16930 + }, + { + "epoch": 1.8593235229518998, + "grad_norm": 2.759970188140869, + "learning_rate": 1e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7474470138549805, + "num_tokens": 422625098.0, + "step": 16931 + }, + { + "epoch": 1.8594333406545136, + "grad_norm": 2.5849106311798096, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7252925634384155, + "num_tokens": 422646574.0, + "step": 16932 + }, + { + "epoch": 1.8595431583571271, + "grad_norm": 2.2539689540863037, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7193180322647095, + "num_tokens": 422672630.0, + "step": 16933 + }, + { + "epoch": 1.8596529760597407, + "grad_norm": 2.388066053390503, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7459090948104858, + "num_tokens": 422695484.0, + "step": 16934 + }, + { + "epoch": 1.8597627937623544, + "grad_norm": 2.2897183895111084, + "learning_rate": 1e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.7399383783340454, + "num_tokens": 422718764.0, + "step": 16935 + }, + { + "epoch": 1.8598726114649682, + "grad_norm": 2.304104804992676, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7016124725341797, + "num_tokens": 422743292.0, + "step": 16936 + }, + { + "epoch": 1.859982429167582, + "grad_norm": 2.4505255222320557, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7195057272911072, + "num_tokens": 422766045.0, + "step": 16937 + }, + { + "epoch": 1.8600922468701955, + "grad_norm": 2.4980926513671875, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.722507655620575, + "num_tokens": 422787930.0, + "step": 16938 + }, + { + "epoch": 1.860202064572809, + "grad_norm": 2.5313282012939453, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7514076232910156, + "num_tokens": 422807451.0, + "step": 16939 + }, + { + "epoch": 1.8603118822754228, + "grad_norm": 2.0489261150360107, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7180799245834351, + "num_tokens": 422835876.0, + "step": 16940 + }, + { + "epoch": 1.8604216999780365, + "grad_norm": 2.216165542602539, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7154252529144287, + "num_tokens": 422860342.0, + "step": 16941 + }, + { + "epoch": 1.8605315176806503, + "grad_norm": 2.3405520915985107, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.740043580532074, + "num_tokens": 422883438.0, + "step": 16942 + }, + { + "epoch": 1.8606413353832638, + "grad_norm": 1.9427121877670288, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7238191366195679, + "num_tokens": 422915081.0, + "step": 16943 + }, + { + "epoch": 1.8607511530858774, + "grad_norm": 2.3941073417663574, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7335478067398071, + "num_tokens": 422937884.0, + "step": 16944 + }, + { + "epoch": 1.860860970788491, + "grad_norm": 2.2230498790740967, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7187806367874146, + "num_tokens": 422964831.0, + "step": 16945 + }, + { + "epoch": 1.8609707884911049, + "grad_norm": 2.2437822818756104, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7178466320037842, + "num_tokens": 422990409.0, + "step": 16946 + }, + { + "epoch": 1.8610806061937184, + "grad_norm": 2.1959426403045654, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7023775577545166, + "num_tokens": 423018181.0, + "step": 16947 + }, + { + "epoch": 1.861190423896332, + "grad_norm": 2.2616026401519775, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7239365577697754, + "num_tokens": 423043197.0, + "step": 16948 + }, + { + "epoch": 1.8613002415989457, + "grad_norm": 1.8565276861190796, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7201849222183228, + "num_tokens": 423078077.0, + "step": 16949 + }, + { + "epoch": 1.8614100593015594, + "grad_norm": 2.1034035682678223, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.709854245185852, + "num_tokens": 423107593.0, + "step": 16950 + }, + { + "epoch": 1.8615198770041732, + "grad_norm": 2.253520965576172, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7211616039276123, + "num_tokens": 423132539.0, + "step": 16951 + }, + { + "epoch": 1.8616296947067867, + "grad_norm": 2.4050636291503906, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7287935018539429, + "num_tokens": 423154490.0, + "step": 16952 + }, + { + "epoch": 1.8617395124094003, + "grad_norm": 2.4117510318756104, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7072474360466003, + "num_tokens": 423177375.0, + "step": 16953 + }, + { + "epoch": 1.861849330112014, + "grad_norm": 2.2087152004241943, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.713839054107666, + "num_tokens": 423203372.0, + "step": 16954 + }, + { + "epoch": 1.8619591478146278, + "grad_norm": 2.458077907562256, + "learning_rate": 1e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7401291131973267, + "num_tokens": 423224961.0, + "step": 16955 + }, + { + "epoch": 1.8620689655172413, + "grad_norm": 2.096508502960205, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.6986489295959473, + "num_tokens": 423253321.0, + "step": 16956 + }, + { + "epoch": 1.862178783219855, + "grad_norm": 2.049893856048584, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7015026807785034, + "num_tokens": 423281195.0, + "step": 16957 + }, + { + "epoch": 1.8622886009224686, + "grad_norm": 2.1980934143066406, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6976829767227173, + "num_tokens": 423310609.0, + "step": 16958 + }, + { + "epoch": 1.8623984186250824, + "grad_norm": 2.162123441696167, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7089248299598694, + "num_tokens": 423340180.0, + "step": 16959 + }, + { + "epoch": 1.8625082363276961, + "grad_norm": 1.9019556045532227, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7117958664894104, + "num_tokens": 423372917.0, + "step": 16960 + }, + { + "epoch": 1.8626180540303097, + "grad_norm": 2.2452316284179688, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7286756634712219, + "num_tokens": 423396817.0, + "step": 16961 + }, + { + "epoch": 1.8627278717329232, + "grad_norm": 2.4900662899017334, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7138474583625793, + "num_tokens": 423418195.0, + "step": 16962 + }, + { + "epoch": 1.862837689435537, + "grad_norm": 2.1985249519348145, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6917517185211182, + "num_tokens": 423445272.0, + "step": 16963 + }, + { + "epoch": 1.8629475071381507, + "grad_norm": 2.5024943351745605, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7053940296173096, + "num_tokens": 423468286.0, + "step": 16964 + }, + { + "epoch": 1.8630573248407645, + "grad_norm": 2.084423065185547, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.707412600517273, + "num_tokens": 423499088.0, + "step": 16965 + }, + { + "epoch": 1.863167142543378, + "grad_norm": 2.1108646392822266, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7332099080085754, + "num_tokens": 423526328.0, + "step": 16966 + }, + { + "epoch": 1.8632769602459915, + "grad_norm": 2.686353921890259, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7266218066215515, + "num_tokens": 423545460.0, + "step": 16967 + }, + { + "epoch": 1.8633867779486053, + "grad_norm": 2.2359282970428467, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7082449793815613, + "num_tokens": 423573183.0, + "step": 16968 + }, + { + "epoch": 1.863496595651219, + "grad_norm": 2.098742723464966, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.6991580724716187, + "num_tokens": 423602399.0, + "step": 16969 + }, + { + "epoch": 1.8636064133538326, + "grad_norm": 2.0999011993408203, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6957794427871704, + "num_tokens": 423632719.0, + "step": 16970 + }, + { + "epoch": 1.8637162310564463, + "grad_norm": 2.4298746585845947, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7196890115737915, + "num_tokens": 423655033.0, + "step": 16971 + }, + { + "epoch": 1.8638260487590599, + "grad_norm": 2.37685227394104, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.738777756690979, + "num_tokens": 423677281.0, + "step": 16972 + }, + { + "epoch": 1.8639358664616736, + "grad_norm": 2.293654203414917, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7312397956848145, + "num_tokens": 423700252.0, + "step": 16973 + }, + { + "epoch": 1.8640456841642874, + "grad_norm": 2.110555410385132, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7086764574050903, + "num_tokens": 423729398.0, + "step": 16974 + }, + { + "epoch": 1.864155501866901, + "grad_norm": 2.162841796875, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7327896356582642, + "num_tokens": 423756021.0, + "step": 16975 + }, + { + "epoch": 1.8642653195695145, + "grad_norm": 2.0509722232818604, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7200613021850586, + "num_tokens": 423783738.0, + "step": 16976 + }, + { + "epoch": 1.8643751372721282, + "grad_norm": 2.2861833572387695, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7238309383392334, + "num_tokens": 423806955.0, + "step": 16977 + }, + { + "epoch": 1.864484954974742, + "grad_norm": 2.378599166870117, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7383548021316528, + "num_tokens": 423829945.0, + "step": 16978 + }, + { + "epoch": 1.8645947726773557, + "grad_norm": 2.7912633419036865, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7467657327651978, + "num_tokens": 423847205.0, + "step": 16979 + }, + { + "epoch": 1.8647045903799693, + "grad_norm": 2.2306220531463623, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7041212320327759, + "num_tokens": 423873054.0, + "step": 16980 + }, + { + "epoch": 1.8648144080825828, + "grad_norm": 2.0765719413757324, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7270135879516602, + "num_tokens": 423899694.0, + "step": 16981 + }, + { + "epoch": 1.8649242257851966, + "grad_norm": 2.3485944271087646, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7083137035369873, + "num_tokens": 423924649.0, + "step": 16982 + }, + { + "epoch": 1.8650340434878103, + "grad_norm": 2.4264581203460693, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7095785737037659, + "num_tokens": 423947255.0, + "step": 16983 + }, + { + "epoch": 1.8651438611904239, + "grad_norm": 2.381319284439087, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7253550291061401, + "num_tokens": 423969972.0, + "step": 16984 + }, + { + "epoch": 1.8652536788930374, + "grad_norm": 2.2930338382720947, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7253739833831787, + "num_tokens": 423993507.0, + "step": 16985 + }, + { + "epoch": 1.8653634965956511, + "grad_norm": 2.2130208015441895, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7002314925193787, + "num_tokens": 424022530.0, + "step": 16986 + }, + { + "epoch": 1.865473314298265, + "grad_norm": 2.3046505451202393, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7234470844268799, + "num_tokens": 424047456.0, + "step": 16987 + }, + { + "epoch": 1.8655831320008787, + "grad_norm": 2.3218986988067627, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7079329490661621, + "num_tokens": 424073290.0, + "step": 16988 + }, + { + "epoch": 1.8656929497034922, + "grad_norm": 2.349740982055664, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7187917232513428, + "num_tokens": 424097572.0, + "step": 16989 + }, + { + "epoch": 1.8658027674061057, + "grad_norm": 2.4597573280334473, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7216266989707947, + "num_tokens": 424119241.0, + "step": 16990 + }, + { + "epoch": 1.8659125851087195, + "grad_norm": 2.3199803829193115, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7200783491134644, + "num_tokens": 424145285.0, + "step": 16991 + }, + { + "epoch": 1.8660224028113332, + "grad_norm": 2.328709125518799, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7136659026145935, + "num_tokens": 424170135.0, + "step": 16992 + }, + { + "epoch": 1.866132220513947, + "grad_norm": 2.6777501106262207, + "learning_rate": 1e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7379685640335083, + "num_tokens": 424189958.0, + "step": 16993 + }, + { + "epoch": 1.8662420382165605, + "grad_norm": 2.298105239868164, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7377450466156006, + "num_tokens": 424213591.0, + "step": 16994 + }, + { + "epoch": 1.866351855919174, + "grad_norm": 2.5491490364074707, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7500450611114502, + "num_tokens": 424233327.0, + "step": 16995 + }, + { + "epoch": 1.8664616736217878, + "grad_norm": 2.4315106868743896, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7273195385932922, + "num_tokens": 424255911.0, + "step": 16996 + }, + { + "epoch": 1.8665714913244016, + "grad_norm": 2.436753034591675, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7419641017913818, + "num_tokens": 424277587.0, + "step": 16997 + }, + { + "epoch": 1.8666813090270151, + "grad_norm": 2.242583990097046, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.715843915939331, + "num_tokens": 424303713.0, + "step": 16998 + }, + { + "epoch": 1.8667911267296287, + "grad_norm": 2.5839009284973145, + "learning_rate": 1e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7447259426116943, + "num_tokens": 424324422.0, + "step": 16999 + }, + { + "epoch": 1.8669009444322424, + "grad_norm": 2.2862532138824463, + "learning_rate": 1e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7516312003135681, + "num_tokens": 424347012.0, + "step": 17000 + }, + { + "epoch": 1.8670107621348562, + "grad_norm": 2.1719532012939453, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7076015472412109, + "num_tokens": 424373297.0, + "step": 17001 + }, + { + "epoch": 1.86712057983747, + "grad_norm": 2.382404327392578, + "learning_rate": 1e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7491960525512695, + "num_tokens": 424394216.0, + "step": 17002 + }, + { + "epoch": 1.8672303975400835, + "grad_norm": 2.28202748298645, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7448451519012451, + "num_tokens": 424418438.0, + "step": 17003 + }, + { + "epoch": 1.867340215242697, + "grad_norm": 2.21531343460083, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7147462368011475, + "num_tokens": 424442720.0, + "step": 17004 + }, + { + "epoch": 1.8674500329453108, + "grad_norm": 2.3429973125457764, + "learning_rate": 1e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7368212938308716, + "num_tokens": 424463918.0, + "step": 17005 + }, + { + "epoch": 1.8675598506479245, + "grad_norm": 2.2110414505004883, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7144597768783569, + "num_tokens": 424487775.0, + "step": 17006 + }, + { + "epoch": 1.8676696683505383, + "grad_norm": 2.40769624710083, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7204513549804688, + "num_tokens": 424512136.0, + "step": 17007 + }, + { + "epoch": 1.8677794860531518, + "grad_norm": 2.4045050144195557, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7256121039390564, + "num_tokens": 424534715.0, + "step": 17008 + }, + { + "epoch": 1.8678893037557653, + "grad_norm": 2.566042900085449, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7167333364486694, + "num_tokens": 424554975.0, + "step": 17009 + }, + { + "epoch": 1.867999121458379, + "grad_norm": 2.3566079139709473, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.714751124382019, + "num_tokens": 424578775.0, + "step": 17010 + }, + { + "epoch": 1.8681089391609929, + "grad_norm": 2.373290777206421, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7358875274658203, + "num_tokens": 424601606.0, + "step": 17011 + }, + { + "epoch": 1.8682187568636064, + "grad_norm": 2.324472665786743, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7340531349182129, + "num_tokens": 424626344.0, + "step": 17012 + }, + { + "epoch": 1.86832857456622, + "grad_norm": 2.242514133453369, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7246731519699097, + "num_tokens": 424652144.0, + "step": 17013 + }, + { + "epoch": 1.8684383922688337, + "grad_norm": 2.4180710315704346, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.718963086605072, + "num_tokens": 424675587.0, + "step": 17014 + }, + { + "epoch": 1.8685482099714474, + "grad_norm": 2.491119623184204, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7227155566215515, + "num_tokens": 424697051.0, + "step": 17015 + }, + { + "epoch": 1.8686580276740612, + "grad_norm": 2.2551000118255615, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7058566212654114, + "num_tokens": 424723276.0, + "step": 17016 + }, + { + "epoch": 1.8687678453766747, + "grad_norm": 2.0121941566467285, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7119382619857788, + "num_tokens": 424753660.0, + "step": 17017 + }, + { + "epoch": 1.8688776630792883, + "grad_norm": 2.1285440921783447, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7217258214950562, + "num_tokens": 424781939.0, + "step": 17018 + }, + { + "epoch": 1.868987480781902, + "grad_norm": 1.941448450088501, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.705020546913147, + "num_tokens": 424815388.0, + "step": 17019 + }, + { + "epoch": 1.8690972984845158, + "grad_norm": 2.350632905960083, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7537658214569092, + "num_tokens": 424837863.0, + "step": 17020 + }, + { + "epoch": 1.8692071161871293, + "grad_norm": 2.2471907138824463, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7259441018104553, + "num_tokens": 424861715.0, + "step": 17021 + }, + { + "epoch": 1.869316933889743, + "grad_norm": 2.092576026916504, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7254289388656616, + "num_tokens": 424887393.0, + "step": 17022 + }, + { + "epoch": 1.8694267515923566, + "grad_norm": 2.1477959156036377, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7322610020637512, + "num_tokens": 424913696.0, + "step": 17023 + }, + { + "epoch": 1.8695365692949704, + "grad_norm": 2.2214386463165283, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7171374559402466, + "num_tokens": 424940140.0, + "step": 17024 + }, + { + "epoch": 1.8696463869975841, + "grad_norm": 2.5036919116973877, + "learning_rate": 1e-06, + "loss": 0.7956, + "mean_token_accuracy": 0.7448497414588928, + "num_tokens": 424962388.0, + "step": 17025 + }, + { + "epoch": 1.8697562047001977, + "grad_norm": 2.2729029655456543, + "learning_rate": 1e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7363659739494324, + "num_tokens": 424985349.0, + "step": 17026 + }, + { + "epoch": 1.8698660224028112, + "grad_norm": 2.282813310623169, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7015841007232666, + "num_tokens": 425011480.0, + "step": 17027 + }, + { + "epoch": 1.869975840105425, + "grad_norm": 2.373563051223755, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7269847393035889, + "num_tokens": 425033675.0, + "step": 17028 + }, + { + "epoch": 1.8700856578080387, + "grad_norm": 2.4406087398529053, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7059610486030579, + "num_tokens": 425055910.0, + "step": 17029 + }, + { + "epoch": 1.8701954755106525, + "grad_norm": 2.280327796936035, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7124055624008179, + "num_tokens": 425082611.0, + "step": 17030 + }, + { + "epoch": 1.870305293213266, + "grad_norm": 2.9715771675109863, + "learning_rate": 1e-06, + "loss": 0.7366, + "mean_token_accuracy": 0.75215083360672, + "num_tokens": 425097876.0, + "step": 17031 + }, + { + "epoch": 1.8704151109158795, + "grad_norm": 2.180040121078491, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7319345474243164, + "num_tokens": 425123117.0, + "step": 17032 + }, + { + "epoch": 1.8705249286184933, + "grad_norm": 2.5054233074188232, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7322598099708557, + "num_tokens": 425144999.0, + "step": 17033 + }, + { + "epoch": 1.870634746321107, + "grad_norm": 2.0858232975006104, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7205927968025208, + "num_tokens": 425174136.0, + "step": 17034 + }, + { + "epoch": 1.8707445640237206, + "grad_norm": 2.2870867252349854, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7190238237380981, + "num_tokens": 425198947.0, + "step": 17035 + }, + { + "epoch": 1.8708543817263343, + "grad_norm": 2.182520866394043, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7053493857383728, + "num_tokens": 425226753.0, + "step": 17036 + }, + { + "epoch": 1.8709641994289479, + "grad_norm": 2.277707815170288, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7095094919204712, + "num_tokens": 425250697.0, + "step": 17037 + }, + { + "epoch": 1.8710740171315616, + "grad_norm": 2.4617836475372314, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7408182621002197, + "num_tokens": 425273278.0, + "step": 17038 + }, + { + "epoch": 1.8711838348341754, + "grad_norm": 2.1429080963134766, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.72507643699646, + "num_tokens": 425298721.0, + "step": 17039 + }, + { + "epoch": 1.871293652536789, + "grad_norm": 2.233267068862915, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7013262510299683, + "num_tokens": 425324860.0, + "step": 17040 + }, + { + "epoch": 1.8714034702394025, + "grad_norm": 2.669832468032837, + "learning_rate": 1e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7451267242431641, + "num_tokens": 425343324.0, + "step": 17041 + }, + { + "epoch": 1.8715132879420162, + "grad_norm": 2.265364408493042, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7360581755638123, + "num_tokens": 425368238.0, + "step": 17042 + }, + { + "epoch": 1.87162310564463, + "grad_norm": 2.141990900039673, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7395217418670654, + "num_tokens": 425393829.0, + "step": 17043 + }, + { + "epoch": 1.8717329233472437, + "grad_norm": 2.1371681690216064, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.717847466468811, + "num_tokens": 425419755.0, + "step": 17044 + }, + { + "epoch": 1.8718427410498573, + "grad_norm": 2.483044147491455, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7555833458900452, + "num_tokens": 425440252.0, + "step": 17045 + }, + { + "epoch": 1.8719525587524708, + "grad_norm": 2.0783586502075195, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7031256556510925, + "num_tokens": 425474541.0, + "step": 17046 + }, + { + "epoch": 1.8720623764550846, + "grad_norm": 2.3921353816986084, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7206621170043945, + "num_tokens": 425495809.0, + "step": 17047 + }, + { + "epoch": 1.8721721941576983, + "grad_norm": 2.1685984134674072, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7255503535270691, + "num_tokens": 425521259.0, + "step": 17048 + }, + { + "epoch": 1.8722820118603118, + "grad_norm": 2.276707649230957, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7026098370552063, + "num_tokens": 425547183.0, + "step": 17049 + }, + { + "epoch": 1.8723918295629254, + "grad_norm": 2.543304204940796, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.717856764793396, + "num_tokens": 425570312.0, + "step": 17050 + }, + { + "epoch": 1.8725016472655391, + "grad_norm": 2.284532308578491, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7260856628417969, + "num_tokens": 425595175.0, + "step": 17051 + }, + { + "epoch": 1.872611464968153, + "grad_norm": 2.159193277359009, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7007855176925659, + "num_tokens": 425623213.0, + "step": 17052 + }, + { + "epoch": 1.8727212826707667, + "grad_norm": 2.3240063190460205, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7365149855613708, + "num_tokens": 425645944.0, + "step": 17053 + }, + { + "epoch": 1.8728311003733802, + "grad_norm": 2.1659252643585205, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7044064402580261, + "num_tokens": 425673614.0, + "step": 17054 + }, + { + "epoch": 1.8729409180759937, + "grad_norm": 2.1637966632843018, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7063982486724854, + "num_tokens": 425701434.0, + "step": 17055 + }, + { + "epoch": 1.8730507357786075, + "grad_norm": 2.291830062866211, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7188868522644043, + "num_tokens": 425725567.0, + "step": 17056 + }, + { + "epoch": 1.8731605534812212, + "grad_norm": 2.156121253967285, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7258478403091431, + "num_tokens": 425753340.0, + "step": 17057 + }, + { + "epoch": 1.873270371183835, + "grad_norm": 2.907919406890869, + "learning_rate": 1e-06, + "loss": 0.7846, + "mean_token_accuracy": 0.7473509311676025, + "num_tokens": 425769969.0, + "step": 17058 + }, + { + "epoch": 1.8733801888864485, + "grad_norm": 2.7724177837371826, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7338670492172241, + "num_tokens": 425788098.0, + "step": 17059 + }, + { + "epoch": 1.873490006589062, + "grad_norm": 2.0496628284454346, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7120847105979919, + "num_tokens": 425819602.0, + "step": 17060 + }, + { + "epoch": 1.8735998242916758, + "grad_norm": 2.004012107849121, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6881368160247803, + "num_tokens": 425854522.0, + "step": 17061 + }, + { + "epoch": 1.8737096419942896, + "grad_norm": 2.372708559036255, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7357120513916016, + "num_tokens": 425876437.0, + "step": 17062 + }, + { + "epoch": 1.8738194596969031, + "grad_norm": 2.3690264225006104, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7114850282669067, + "num_tokens": 425900771.0, + "step": 17063 + }, + { + "epoch": 1.8739292773995166, + "grad_norm": 2.0981714725494385, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7063107490539551, + "num_tokens": 425930394.0, + "step": 17064 + }, + { + "epoch": 1.8740390951021304, + "grad_norm": 2.2105257511138916, + "learning_rate": 1e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7453148365020752, + "num_tokens": 425955278.0, + "step": 17065 + }, + { + "epoch": 1.8741489128047442, + "grad_norm": 2.5998432636260986, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7406901121139526, + "num_tokens": 425974469.0, + "step": 17066 + }, + { + "epoch": 1.874258730507358, + "grad_norm": 2.166161060333252, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.6973289847373962, + "num_tokens": 426001069.0, + "step": 17067 + }, + { + "epoch": 1.8743685482099715, + "grad_norm": 2.1688456535339355, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7273500561714172, + "num_tokens": 426030368.0, + "step": 17068 + }, + { + "epoch": 1.874478365912585, + "grad_norm": 2.1972804069519043, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7285505533218384, + "num_tokens": 426056377.0, + "step": 17069 + }, + { + "epoch": 1.8745881836151987, + "grad_norm": 2.1666371822357178, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7312946915626526, + "num_tokens": 426082611.0, + "step": 17070 + }, + { + "epoch": 1.8746980013178125, + "grad_norm": 2.340787172317505, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7205824255943298, + "num_tokens": 426108825.0, + "step": 17071 + }, + { + "epoch": 1.8748078190204263, + "grad_norm": 2.046234607696533, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7078562378883362, + "num_tokens": 426139937.0, + "step": 17072 + }, + { + "epoch": 1.8749176367230398, + "grad_norm": 2.2492520809173584, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7200601100921631, + "num_tokens": 426164937.0, + "step": 17073 + }, + { + "epoch": 1.8750274544256533, + "grad_norm": 2.8857898712158203, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7171468734741211, + "num_tokens": 426183041.0, + "step": 17074 + }, + { + "epoch": 1.875137272128267, + "grad_norm": 2.354579210281372, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7262046337127686, + "num_tokens": 426206096.0, + "step": 17075 + }, + { + "epoch": 1.8752470898308808, + "grad_norm": 2.370650291442871, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7206472158432007, + "num_tokens": 426229815.0, + "step": 17076 + }, + { + "epoch": 1.8753569075334944, + "grad_norm": 2.3240771293640137, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7160611152648926, + "num_tokens": 426254190.0, + "step": 17077 + }, + { + "epoch": 1.875466725236108, + "grad_norm": 2.473966121673584, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7392264008522034, + "num_tokens": 426275037.0, + "step": 17078 + }, + { + "epoch": 1.8755765429387217, + "grad_norm": 2.018944501876831, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.728804349899292, + "num_tokens": 426306482.0, + "step": 17079 + }, + { + "epoch": 1.8756863606413354, + "grad_norm": 2.3903558254241943, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7435446977615356, + "num_tokens": 426330931.0, + "step": 17080 + }, + { + "epoch": 1.8757961783439492, + "grad_norm": 2.550844669342041, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7314472198486328, + "num_tokens": 426354389.0, + "step": 17081 + }, + { + "epoch": 1.8759059960465627, + "grad_norm": 2.3649213314056396, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7363665103912354, + "num_tokens": 426377719.0, + "step": 17082 + }, + { + "epoch": 1.8760158137491763, + "grad_norm": 2.5743155479431152, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7242505550384521, + "num_tokens": 426398431.0, + "step": 17083 + }, + { + "epoch": 1.87612563145179, + "grad_norm": 2.297394037246704, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7268288135528564, + "num_tokens": 426421671.0, + "step": 17084 + }, + { + "epoch": 1.8762354491544038, + "grad_norm": 2.1919941902160645, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7305214405059814, + "num_tokens": 426448989.0, + "step": 17085 + }, + { + "epoch": 1.8763452668570173, + "grad_norm": 2.0417537689208984, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7103028893470764, + "num_tokens": 426480341.0, + "step": 17086 + }, + { + "epoch": 1.876455084559631, + "grad_norm": 2.3251211643218994, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7285966873168945, + "num_tokens": 426504364.0, + "step": 17087 + }, + { + "epoch": 1.8765649022622446, + "grad_norm": 2.459294319152832, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7234380841255188, + "num_tokens": 426525066.0, + "step": 17088 + }, + { + "epoch": 1.8766747199648584, + "grad_norm": 2.3583409786224365, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7192301154136658, + "num_tokens": 426548886.0, + "step": 17089 + }, + { + "epoch": 1.876784537667472, + "grad_norm": 2.20790958404541, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7170122861862183, + "num_tokens": 426573533.0, + "step": 17090 + }, + { + "epoch": 1.8768943553700856, + "grad_norm": 2.258289337158203, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7433551549911499, + "num_tokens": 426596634.0, + "step": 17091 + }, + { + "epoch": 1.8770041730726992, + "grad_norm": 2.2459876537323, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7120105028152466, + "num_tokens": 426622024.0, + "step": 17092 + }, + { + "epoch": 1.877113990775313, + "grad_norm": 2.237539768218994, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6901944875717163, + "num_tokens": 426647125.0, + "step": 17093 + }, + { + "epoch": 1.8772238084779267, + "grad_norm": 2.7786755561828613, + "learning_rate": 1e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7353034019470215, + "num_tokens": 426666192.0, + "step": 17094 + }, + { + "epoch": 1.8773336261805404, + "grad_norm": 2.1865694522857666, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7545472383499146, + "num_tokens": 426689780.0, + "step": 17095 + }, + { + "epoch": 1.877443443883154, + "grad_norm": 2.2025322914123535, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7230196595191956, + "num_tokens": 426714694.0, + "step": 17096 + }, + { + "epoch": 1.8775532615857675, + "grad_norm": 2.0865375995635986, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7297119498252869, + "num_tokens": 426741380.0, + "step": 17097 + }, + { + "epoch": 1.8776630792883813, + "grad_norm": 2.4380581378936768, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7346689701080322, + "num_tokens": 426762003.0, + "step": 17098 + }, + { + "epoch": 1.877772896990995, + "grad_norm": 2.4231348037719727, + "learning_rate": 1e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7455312609672546, + "num_tokens": 426782628.0, + "step": 17099 + }, + { + "epoch": 1.8778827146936086, + "grad_norm": 1.914167881011963, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7063184976577759, + "num_tokens": 426817151.0, + "step": 17100 + }, + { + "epoch": 1.8779925323962223, + "grad_norm": 2.3913938999176025, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.708952784538269, + "num_tokens": 426841984.0, + "step": 17101 + }, + { + "epoch": 1.8781023500988359, + "grad_norm": 2.384551525115967, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7177640199661255, + "num_tokens": 426865082.0, + "step": 17102 + }, + { + "epoch": 1.8782121678014496, + "grad_norm": 2.2296395301818848, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.696002721786499, + "num_tokens": 426891264.0, + "step": 17103 + }, + { + "epoch": 1.8783219855040634, + "grad_norm": 2.602104902267456, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7447395324707031, + "num_tokens": 426910831.0, + "step": 17104 + }, + { + "epoch": 1.878431803206677, + "grad_norm": 2.156949996948242, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7049986124038696, + "num_tokens": 426939259.0, + "step": 17105 + }, + { + "epoch": 1.8785416209092904, + "grad_norm": 2.0295369625091553, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.723971962928772, + "num_tokens": 426973223.0, + "step": 17106 + }, + { + "epoch": 1.8786514386119042, + "grad_norm": 2.474287271499634, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7223271727561951, + "num_tokens": 426997083.0, + "step": 17107 + }, + { + "epoch": 1.878761256314518, + "grad_norm": 2.516094446182251, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7320630550384521, + "num_tokens": 427016895.0, + "step": 17108 + }, + { + "epoch": 1.8788710740171317, + "grad_norm": 2.067502737045288, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.720118522644043, + "num_tokens": 427044574.0, + "step": 17109 + }, + { + "epoch": 1.8789808917197452, + "grad_norm": 1.9158610105514526, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7119696140289307, + "num_tokens": 427078220.0, + "step": 17110 + }, + { + "epoch": 1.8790907094223588, + "grad_norm": 2.0391812324523926, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7056393623352051, + "num_tokens": 427107186.0, + "step": 17111 + }, + { + "epoch": 1.8792005271249725, + "grad_norm": 2.4018115997314453, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7300054430961609, + "num_tokens": 427130749.0, + "step": 17112 + }, + { + "epoch": 1.8793103448275863, + "grad_norm": 2.2416040897369385, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7350705862045288, + "num_tokens": 427155367.0, + "step": 17113 + }, + { + "epoch": 1.8794201625301998, + "grad_norm": 2.296753406524658, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7012729048728943, + "num_tokens": 427181799.0, + "step": 17114 + }, + { + "epoch": 1.8795299802328134, + "grad_norm": 2.5337371826171875, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7280040979385376, + "num_tokens": 427202424.0, + "step": 17115 + }, + { + "epoch": 1.8796397979354271, + "grad_norm": 2.0509510040283203, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7220684289932251, + "num_tokens": 427230633.0, + "step": 17116 + }, + { + "epoch": 1.8797496156380409, + "grad_norm": 2.6381921768188477, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7143756151199341, + "num_tokens": 427249307.0, + "step": 17117 + }, + { + "epoch": 1.8798594333406546, + "grad_norm": 2.388352394104004, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7500847578048706, + "num_tokens": 427270617.0, + "step": 17118 + }, + { + "epoch": 1.8799692510432682, + "grad_norm": 2.2449190616607666, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7040280103683472, + "num_tokens": 427297356.0, + "step": 17119 + }, + { + "epoch": 1.8800790687458817, + "grad_norm": 2.518731117248535, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7229564785957336, + "num_tokens": 427318824.0, + "step": 17120 + }, + { + "epoch": 1.8801888864484955, + "grad_norm": 2.5858638286590576, + "learning_rate": 1e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7439841032028198, + "num_tokens": 427337348.0, + "step": 17121 + }, + { + "epoch": 1.8802987041511092, + "grad_norm": 2.0095999240875244, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7273061275482178, + "num_tokens": 427366785.0, + "step": 17122 + }, + { + "epoch": 1.880408521853723, + "grad_norm": 2.188184976577759, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7115921974182129, + "num_tokens": 427393518.0, + "step": 17123 + }, + { + "epoch": 1.8805183395563365, + "grad_norm": 2.1673712730407715, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7187342047691345, + "num_tokens": 427419547.0, + "step": 17124 + }, + { + "epoch": 1.88062815725895, + "grad_norm": 2.0416884422302246, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.707555890083313, + "num_tokens": 427448841.0, + "step": 17125 + }, + { + "epoch": 1.8807379749615638, + "grad_norm": 1.9893816709518433, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7170761823654175, + "num_tokens": 427479485.0, + "step": 17126 + }, + { + "epoch": 1.8808477926641776, + "grad_norm": 2.376127004623413, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7258179187774658, + "num_tokens": 427502817.0, + "step": 17127 + }, + { + "epoch": 1.880957610366791, + "grad_norm": 2.2452192306518555, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7089493870735168, + "num_tokens": 427530769.0, + "step": 17128 + }, + { + "epoch": 1.8810674280694046, + "grad_norm": 2.2969913482666016, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7289578318595886, + "num_tokens": 427555573.0, + "step": 17129 + }, + { + "epoch": 1.8811772457720184, + "grad_norm": 2.369133949279785, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7403689026832581, + "num_tokens": 427576555.0, + "step": 17130 + }, + { + "epoch": 1.8812870634746321, + "grad_norm": 2.3116579055786133, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7201417684555054, + "num_tokens": 427602849.0, + "step": 17131 + }, + { + "epoch": 1.881396881177246, + "grad_norm": 2.3656349182128906, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.713268518447876, + "num_tokens": 427626977.0, + "step": 17132 + }, + { + "epoch": 1.8815066988798594, + "grad_norm": 2.359498977661133, + "learning_rate": 1e-06, + "loss": 0.8016, + "mean_token_accuracy": 0.7530920505523682, + "num_tokens": 427649005.0, + "step": 17133 + }, + { + "epoch": 1.881616516582473, + "grad_norm": 2.1057956218719482, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7141686081886292, + "num_tokens": 427676837.0, + "step": 17134 + }, + { + "epoch": 1.8817263342850867, + "grad_norm": 2.323279857635498, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7365612387657166, + "num_tokens": 427700864.0, + "step": 17135 + }, + { + "epoch": 1.8818361519877005, + "grad_norm": 2.4664714336395264, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7398669719696045, + "num_tokens": 427722430.0, + "step": 17136 + }, + { + "epoch": 1.881945969690314, + "grad_norm": 2.4171884059906006, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.724605917930603, + "num_tokens": 427746606.0, + "step": 17137 + }, + { + "epoch": 1.8820557873929278, + "grad_norm": 2.5232958793640137, + "learning_rate": 1e-06, + "loss": 0.7761, + "mean_token_accuracy": 0.7464760541915894, + "num_tokens": 427766639.0, + "step": 17138 + }, + { + "epoch": 1.8821656050955413, + "grad_norm": 2.2831945419311523, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7440522313117981, + "num_tokens": 427789723.0, + "step": 17139 + }, + { + "epoch": 1.882275422798155, + "grad_norm": 2.380127429962158, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.701980471611023, + "num_tokens": 427815552.0, + "step": 17140 + }, + { + "epoch": 1.8823852405007688, + "grad_norm": 2.34635853767395, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7280064821243286, + "num_tokens": 427840285.0, + "step": 17141 + }, + { + "epoch": 1.8824950582033824, + "grad_norm": 2.02231502532959, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7442220449447632, + "num_tokens": 427868577.0, + "step": 17142 + }, + { + "epoch": 1.882604875905996, + "grad_norm": 2.242497444152832, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7255587577819824, + "num_tokens": 427893566.0, + "step": 17143 + }, + { + "epoch": 1.8827146936086097, + "grad_norm": 2.348033905029297, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7397497892379761, + "num_tokens": 427918352.0, + "step": 17144 + }, + { + "epoch": 1.8828245113112234, + "grad_norm": 2.377819538116455, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7127686738967896, + "num_tokens": 427941208.0, + "step": 17145 + }, + { + "epoch": 1.8829343290138372, + "grad_norm": 2.3696110248565674, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7316621541976929, + "num_tokens": 427963127.0, + "step": 17146 + }, + { + "epoch": 1.8830441467164507, + "grad_norm": 2.351776123046875, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6996138095855713, + "num_tokens": 427988303.0, + "step": 17147 + }, + { + "epoch": 1.8831539644190642, + "grad_norm": 2.034804582595825, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7054828405380249, + "num_tokens": 428018827.0, + "step": 17148 + }, + { + "epoch": 1.883263782121678, + "grad_norm": 2.1997478008270264, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.738439679145813, + "num_tokens": 428041771.0, + "step": 17149 + }, + { + "epoch": 1.8833735998242918, + "grad_norm": 2.1292531490325928, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7153512239456177, + "num_tokens": 428069452.0, + "step": 17150 + }, + { + "epoch": 1.8834834175269053, + "grad_norm": 2.291588068008423, + "learning_rate": 1e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7294728755950928, + "num_tokens": 428093475.0, + "step": 17151 + }, + { + "epoch": 1.883593235229519, + "grad_norm": 2.2385284900665283, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7408638000488281, + "num_tokens": 428117541.0, + "step": 17152 + }, + { + "epoch": 1.8837030529321326, + "grad_norm": 2.611435651779175, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7245777249336243, + "num_tokens": 428137271.0, + "step": 17153 + }, + { + "epoch": 1.8838128706347463, + "grad_norm": 2.289424180984497, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7267724275588989, + "num_tokens": 428160663.0, + "step": 17154 + }, + { + "epoch": 1.88392268833736, + "grad_norm": 2.2693915367126465, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7251066565513611, + "num_tokens": 428185123.0, + "step": 17155 + }, + { + "epoch": 1.8840325060399736, + "grad_norm": 2.271989107131958, + "learning_rate": 1e-06, + "loss": 0.8069, + "mean_token_accuracy": 0.737200915813446, + "num_tokens": 428207450.0, + "step": 17156 + }, + { + "epoch": 1.8841423237425872, + "grad_norm": 2.4357523918151855, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7228268384933472, + "num_tokens": 428229997.0, + "step": 17157 + }, + { + "epoch": 1.884252141445201, + "grad_norm": 2.215501308441162, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7255862951278687, + "num_tokens": 428255323.0, + "step": 17158 + }, + { + "epoch": 1.8843619591478147, + "grad_norm": 2.6585848331451416, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7355534434318542, + "num_tokens": 428274868.0, + "step": 17159 + }, + { + "epoch": 1.8844717768504284, + "grad_norm": 2.1753032207489014, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7338358163833618, + "num_tokens": 428300686.0, + "step": 17160 + }, + { + "epoch": 1.884581594553042, + "grad_norm": 2.448194980621338, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7202293872833252, + "num_tokens": 428321735.0, + "step": 17161 + }, + { + "epoch": 1.8846914122556555, + "grad_norm": 2.3172569274902344, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7335232496261597, + "num_tokens": 428344058.0, + "step": 17162 + }, + { + "epoch": 1.8848012299582693, + "grad_norm": 2.1801764965057373, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7375586032867432, + "num_tokens": 428370341.0, + "step": 17163 + }, + { + "epoch": 1.884911047660883, + "grad_norm": 2.119206666946411, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7233858704566956, + "num_tokens": 428398280.0, + "step": 17164 + }, + { + "epoch": 1.8850208653634966, + "grad_norm": 2.207878351211548, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7004490494728088, + "num_tokens": 428426504.0, + "step": 17165 + }, + { + "epoch": 1.88513068306611, + "grad_norm": 2.2101218700408936, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7142938375473022, + "num_tokens": 428452707.0, + "step": 17166 + }, + { + "epoch": 1.8852405007687238, + "grad_norm": 2.3266000747680664, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7248073816299438, + "num_tokens": 428477102.0, + "step": 17167 + }, + { + "epoch": 1.8853503184713376, + "grad_norm": 2.260146379470825, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7357447743415833, + "num_tokens": 428501515.0, + "step": 17168 + }, + { + "epoch": 1.8854601361739514, + "grad_norm": 2.2196688652038574, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7284480333328247, + "num_tokens": 428527432.0, + "step": 17169 + }, + { + "epoch": 1.885569953876565, + "grad_norm": 2.1006410121917725, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7191981077194214, + "num_tokens": 428553795.0, + "step": 17170 + }, + { + "epoch": 1.8856797715791784, + "grad_norm": 2.0446267127990723, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7113271951675415, + "num_tokens": 428584139.0, + "step": 17171 + }, + { + "epoch": 1.8857895892817922, + "grad_norm": 2.360231399536133, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7344276905059814, + "num_tokens": 428606512.0, + "step": 17172 + }, + { + "epoch": 1.885899406984406, + "grad_norm": 2.2620372772216797, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7067517638206482, + "num_tokens": 428630284.0, + "step": 17173 + }, + { + "epoch": 1.8860092246870197, + "grad_norm": 2.683114528656006, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7231302857398987, + "num_tokens": 428651046.0, + "step": 17174 + }, + { + "epoch": 1.8861190423896332, + "grad_norm": 2.2509825229644775, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7006111741065979, + "num_tokens": 428676545.0, + "step": 17175 + }, + { + "epoch": 1.8862288600922468, + "grad_norm": 2.2001290321350098, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7085575461387634, + "num_tokens": 428703177.0, + "step": 17176 + }, + { + "epoch": 1.8863386777948605, + "grad_norm": 2.1609296798706055, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7143923044204712, + "num_tokens": 428729578.0, + "step": 17177 + }, + { + "epoch": 1.8864484954974743, + "grad_norm": 2.247302532196045, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7177579998970032, + "num_tokens": 428755271.0, + "step": 17178 + }, + { + "epoch": 1.8865583132000878, + "grad_norm": 2.3354766368865967, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.721164345741272, + "num_tokens": 428778957.0, + "step": 17179 + }, + { + "epoch": 1.8866681309027014, + "grad_norm": 1.9122745990753174, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7264751195907593, + "num_tokens": 428809563.0, + "step": 17180 + }, + { + "epoch": 1.8867779486053151, + "grad_norm": 2.068516254425049, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6910396814346313, + "num_tokens": 428841533.0, + "step": 17181 + }, + { + "epoch": 1.8868877663079289, + "grad_norm": 1.991145372390747, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7011972665786743, + "num_tokens": 428874445.0, + "step": 17182 + }, + { + "epoch": 1.8869975840105426, + "grad_norm": 2.5635085105895996, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7337855696678162, + "num_tokens": 428894719.0, + "step": 17183 + }, + { + "epoch": 1.8871074017131562, + "grad_norm": 2.1795449256896973, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.739966869354248, + "num_tokens": 428921313.0, + "step": 17184 + }, + { + "epoch": 1.8872172194157697, + "grad_norm": 2.2913084030151367, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6978441476821899, + "num_tokens": 428947670.0, + "step": 17185 + }, + { + "epoch": 1.8873270371183835, + "grad_norm": 2.6477811336517334, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7290152311325073, + "num_tokens": 428967455.0, + "step": 17186 + }, + { + "epoch": 1.8874368548209972, + "grad_norm": 2.2386534214019775, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7184680700302124, + "num_tokens": 428993092.0, + "step": 17187 + }, + { + "epoch": 1.887546672523611, + "grad_norm": 2.1138648986816406, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7295125722885132, + "num_tokens": 429020099.0, + "step": 17188 + }, + { + "epoch": 1.8876564902262245, + "grad_norm": 2.659801721572876, + "learning_rate": 1e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7507453560829163, + "num_tokens": 429038537.0, + "step": 17189 + }, + { + "epoch": 1.887766307928838, + "grad_norm": 2.40299654006958, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6869024038314819, + "num_tokens": 429063092.0, + "step": 17190 + }, + { + "epoch": 1.8878761256314518, + "grad_norm": 2.21167254447937, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7333587408065796, + "num_tokens": 429089962.0, + "step": 17191 + }, + { + "epoch": 1.8879859433340656, + "grad_norm": 2.0248141288757324, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7163610458374023, + "num_tokens": 429121019.0, + "step": 17192 + }, + { + "epoch": 1.888095761036679, + "grad_norm": 2.279040813446045, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7317493557929993, + "num_tokens": 429145925.0, + "step": 17193 + }, + { + "epoch": 1.8882055787392926, + "grad_norm": 2.1383204460144043, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7196515202522278, + "num_tokens": 429173599.0, + "step": 17194 + }, + { + "epoch": 1.8883153964419064, + "grad_norm": 2.3100955486297607, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7115254402160645, + "num_tokens": 429197372.0, + "step": 17195 + }, + { + "epoch": 1.8884252141445201, + "grad_norm": 2.139362335205078, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7269154787063599, + "num_tokens": 429223722.0, + "step": 17196 + }, + { + "epoch": 1.888535031847134, + "grad_norm": 2.456662178039551, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7335153818130493, + "num_tokens": 429243805.0, + "step": 17197 + }, + { + "epoch": 1.8886448495497474, + "grad_norm": 2.3018195629119873, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.711056113243103, + "num_tokens": 429268755.0, + "step": 17198 + }, + { + "epoch": 1.888754667252361, + "grad_norm": 2.296262741088867, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7154205441474915, + "num_tokens": 429296874.0, + "step": 17199 + }, + { + "epoch": 1.8888644849549747, + "grad_norm": 2.1595652103424072, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.727015495300293, + "num_tokens": 429323856.0, + "step": 17200 + }, + { + "epoch": 1.8889743026575885, + "grad_norm": 2.4178354740142822, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7101311087608337, + "num_tokens": 429347568.0, + "step": 17201 + }, + { + "epoch": 1.889084120360202, + "grad_norm": 2.44795823097229, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7400898933410645, + "num_tokens": 429368513.0, + "step": 17202 + }, + { + "epoch": 1.8891939380628158, + "grad_norm": 2.1460416316986084, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6895486116409302, + "num_tokens": 429396837.0, + "step": 17203 + }, + { + "epoch": 1.8893037557654293, + "grad_norm": 2.3448126316070557, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7396059632301331, + "num_tokens": 429419867.0, + "step": 17204 + }, + { + "epoch": 1.889413573468043, + "grad_norm": 2.055769205093384, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7157171964645386, + "num_tokens": 429450915.0, + "step": 17205 + }, + { + "epoch": 1.8895233911706568, + "grad_norm": 2.5001466274261475, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7168033123016357, + "num_tokens": 429472686.0, + "step": 17206 + }, + { + "epoch": 1.8896332088732704, + "grad_norm": 2.154188394546509, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7416762113571167, + "num_tokens": 429498173.0, + "step": 17207 + }, + { + "epoch": 1.8897430265758839, + "grad_norm": 2.2037625312805176, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7285157442092896, + "num_tokens": 429523790.0, + "step": 17208 + }, + { + "epoch": 1.8898528442784976, + "grad_norm": 2.409608840942383, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7551517486572266, + "num_tokens": 429546503.0, + "step": 17209 + }, + { + "epoch": 1.8899626619811114, + "grad_norm": 1.9284597635269165, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.707647442817688, + "num_tokens": 429578533.0, + "step": 17210 + }, + { + "epoch": 1.8900724796837252, + "grad_norm": 2.371837615966797, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7264962196350098, + "num_tokens": 429603433.0, + "step": 17211 + }, + { + "epoch": 1.8901822973863387, + "grad_norm": 2.15370512008667, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7344580888748169, + "num_tokens": 429629176.0, + "step": 17212 + }, + { + "epoch": 1.8902921150889522, + "grad_norm": 2.3942317962646484, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7442911863327026, + "num_tokens": 429650985.0, + "step": 17213 + }, + { + "epoch": 1.890401932791566, + "grad_norm": 2.737621784210205, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7426059246063232, + "num_tokens": 429669925.0, + "step": 17214 + }, + { + "epoch": 1.8905117504941797, + "grad_norm": 2.46773362159729, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7327287793159485, + "num_tokens": 429689962.0, + "step": 17215 + }, + { + "epoch": 1.8906215681967933, + "grad_norm": 2.3639800548553467, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7385218143463135, + "num_tokens": 429712363.0, + "step": 17216 + }, + { + "epoch": 1.890731385899407, + "grad_norm": 2.2903435230255127, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7295088768005371, + "num_tokens": 429737327.0, + "step": 17217 + }, + { + "epoch": 1.8908412036020206, + "grad_norm": 2.363269090652466, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7302908897399902, + "num_tokens": 429758794.0, + "step": 17218 + }, + { + "epoch": 1.8909510213046343, + "grad_norm": 2.267012596130371, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7234383821487427, + "num_tokens": 429782411.0, + "step": 17219 + }, + { + "epoch": 1.891060839007248, + "grad_norm": 2.265911102294922, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.727470874786377, + "num_tokens": 429805986.0, + "step": 17220 + }, + { + "epoch": 1.8911706567098616, + "grad_norm": 2.113440990447998, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.720885694026947, + "num_tokens": 429833319.0, + "step": 17221 + }, + { + "epoch": 1.8912804744124752, + "grad_norm": 2.230073928833008, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7280434370040894, + "num_tokens": 429857507.0, + "step": 17222 + }, + { + "epoch": 1.891390292115089, + "grad_norm": 2.329054117202759, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7312428951263428, + "num_tokens": 429881283.0, + "step": 17223 + }, + { + "epoch": 1.8915001098177027, + "grad_norm": 2.0178134441375732, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7032545804977417, + "num_tokens": 429910485.0, + "step": 17224 + }, + { + "epoch": 1.8916099275203164, + "grad_norm": 2.2474005222320557, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7086013555526733, + "num_tokens": 429938910.0, + "step": 17225 + }, + { + "epoch": 1.89171974522293, + "grad_norm": 2.4418294429779053, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7182352542877197, + "num_tokens": 429960749.0, + "step": 17226 + }, + { + "epoch": 1.8918295629255435, + "grad_norm": 2.1375133991241455, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7091568112373352, + "num_tokens": 429988773.0, + "step": 17227 + }, + { + "epoch": 1.8919393806281573, + "grad_norm": 2.2896382808685303, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7129673957824707, + "num_tokens": 430011934.0, + "step": 17228 + }, + { + "epoch": 1.892049198330771, + "grad_norm": 2.212554454803467, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7121046781539917, + "num_tokens": 430036834.0, + "step": 17229 + }, + { + "epoch": 1.8921590160333845, + "grad_norm": 2.4546000957489014, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7345290184020996, + "num_tokens": 430058797.0, + "step": 17230 + }, + { + "epoch": 1.892268833735998, + "grad_norm": 2.5406131744384766, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7254201173782349, + "num_tokens": 430079868.0, + "step": 17231 + }, + { + "epoch": 1.8923786514386118, + "grad_norm": 2.0968713760375977, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7259669899940491, + "num_tokens": 430107993.0, + "step": 17232 + }, + { + "epoch": 1.8924884691412256, + "grad_norm": 2.226612091064453, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7419281005859375, + "num_tokens": 430131464.0, + "step": 17233 + }, + { + "epoch": 1.8925982868438394, + "grad_norm": 2.0747151374816895, + "learning_rate": 1e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7487165927886963, + "num_tokens": 430158031.0, + "step": 17234 + }, + { + "epoch": 1.8927081045464529, + "grad_norm": 2.253305196762085, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7224323749542236, + "num_tokens": 430187503.0, + "step": 17235 + }, + { + "epoch": 1.8928179222490664, + "grad_norm": 2.1998023986816406, + "learning_rate": 1e-06, + "loss": 0.8073, + "mean_token_accuracy": 0.73967045545578, + "num_tokens": 430211894.0, + "step": 17236 + }, + { + "epoch": 1.8929277399516802, + "grad_norm": 2.185377836227417, + "learning_rate": 1e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7400206923484802, + "num_tokens": 430240198.0, + "step": 17237 + }, + { + "epoch": 1.893037557654294, + "grad_norm": 2.2479162216186523, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7165621519088745, + "num_tokens": 430265828.0, + "step": 17238 + }, + { + "epoch": 1.8931473753569077, + "grad_norm": 2.332714796066284, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7295548915863037, + "num_tokens": 430289770.0, + "step": 17239 + }, + { + "epoch": 1.8932571930595212, + "grad_norm": 2.014355182647705, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7023516893386841, + "num_tokens": 430320361.0, + "step": 17240 + }, + { + "epoch": 1.8933670107621348, + "grad_norm": 2.2982404232025146, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7394524812698364, + "num_tokens": 430342643.0, + "step": 17241 + }, + { + "epoch": 1.8934768284647485, + "grad_norm": 2.225590705871582, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7076491117477417, + "num_tokens": 430371234.0, + "step": 17242 + }, + { + "epoch": 1.8935866461673623, + "grad_norm": 2.4099340438842773, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7346441149711609, + "num_tokens": 430394194.0, + "step": 17243 + }, + { + "epoch": 1.8936964638699758, + "grad_norm": 2.309537410736084, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7321663498878479, + "num_tokens": 430417348.0, + "step": 17244 + }, + { + "epoch": 1.8938062815725893, + "grad_norm": 2.466818332672119, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.719740629196167, + "num_tokens": 430439660.0, + "step": 17245 + }, + { + "epoch": 1.893916099275203, + "grad_norm": 2.3812572956085205, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7344725131988525, + "num_tokens": 430462570.0, + "step": 17246 + }, + { + "epoch": 1.8940259169778169, + "grad_norm": 2.1278231143951416, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.7056437730789185, + "num_tokens": 430491734.0, + "step": 17247 + }, + { + "epoch": 1.8941357346804306, + "grad_norm": 2.2167446613311768, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7125623226165771, + "num_tokens": 430519424.0, + "step": 17248 + }, + { + "epoch": 1.8942455523830442, + "grad_norm": 2.347100257873535, + "learning_rate": 1e-06, + "loss": 0.7971, + "mean_token_accuracy": 0.7449371814727783, + "num_tokens": 430543510.0, + "step": 17249 + }, + { + "epoch": 1.8943553700856577, + "grad_norm": 2.3414180278778076, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.726685643196106, + "num_tokens": 430565988.0, + "step": 17250 + }, + { + "epoch": 1.8944651877882714, + "grad_norm": 2.046255588531494, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7200065851211548, + "num_tokens": 430595847.0, + "step": 17251 + }, + { + "epoch": 1.8945750054908852, + "grad_norm": 2.1206247806549072, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.701353132724762, + "num_tokens": 430624881.0, + "step": 17252 + }, + { + "epoch": 1.894684823193499, + "grad_norm": 2.1428816318511963, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7277035713195801, + "num_tokens": 430652255.0, + "step": 17253 + }, + { + "epoch": 1.8947946408961125, + "grad_norm": 2.461127758026123, + "learning_rate": 1e-06, + "loss": 0.7321, + "mean_token_accuracy": 0.7685519456863403, + "num_tokens": 430672137.0, + "step": 17254 + }, + { + "epoch": 1.894904458598726, + "grad_norm": 2.416142702102661, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7254550457000732, + "num_tokens": 430694258.0, + "step": 17255 + }, + { + "epoch": 1.8950142763013398, + "grad_norm": 2.0560505390167236, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7235115170478821, + "num_tokens": 430723835.0, + "step": 17256 + }, + { + "epoch": 1.8951240940039535, + "grad_norm": 2.344362497329712, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7240090370178223, + "num_tokens": 430747580.0, + "step": 17257 + }, + { + "epoch": 1.895233911706567, + "grad_norm": 2.176842451095581, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7232760190963745, + "num_tokens": 430772913.0, + "step": 17258 + }, + { + "epoch": 1.8953437294091806, + "grad_norm": 2.3985483646392822, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7272948622703552, + "num_tokens": 430796189.0, + "step": 17259 + }, + { + "epoch": 1.8954535471117944, + "grad_norm": 2.3448660373687744, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7311666011810303, + "num_tokens": 430820196.0, + "step": 17260 + }, + { + "epoch": 1.8955633648144081, + "grad_norm": 2.3121092319488525, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7138437032699585, + "num_tokens": 430848360.0, + "step": 17261 + }, + { + "epoch": 1.8956731825170219, + "grad_norm": 2.354828357696533, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.747786819934845, + "num_tokens": 430869909.0, + "step": 17262 + }, + { + "epoch": 1.8957830002196354, + "grad_norm": 2.2318553924560547, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7025749683380127, + "num_tokens": 430895310.0, + "step": 17263 + }, + { + "epoch": 1.895892817922249, + "grad_norm": 2.4153084754943848, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7120473980903625, + "num_tokens": 430919029.0, + "step": 17264 + }, + { + "epoch": 1.8960026356248627, + "grad_norm": 2.1021595001220703, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7213392853736877, + "num_tokens": 430946756.0, + "step": 17265 + }, + { + "epoch": 1.8961124533274765, + "grad_norm": 2.038623332977295, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7104123830795288, + "num_tokens": 430975664.0, + "step": 17266 + }, + { + "epoch": 1.89622227103009, + "grad_norm": 2.480294942855835, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7266497611999512, + "num_tokens": 430996004.0, + "step": 17267 + }, + { + "epoch": 1.8963320887327038, + "grad_norm": 2.2652676105499268, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7211047410964966, + "num_tokens": 431020264.0, + "step": 17268 + }, + { + "epoch": 1.8964419064353173, + "grad_norm": 2.387667417526245, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7090885639190674, + "num_tokens": 431044638.0, + "step": 17269 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 2.0219357013702393, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7386503219604492, + "num_tokens": 431073326.0, + "step": 17270 + }, + { + "epoch": 1.8966615418405448, + "grad_norm": 2.456353187561035, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7035433650016785, + "num_tokens": 431096202.0, + "step": 17271 + }, + { + "epoch": 1.8967713595431583, + "grad_norm": 2.3148069381713867, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7302997708320618, + "num_tokens": 431120549.0, + "step": 17272 + }, + { + "epoch": 1.8968811772457719, + "grad_norm": 2.1696367263793945, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7330451607704163, + "num_tokens": 431146565.0, + "step": 17273 + }, + { + "epoch": 1.8969909949483856, + "grad_norm": 2.3117570877075195, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7352220416069031, + "num_tokens": 431170126.0, + "step": 17274 + }, + { + "epoch": 1.8971008126509994, + "grad_norm": 2.5701608657836914, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7207770347595215, + "num_tokens": 431192133.0, + "step": 17275 + }, + { + "epoch": 1.8972106303536131, + "grad_norm": 2.445000410079956, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7160568833351135, + "num_tokens": 431214798.0, + "step": 17276 + }, + { + "epoch": 1.8973204480562267, + "grad_norm": 2.670471429824829, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.716332197189331, + "num_tokens": 431235335.0, + "step": 17277 + }, + { + "epoch": 1.8974302657588402, + "grad_norm": 2.0397891998291016, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.697083592414856, + "num_tokens": 431263696.0, + "step": 17278 + }, + { + "epoch": 1.897540083461454, + "grad_norm": 2.2757461071014404, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7395410537719727, + "num_tokens": 431288171.0, + "step": 17279 + }, + { + "epoch": 1.8976499011640677, + "grad_norm": 2.117489814758301, + "learning_rate": 1e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7445769309997559, + "num_tokens": 431314595.0, + "step": 17280 + }, + { + "epoch": 1.8977597188666813, + "grad_norm": 2.6651713848114014, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7353609800338745, + "num_tokens": 431332761.0, + "step": 17281 + }, + { + "epoch": 1.897869536569295, + "grad_norm": 2.2493948936462402, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7195179462432861, + "num_tokens": 431358741.0, + "step": 17282 + }, + { + "epoch": 1.8979793542719086, + "grad_norm": 2.3863685131073, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7148603200912476, + "num_tokens": 431382055.0, + "step": 17283 + }, + { + "epoch": 1.8980891719745223, + "grad_norm": 2.2789535522460938, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7106850147247314, + "num_tokens": 431410316.0, + "step": 17284 + }, + { + "epoch": 1.898198989677136, + "grad_norm": 2.3084876537323, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7155790328979492, + "num_tokens": 431435353.0, + "step": 17285 + }, + { + "epoch": 1.8983088073797496, + "grad_norm": 2.1754093170166016, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.727776825428009, + "num_tokens": 431462499.0, + "step": 17286 + }, + { + "epoch": 1.8984186250823631, + "grad_norm": 2.0681281089782715, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7335071563720703, + "num_tokens": 431491001.0, + "step": 17287 + }, + { + "epoch": 1.898528442784977, + "grad_norm": 2.3063197135925293, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7225995063781738, + "num_tokens": 431516783.0, + "step": 17288 + }, + { + "epoch": 1.8986382604875907, + "grad_norm": 2.091601848602295, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7208684682846069, + "num_tokens": 431548045.0, + "step": 17289 + }, + { + "epoch": 1.8987480781902044, + "grad_norm": 2.148202419281006, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7065749168395996, + "num_tokens": 431578384.0, + "step": 17290 + }, + { + "epoch": 1.898857895892818, + "grad_norm": 2.5722479820251465, + "learning_rate": 1e-06, + "loss": 0.7795, + "mean_token_accuracy": 0.7539461851119995, + "num_tokens": 431598326.0, + "step": 17291 + }, + { + "epoch": 1.8989677135954315, + "grad_norm": 2.5102195739746094, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7358789443969727, + "num_tokens": 431618857.0, + "step": 17292 + }, + { + "epoch": 1.8990775312980452, + "grad_norm": 2.336056709289551, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7081833481788635, + "num_tokens": 431643147.0, + "step": 17293 + }, + { + "epoch": 1.899187349000659, + "grad_norm": 2.8128840923309326, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7378852963447571, + "num_tokens": 431661283.0, + "step": 17294 + }, + { + "epoch": 1.8992971667032725, + "grad_norm": 2.062049388885498, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7162976264953613, + "num_tokens": 431692456.0, + "step": 17295 + }, + { + "epoch": 1.899406984405886, + "grad_norm": 2.5482559204101562, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7213267087936401, + "num_tokens": 431713530.0, + "step": 17296 + }, + { + "epoch": 1.8995168021084998, + "grad_norm": 2.621424913406372, + "learning_rate": 1e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7449256181716919, + "num_tokens": 431732154.0, + "step": 17297 + }, + { + "epoch": 1.8996266198111136, + "grad_norm": 2.6422231197357178, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7529289722442627, + "num_tokens": 431751045.0, + "step": 17298 + }, + { + "epoch": 1.8997364375137273, + "grad_norm": 2.825772285461426, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7274708151817322, + "num_tokens": 431769163.0, + "step": 17299 + }, + { + "epoch": 1.8998462552163409, + "grad_norm": 2.0972349643707275, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7065224051475525, + "num_tokens": 431799006.0, + "step": 17300 + }, + { + "epoch": 1.8999560729189544, + "grad_norm": 2.261791229248047, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7216989398002625, + "num_tokens": 431823470.0, + "step": 17301 + }, + { + "epoch": 1.9000658906215682, + "grad_norm": 2.3487837314605713, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7120072841644287, + "num_tokens": 431847686.0, + "step": 17302 + }, + { + "epoch": 1.900175708324182, + "grad_norm": 2.2654967308044434, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.729503870010376, + "num_tokens": 431873333.0, + "step": 17303 + }, + { + "epoch": 1.9002855260267957, + "grad_norm": 2.1570754051208496, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7165457606315613, + "num_tokens": 431900292.0, + "step": 17304 + }, + { + "epoch": 1.9003953437294092, + "grad_norm": 2.314675807952881, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.725396990776062, + "num_tokens": 431924531.0, + "step": 17305 + }, + { + "epoch": 1.9005051614320227, + "grad_norm": 2.364210367202759, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7109431028366089, + "num_tokens": 431948412.0, + "step": 17306 + }, + { + "epoch": 1.9006149791346365, + "grad_norm": 2.2049143314361572, + "learning_rate": 1e-06, + "loss": 0.8028, + "mean_token_accuracy": 0.748715877532959, + "num_tokens": 431971702.0, + "step": 17307 + }, + { + "epoch": 1.9007247968372503, + "grad_norm": 2.1964211463928223, + "learning_rate": 1e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7452532052993774, + "num_tokens": 431996845.0, + "step": 17308 + }, + { + "epoch": 1.9008346145398638, + "grad_norm": 2.243748426437378, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.724838137626648, + "num_tokens": 432022974.0, + "step": 17309 + }, + { + "epoch": 1.9009444322424773, + "grad_norm": 2.2333552837371826, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7326385974884033, + "num_tokens": 432048246.0, + "step": 17310 + }, + { + "epoch": 1.901054249945091, + "grad_norm": 2.1217002868652344, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7074300646781921, + "num_tokens": 432076850.0, + "step": 17311 + }, + { + "epoch": 1.9011640676477048, + "grad_norm": 2.244818925857544, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7420852184295654, + "num_tokens": 432101214.0, + "step": 17312 + }, + { + "epoch": 1.9012738853503186, + "grad_norm": 2.3924055099487305, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7199739217758179, + "num_tokens": 432124034.0, + "step": 17313 + }, + { + "epoch": 1.9013837030529321, + "grad_norm": 2.1829144954681396, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7154136896133423, + "num_tokens": 432152556.0, + "step": 17314 + }, + { + "epoch": 1.9014935207555457, + "grad_norm": 2.4082252979278564, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7321943044662476, + "num_tokens": 432174578.0, + "step": 17315 + }, + { + "epoch": 1.9016033384581594, + "grad_norm": 2.7162821292877197, + "learning_rate": 1e-06, + "loss": 0.7356, + "mean_token_accuracy": 0.7726122140884399, + "num_tokens": 432191801.0, + "step": 17316 + }, + { + "epoch": 1.9017131561607732, + "grad_norm": 2.1156375408172607, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7208393812179565, + "num_tokens": 432220543.0, + "step": 17317 + }, + { + "epoch": 1.9018229738633867, + "grad_norm": 2.3522069454193115, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7157745361328125, + "num_tokens": 432243243.0, + "step": 17318 + }, + { + "epoch": 1.9019327915660005, + "grad_norm": 2.4538347721099854, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7293319702148438, + "num_tokens": 432264803.0, + "step": 17319 + }, + { + "epoch": 1.902042609268614, + "grad_norm": 2.12921142578125, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7220548391342163, + "num_tokens": 432292127.0, + "step": 17320 + }, + { + "epoch": 1.9021524269712278, + "grad_norm": 2.3474655151367188, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.72674161195755, + "num_tokens": 432315029.0, + "step": 17321 + }, + { + "epoch": 1.9022622446738415, + "grad_norm": 2.3904356956481934, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7213214635848999, + "num_tokens": 432338155.0, + "step": 17322 + }, + { + "epoch": 1.902372062376455, + "grad_norm": 2.061509370803833, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7004746794700623, + "num_tokens": 432369168.0, + "step": 17323 + }, + { + "epoch": 1.9024818800790686, + "grad_norm": 2.2768473625183105, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7345141172409058, + "num_tokens": 432393974.0, + "step": 17324 + }, + { + "epoch": 1.9025916977816824, + "grad_norm": 2.415717601776123, + "learning_rate": 1e-06, + "loss": 0.7597, + "mean_token_accuracy": 0.7647847533226013, + "num_tokens": 432414193.0, + "step": 17325 + }, + { + "epoch": 1.9027015154842961, + "grad_norm": 2.4475746154785156, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7242275476455688, + "num_tokens": 432435994.0, + "step": 17326 + }, + { + "epoch": 1.9028113331869099, + "grad_norm": 2.282482147216797, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7220380306243896, + "num_tokens": 432461376.0, + "step": 17327 + }, + { + "epoch": 1.9029211508895234, + "grad_norm": 2.2727277278900146, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7265636920928955, + "num_tokens": 432485571.0, + "step": 17328 + }, + { + "epoch": 1.903030968592137, + "grad_norm": 2.69134783744812, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7096267342567444, + "num_tokens": 432506559.0, + "step": 17329 + }, + { + "epoch": 1.9031407862947507, + "grad_norm": 2.1201319694519043, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7327520847320557, + "num_tokens": 432532527.0, + "step": 17330 + }, + { + "epoch": 1.9032506039973645, + "grad_norm": 2.0188307762145996, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7114579677581787, + "num_tokens": 432565076.0, + "step": 17331 + }, + { + "epoch": 1.903360421699978, + "grad_norm": 2.25205135345459, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7195457816123962, + "num_tokens": 432590106.0, + "step": 17332 + }, + { + "epoch": 1.9034702394025917, + "grad_norm": 2.1755881309509277, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.6989755630493164, + "num_tokens": 432617531.0, + "step": 17333 + }, + { + "epoch": 1.9035800571052053, + "grad_norm": 2.147102117538452, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.726424515247345, + "num_tokens": 432646568.0, + "step": 17334 + }, + { + "epoch": 1.903689874807819, + "grad_norm": 2.2381720542907715, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7176851034164429, + "num_tokens": 432670957.0, + "step": 17335 + }, + { + "epoch": 1.9037996925104328, + "grad_norm": 2.2020580768585205, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7044677734375, + "num_tokens": 432696134.0, + "step": 17336 + }, + { + "epoch": 1.9039095102130463, + "grad_norm": 2.2364349365234375, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7228965163230896, + "num_tokens": 432722320.0, + "step": 17337 + }, + { + "epoch": 1.9040193279156599, + "grad_norm": 2.5798604488372803, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7425042390823364, + "num_tokens": 432744280.0, + "step": 17338 + }, + { + "epoch": 1.9041291456182736, + "grad_norm": 1.9023005962371826, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7080788612365723, + "num_tokens": 432780191.0, + "step": 17339 + }, + { + "epoch": 1.9042389633208874, + "grad_norm": 2.3684659004211426, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7230371236801147, + "num_tokens": 432803538.0, + "step": 17340 + }, + { + "epoch": 1.9043487810235011, + "grad_norm": 2.322828769683838, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7100708484649658, + "num_tokens": 432829068.0, + "step": 17341 + }, + { + "epoch": 1.9044585987261147, + "grad_norm": 1.8613908290863037, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7362577319145203, + "num_tokens": 432864654.0, + "step": 17342 + }, + { + "epoch": 1.9045684164287282, + "grad_norm": 2.2336552143096924, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7238823175430298, + "num_tokens": 432889102.0, + "step": 17343 + }, + { + "epoch": 1.904678234131342, + "grad_norm": 2.1019372940063477, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7311056852340698, + "num_tokens": 432915280.0, + "step": 17344 + }, + { + "epoch": 1.9047880518339557, + "grad_norm": 2.190009117126465, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7340171337127686, + "num_tokens": 432940824.0, + "step": 17345 + }, + { + "epoch": 1.9048978695365693, + "grad_norm": 2.19665789604187, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7209590673446655, + "num_tokens": 432966906.0, + "step": 17346 + }, + { + "epoch": 1.9050076872391828, + "grad_norm": 2.6971116065979004, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7354686260223389, + "num_tokens": 432984758.0, + "step": 17347 + }, + { + "epoch": 1.9051175049417965, + "grad_norm": 2.6643569469451904, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7240861654281616, + "num_tokens": 433004580.0, + "step": 17348 + }, + { + "epoch": 1.9052273226444103, + "grad_norm": 2.2478866577148438, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7177952527999878, + "num_tokens": 433029493.0, + "step": 17349 + }, + { + "epoch": 1.905337140347024, + "grad_norm": 2.1988048553466797, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7198824882507324, + "num_tokens": 433054606.0, + "step": 17350 + }, + { + "epoch": 1.9054469580496376, + "grad_norm": 2.4906277656555176, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7302510142326355, + "num_tokens": 433075146.0, + "step": 17351 + }, + { + "epoch": 1.9055567757522511, + "grad_norm": 2.1466479301452637, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7221713066101074, + "num_tokens": 433104450.0, + "step": 17352 + }, + { + "epoch": 1.9056665934548649, + "grad_norm": 2.2731142044067383, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7211214900016785, + "num_tokens": 433130491.0, + "step": 17353 + }, + { + "epoch": 1.9057764111574786, + "grad_norm": 2.3171534538269043, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7322030663490295, + "num_tokens": 433153837.0, + "step": 17354 + }, + { + "epoch": 1.9058862288600924, + "grad_norm": 2.0241782665252686, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6894144415855408, + "num_tokens": 433184463.0, + "step": 17355 + }, + { + "epoch": 1.905996046562706, + "grad_norm": 2.5140528678894043, + "learning_rate": 1e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7494093179702759, + "num_tokens": 433205366.0, + "step": 17356 + }, + { + "epoch": 1.9061058642653195, + "grad_norm": 2.3720498085021973, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7397510409355164, + "num_tokens": 433226377.0, + "step": 17357 + }, + { + "epoch": 1.9062156819679332, + "grad_norm": 2.229217767715454, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7003151178359985, + "num_tokens": 433253746.0, + "step": 17358 + }, + { + "epoch": 1.906325499670547, + "grad_norm": 2.2339470386505127, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7397010326385498, + "num_tokens": 433277292.0, + "step": 17359 + }, + { + "epoch": 1.9064353173731605, + "grad_norm": 2.1776986122131348, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7272369861602783, + "num_tokens": 433303949.0, + "step": 17360 + }, + { + "epoch": 1.906545135075774, + "grad_norm": 2.2043633460998535, + "learning_rate": 1e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.7405759692192078, + "num_tokens": 433329819.0, + "step": 17361 + }, + { + "epoch": 1.9066549527783878, + "grad_norm": 2.3191542625427246, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7247523069381714, + "num_tokens": 433355761.0, + "step": 17362 + }, + { + "epoch": 1.9067647704810016, + "grad_norm": 2.591860055923462, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7251981496810913, + "num_tokens": 433376750.0, + "step": 17363 + }, + { + "epoch": 1.9068745881836153, + "grad_norm": 2.3871073722839355, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7373659014701843, + "num_tokens": 433398562.0, + "step": 17364 + }, + { + "epoch": 1.9069844058862289, + "grad_norm": 2.3819642066955566, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7294436097145081, + "num_tokens": 433422870.0, + "step": 17365 + }, + { + "epoch": 1.9070942235888424, + "grad_norm": 2.137627363204956, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7539121508598328, + "num_tokens": 433447543.0, + "step": 17366 + }, + { + "epoch": 1.9072040412914562, + "grad_norm": 2.1426000595092773, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7024492025375366, + "num_tokens": 433477001.0, + "step": 17367 + }, + { + "epoch": 1.90731385899407, + "grad_norm": 2.305813789367676, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7262722253799438, + "num_tokens": 433503119.0, + "step": 17368 + }, + { + "epoch": 1.9074236766966837, + "grad_norm": 2.213273525238037, + "learning_rate": 1e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7386764287948608, + "num_tokens": 433527350.0, + "step": 17369 + }, + { + "epoch": 1.9075334943992972, + "grad_norm": 2.1765923500061035, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7050251960754395, + "num_tokens": 433554293.0, + "step": 17370 + }, + { + "epoch": 1.9076433121019107, + "grad_norm": 1.9160473346710205, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7209699153900146, + "num_tokens": 433589783.0, + "step": 17371 + }, + { + "epoch": 1.9077531298045245, + "grad_norm": 2.1014106273651123, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.6864326596260071, + "num_tokens": 433620355.0, + "step": 17372 + }, + { + "epoch": 1.9078629475071383, + "grad_norm": 2.1380348205566406, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7028776407241821, + "num_tokens": 433647612.0, + "step": 17373 + }, + { + "epoch": 1.9079727652097518, + "grad_norm": 2.244324207305908, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7328691482543945, + "num_tokens": 433671641.0, + "step": 17374 + }, + { + "epoch": 1.9080825829123653, + "grad_norm": 2.382830858230591, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7182903289794922, + "num_tokens": 433694615.0, + "step": 17375 + }, + { + "epoch": 1.908192400614979, + "grad_norm": 2.1793956756591797, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7306340336799622, + "num_tokens": 433719809.0, + "step": 17376 + }, + { + "epoch": 1.9083022183175928, + "grad_norm": 2.222980260848999, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7173341512680054, + "num_tokens": 433746175.0, + "step": 17377 + }, + { + "epoch": 1.9084120360202066, + "grad_norm": 2.484586238861084, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7432072162628174, + "num_tokens": 433768028.0, + "step": 17378 + }, + { + "epoch": 1.9085218537228201, + "grad_norm": 2.5981128215789795, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7310913801193237, + "num_tokens": 433788125.0, + "step": 17379 + }, + { + "epoch": 1.9086316714254337, + "grad_norm": 2.2840301990509033, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7192926406860352, + "num_tokens": 433813143.0, + "step": 17380 + }, + { + "epoch": 1.9087414891280474, + "grad_norm": 2.4515109062194824, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.717532217502594, + "num_tokens": 433835105.0, + "step": 17381 + }, + { + "epoch": 1.9088513068306612, + "grad_norm": 2.4843108654022217, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7194846868515015, + "num_tokens": 433856978.0, + "step": 17382 + }, + { + "epoch": 1.9089611245332747, + "grad_norm": 2.2062294483184814, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7343456149101257, + "num_tokens": 433881535.0, + "step": 17383 + }, + { + "epoch": 1.9090709422358885, + "grad_norm": 2.6094343662261963, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.744477391242981, + "num_tokens": 433899349.0, + "step": 17384 + }, + { + "epoch": 1.909180759938502, + "grad_norm": 2.2545156478881836, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7215709686279297, + "num_tokens": 433922486.0, + "step": 17385 + }, + { + "epoch": 1.9092905776411158, + "grad_norm": 2.098964214324951, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7020083665847778, + "num_tokens": 433952884.0, + "step": 17386 + }, + { + "epoch": 1.9094003953437295, + "grad_norm": 2.3151049613952637, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7230309247970581, + "num_tokens": 433977952.0, + "step": 17387 + }, + { + "epoch": 1.909510213046343, + "grad_norm": 2.3580985069274902, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7151626944541931, + "num_tokens": 434001914.0, + "step": 17388 + }, + { + "epoch": 1.9096200307489566, + "grad_norm": 2.2029953002929688, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.715693473815918, + "num_tokens": 434028374.0, + "step": 17389 + }, + { + "epoch": 1.9097298484515703, + "grad_norm": 2.522465944290161, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7363364100456238, + "num_tokens": 434049857.0, + "step": 17390 + }, + { + "epoch": 1.909839666154184, + "grad_norm": 2.4024083614349365, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7199163436889648, + "num_tokens": 434073206.0, + "step": 17391 + }, + { + "epoch": 1.9099494838567979, + "grad_norm": 2.190631628036499, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.717162549495697, + "num_tokens": 434099188.0, + "step": 17392 + }, + { + "epoch": 1.9100593015594114, + "grad_norm": 2.6243977546691895, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7158672213554382, + "num_tokens": 434118267.0, + "step": 17393 + }, + { + "epoch": 1.910169119262025, + "grad_norm": 2.2373850345611572, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7127975821495056, + "num_tokens": 434145069.0, + "step": 17394 + }, + { + "epoch": 1.9102789369646387, + "grad_norm": 2.1847245693206787, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7309848070144653, + "num_tokens": 434170041.0, + "step": 17395 + }, + { + "epoch": 1.9103887546672524, + "grad_norm": 2.0533034801483154, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7164083123207092, + "num_tokens": 434200109.0, + "step": 17396 + }, + { + "epoch": 1.910498572369866, + "grad_norm": 2.5003490447998047, + "learning_rate": 1e-06, + "loss": 0.7253, + "mean_token_accuracy": 0.7659720778465271, + "num_tokens": 434219508.0, + "step": 17397 + }, + { + "epoch": 1.9106083900724797, + "grad_norm": 2.062075138092041, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7120324969291687, + "num_tokens": 434250169.0, + "step": 17398 + }, + { + "epoch": 1.9107182077750933, + "grad_norm": 2.4457571506500244, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7423768639564514, + "num_tokens": 434273386.0, + "step": 17399 + }, + { + "epoch": 1.910828025477707, + "grad_norm": 2.371655225753784, + "learning_rate": 1e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7451849579811096, + "num_tokens": 434295495.0, + "step": 17400 + }, + { + "epoch": 1.9109378431803208, + "grad_norm": 2.275775909423828, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7260529398918152, + "num_tokens": 434319276.0, + "step": 17401 + }, + { + "epoch": 1.9110476608829343, + "grad_norm": 2.2278428077697754, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7225956916809082, + "num_tokens": 434343826.0, + "step": 17402 + }, + { + "epoch": 1.9111574785855479, + "grad_norm": 2.711451292037964, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7239084839820862, + "num_tokens": 434362421.0, + "step": 17403 + }, + { + "epoch": 1.9112672962881616, + "grad_norm": 2.167778968811035, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7207438945770264, + "num_tokens": 434389030.0, + "step": 17404 + }, + { + "epoch": 1.9113771139907754, + "grad_norm": 2.5479512214660645, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7297619581222534, + "num_tokens": 434409510.0, + "step": 17405 + }, + { + "epoch": 1.9114869316933891, + "grad_norm": 2.4851460456848145, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7339727878570557, + "num_tokens": 434432347.0, + "step": 17406 + }, + { + "epoch": 1.9115967493960027, + "grad_norm": 2.461756706237793, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7336841225624084, + "num_tokens": 434452115.0, + "step": 17407 + }, + { + "epoch": 1.9117065670986162, + "grad_norm": 2.3663594722747803, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7267646789550781, + "num_tokens": 434474909.0, + "step": 17408 + }, + { + "epoch": 1.91181638480123, + "grad_norm": 2.2246997356414795, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7202316522598267, + "num_tokens": 434503676.0, + "step": 17409 + }, + { + "epoch": 1.9119262025038437, + "grad_norm": 2.307887554168701, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7296500205993652, + "num_tokens": 434528383.0, + "step": 17410 + }, + { + "epoch": 1.9120360202064572, + "grad_norm": 2.4290611743927, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7100168466567993, + "num_tokens": 434550755.0, + "step": 17411 + }, + { + "epoch": 1.9121458379090708, + "grad_norm": 2.2055106163024902, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7341967225074768, + "num_tokens": 434576910.0, + "step": 17412 + }, + { + "epoch": 1.9122556556116845, + "grad_norm": 2.0887200832366943, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7457231283187866, + "num_tokens": 434603246.0, + "step": 17413 + }, + { + "epoch": 1.9123654733142983, + "grad_norm": 2.36407208442688, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7335405945777893, + "num_tokens": 434626997.0, + "step": 17414 + }, + { + "epoch": 1.912475291016912, + "grad_norm": 2.231252908706665, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.699927031993866, + "num_tokens": 434653526.0, + "step": 17415 + }, + { + "epoch": 1.9125851087195256, + "grad_norm": 2.3274097442626953, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7226946353912354, + "num_tokens": 434677745.0, + "step": 17416 + }, + { + "epoch": 1.9126949264221391, + "grad_norm": 2.1360161304473877, + "learning_rate": 1e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7396663427352905, + "num_tokens": 434704128.0, + "step": 17417 + }, + { + "epoch": 1.9128047441247529, + "grad_norm": 2.307722568511963, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7377898693084717, + "num_tokens": 434728178.0, + "step": 17418 + }, + { + "epoch": 1.9129145618273666, + "grad_norm": 2.1548705101013184, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7462992668151855, + "num_tokens": 434754092.0, + "step": 17419 + }, + { + "epoch": 1.9130243795299804, + "grad_norm": 2.5750489234924316, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7334259152412415, + "num_tokens": 434774728.0, + "step": 17420 + }, + { + "epoch": 1.913134197232594, + "grad_norm": 2.469583034515381, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7433699369430542, + "num_tokens": 434796967.0, + "step": 17421 + }, + { + "epoch": 1.9132440149352075, + "grad_norm": 2.2718169689178467, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7169447541236877, + "num_tokens": 434824218.0, + "step": 17422 + }, + { + "epoch": 1.9133538326378212, + "grad_norm": 2.4578161239624023, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7116581201553345, + "num_tokens": 434849219.0, + "step": 17423 + }, + { + "epoch": 1.913463650340435, + "grad_norm": 2.470677614212036, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7212173938751221, + "num_tokens": 434873409.0, + "step": 17424 + }, + { + "epoch": 1.9135734680430485, + "grad_norm": 2.269946336746216, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7320975065231323, + "num_tokens": 434897530.0, + "step": 17425 + }, + { + "epoch": 1.913683285745662, + "grad_norm": 2.2950191497802734, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7303376793861389, + "num_tokens": 434920960.0, + "step": 17426 + }, + { + "epoch": 1.9137931034482758, + "grad_norm": 2.332146406173706, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7194353342056274, + "num_tokens": 434944507.0, + "step": 17427 + }, + { + "epoch": 1.9139029211508896, + "grad_norm": 1.8732355833053589, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7144832015037537, + "num_tokens": 434979642.0, + "step": 17428 + }, + { + "epoch": 1.9140127388535033, + "grad_norm": 2.5137970447540283, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.721744954586029, + "num_tokens": 435000834.0, + "step": 17429 + }, + { + "epoch": 1.9141225565561168, + "grad_norm": 2.2139031887054443, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7095823287963867, + "num_tokens": 435027970.0, + "step": 17430 + }, + { + "epoch": 1.9142323742587304, + "grad_norm": 2.5135159492492676, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7163450717926025, + "num_tokens": 435049422.0, + "step": 17431 + }, + { + "epoch": 1.9143421919613441, + "grad_norm": 2.196300745010376, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7183458805084229, + "num_tokens": 435074536.0, + "step": 17432 + }, + { + "epoch": 1.914452009663958, + "grad_norm": 2.1679229736328125, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7507972717285156, + "num_tokens": 435100325.0, + "step": 17433 + }, + { + "epoch": 1.9145618273665717, + "grad_norm": 2.4188437461853027, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7374168038368225, + "num_tokens": 435121768.0, + "step": 17434 + }, + { + "epoch": 1.9146716450691852, + "grad_norm": 2.4295616149902344, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7332555651664734, + "num_tokens": 435142916.0, + "step": 17435 + }, + { + "epoch": 1.9147814627717987, + "grad_norm": 2.2944514751434326, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.701828122138977, + "num_tokens": 435167987.0, + "step": 17436 + }, + { + "epoch": 1.9148912804744125, + "grad_norm": 2.6109485626220703, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7004976272583008, + "num_tokens": 435190391.0, + "step": 17437 + }, + { + "epoch": 1.9150010981770262, + "grad_norm": 2.294735908508301, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7045449018478394, + "num_tokens": 435216655.0, + "step": 17438 + }, + { + "epoch": 1.9151109158796398, + "grad_norm": 2.294247627258301, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7191649675369263, + "num_tokens": 435242974.0, + "step": 17439 + }, + { + "epoch": 1.9152207335822533, + "grad_norm": 2.468066930770874, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7315509915351868, + "num_tokens": 435265293.0, + "step": 17440 + }, + { + "epoch": 1.915330551284867, + "grad_norm": 2.1518709659576416, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7426623702049255, + "num_tokens": 435291173.0, + "step": 17441 + }, + { + "epoch": 1.9154403689874808, + "grad_norm": 2.2521321773529053, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7536028027534485, + "num_tokens": 435314656.0, + "step": 17442 + }, + { + "epoch": 1.9155501866900946, + "grad_norm": 1.9737085103988647, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7118839621543884, + "num_tokens": 435347420.0, + "step": 17443 + }, + { + "epoch": 1.9156600043927081, + "grad_norm": 2.5089778900146484, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.729631781578064, + "num_tokens": 435369497.0, + "step": 17444 + }, + { + "epoch": 1.9157698220953217, + "grad_norm": 2.235718250274658, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7151105403900146, + "num_tokens": 435395624.0, + "step": 17445 + }, + { + "epoch": 1.9158796397979354, + "grad_norm": 2.4290640354156494, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7365822792053223, + "num_tokens": 435417270.0, + "step": 17446 + }, + { + "epoch": 1.9159894575005492, + "grad_norm": 2.2201197147369385, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7052654027938843, + "num_tokens": 435444475.0, + "step": 17447 + }, + { + "epoch": 1.9160992752031627, + "grad_norm": 2.0606319904327393, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7042871117591858, + "num_tokens": 435474128.0, + "step": 17448 + }, + { + "epoch": 1.9162090929057765, + "grad_norm": 2.0696628093719482, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7290998697280884, + "num_tokens": 435503401.0, + "step": 17449 + }, + { + "epoch": 1.91631891060839, + "grad_norm": 2.049469470977783, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.712565541267395, + "num_tokens": 435532692.0, + "step": 17450 + }, + { + "epoch": 1.9164287283110037, + "grad_norm": 2.0953707695007324, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7235610485076904, + "num_tokens": 435560191.0, + "step": 17451 + }, + { + "epoch": 1.9165385460136175, + "grad_norm": 2.3005716800689697, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.697712779045105, + "num_tokens": 435586242.0, + "step": 17452 + }, + { + "epoch": 1.916648363716231, + "grad_norm": 2.2218880653381348, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7147378325462341, + "num_tokens": 435610882.0, + "step": 17453 + }, + { + "epoch": 1.9167581814188446, + "grad_norm": 2.253469228744507, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7333104610443115, + "num_tokens": 435636026.0, + "step": 17454 + }, + { + "epoch": 1.9168679991214583, + "grad_norm": 2.231043815612793, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7236462235450745, + "num_tokens": 435661925.0, + "step": 17455 + }, + { + "epoch": 1.916977816824072, + "grad_norm": 2.1646406650543213, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7142850756645203, + "num_tokens": 435687816.0, + "step": 17456 + }, + { + "epoch": 1.9170876345266858, + "grad_norm": 2.50516939163208, + "learning_rate": 1e-06, + "loss": 0.8127, + "mean_token_accuracy": 0.7411960959434509, + "num_tokens": 435708640.0, + "step": 17457 + }, + { + "epoch": 1.9171974522292994, + "grad_norm": 2.4555084705352783, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7029364705085754, + "num_tokens": 435734002.0, + "step": 17458 + }, + { + "epoch": 1.917307269931913, + "grad_norm": 2.1419458389282227, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7163670063018799, + "num_tokens": 435761677.0, + "step": 17459 + }, + { + "epoch": 1.9174170876345267, + "grad_norm": 2.072655439376831, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7121316194534302, + "num_tokens": 435790677.0, + "step": 17460 + }, + { + "epoch": 1.9175269053371404, + "grad_norm": 2.5717525482177734, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.746091902256012, + "num_tokens": 435812167.0, + "step": 17461 + }, + { + "epoch": 1.917636723039754, + "grad_norm": 2.139111042022705, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7225315570831299, + "num_tokens": 435839552.0, + "step": 17462 + }, + { + "epoch": 1.9177465407423677, + "grad_norm": 2.285313367843628, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7306090593338013, + "num_tokens": 435862430.0, + "step": 17463 + }, + { + "epoch": 1.9178563584449813, + "grad_norm": 2.423147439956665, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7090113162994385, + "num_tokens": 435884952.0, + "step": 17464 + }, + { + "epoch": 1.917966176147595, + "grad_norm": 2.1343986988067627, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7382722496986389, + "num_tokens": 435910345.0, + "step": 17465 + }, + { + "epoch": 1.9180759938502088, + "grad_norm": 2.2304747104644775, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7044218182563782, + "num_tokens": 435938151.0, + "step": 17466 + }, + { + "epoch": 1.9181858115528223, + "grad_norm": 2.133488416671753, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7254945039749146, + "num_tokens": 435965183.0, + "step": 17467 + }, + { + "epoch": 1.9182956292554358, + "grad_norm": 2.363677501678467, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7144008874893188, + "num_tokens": 435989314.0, + "step": 17468 + }, + { + "epoch": 1.9184054469580496, + "grad_norm": 2.4027857780456543, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7233700752258301, + "num_tokens": 436012055.0, + "step": 17469 + }, + { + "epoch": 1.9185152646606634, + "grad_norm": 2.4118032455444336, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.712165117263794, + "num_tokens": 436033879.0, + "step": 17470 + }, + { + "epoch": 1.9186250823632771, + "grad_norm": 2.224367141723633, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7000679969787598, + "num_tokens": 436060081.0, + "step": 17471 + }, + { + "epoch": 1.9187349000658906, + "grad_norm": 2.563385248184204, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7101380825042725, + "num_tokens": 436082032.0, + "step": 17472 + }, + { + "epoch": 1.9188447177685042, + "grad_norm": 2.27067494392395, + "learning_rate": 1e-06, + "loss": 0.7458, + "mean_token_accuracy": 0.7620100975036621, + "num_tokens": 436104241.0, + "step": 17473 + }, + { + "epoch": 1.918954535471118, + "grad_norm": 2.2045438289642334, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7193079590797424, + "num_tokens": 436130932.0, + "step": 17474 + }, + { + "epoch": 1.9190643531737317, + "grad_norm": 2.078126907348633, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7362126708030701, + "num_tokens": 436158544.0, + "step": 17475 + }, + { + "epoch": 1.9191741708763452, + "grad_norm": 1.8950328826904297, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7160358428955078, + "num_tokens": 436190899.0, + "step": 17476 + }, + { + "epoch": 1.9192839885789588, + "grad_norm": 2.4729442596435547, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7421406507492065, + "num_tokens": 436211798.0, + "step": 17477 + }, + { + "epoch": 1.9193938062815725, + "grad_norm": 2.0084595680236816, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6996480226516724, + "num_tokens": 436242482.0, + "step": 17478 + }, + { + "epoch": 1.9195036239841863, + "grad_norm": 1.9871312379837036, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7215209603309631, + "num_tokens": 436273099.0, + "step": 17479 + }, + { + "epoch": 1.9196134416868, + "grad_norm": 2.5686519145965576, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7330783605575562, + "num_tokens": 436292193.0, + "step": 17480 + }, + { + "epoch": 1.9197232593894136, + "grad_norm": 2.799353837966919, + "learning_rate": 1e-06, + "loss": 0.7567, + "mean_token_accuracy": 0.7550519704818726, + "num_tokens": 436309270.0, + "step": 17481 + }, + { + "epoch": 1.919833077092027, + "grad_norm": 2.1602189540863037, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7250237464904785, + "num_tokens": 436336500.0, + "step": 17482 + }, + { + "epoch": 1.9199428947946409, + "grad_norm": 2.3352603912353516, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7422798871994019, + "num_tokens": 436358220.0, + "step": 17483 + }, + { + "epoch": 1.9200527124972546, + "grad_norm": 2.1233437061309814, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7058745622634888, + "num_tokens": 436387001.0, + "step": 17484 + }, + { + "epoch": 1.9201625301998684, + "grad_norm": 2.1151485443115234, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.704454779624939, + "num_tokens": 436414722.0, + "step": 17485 + }, + { + "epoch": 1.920272347902482, + "grad_norm": 2.177250623703003, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7050946354866028, + "num_tokens": 436443487.0, + "step": 17486 + }, + { + "epoch": 1.9203821656050954, + "grad_norm": 2.4798264503479004, + "learning_rate": 1e-06, + "loss": 0.7554, + "mean_token_accuracy": 0.7555056214332581, + "num_tokens": 436462063.0, + "step": 17487 + }, + { + "epoch": 1.9204919833077092, + "grad_norm": 2.443528413772583, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7137737274169922, + "num_tokens": 436483676.0, + "step": 17488 + }, + { + "epoch": 1.920601801010323, + "grad_norm": 2.0010409355163574, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7227235436439514, + "num_tokens": 436516164.0, + "step": 17489 + }, + { + "epoch": 1.9207116187129365, + "grad_norm": 2.4757208824157715, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7148981094360352, + "num_tokens": 436540436.0, + "step": 17490 + }, + { + "epoch": 1.92082143641555, + "grad_norm": 2.417649984359741, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.707399845123291, + "num_tokens": 436565481.0, + "step": 17491 + }, + { + "epoch": 1.9209312541181638, + "grad_norm": 2.6495676040649414, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7427377700805664, + "num_tokens": 436584171.0, + "step": 17492 + }, + { + "epoch": 1.9210410718207775, + "grad_norm": 2.1669023036956787, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.721554160118103, + "num_tokens": 436613068.0, + "step": 17493 + }, + { + "epoch": 1.9211508895233913, + "grad_norm": 2.681629180908203, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.74567711353302, + "num_tokens": 436633648.0, + "step": 17494 + }, + { + "epoch": 1.9212607072260048, + "grad_norm": 2.23949933052063, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7307376861572266, + "num_tokens": 436658946.0, + "step": 17495 + }, + { + "epoch": 1.9213705249286184, + "grad_norm": 2.123288869857788, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7226438522338867, + "num_tokens": 436690496.0, + "step": 17496 + }, + { + "epoch": 1.9214803426312321, + "grad_norm": 2.0665488243103027, + "learning_rate": 1e-06, + "loss": 0.7851, + "mean_token_accuracy": 0.751416802406311, + "num_tokens": 436718658.0, + "step": 17497 + }, + { + "epoch": 1.9215901603338459, + "grad_norm": 2.063748836517334, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7203004360198975, + "num_tokens": 436747901.0, + "step": 17498 + }, + { + "epoch": 1.9216999780364594, + "grad_norm": 2.208202362060547, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7083156108856201, + "num_tokens": 436775293.0, + "step": 17499 + }, + { + "epoch": 1.9218097957390732, + "grad_norm": 2.2457315921783447, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.6966729164123535, + "num_tokens": 436802475.0, + "step": 17500 + }, + { + "epoch": 1.9219196134416867, + "grad_norm": 2.294490337371826, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7028179168701172, + "num_tokens": 436827773.0, + "step": 17501 + }, + { + "epoch": 1.9220294311443005, + "grad_norm": 2.7035958766937256, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7122321128845215, + "num_tokens": 436846769.0, + "step": 17502 + }, + { + "epoch": 1.9221392488469142, + "grad_norm": 2.451916456222534, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7393077611923218, + "num_tokens": 436868967.0, + "step": 17503 + }, + { + "epoch": 1.9222490665495278, + "grad_norm": 2.2443220615386963, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7177998423576355, + "num_tokens": 436895095.0, + "step": 17504 + }, + { + "epoch": 1.9223588842521413, + "grad_norm": 2.4379324913024902, + "learning_rate": 1e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7448081374168396, + "num_tokens": 436916103.0, + "step": 17505 + }, + { + "epoch": 1.922468701954755, + "grad_norm": 2.076282024383545, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6900902986526489, + "num_tokens": 436946619.0, + "step": 17506 + }, + { + "epoch": 1.9225785196573688, + "grad_norm": 2.54599928855896, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7308703064918518, + "num_tokens": 436966249.0, + "step": 17507 + }, + { + "epoch": 1.9226883373599826, + "grad_norm": 2.5840601921081543, + "learning_rate": 1e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7592013478279114, + "num_tokens": 436984850.0, + "step": 17508 + }, + { + "epoch": 1.922798155062596, + "grad_norm": 2.5891101360321045, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7249057292938232, + "num_tokens": 437005013.0, + "step": 17509 + }, + { + "epoch": 1.9229079727652096, + "grad_norm": 2.215163230895996, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7205499410629272, + "num_tokens": 437031346.0, + "step": 17510 + }, + { + "epoch": 1.9230177904678234, + "grad_norm": 2.584632396697998, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7435819506645203, + "num_tokens": 437052393.0, + "step": 17511 + }, + { + "epoch": 1.9231276081704372, + "grad_norm": 1.9587634801864624, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7034995555877686, + "num_tokens": 437085886.0, + "step": 17512 + }, + { + "epoch": 1.9232374258730507, + "grad_norm": 2.0744268894195557, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7120760679244995, + "num_tokens": 437115119.0, + "step": 17513 + }, + { + "epoch": 1.9233472435756644, + "grad_norm": 2.473477840423584, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7129000425338745, + "num_tokens": 437137454.0, + "step": 17514 + }, + { + "epoch": 1.923457061278278, + "grad_norm": 2.2923197746276855, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6887272000312805, + "num_tokens": 437164386.0, + "step": 17515 + }, + { + "epoch": 1.9235668789808917, + "grad_norm": 2.2623469829559326, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7369268536567688, + "num_tokens": 437188051.0, + "step": 17516 + }, + { + "epoch": 1.9236766966835055, + "grad_norm": 2.0848851203918457, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7283515930175781, + "num_tokens": 437216721.0, + "step": 17517 + }, + { + "epoch": 1.923786514386119, + "grad_norm": 2.726680040359497, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.727649986743927, + "num_tokens": 437235261.0, + "step": 17518 + }, + { + "epoch": 1.9238963320887326, + "grad_norm": 2.3141705989837646, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7101922035217285, + "num_tokens": 437258809.0, + "step": 17519 + }, + { + "epoch": 1.9240061497913463, + "grad_norm": 2.5297815799713135, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7300750017166138, + "num_tokens": 437279352.0, + "step": 17520 + }, + { + "epoch": 1.92411596749396, + "grad_norm": 2.249706268310547, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7106271982192993, + "num_tokens": 437305315.0, + "step": 17521 + }, + { + "epoch": 1.9242257851965738, + "grad_norm": 2.2270336151123047, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7038204073905945, + "num_tokens": 437333296.0, + "step": 17522 + }, + { + "epoch": 1.9243356028991874, + "grad_norm": 2.312967300415039, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7194861173629761, + "num_tokens": 437357572.0, + "step": 17523 + }, + { + "epoch": 1.924445420601801, + "grad_norm": 2.2523193359375, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7108675241470337, + "num_tokens": 437383519.0, + "step": 17524 + }, + { + "epoch": 1.9245552383044147, + "grad_norm": 2.2159807682037354, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7233786582946777, + "num_tokens": 437408700.0, + "step": 17525 + }, + { + "epoch": 1.9246650560070284, + "grad_norm": 2.360701560974121, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7339006662368774, + "num_tokens": 437432191.0, + "step": 17526 + }, + { + "epoch": 1.924774873709642, + "grad_norm": 2.630640745162964, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7226746678352356, + "num_tokens": 437452119.0, + "step": 17527 + }, + { + "epoch": 1.9248846914122557, + "grad_norm": 2.5063369274139404, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7306604385375977, + "num_tokens": 437474593.0, + "step": 17528 + }, + { + "epoch": 1.9249945091148692, + "grad_norm": 2.2912096977233887, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7287831902503967, + "num_tokens": 437500804.0, + "step": 17529 + }, + { + "epoch": 1.925104326817483, + "grad_norm": 2.6117751598358154, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7149080038070679, + "num_tokens": 437521174.0, + "step": 17530 + }, + { + "epoch": 1.9252141445200968, + "grad_norm": 2.3604979515075684, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7202773094177246, + "num_tokens": 437545660.0, + "step": 17531 + }, + { + "epoch": 1.9253239622227103, + "grad_norm": 2.315232992172241, + "learning_rate": 1e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7552844285964966, + "num_tokens": 437571499.0, + "step": 17532 + }, + { + "epoch": 1.9254337799253238, + "grad_norm": 2.0293848514556885, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7032036185264587, + "num_tokens": 437599920.0, + "step": 17533 + }, + { + "epoch": 1.9255435976279376, + "grad_norm": 2.4324684143066406, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7379453182220459, + "num_tokens": 437621206.0, + "step": 17534 + }, + { + "epoch": 1.9256534153305513, + "grad_norm": 2.347508668899536, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7104411125183105, + "num_tokens": 437646354.0, + "step": 17535 + }, + { + "epoch": 1.925763233033165, + "grad_norm": 2.1907992362976074, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.705396294593811, + "num_tokens": 437672883.0, + "step": 17536 + }, + { + "epoch": 1.9258730507357786, + "grad_norm": 2.297154664993286, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7116631865501404, + "num_tokens": 437698658.0, + "step": 17537 + }, + { + "epoch": 1.9259828684383922, + "grad_norm": 2.1775314807891846, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7034453749656677, + "num_tokens": 437725869.0, + "step": 17538 + }, + { + "epoch": 1.926092686141006, + "grad_norm": 2.174243211746216, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7362493276596069, + "num_tokens": 437751913.0, + "step": 17539 + }, + { + "epoch": 1.9262025038436197, + "grad_norm": 2.3422634601593018, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.726715087890625, + "num_tokens": 437776797.0, + "step": 17540 + }, + { + "epoch": 1.9263123215462332, + "grad_norm": 2.5186994075775146, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7427828311920166, + "num_tokens": 437797107.0, + "step": 17541 + }, + { + "epoch": 1.9264221392488468, + "grad_norm": 2.386497735977173, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7393009066581726, + "num_tokens": 437822608.0, + "step": 17542 + }, + { + "epoch": 1.9265319569514605, + "grad_norm": 2.27229380607605, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.700998067855835, + "num_tokens": 437851104.0, + "step": 17543 + }, + { + "epoch": 1.9266417746540743, + "grad_norm": 2.454345703125, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7131738066673279, + "num_tokens": 437873153.0, + "step": 17544 + }, + { + "epoch": 1.926751592356688, + "grad_norm": 2.4555180072784424, + "learning_rate": 1e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7354714870452881, + "num_tokens": 437894557.0, + "step": 17545 + }, + { + "epoch": 1.9268614100593016, + "grad_norm": 2.0347228050231934, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7125930190086365, + "num_tokens": 437924424.0, + "step": 17546 + }, + { + "epoch": 1.926971227761915, + "grad_norm": 2.3975963592529297, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7389736175537109, + "num_tokens": 437947143.0, + "step": 17547 + }, + { + "epoch": 1.9270810454645289, + "grad_norm": 2.0515782833099365, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6924047470092773, + "num_tokens": 437977773.0, + "step": 17548 + }, + { + "epoch": 1.9271908631671426, + "grad_norm": 2.2394306659698486, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7013660669326782, + "num_tokens": 438004219.0, + "step": 17549 + }, + { + "epoch": 1.9273006808697564, + "grad_norm": 2.303387403488159, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7203797101974487, + "num_tokens": 438028600.0, + "step": 17550 + }, + { + "epoch": 1.92741049857237, + "grad_norm": 2.0854196548461914, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.723358690738678, + "num_tokens": 438058240.0, + "step": 17551 + }, + { + "epoch": 1.9275203162749834, + "grad_norm": 2.2824771404266357, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6943476796150208, + "num_tokens": 438083642.0, + "step": 17552 + }, + { + "epoch": 1.9276301339775972, + "grad_norm": 2.247532844543457, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7205274105072021, + "num_tokens": 438109877.0, + "step": 17553 + }, + { + "epoch": 1.927739951680211, + "grad_norm": 2.3865373134613037, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7289358973503113, + "num_tokens": 438131543.0, + "step": 17554 + }, + { + "epoch": 1.9278497693828245, + "grad_norm": 2.113546133041382, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7089650630950928, + "num_tokens": 438158202.0, + "step": 17555 + }, + { + "epoch": 1.927959587085438, + "grad_norm": 2.367300510406494, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7176623940467834, + "num_tokens": 438182718.0, + "step": 17556 + }, + { + "epoch": 1.9280694047880518, + "grad_norm": 2.209088087081909, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7051968574523926, + "num_tokens": 438208935.0, + "step": 17557 + }, + { + "epoch": 1.9281792224906655, + "grad_norm": 2.2417728900909424, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.730039119720459, + "num_tokens": 438235167.0, + "step": 17558 + }, + { + "epoch": 1.9282890401932793, + "grad_norm": 2.2783374786376953, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7067036628723145, + "num_tokens": 438260864.0, + "step": 17559 + }, + { + "epoch": 1.9283988578958928, + "grad_norm": 2.5963752269744873, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7383104562759399, + "num_tokens": 438280302.0, + "step": 17560 + }, + { + "epoch": 1.9285086755985064, + "grad_norm": 2.064434051513672, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6972541213035583, + "num_tokens": 438312310.0, + "step": 17561 + }, + { + "epoch": 1.9286184933011201, + "grad_norm": 2.0161662101745605, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6886002421379089, + "num_tokens": 438342094.0, + "step": 17562 + }, + { + "epoch": 1.9287283110037339, + "grad_norm": 2.246314287185669, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7105754017829895, + "num_tokens": 438369496.0, + "step": 17563 + }, + { + "epoch": 1.9288381287063474, + "grad_norm": 2.464679002761841, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7222431302070618, + "num_tokens": 438393567.0, + "step": 17564 + }, + { + "epoch": 1.9289479464089612, + "grad_norm": 2.11722993850708, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.733335018157959, + "num_tokens": 438420868.0, + "step": 17565 + }, + { + "epoch": 1.9290577641115747, + "grad_norm": 2.240006685256958, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.715809166431427, + "num_tokens": 438444215.0, + "step": 17566 + }, + { + "epoch": 1.9291675818141885, + "grad_norm": 2.2301297187805176, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7014455795288086, + "num_tokens": 438470695.0, + "step": 17567 + }, + { + "epoch": 1.9292773995168022, + "grad_norm": 2.498201847076416, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7151287794113159, + "num_tokens": 438491089.0, + "step": 17568 + }, + { + "epoch": 1.9293872172194158, + "grad_norm": 2.390706777572632, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7172684669494629, + "num_tokens": 438512644.0, + "step": 17569 + }, + { + "epoch": 1.9294970349220293, + "grad_norm": 2.3447494506835938, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7042317390441895, + "num_tokens": 438537897.0, + "step": 17570 + }, + { + "epoch": 1.929606852624643, + "grad_norm": 2.377312660217285, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7291868329048157, + "num_tokens": 438561877.0, + "step": 17571 + }, + { + "epoch": 1.9297166703272568, + "grad_norm": 2.2950987815856934, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7061895728111267, + "num_tokens": 438588187.0, + "step": 17572 + }, + { + "epoch": 1.9298264880298706, + "grad_norm": 2.55049729347229, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7213271856307983, + "num_tokens": 438608342.0, + "step": 17573 + }, + { + "epoch": 1.929936305732484, + "grad_norm": 2.754300117492676, + "learning_rate": 1e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.74415123462677, + "num_tokens": 438626406.0, + "step": 17574 + }, + { + "epoch": 1.9300461234350976, + "grad_norm": 2.5654561519622803, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7223984599113464, + "num_tokens": 438647176.0, + "step": 17575 + }, + { + "epoch": 1.9301559411377114, + "grad_norm": 2.4592690467834473, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7289109826087952, + "num_tokens": 438668067.0, + "step": 17576 + }, + { + "epoch": 1.9302657588403251, + "grad_norm": 2.585728406906128, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7236595749855042, + "num_tokens": 438688951.0, + "step": 17577 + }, + { + "epoch": 1.9303755765429387, + "grad_norm": 2.2420241832733154, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7329710721969604, + "num_tokens": 438715006.0, + "step": 17578 + }, + { + "epoch": 1.9304853942455524, + "grad_norm": 2.172381639480591, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7167963981628418, + "num_tokens": 438740391.0, + "step": 17579 + }, + { + "epoch": 1.930595211948166, + "grad_norm": 2.5417466163635254, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7299714088439941, + "num_tokens": 438761147.0, + "step": 17580 + }, + { + "epoch": 1.9307050296507797, + "grad_norm": 2.164486885070801, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7061846256256104, + "num_tokens": 438789457.0, + "step": 17581 + }, + { + "epoch": 1.9308148473533935, + "grad_norm": 2.3799731731414795, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7276869416236877, + "num_tokens": 438813625.0, + "step": 17582 + }, + { + "epoch": 1.930924665056007, + "grad_norm": 2.3381168842315674, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.705979585647583, + "num_tokens": 438837728.0, + "step": 17583 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 2.595421552658081, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7289830446243286, + "num_tokens": 438858341.0, + "step": 17584 + }, + { + "epoch": 1.9311443004612343, + "grad_norm": 2.3474841117858887, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.6954270005226135, + "num_tokens": 438883262.0, + "step": 17585 + }, + { + "epoch": 1.931254118163848, + "grad_norm": 2.1639084815979004, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7393043637275696, + "num_tokens": 438909369.0, + "step": 17586 + }, + { + "epoch": 1.9313639358664618, + "grad_norm": 2.141714096069336, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7281747460365295, + "num_tokens": 438934289.0, + "step": 17587 + }, + { + "epoch": 1.9314737535690754, + "grad_norm": 2.3161959648132324, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7259332537651062, + "num_tokens": 438958605.0, + "step": 17588 + }, + { + "epoch": 1.931583571271689, + "grad_norm": 2.5577175617218018, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7326464056968689, + "num_tokens": 438979190.0, + "step": 17589 + }, + { + "epoch": 1.9316933889743026, + "grad_norm": 2.328929901123047, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.725266695022583, + "num_tokens": 439004013.0, + "step": 17590 + }, + { + "epoch": 1.9318032066769164, + "grad_norm": 2.2701072692871094, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7142800092697144, + "num_tokens": 439029873.0, + "step": 17591 + }, + { + "epoch": 1.93191302437953, + "grad_norm": 2.0605154037475586, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7246832251548767, + "num_tokens": 439060567.0, + "step": 17592 + }, + { + "epoch": 1.9320228420821435, + "grad_norm": 2.1451900005340576, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.724030077457428, + "num_tokens": 439086900.0, + "step": 17593 + }, + { + "epoch": 1.9321326597847572, + "grad_norm": 2.6750540733337402, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7252742052078247, + "num_tokens": 439105604.0, + "step": 17594 + }, + { + "epoch": 1.932242477487371, + "grad_norm": 2.182713747024536, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7182754278182983, + "num_tokens": 439131129.0, + "step": 17595 + }, + { + "epoch": 1.9323522951899847, + "grad_norm": 2.0044867992401123, + "learning_rate": 1e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7446199655532837, + "num_tokens": 439160520.0, + "step": 17596 + }, + { + "epoch": 1.9324621128925983, + "grad_norm": 2.2563204765319824, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7349448800086975, + "num_tokens": 439187378.0, + "step": 17597 + }, + { + "epoch": 1.9325719305952118, + "grad_norm": 2.361694097518921, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7520121335983276, + "num_tokens": 439209374.0, + "step": 17598 + }, + { + "epoch": 1.9326817482978256, + "grad_norm": 2.0292747020721436, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7190905213356018, + "num_tokens": 439240803.0, + "step": 17599 + }, + { + "epoch": 1.9327915660004393, + "grad_norm": 2.02738356590271, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7182168364524841, + "num_tokens": 439269715.0, + "step": 17600 + }, + { + "epoch": 1.932901383703053, + "grad_norm": 2.516326427459717, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7344500422477722, + "num_tokens": 439291191.0, + "step": 17601 + }, + { + "epoch": 1.9330112014056666, + "grad_norm": 2.271730422973633, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7279675006866455, + "num_tokens": 439316613.0, + "step": 17602 + }, + { + "epoch": 1.9331210191082802, + "grad_norm": 2.1226866245269775, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7337597608566284, + "num_tokens": 439345090.0, + "step": 17603 + }, + { + "epoch": 1.933230836810894, + "grad_norm": 2.4035353660583496, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7125351428985596, + "num_tokens": 439369124.0, + "step": 17604 + }, + { + "epoch": 1.9333406545135077, + "grad_norm": 1.9718985557556152, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7067421674728394, + "num_tokens": 439401902.0, + "step": 17605 + }, + { + "epoch": 1.9334504722161212, + "grad_norm": 2.334359884262085, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7041106224060059, + "num_tokens": 439426100.0, + "step": 17606 + }, + { + "epoch": 1.9335602899187347, + "grad_norm": 2.2279305458068848, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.699887216091156, + "num_tokens": 439454346.0, + "step": 17607 + }, + { + "epoch": 1.9336701076213485, + "grad_norm": 2.593799352645874, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7337331175804138, + "num_tokens": 439475032.0, + "step": 17608 + }, + { + "epoch": 1.9337799253239623, + "grad_norm": 2.1859617233276367, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7159925699234009, + "num_tokens": 439502255.0, + "step": 17609 + }, + { + "epoch": 1.933889743026576, + "grad_norm": 2.1291749477386475, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7110934257507324, + "num_tokens": 439532158.0, + "step": 17610 + }, + { + "epoch": 1.9339995607291895, + "grad_norm": 2.2699828147888184, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7085574865341187, + "num_tokens": 439559290.0, + "step": 17611 + }, + { + "epoch": 1.934109378431803, + "grad_norm": 2.38387131690979, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7307377457618713, + "num_tokens": 439581461.0, + "step": 17612 + }, + { + "epoch": 1.9342191961344168, + "grad_norm": 2.368014097213745, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7288289070129395, + "num_tokens": 439602930.0, + "step": 17613 + }, + { + "epoch": 1.9343290138370306, + "grad_norm": 2.2774817943573, + "learning_rate": 1e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7379100322723389, + "num_tokens": 439626086.0, + "step": 17614 + }, + { + "epoch": 1.9344388315396444, + "grad_norm": 2.1417109966278076, + "learning_rate": 1e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.7491162419319153, + "num_tokens": 439651152.0, + "step": 17615 + }, + { + "epoch": 1.934548649242258, + "grad_norm": 2.373295307159424, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.744056224822998, + "num_tokens": 439673681.0, + "step": 17616 + }, + { + "epoch": 1.9346584669448714, + "grad_norm": 2.1041460037231445, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7109857797622681, + "num_tokens": 439701734.0, + "step": 17617 + }, + { + "epoch": 1.9347682846474852, + "grad_norm": 2.2629923820495605, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7126120328903198, + "num_tokens": 439728652.0, + "step": 17618 + }, + { + "epoch": 1.934878102350099, + "grad_norm": 2.223193407058716, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7189136743545532, + "num_tokens": 439755403.0, + "step": 17619 + }, + { + "epoch": 1.9349879200527125, + "grad_norm": 2.4630727767944336, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7323936820030212, + "num_tokens": 439776396.0, + "step": 17620 + }, + { + "epoch": 1.935097737755326, + "grad_norm": 2.2974801063537598, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7114837169647217, + "num_tokens": 439802969.0, + "step": 17621 + }, + { + "epoch": 1.9352075554579398, + "grad_norm": 2.1447079181671143, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.702272891998291, + "num_tokens": 439831380.0, + "step": 17622 + }, + { + "epoch": 1.9353173731605535, + "grad_norm": 2.632518768310547, + "learning_rate": 1e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7440633773803711, + "num_tokens": 439849895.0, + "step": 17623 + }, + { + "epoch": 1.9354271908631673, + "grad_norm": 2.615280866622925, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7329895496368408, + "num_tokens": 439869678.0, + "step": 17624 + }, + { + "epoch": 1.9355370085657808, + "grad_norm": 2.138526678085327, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7121492624282837, + "num_tokens": 439896862.0, + "step": 17625 + }, + { + "epoch": 1.9356468262683943, + "grad_norm": 2.270127058029175, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7346705198287964, + "num_tokens": 439921037.0, + "step": 17626 + }, + { + "epoch": 1.935756643971008, + "grad_norm": 2.236173629760742, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7007927894592285, + "num_tokens": 439947388.0, + "step": 17627 + }, + { + "epoch": 1.9358664616736219, + "grad_norm": 2.1751322746276855, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7130075097084045, + "num_tokens": 439973941.0, + "step": 17628 + }, + { + "epoch": 1.9359762793762354, + "grad_norm": 2.165566921234131, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7278733253479004, + "num_tokens": 439998998.0, + "step": 17629 + }, + { + "epoch": 1.9360860970788492, + "grad_norm": 2.3419606685638428, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7329298257827759, + "num_tokens": 440023450.0, + "step": 17630 + }, + { + "epoch": 1.9361959147814627, + "grad_norm": 1.9979299306869507, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.699820876121521, + "num_tokens": 440055350.0, + "step": 17631 + }, + { + "epoch": 1.9363057324840764, + "grad_norm": 2.5314648151397705, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7162749767303467, + "num_tokens": 440077753.0, + "step": 17632 + }, + { + "epoch": 1.9364155501866902, + "grad_norm": 2.2583651542663574, + "learning_rate": 1e-06, + "loss": 0.7815, + "mean_token_accuracy": 0.7521328926086426, + "num_tokens": 440099436.0, + "step": 17633 + }, + { + "epoch": 1.9365253678893037, + "grad_norm": 2.0203094482421875, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7236100435256958, + "num_tokens": 440132237.0, + "step": 17634 + }, + { + "epoch": 1.9366351855919173, + "grad_norm": 1.8571159839630127, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7127425670623779, + "num_tokens": 440163978.0, + "step": 17635 + }, + { + "epoch": 1.936745003294531, + "grad_norm": 2.1094417572021484, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7175596356391907, + "num_tokens": 440190377.0, + "step": 17636 + }, + { + "epoch": 1.9368548209971448, + "grad_norm": 2.3860585689544678, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7234683036804199, + "num_tokens": 440211975.0, + "step": 17637 + }, + { + "epoch": 1.9369646386997585, + "grad_norm": 2.5538110733032227, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7468588352203369, + "num_tokens": 440231028.0, + "step": 17638 + }, + { + "epoch": 1.937074456402372, + "grad_norm": 2.107234001159668, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7362862825393677, + "num_tokens": 440256310.0, + "step": 17639 + }, + { + "epoch": 1.9371842741049856, + "grad_norm": 2.4113223552703857, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7182877063751221, + "num_tokens": 440280071.0, + "step": 17640 + }, + { + "epoch": 1.9372940918075994, + "grad_norm": 2.2626054286956787, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.6990941762924194, + "num_tokens": 440307484.0, + "step": 17641 + }, + { + "epoch": 1.9374039095102131, + "grad_norm": 2.1417675018310547, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7085614204406738, + "num_tokens": 440333336.0, + "step": 17642 + }, + { + "epoch": 1.9375137272128267, + "grad_norm": 2.548795700073242, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.703572690486908, + "num_tokens": 440355058.0, + "step": 17643 + }, + { + "epoch": 1.9376235449154404, + "grad_norm": 2.257215976715088, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.721990168094635, + "num_tokens": 440379653.0, + "step": 17644 + }, + { + "epoch": 1.937733362618054, + "grad_norm": 2.2439732551574707, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7298349142074585, + "num_tokens": 440404259.0, + "step": 17645 + }, + { + "epoch": 1.9378431803206677, + "grad_norm": 1.8962798118591309, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7130768895149231, + "num_tokens": 440437365.0, + "step": 17646 + }, + { + "epoch": 1.9379529980232815, + "grad_norm": 2.185620069503784, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7320995926856995, + "num_tokens": 440464215.0, + "step": 17647 + }, + { + "epoch": 1.938062815725895, + "grad_norm": 2.2612662315368652, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7204616665840149, + "num_tokens": 440490244.0, + "step": 17648 + }, + { + "epoch": 1.9381726334285085, + "grad_norm": 2.328781843185425, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7205690741539001, + "num_tokens": 440513550.0, + "step": 17649 + }, + { + "epoch": 1.9382824511311223, + "grad_norm": 2.2711291313171387, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7359299659729004, + "num_tokens": 440537597.0, + "step": 17650 + }, + { + "epoch": 1.938392268833736, + "grad_norm": 2.320361614227295, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7237156629562378, + "num_tokens": 440559168.0, + "step": 17651 + }, + { + "epoch": 1.9385020865363498, + "grad_norm": 2.437431573867798, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7262292504310608, + "num_tokens": 440582552.0, + "step": 17652 + }, + { + "epoch": 1.9386119042389633, + "grad_norm": 2.382920742034912, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7152904868125916, + "num_tokens": 440605697.0, + "step": 17653 + }, + { + "epoch": 1.9387217219415769, + "grad_norm": 2.2528514862060547, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.735051691532135, + "num_tokens": 440630150.0, + "step": 17654 + }, + { + "epoch": 1.9388315396441906, + "grad_norm": 2.146665096282959, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7114142179489136, + "num_tokens": 440658102.0, + "step": 17655 + }, + { + "epoch": 1.9389413573468044, + "grad_norm": 2.5846924781799316, + "learning_rate": 1e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7393434643745422, + "num_tokens": 440678357.0, + "step": 17656 + }, + { + "epoch": 1.939051175049418, + "grad_norm": 2.2550277709960938, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7053573131561279, + "num_tokens": 440703315.0, + "step": 17657 + }, + { + "epoch": 1.9391609927520315, + "grad_norm": 2.503836154937744, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7109023332595825, + "num_tokens": 440724659.0, + "step": 17658 + }, + { + "epoch": 1.9392708104546452, + "grad_norm": 1.8423734903335571, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7231417894363403, + "num_tokens": 440756622.0, + "step": 17659 + }, + { + "epoch": 1.939380628157259, + "grad_norm": 2.108790397644043, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6978685259819031, + "num_tokens": 440785561.0, + "step": 17660 + }, + { + "epoch": 1.9394904458598727, + "grad_norm": 2.3055882453918457, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.741389274597168, + "num_tokens": 440808296.0, + "step": 17661 + }, + { + "epoch": 1.9396002635624863, + "grad_norm": 2.4887912273406982, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7225756049156189, + "num_tokens": 440828949.0, + "step": 17662 + }, + { + "epoch": 1.9397100812650998, + "grad_norm": 2.540823459625244, + "learning_rate": 1e-06, + "loss": 0.7034, + "mean_token_accuracy": 0.7715174555778503, + "num_tokens": 440847840.0, + "step": 17663 + }, + { + "epoch": 1.9398198989677136, + "grad_norm": 2.4882123470306396, + "learning_rate": 1e-06, + "loss": 0.7134, + "mean_token_accuracy": 0.764075517654419, + "num_tokens": 440866460.0, + "step": 17664 + }, + { + "epoch": 1.9399297166703273, + "grad_norm": 2.315084934234619, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7032828330993652, + "num_tokens": 440890470.0, + "step": 17665 + }, + { + "epoch": 1.940039534372941, + "grad_norm": 2.0844881534576416, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.737436830997467, + "num_tokens": 440918790.0, + "step": 17666 + }, + { + "epoch": 1.9401493520755546, + "grad_norm": 2.2676918506622314, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7144347429275513, + "num_tokens": 440944255.0, + "step": 17667 + }, + { + "epoch": 1.9402591697781681, + "grad_norm": 2.4570152759552, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7228423357009888, + "num_tokens": 440966699.0, + "step": 17668 + }, + { + "epoch": 1.940368987480782, + "grad_norm": 2.469473123550415, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7433606386184692, + "num_tokens": 440988878.0, + "step": 17669 + }, + { + "epoch": 1.9404788051833957, + "grad_norm": 2.68119478225708, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7248759269714355, + "num_tokens": 441008642.0, + "step": 17670 + }, + { + "epoch": 1.9405886228860092, + "grad_norm": 2.1518688201904297, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7066199779510498, + "num_tokens": 441035467.0, + "step": 17671 + }, + { + "epoch": 1.9406984405886227, + "grad_norm": 2.326803684234619, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7203763723373413, + "num_tokens": 441057948.0, + "step": 17672 + }, + { + "epoch": 1.9408082582912365, + "grad_norm": 2.253612756729126, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7093812227249146, + "num_tokens": 441082391.0, + "step": 17673 + }, + { + "epoch": 1.9409180759938502, + "grad_norm": 2.162285089492798, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7082951068878174, + "num_tokens": 441109150.0, + "step": 17674 + }, + { + "epoch": 1.941027893696464, + "grad_norm": 2.1190762519836426, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7044569849967957, + "num_tokens": 441137341.0, + "step": 17675 + }, + { + "epoch": 1.9411377113990775, + "grad_norm": 2.3723955154418945, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7275953888893127, + "num_tokens": 441158977.0, + "step": 17676 + }, + { + "epoch": 1.941247529101691, + "grad_norm": 2.166243553161621, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7264959812164307, + "num_tokens": 441184624.0, + "step": 17677 + }, + { + "epoch": 1.9413573468043048, + "grad_norm": 2.177406072616577, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6929910182952881, + "num_tokens": 441213560.0, + "step": 17678 + }, + { + "epoch": 1.9414671645069186, + "grad_norm": 2.5288760662078857, + "learning_rate": 1e-06, + "loss": 0.7397, + "mean_token_accuracy": 0.7582921981811523, + "num_tokens": 441231412.0, + "step": 17679 + }, + { + "epoch": 1.9415769822095323, + "grad_norm": 2.205002546310425, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7293002605438232, + "num_tokens": 441257069.0, + "step": 17680 + }, + { + "epoch": 1.9416867999121459, + "grad_norm": 2.4823100566864014, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7288711071014404, + "num_tokens": 441277988.0, + "step": 17681 + }, + { + "epoch": 1.9417966176147594, + "grad_norm": 2.409451723098755, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7320519089698792, + "num_tokens": 441298583.0, + "step": 17682 + }, + { + "epoch": 1.9419064353173732, + "grad_norm": 2.421919107437134, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.716191291809082, + "num_tokens": 441327739.0, + "step": 17683 + }, + { + "epoch": 1.942016253019987, + "grad_norm": 2.359905958175659, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7326892018318176, + "num_tokens": 441350740.0, + "step": 17684 + }, + { + "epoch": 1.9421260707226005, + "grad_norm": 2.2027525901794434, + "learning_rate": 1e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7503782510757446, + "num_tokens": 441375752.0, + "step": 17685 + }, + { + "epoch": 1.942235888425214, + "grad_norm": 2.3217391967773438, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7351071834564209, + "num_tokens": 441401592.0, + "step": 17686 + }, + { + "epoch": 1.9423457061278278, + "grad_norm": 2.5798351764678955, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7270615100860596, + "num_tokens": 441422351.0, + "step": 17687 + }, + { + "epoch": 1.9424555238304415, + "grad_norm": 2.023674488067627, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7085701823234558, + "num_tokens": 441451920.0, + "step": 17688 + }, + { + "epoch": 1.9425653415330553, + "grad_norm": 2.6249642372131348, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7302250862121582, + "num_tokens": 441471490.0, + "step": 17689 + }, + { + "epoch": 1.9426751592356688, + "grad_norm": 2.0768535137176514, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7240833640098572, + "num_tokens": 441501489.0, + "step": 17690 + }, + { + "epoch": 1.9427849769382823, + "grad_norm": 2.397919178009033, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7327941060066223, + "num_tokens": 441524073.0, + "step": 17691 + }, + { + "epoch": 1.942894794640896, + "grad_norm": 2.033325672149658, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7214892506599426, + "num_tokens": 441555449.0, + "step": 17692 + }, + { + "epoch": 1.9430046123435099, + "grad_norm": 2.082080602645874, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7085989117622375, + "num_tokens": 441584733.0, + "step": 17693 + }, + { + "epoch": 1.9431144300461234, + "grad_norm": 1.9322972297668457, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7268747091293335, + "num_tokens": 441616632.0, + "step": 17694 + }, + { + "epoch": 1.9432242477487371, + "grad_norm": 2.356321096420288, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7303693294525146, + "num_tokens": 441637617.0, + "step": 17695 + }, + { + "epoch": 1.9433340654513507, + "grad_norm": 2.7908170223236084, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7260613441467285, + "num_tokens": 441655525.0, + "step": 17696 + }, + { + "epoch": 1.9434438831539644, + "grad_norm": 2.315290689468384, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7233335375785828, + "num_tokens": 441679902.0, + "step": 17697 + }, + { + "epoch": 1.9435537008565782, + "grad_norm": 2.2826974391937256, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7131445407867432, + "num_tokens": 441705931.0, + "step": 17698 + }, + { + "epoch": 1.9436635185591917, + "grad_norm": 2.1780238151550293, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.742300271987915, + "num_tokens": 441729535.0, + "step": 17699 + }, + { + "epoch": 1.9437733362618053, + "grad_norm": 2.0597383975982666, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7044094800949097, + "num_tokens": 441759764.0, + "step": 17700 + }, + { + "epoch": 1.943883153964419, + "grad_norm": 2.356672763824463, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7192332744598389, + "num_tokens": 441782441.0, + "step": 17701 + }, + { + "epoch": 1.9439929716670328, + "grad_norm": 2.0775065422058105, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7495747804641724, + "num_tokens": 441808832.0, + "step": 17702 + }, + { + "epoch": 1.9441027893696465, + "grad_norm": 2.087855339050293, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.709334671497345, + "num_tokens": 441837302.0, + "step": 17703 + }, + { + "epoch": 1.94421260707226, + "grad_norm": 2.3220977783203125, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.708895206451416, + "num_tokens": 441862932.0, + "step": 17704 + }, + { + "epoch": 1.9443224247748736, + "grad_norm": 2.188465118408203, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7248847484588623, + "num_tokens": 441888214.0, + "step": 17705 + }, + { + "epoch": 1.9444322424774874, + "grad_norm": 2.5902838706970215, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.736467719078064, + "num_tokens": 441907475.0, + "step": 17706 + }, + { + "epoch": 1.9445420601801011, + "grad_norm": 2.66133189201355, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7340044975280762, + "num_tokens": 441926887.0, + "step": 17707 + }, + { + "epoch": 1.9446518778827147, + "grad_norm": 2.238138437271118, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7049644589424133, + "num_tokens": 441954743.0, + "step": 17708 + }, + { + "epoch": 1.9447616955853284, + "grad_norm": 2.404921770095825, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7366464138031006, + "num_tokens": 441975865.0, + "step": 17709 + }, + { + "epoch": 1.944871513287942, + "grad_norm": 2.244126081466675, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7103553414344788, + "num_tokens": 442001299.0, + "step": 17710 + }, + { + "epoch": 1.9449813309905557, + "grad_norm": 2.2667534351348877, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7210679650306702, + "num_tokens": 442025961.0, + "step": 17711 + }, + { + "epoch": 1.9450911486931695, + "grad_norm": 2.346808910369873, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7193853855133057, + "num_tokens": 442050746.0, + "step": 17712 + }, + { + "epoch": 1.945200966395783, + "grad_norm": 2.2795705795288086, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.708178699016571, + "num_tokens": 442076541.0, + "step": 17713 + }, + { + "epoch": 1.9453107840983965, + "grad_norm": 1.9318190813064575, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7154431939125061, + "num_tokens": 442108110.0, + "step": 17714 + }, + { + "epoch": 1.9454206018010103, + "grad_norm": 2.2817177772521973, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7127337455749512, + "num_tokens": 442131589.0, + "step": 17715 + }, + { + "epoch": 1.945530419503624, + "grad_norm": 2.5659985542297363, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.718097448348999, + "num_tokens": 442152582.0, + "step": 17716 + }, + { + "epoch": 1.9456402372062378, + "grad_norm": 2.0298006534576416, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7069747447967529, + "num_tokens": 442183513.0, + "step": 17717 + }, + { + "epoch": 1.9457500549088513, + "grad_norm": 2.743659019470215, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7297898530960083, + "num_tokens": 442202307.0, + "step": 17718 + }, + { + "epoch": 1.9458598726114649, + "grad_norm": 2.0760750770568848, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7108941078186035, + "num_tokens": 442231450.0, + "step": 17719 + }, + { + "epoch": 1.9459696903140786, + "grad_norm": 2.7472646236419678, + "learning_rate": 1e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7495874166488647, + "num_tokens": 442249556.0, + "step": 17720 + }, + { + "epoch": 1.9460795080166924, + "grad_norm": 2.427231550216675, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7139610052108765, + "num_tokens": 442272620.0, + "step": 17721 + }, + { + "epoch": 1.946189325719306, + "grad_norm": 1.9570063352584839, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7204064130783081, + "num_tokens": 442303126.0, + "step": 17722 + }, + { + "epoch": 1.9462991434219195, + "grad_norm": 2.718194007873535, + "learning_rate": 1e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7342193126678467, + "num_tokens": 442321288.0, + "step": 17723 + }, + { + "epoch": 1.9464089611245332, + "grad_norm": 2.0690481662750244, + "learning_rate": 1e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7439885139465332, + "num_tokens": 442347706.0, + "step": 17724 + }, + { + "epoch": 1.946518778827147, + "grad_norm": 2.492558240890503, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7384921312332153, + "num_tokens": 442369199.0, + "step": 17725 + }, + { + "epoch": 1.9466285965297607, + "grad_norm": 2.4230597019195557, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7280339002609253, + "num_tokens": 442391028.0, + "step": 17726 + }, + { + "epoch": 1.9467384142323743, + "grad_norm": 2.1522109508514404, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7430087924003601, + "num_tokens": 442415898.0, + "step": 17727 + }, + { + "epoch": 1.9468482319349878, + "grad_norm": 2.279275894165039, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7246578931808472, + "num_tokens": 442440493.0, + "step": 17728 + }, + { + "epoch": 1.9469580496376016, + "grad_norm": 2.313051223754883, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7461782693862915, + "num_tokens": 442462963.0, + "step": 17729 + }, + { + "epoch": 1.9470678673402153, + "grad_norm": 2.7830138206481934, + "learning_rate": 1e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7468870878219604, + "num_tokens": 442480106.0, + "step": 17730 + }, + { + "epoch": 1.947177685042829, + "grad_norm": 2.4026968479156494, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7026533484458923, + "num_tokens": 442504627.0, + "step": 17731 + }, + { + "epoch": 1.9472875027454426, + "grad_norm": 2.494969367980957, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7290789484977722, + "num_tokens": 442526294.0, + "step": 17732 + }, + { + "epoch": 1.9473973204480561, + "grad_norm": 2.3186211585998535, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7250393629074097, + "num_tokens": 442551312.0, + "step": 17733 + }, + { + "epoch": 1.94750713815067, + "grad_norm": 2.4049816131591797, + "learning_rate": 1e-06, + "loss": 0.7229, + "mean_token_accuracy": 0.767950177192688, + "num_tokens": 442572317.0, + "step": 17734 + }, + { + "epoch": 1.9476169558532836, + "grad_norm": 2.5990374088287354, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7611814737319946, + "num_tokens": 442590282.0, + "step": 17735 + }, + { + "epoch": 1.9477267735558972, + "grad_norm": 2.2266180515289307, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7134069204330444, + "num_tokens": 442615086.0, + "step": 17736 + }, + { + "epoch": 1.9478365912585107, + "grad_norm": 2.63415265083313, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7262436151504517, + "num_tokens": 442635236.0, + "step": 17737 + }, + { + "epoch": 1.9479464089611245, + "grad_norm": 2.080153465270996, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7094380855560303, + "num_tokens": 442662736.0, + "step": 17738 + }, + { + "epoch": 1.9480562266637382, + "grad_norm": 2.3315799236297607, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7237639427185059, + "num_tokens": 442687752.0, + "step": 17739 + }, + { + "epoch": 1.948166044366352, + "grad_norm": 2.4615390300750732, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7208828926086426, + "num_tokens": 442710053.0, + "step": 17740 + }, + { + "epoch": 1.9482758620689655, + "grad_norm": 2.1596968173980713, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7337979078292847, + "num_tokens": 442734842.0, + "step": 17741 + }, + { + "epoch": 1.948385679771579, + "grad_norm": 2.1119651794433594, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7179704308509827, + "num_tokens": 442760730.0, + "step": 17742 + }, + { + "epoch": 1.9484954974741928, + "grad_norm": 2.3813962936401367, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7424293756484985, + "num_tokens": 442782936.0, + "step": 17743 + }, + { + "epoch": 1.9486053151768066, + "grad_norm": 2.317246198654175, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7396762371063232, + "num_tokens": 442807300.0, + "step": 17744 + }, + { + "epoch": 1.94871513287942, + "grad_norm": 2.1949081420898438, + "learning_rate": 1e-06, + "loss": 0.8327, + "mean_token_accuracy": 0.7365360260009766, + "num_tokens": 442831404.0, + "step": 17745 + }, + { + "epoch": 1.9488249505820339, + "grad_norm": 2.1684229373931885, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7299978733062744, + "num_tokens": 442858289.0, + "step": 17746 + }, + { + "epoch": 1.9489347682846474, + "grad_norm": 2.142577648162842, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7279382944107056, + "num_tokens": 442885547.0, + "step": 17747 + }, + { + "epoch": 1.9490445859872612, + "grad_norm": 2.3112382888793945, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7230232954025269, + "num_tokens": 442909370.0, + "step": 17748 + }, + { + "epoch": 1.949154403689875, + "grad_norm": 2.5812149047851562, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7415106296539307, + "num_tokens": 442929623.0, + "step": 17749 + }, + { + "epoch": 1.9492642213924884, + "grad_norm": 1.9392752647399902, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7365277409553528, + "num_tokens": 442962161.0, + "step": 17750 + }, + { + "epoch": 1.949374039095102, + "grad_norm": 2.2217273712158203, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7236137390136719, + "num_tokens": 442989560.0, + "step": 17751 + }, + { + "epoch": 1.9494838567977157, + "grad_norm": 2.637101411819458, + "learning_rate": 1e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7373225092887878, + "num_tokens": 443008302.0, + "step": 17752 + }, + { + "epoch": 1.9495936745003295, + "grad_norm": 2.2660040855407715, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7061460614204407, + "num_tokens": 443034774.0, + "step": 17753 + }, + { + "epoch": 1.9497034922029433, + "grad_norm": 2.4570181369781494, + "learning_rate": 1e-06, + "loss": 0.8167, + "mean_token_accuracy": 0.7407264709472656, + "num_tokens": 443054892.0, + "step": 17754 + }, + { + "epoch": 1.9498133099055568, + "grad_norm": 2.1564221382141113, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.716525137424469, + "num_tokens": 443079929.0, + "step": 17755 + }, + { + "epoch": 1.9499231276081703, + "grad_norm": 1.940035343170166, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.708715558052063, + "num_tokens": 443112325.0, + "step": 17756 + }, + { + "epoch": 1.950032945310784, + "grad_norm": 2.5619866847991943, + "learning_rate": 1e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.7450215816497803, + "num_tokens": 443131233.0, + "step": 17757 + }, + { + "epoch": 1.9501427630133978, + "grad_norm": 2.072600841522217, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7248098850250244, + "num_tokens": 443161082.0, + "step": 17758 + }, + { + "epoch": 1.9502525807160114, + "grad_norm": 2.339120864868164, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7079459428787231, + "num_tokens": 443183917.0, + "step": 17759 + }, + { + "epoch": 1.9503623984186251, + "grad_norm": 2.2951958179473877, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6909171938896179, + "num_tokens": 443210068.0, + "step": 17760 + }, + { + "epoch": 1.9504722161212387, + "grad_norm": 2.4764158725738525, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7357722520828247, + "num_tokens": 443231536.0, + "step": 17761 + }, + { + "epoch": 1.9505820338238524, + "grad_norm": 2.2724833488464355, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7071724534034729, + "num_tokens": 443256032.0, + "step": 17762 + }, + { + "epoch": 1.9506918515264662, + "grad_norm": 1.9419353008270264, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7073350548744202, + "num_tokens": 443288495.0, + "step": 17763 + }, + { + "epoch": 1.9508016692290797, + "grad_norm": 2.1587939262390137, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7198089361190796, + "num_tokens": 443316215.0, + "step": 17764 + }, + { + "epoch": 1.9509114869316933, + "grad_norm": 2.078702211380005, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7097225189208984, + "num_tokens": 443344331.0, + "step": 17765 + }, + { + "epoch": 1.951021304634307, + "grad_norm": 2.238600015640259, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7404631972312927, + "num_tokens": 443369458.0, + "step": 17766 + }, + { + "epoch": 1.9511311223369208, + "grad_norm": 2.3592875003814697, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7404953241348267, + "num_tokens": 443392432.0, + "step": 17767 + }, + { + "epoch": 1.9512409400395345, + "grad_norm": 2.5037782192230225, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7268720865249634, + "num_tokens": 443414608.0, + "step": 17768 + }, + { + "epoch": 1.951350757742148, + "grad_norm": 2.3898932933807373, + "learning_rate": 1e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7353976964950562, + "num_tokens": 443437492.0, + "step": 17769 + }, + { + "epoch": 1.9514605754447616, + "grad_norm": 2.171649694442749, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7191503047943115, + "num_tokens": 443465170.0, + "step": 17770 + }, + { + "epoch": 1.9515703931473753, + "grad_norm": 2.332973003387451, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7220925092697144, + "num_tokens": 443489338.0, + "step": 17771 + }, + { + "epoch": 1.951680210849989, + "grad_norm": 2.0129292011260986, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7246425747871399, + "num_tokens": 443521738.0, + "step": 17772 + }, + { + "epoch": 1.9517900285526026, + "grad_norm": 2.409489631652832, + "learning_rate": 1e-06, + "loss": 0.7329, + "mean_token_accuracy": 0.7625596523284912, + "num_tokens": 443542148.0, + "step": 17773 + }, + { + "epoch": 1.9518998462552162, + "grad_norm": 1.9983621835708618, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7429633736610413, + "num_tokens": 443570519.0, + "step": 17774 + }, + { + "epoch": 1.95200966395783, + "grad_norm": 2.2208566665649414, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7278140783309937, + "num_tokens": 443595733.0, + "step": 17775 + }, + { + "epoch": 1.9521194816604437, + "grad_norm": 1.9787013530731201, + "learning_rate": 1e-06, + "loss": 1.0227, + "mean_token_accuracy": 0.6877977848052979, + "num_tokens": 443627978.0, + "step": 17776 + }, + { + "epoch": 1.9522292993630574, + "grad_norm": 2.8558614253997803, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7431186437606812, + "num_tokens": 443644472.0, + "step": 17777 + }, + { + "epoch": 1.952339117065671, + "grad_norm": 2.105677843093872, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7425687909126282, + "num_tokens": 443672478.0, + "step": 17778 + }, + { + "epoch": 1.9524489347682845, + "grad_norm": 2.088284969329834, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.723487138748169, + "num_tokens": 443700047.0, + "step": 17779 + }, + { + "epoch": 1.9525587524708983, + "grad_norm": 2.7454652786254883, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7227715849876404, + "num_tokens": 443725108.0, + "step": 17780 + }, + { + "epoch": 1.952668570173512, + "grad_norm": 2.604743719100952, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7348077893257141, + "num_tokens": 443744471.0, + "step": 17781 + }, + { + "epoch": 1.9527783878761258, + "grad_norm": 2.2128655910491943, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7378065586090088, + "num_tokens": 443770026.0, + "step": 17782 + }, + { + "epoch": 1.9528882055787393, + "grad_norm": 2.558741331100464, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7463698983192444, + "num_tokens": 443791205.0, + "step": 17783 + }, + { + "epoch": 1.9529980232813529, + "grad_norm": 2.0564053058624268, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7142477035522461, + "num_tokens": 443822422.0, + "step": 17784 + }, + { + "epoch": 1.9531078409839666, + "grad_norm": 2.46675443649292, + "learning_rate": 1e-06, + "loss": 0.7818, + "mean_token_accuracy": 0.7609975337982178, + "num_tokens": 443843313.0, + "step": 17785 + }, + { + "epoch": 1.9532176586865804, + "grad_norm": 2.3327138423919678, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7071719765663147, + "num_tokens": 443869008.0, + "step": 17786 + }, + { + "epoch": 1.953327476389194, + "grad_norm": 2.2214694023132324, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7412601113319397, + "num_tokens": 443894898.0, + "step": 17787 + }, + { + "epoch": 1.9534372940918074, + "grad_norm": 2.1820993423461914, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7252464294433594, + "num_tokens": 443921989.0, + "step": 17788 + }, + { + "epoch": 1.9535471117944212, + "grad_norm": 1.9954825639724731, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7113178968429565, + "num_tokens": 443952195.0, + "step": 17789 + }, + { + "epoch": 1.953656929497035, + "grad_norm": 2.0225229263305664, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7044711112976074, + "num_tokens": 443981630.0, + "step": 17790 + }, + { + "epoch": 1.9537667471996487, + "grad_norm": 2.4214682579040527, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7396458983421326, + "num_tokens": 444004203.0, + "step": 17791 + }, + { + "epoch": 1.9538765649022622, + "grad_norm": 2.2007341384887695, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.6990821361541748, + "num_tokens": 444032788.0, + "step": 17792 + }, + { + "epoch": 1.9539863826048758, + "grad_norm": 2.171053171157837, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.721588671207428, + "num_tokens": 444060405.0, + "step": 17793 + }, + { + "epoch": 1.9540962003074895, + "grad_norm": 2.2928099632263184, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7412726879119873, + "num_tokens": 444081756.0, + "step": 17794 + }, + { + "epoch": 1.9542060180101033, + "grad_norm": 2.3387067317962646, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7403218150138855, + "num_tokens": 444105638.0, + "step": 17795 + }, + { + "epoch": 1.954315835712717, + "grad_norm": 2.0251684188842773, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7260309457778931, + "num_tokens": 444136339.0, + "step": 17796 + }, + { + "epoch": 1.9544256534153306, + "grad_norm": 2.161374807357788, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7032082080841064, + "num_tokens": 444164700.0, + "step": 17797 + }, + { + "epoch": 1.9545354711179441, + "grad_norm": 2.3913896083831787, + "learning_rate": 1e-06, + "loss": 0.8185, + "mean_token_accuracy": 0.7409569025039673, + "num_tokens": 444187697.0, + "step": 17798 + }, + { + "epoch": 1.9546452888205579, + "grad_norm": 2.254349708557129, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7284367680549622, + "num_tokens": 444212504.0, + "step": 17799 + }, + { + "epoch": 1.9547551065231716, + "grad_norm": 2.0837676525115967, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6893295049667358, + "num_tokens": 444242310.0, + "step": 17800 + }, + { + "epoch": 1.9548649242257852, + "grad_norm": 2.249302387237549, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.711706280708313, + "num_tokens": 444269299.0, + "step": 17801 + }, + { + "epoch": 1.9549747419283987, + "grad_norm": 2.3743085861206055, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7021915912628174, + "num_tokens": 444294469.0, + "step": 17802 + }, + { + "epoch": 1.9550845596310125, + "grad_norm": 2.3605761528015137, + "learning_rate": 1e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.7392048835754395, + "num_tokens": 444318101.0, + "step": 17803 + }, + { + "epoch": 1.9551943773336262, + "grad_norm": 2.3191399574279785, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7192132472991943, + "num_tokens": 444342970.0, + "step": 17804 + }, + { + "epoch": 1.95530419503624, + "grad_norm": 2.2832212448120117, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7157164812088013, + "num_tokens": 444370140.0, + "step": 17805 + }, + { + "epoch": 1.9554140127388535, + "grad_norm": 2.3463594913482666, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7003905773162842, + "num_tokens": 444393907.0, + "step": 17806 + }, + { + "epoch": 1.955523830441467, + "grad_norm": 2.1058690547943115, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.6982563734054565, + "num_tokens": 444424184.0, + "step": 17807 + }, + { + "epoch": 1.9556336481440808, + "grad_norm": 2.4360058307647705, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.725890040397644, + "num_tokens": 444446494.0, + "step": 17808 + }, + { + "epoch": 1.9557434658466946, + "grad_norm": 2.2170751094818115, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7347400784492493, + "num_tokens": 444471463.0, + "step": 17809 + }, + { + "epoch": 1.955853283549308, + "grad_norm": 2.2090210914611816, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.704987645149231, + "num_tokens": 444499002.0, + "step": 17810 + }, + { + "epoch": 1.9559631012519219, + "grad_norm": 2.683976411819458, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7224498987197876, + "num_tokens": 444517623.0, + "step": 17811 + }, + { + "epoch": 1.9560729189545354, + "grad_norm": 2.0993387699127197, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7097451686859131, + "num_tokens": 444545929.0, + "step": 17812 + }, + { + "epoch": 1.9561827366571491, + "grad_norm": 2.231398582458496, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.6960911154747009, + "num_tokens": 444573989.0, + "step": 17813 + }, + { + "epoch": 1.956292554359763, + "grad_norm": 2.2147278785705566, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.716569721698761, + "num_tokens": 444599750.0, + "step": 17814 + }, + { + "epoch": 1.9564023720623764, + "grad_norm": 2.3646433353424072, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7240774631500244, + "num_tokens": 444622477.0, + "step": 17815 + }, + { + "epoch": 1.95651218976499, + "grad_norm": 2.324054002761841, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7031192779541016, + "num_tokens": 444647291.0, + "step": 17816 + }, + { + "epoch": 1.9566220074676037, + "grad_norm": 2.182108163833618, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.72325599193573, + "num_tokens": 444672618.0, + "step": 17817 + }, + { + "epoch": 1.9567318251702175, + "grad_norm": 2.3329811096191406, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.729882001876831, + "num_tokens": 444696333.0, + "step": 17818 + }, + { + "epoch": 1.9568416428728312, + "grad_norm": 2.2552666664123535, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7250372171401978, + "num_tokens": 444720559.0, + "step": 17819 + }, + { + "epoch": 1.9569514605754448, + "grad_norm": 2.0993478298187256, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7128004431724548, + "num_tokens": 444751887.0, + "step": 17820 + }, + { + "epoch": 1.9570612782780583, + "grad_norm": 2.5686075687408447, + "learning_rate": 1e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7537357807159424, + "num_tokens": 444771399.0, + "step": 17821 + }, + { + "epoch": 1.957171095980672, + "grad_norm": 2.469242572784424, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7212201952934265, + "num_tokens": 444792260.0, + "step": 17822 + }, + { + "epoch": 1.9572809136832858, + "grad_norm": 2.5635554790496826, + "learning_rate": 1e-06, + "loss": 0.762, + "mean_token_accuracy": 0.7496106624603271, + "num_tokens": 444811555.0, + "step": 17823 + }, + { + "epoch": 1.9573907313858994, + "grad_norm": 2.5021634101867676, + "learning_rate": 1e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7443876266479492, + "num_tokens": 444831783.0, + "step": 17824 + }, + { + "epoch": 1.9575005490885131, + "grad_norm": 2.497706890106201, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7206655144691467, + "num_tokens": 444852691.0, + "step": 17825 + }, + { + "epoch": 1.9576103667911267, + "grad_norm": 2.397088050842285, + "learning_rate": 1e-06, + "loss": 0.7693, + "mean_token_accuracy": 0.7602525949478149, + "num_tokens": 444874821.0, + "step": 17826 + }, + { + "epoch": 1.9577201844937404, + "grad_norm": 2.1428561210632324, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.735771119594574, + "num_tokens": 444900823.0, + "step": 17827 + }, + { + "epoch": 1.9578300021963542, + "grad_norm": 2.4863009452819824, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.6971774101257324, + "num_tokens": 444923057.0, + "step": 17828 + }, + { + "epoch": 1.9579398198989677, + "grad_norm": 2.091158866882324, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7063330411911011, + "num_tokens": 444951534.0, + "step": 17829 + }, + { + "epoch": 1.9580496376015812, + "grad_norm": 2.379629135131836, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.734153151512146, + "num_tokens": 444974123.0, + "step": 17830 + }, + { + "epoch": 1.958159455304195, + "grad_norm": 2.3107919692993164, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7040882706642151, + "num_tokens": 445000230.0, + "step": 17831 + }, + { + "epoch": 1.9582692730068088, + "grad_norm": 2.454241991043091, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7338643670082092, + "num_tokens": 445020505.0, + "step": 17832 + }, + { + "epoch": 1.9583790907094225, + "grad_norm": 1.9379327297210693, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7272765636444092, + "num_tokens": 445053322.0, + "step": 17833 + }, + { + "epoch": 1.958488908412036, + "grad_norm": 2.1857309341430664, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7338171005249023, + "num_tokens": 445079692.0, + "step": 17834 + }, + { + "epoch": 1.9585987261146496, + "grad_norm": 2.2032225131988525, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7167907953262329, + "num_tokens": 445106955.0, + "step": 17835 + }, + { + "epoch": 1.9587085438172633, + "grad_norm": 2.3702750205993652, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7243070602416992, + "num_tokens": 445131442.0, + "step": 17836 + }, + { + "epoch": 1.958818361519877, + "grad_norm": 2.060901403427124, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7122685313224792, + "num_tokens": 445161399.0, + "step": 17837 + }, + { + "epoch": 1.9589281792224906, + "grad_norm": 2.1430141925811768, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7169672250747681, + "num_tokens": 445188442.0, + "step": 17838 + }, + { + "epoch": 1.9590379969251042, + "grad_norm": 2.1750521659851074, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7270238399505615, + "num_tokens": 445214325.0, + "step": 17839 + }, + { + "epoch": 1.959147814627718, + "grad_norm": 2.086219072341919, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7288551926612854, + "num_tokens": 445239760.0, + "step": 17840 + }, + { + "epoch": 1.9592576323303317, + "grad_norm": 2.070457696914673, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7411139011383057, + "num_tokens": 445267577.0, + "step": 17841 + }, + { + "epoch": 1.9593674500329454, + "grad_norm": 2.294469118118286, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7157145142555237, + "num_tokens": 445293328.0, + "step": 17842 + }, + { + "epoch": 1.959477267735559, + "grad_norm": 2.277635097503662, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.716644287109375, + "num_tokens": 445318600.0, + "step": 17843 + }, + { + "epoch": 1.9595870854381725, + "grad_norm": 2.234482526779175, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7105916738510132, + "num_tokens": 445344993.0, + "step": 17844 + }, + { + "epoch": 1.9596969031407863, + "grad_norm": 2.0754756927490234, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7091694474220276, + "num_tokens": 445373071.0, + "step": 17845 + }, + { + "epoch": 1.9598067208434, + "grad_norm": 2.4329774379730225, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7127547860145569, + "num_tokens": 445394745.0, + "step": 17846 + }, + { + "epoch": 1.9599165385460138, + "grad_norm": 2.6351583003997803, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7251445651054382, + "num_tokens": 445413898.0, + "step": 17847 + }, + { + "epoch": 1.9600263562486273, + "grad_norm": 2.4918103218078613, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7201269865036011, + "num_tokens": 445435587.0, + "step": 17848 + }, + { + "epoch": 1.9601361739512408, + "grad_norm": 2.315674304962158, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7291927337646484, + "num_tokens": 445459373.0, + "step": 17849 + }, + { + "epoch": 1.9602459916538546, + "grad_norm": 2.5340051651000977, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7442729473114014, + "num_tokens": 445479729.0, + "step": 17850 + }, + { + "epoch": 1.9603558093564684, + "grad_norm": 2.4691436290740967, + "learning_rate": 1e-06, + "loss": 0.8221, + "mean_token_accuracy": 0.7421526312828064, + "num_tokens": 445501248.0, + "step": 17851 + }, + { + "epoch": 1.960465627059082, + "grad_norm": 2.030973434448242, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6860157251358032, + "num_tokens": 445531773.0, + "step": 17852 + }, + { + "epoch": 1.9605754447616954, + "grad_norm": 2.3784079551696777, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.71513831615448, + "num_tokens": 445554274.0, + "step": 17853 + }, + { + "epoch": 1.9606852624643092, + "grad_norm": 2.31658673286438, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7155106067657471, + "num_tokens": 445578212.0, + "step": 17854 + }, + { + "epoch": 1.960795080166923, + "grad_norm": 2.3199925422668457, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7148776054382324, + "num_tokens": 445603419.0, + "step": 17855 + }, + { + "epoch": 1.9609048978695367, + "grad_norm": 2.275171995162964, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7211350798606873, + "num_tokens": 445627719.0, + "step": 17856 + }, + { + "epoch": 1.9610147155721502, + "grad_norm": 2.023047685623169, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7030212879180908, + "num_tokens": 445658645.0, + "step": 17857 + }, + { + "epoch": 1.9611245332747638, + "grad_norm": 2.1977508068084717, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7397074103355408, + "num_tokens": 445684974.0, + "step": 17858 + }, + { + "epoch": 1.9612343509773775, + "grad_norm": 2.3333215713500977, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7113219499588013, + "num_tokens": 445711061.0, + "step": 17859 + }, + { + "epoch": 1.9613441686799913, + "grad_norm": 2.5779006481170654, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.719759464263916, + "num_tokens": 445733739.0, + "step": 17860 + }, + { + "epoch": 1.961453986382605, + "grad_norm": 2.278503894805908, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7252815365791321, + "num_tokens": 445757872.0, + "step": 17861 + }, + { + "epoch": 1.9615638040852186, + "grad_norm": 2.093384265899658, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7120819687843323, + "num_tokens": 445786189.0, + "step": 17862 + }, + { + "epoch": 1.9616736217878321, + "grad_norm": 2.264740228652954, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7180933952331543, + "num_tokens": 445812387.0, + "step": 17863 + }, + { + "epoch": 1.9617834394904459, + "grad_norm": 2.1416914463043213, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7030438184738159, + "num_tokens": 445841858.0, + "step": 17864 + }, + { + "epoch": 1.9618932571930596, + "grad_norm": 2.3990774154663086, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7064147591590881, + "num_tokens": 445865704.0, + "step": 17865 + }, + { + "epoch": 1.9620030748956732, + "grad_norm": 2.2809436321258545, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7033027410507202, + "num_tokens": 445891679.0, + "step": 17866 + }, + { + "epoch": 1.9621128925982867, + "grad_norm": 2.081958770751953, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7222921848297119, + "num_tokens": 445917872.0, + "step": 17867 + }, + { + "epoch": 1.9622227103009005, + "grad_norm": 2.129943370819092, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7013341784477234, + "num_tokens": 445944881.0, + "step": 17868 + }, + { + "epoch": 1.9623325280035142, + "grad_norm": 2.5654280185699463, + "learning_rate": 1e-06, + "loss": 0.7862, + "mean_token_accuracy": 0.7448102235794067, + "num_tokens": 445964838.0, + "step": 17869 + }, + { + "epoch": 1.962442345706128, + "grad_norm": 1.8895596265792847, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7175247669219971, + "num_tokens": 445997843.0, + "step": 17870 + }, + { + "epoch": 1.9625521634087415, + "grad_norm": 2.3441994190216064, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7235186100006104, + "num_tokens": 446020731.0, + "step": 17871 + }, + { + "epoch": 1.962661981111355, + "grad_norm": 2.1628482341766357, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7311340570449829, + "num_tokens": 446047836.0, + "step": 17872 + }, + { + "epoch": 1.9627717988139688, + "grad_norm": 2.137758493423462, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7274355292320251, + "num_tokens": 446077005.0, + "step": 17873 + }, + { + "epoch": 1.9628816165165826, + "grad_norm": 2.244171380996704, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7069122791290283, + "num_tokens": 446103506.0, + "step": 17874 + }, + { + "epoch": 1.962991434219196, + "grad_norm": 2.348362922668457, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7145452499389648, + "num_tokens": 446129286.0, + "step": 17875 + }, + { + "epoch": 1.9631012519218098, + "grad_norm": 2.898160219192505, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7294067144393921, + "num_tokens": 446147286.0, + "step": 17876 + }, + { + "epoch": 1.9632110696244234, + "grad_norm": 2.62085223197937, + "learning_rate": 1e-06, + "loss": 0.7943, + "mean_token_accuracy": 0.7498539686203003, + "num_tokens": 446167649.0, + "step": 17877 + }, + { + "epoch": 1.9633208873270371, + "grad_norm": 2.464303493499756, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7105405926704407, + "num_tokens": 446192072.0, + "step": 17878 + }, + { + "epoch": 1.963430705029651, + "grad_norm": 2.674166679382324, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7301146388053894, + "num_tokens": 446210503.0, + "step": 17879 + }, + { + "epoch": 1.9635405227322644, + "grad_norm": 2.5246024131774902, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7216634750366211, + "num_tokens": 446232059.0, + "step": 17880 + }, + { + "epoch": 1.963650340434878, + "grad_norm": 2.4017128944396973, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7324700355529785, + "num_tokens": 446255665.0, + "step": 17881 + }, + { + "epoch": 1.9637601581374917, + "grad_norm": 2.2579731941223145, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7236033082008362, + "num_tokens": 446281670.0, + "step": 17882 + }, + { + "epoch": 1.9638699758401055, + "grad_norm": 2.0934951305389404, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.726270318031311, + "num_tokens": 446309393.0, + "step": 17883 + }, + { + "epoch": 1.9639797935427192, + "grad_norm": 2.176659107208252, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7317976355552673, + "num_tokens": 446334306.0, + "step": 17884 + }, + { + "epoch": 1.9640896112453328, + "grad_norm": 2.184386968612671, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6884216070175171, + "num_tokens": 446361531.0, + "step": 17885 + }, + { + "epoch": 1.9641994289479463, + "grad_norm": 2.5040042400360107, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7265910506248474, + "num_tokens": 446382585.0, + "step": 17886 + }, + { + "epoch": 1.96430924665056, + "grad_norm": 2.5483028888702393, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7100759744644165, + "num_tokens": 446403606.0, + "step": 17887 + }, + { + "epoch": 1.9644190643531738, + "grad_norm": 2.4972987174987793, + "learning_rate": 1e-06, + "loss": 0.7907, + "mean_token_accuracy": 0.7491883635520935, + "num_tokens": 446423711.0, + "step": 17888 + }, + { + "epoch": 1.9645288820557874, + "grad_norm": 2.396207094192505, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7214447259902954, + "num_tokens": 446445552.0, + "step": 17889 + }, + { + "epoch": 1.964638699758401, + "grad_norm": 2.480771064758301, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7266260385513306, + "num_tokens": 446466077.0, + "step": 17890 + }, + { + "epoch": 1.9647485174610146, + "grad_norm": 2.554849863052368, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7319584488868713, + "num_tokens": 446488665.0, + "step": 17891 + }, + { + "epoch": 1.9648583351636284, + "grad_norm": 2.4026107788085938, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7231987714767456, + "num_tokens": 446511317.0, + "step": 17892 + }, + { + "epoch": 1.9649681528662422, + "grad_norm": 2.353688955307007, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7089117169380188, + "num_tokens": 446535447.0, + "step": 17893 + }, + { + "epoch": 1.9650779705688557, + "grad_norm": 2.345391273498535, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7310782074928284, + "num_tokens": 446558013.0, + "step": 17894 + }, + { + "epoch": 1.9651877882714692, + "grad_norm": 2.1864519119262695, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7180309891700745, + "num_tokens": 446583025.0, + "step": 17895 + }, + { + "epoch": 1.965297605974083, + "grad_norm": 2.4576752185821533, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7302495241165161, + "num_tokens": 446606431.0, + "step": 17896 + }, + { + "epoch": 1.9654074236766967, + "grad_norm": 2.3522591590881348, + "learning_rate": 1e-06, + "loss": 0.8244, + "mean_token_accuracy": 0.7362608313560486, + "num_tokens": 446630872.0, + "step": 17897 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 2.2568466663360596, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7159602642059326, + "num_tokens": 446655922.0, + "step": 17898 + }, + { + "epoch": 1.965627059081924, + "grad_norm": 2.563168525695801, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7477759122848511, + "num_tokens": 446676492.0, + "step": 17899 + }, + { + "epoch": 1.9657368767845376, + "grad_norm": 2.46718168258667, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.70649653673172, + "num_tokens": 446698654.0, + "step": 17900 + }, + { + "epoch": 1.9658466944871513, + "grad_norm": 2.209615707397461, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7248069047927856, + "num_tokens": 446725766.0, + "step": 17901 + }, + { + "epoch": 1.965956512189765, + "grad_norm": 2.221867799758911, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7343374490737915, + "num_tokens": 446751334.0, + "step": 17902 + }, + { + "epoch": 1.9660663298923786, + "grad_norm": 2.3272857666015625, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.72251296043396, + "num_tokens": 446775563.0, + "step": 17903 + }, + { + "epoch": 1.9661761475949922, + "grad_norm": 2.6918089389801025, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7134074568748474, + "num_tokens": 446795185.0, + "step": 17904 + }, + { + "epoch": 1.966285965297606, + "grad_norm": 1.8693437576293945, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6902410387992859, + "num_tokens": 446832448.0, + "step": 17905 + }, + { + "epoch": 1.9663957830002197, + "grad_norm": 2.353961229324341, + "learning_rate": 1e-06, + "loss": 0.7523, + "mean_token_accuracy": 0.7559767961502075, + "num_tokens": 446854268.0, + "step": 17906 + }, + { + "epoch": 1.9665056007028334, + "grad_norm": 2.485853433609009, + "learning_rate": 1e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.739356279373169, + "num_tokens": 446875036.0, + "step": 17907 + }, + { + "epoch": 1.966615418405447, + "grad_norm": 2.2832295894622803, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7197592258453369, + "num_tokens": 446899751.0, + "step": 17908 + }, + { + "epoch": 1.9667252361080605, + "grad_norm": 2.0794570446014404, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.714181125164032, + "num_tokens": 446927557.0, + "step": 17909 + }, + { + "epoch": 1.9668350538106742, + "grad_norm": 1.9471131563186646, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6907455921173096, + "num_tokens": 446963178.0, + "step": 17910 + }, + { + "epoch": 1.966944871513288, + "grad_norm": 2.2009217739105225, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7424816489219666, + "num_tokens": 446988988.0, + "step": 17911 + }, + { + "epoch": 1.9670546892159018, + "grad_norm": 2.4251885414123535, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7268937826156616, + "num_tokens": 447011785.0, + "step": 17912 + }, + { + "epoch": 1.9671645069185153, + "grad_norm": 2.045227289199829, + "learning_rate": 1e-06, + "loss": 0.7256, + "mean_token_accuracy": 0.7635244727134705, + "num_tokens": 447036366.0, + "step": 17913 + }, + { + "epoch": 1.9672743246211288, + "grad_norm": 2.2470555305480957, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7347152233123779, + "num_tokens": 447061582.0, + "step": 17914 + }, + { + "epoch": 1.9673841423237426, + "grad_norm": 2.289278507232666, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7036974430084229, + "num_tokens": 447085851.0, + "step": 17915 + }, + { + "epoch": 1.9674939600263563, + "grad_norm": 2.1988542079925537, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7223630547523499, + "num_tokens": 447111778.0, + "step": 17916 + }, + { + "epoch": 1.9676037777289699, + "grad_norm": 2.2852349281311035, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7321828603744507, + "num_tokens": 447136615.0, + "step": 17917 + }, + { + "epoch": 1.9677135954315834, + "grad_norm": 2.164548873901367, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7080050110816956, + "num_tokens": 447164584.0, + "step": 17918 + }, + { + "epoch": 1.9678234131341972, + "grad_norm": 2.2724785804748535, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7392181158065796, + "num_tokens": 447187784.0, + "step": 17919 + }, + { + "epoch": 1.967933230836811, + "grad_norm": 2.5200042724609375, + "learning_rate": 1e-06, + "loss": 0.7873, + "mean_token_accuracy": 0.747266411781311, + "num_tokens": 447208193.0, + "step": 17920 + }, + { + "epoch": 1.9680430485394247, + "grad_norm": 2.0864317417144775, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7642276287078857, + "num_tokens": 447233618.0, + "step": 17921 + }, + { + "epoch": 1.9681528662420382, + "grad_norm": 2.5371205806732178, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7505212426185608, + "num_tokens": 447253179.0, + "step": 17922 + }, + { + "epoch": 1.9682626839446518, + "grad_norm": 2.012890100479126, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7171192169189453, + "num_tokens": 447282473.0, + "step": 17923 + }, + { + "epoch": 1.9683725016472655, + "grad_norm": 2.242680311203003, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7259107232093811, + "num_tokens": 447310034.0, + "step": 17924 + }, + { + "epoch": 1.9684823193498793, + "grad_norm": 2.369680881500244, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7487620711326599, + "num_tokens": 447331139.0, + "step": 17925 + }, + { + "epoch": 1.9685921370524928, + "grad_norm": 2.2984986305236816, + "learning_rate": 1e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7441769242286682, + "num_tokens": 447354453.0, + "step": 17926 + }, + { + "epoch": 1.9687019547551066, + "grad_norm": 2.183762311935425, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6900984048843384, + "num_tokens": 447383508.0, + "step": 17927 + }, + { + "epoch": 1.96881177245772, + "grad_norm": 2.249610424041748, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7258517146110535, + "num_tokens": 447408463.0, + "step": 17928 + }, + { + "epoch": 1.9689215901603339, + "grad_norm": 1.9641388654708862, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7031674385070801, + "num_tokens": 447440518.0, + "step": 17929 + }, + { + "epoch": 1.9690314078629476, + "grad_norm": 2.573895215988159, + "learning_rate": 1e-06, + "loss": 0.7593, + "mean_token_accuracy": 0.7572951912879944, + "num_tokens": 447460564.0, + "step": 17930 + }, + { + "epoch": 1.9691412255655611, + "grad_norm": 2.441851854324341, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7281261682510376, + "num_tokens": 447482494.0, + "step": 17931 + }, + { + "epoch": 1.9692510432681747, + "grad_norm": 2.2950356006622314, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7030719518661499, + "num_tokens": 447508530.0, + "step": 17932 + }, + { + "epoch": 1.9693608609707884, + "grad_norm": 2.737499713897705, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7434272170066833, + "num_tokens": 447528709.0, + "step": 17933 + }, + { + "epoch": 1.9694706786734022, + "grad_norm": 2.1139042377471924, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7128223776817322, + "num_tokens": 447555007.0, + "step": 17934 + }, + { + "epoch": 1.969580496376016, + "grad_norm": 2.2584803104400635, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7094775438308716, + "num_tokens": 447580384.0, + "step": 17935 + }, + { + "epoch": 1.9696903140786295, + "grad_norm": 2.0869462490081787, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7200092673301697, + "num_tokens": 447609181.0, + "step": 17936 + }, + { + "epoch": 1.969800131781243, + "grad_norm": 2.1204965114593506, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.727120578289032, + "num_tokens": 447637002.0, + "step": 17937 + }, + { + "epoch": 1.9699099494838568, + "grad_norm": 2.361323118209839, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7243454456329346, + "num_tokens": 447660604.0, + "step": 17938 + }, + { + "epoch": 1.9700197671864705, + "grad_norm": 2.66241455078125, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7375041246414185, + "num_tokens": 447679678.0, + "step": 17939 + }, + { + "epoch": 1.970129584889084, + "grad_norm": 2.4908127784729004, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7416468858718872, + "num_tokens": 447700878.0, + "step": 17940 + }, + { + "epoch": 1.9702394025916978, + "grad_norm": 2.1526267528533936, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7278974056243896, + "num_tokens": 447727908.0, + "step": 17941 + }, + { + "epoch": 1.9703492202943114, + "grad_norm": 2.3301546573638916, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7492403388023376, + "num_tokens": 447749567.0, + "step": 17942 + }, + { + "epoch": 1.9704590379969251, + "grad_norm": 2.333292245864868, + "learning_rate": 1e-06, + "loss": 0.8084, + "mean_token_accuracy": 0.742202639579773, + "num_tokens": 447772259.0, + "step": 17943 + }, + { + "epoch": 1.9705688556995389, + "grad_norm": 2.3338029384613037, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7291619777679443, + "num_tokens": 447797095.0, + "step": 17944 + }, + { + "epoch": 1.9706786734021524, + "grad_norm": 2.1113216876983643, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7214304804801941, + "num_tokens": 447825978.0, + "step": 17945 + }, + { + "epoch": 1.970788491104766, + "grad_norm": 2.4274561405181885, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7168967723846436, + "num_tokens": 447848828.0, + "step": 17946 + }, + { + "epoch": 1.9708983088073797, + "grad_norm": 2.137864589691162, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7085424661636353, + "num_tokens": 447877346.0, + "step": 17947 + }, + { + "epoch": 1.9710081265099935, + "grad_norm": 2.240297317504883, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7200324535369873, + "num_tokens": 447903079.0, + "step": 17948 + }, + { + "epoch": 1.9711179442126072, + "grad_norm": 1.9619691371917725, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.705767035484314, + "num_tokens": 447934862.0, + "step": 17949 + }, + { + "epoch": 1.9712277619152208, + "grad_norm": 2.1553680896759033, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7090426683425903, + "num_tokens": 447961543.0, + "step": 17950 + }, + { + "epoch": 1.9713375796178343, + "grad_norm": 2.394714832305908, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.735702395439148, + "num_tokens": 447982696.0, + "step": 17951 + }, + { + "epoch": 1.971447397320448, + "grad_norm": 2.338912010192871, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7100105881690979, + "num_tokens": 448005639.0, + "step": 17952 + }, + { + "epoch": 1.9715572150230618, + "grad_norm": 2.1365177631378174, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7208997011184692, + "num_tokens": 448032810.0, + "step": 17953 + }, + { + "epoch": 1.9716670327256753, + "grad_norm": 2.1052162647247314, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.721125066280365, + "num_tokens": 448059632.0, + "step": 17954 + }, + { + "epoch": 1.9717768504282889, + "grad_norm": 2.2176408767700195, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7239344120025635, + "num_tokens": 448085330.0, + "step": 17955 + }, + { + "epoch": 1.9718866681309026, + "grad_norm": 2.1451010704040527, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7158453464508057, + "num_tokens": 448114721.0, + "step": 17956 + }, + { + "epoch": 1.9719964858335164, + "grad_norm": 2.1592228412628174, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7127213478088379, + "num_tokens": 448140251.0, + "step": 17957 + }, + { + "epoch": 1.9721063035361301, + "grad_norm": 2.1637072563171387, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7060748338699341, + "num_tokens": 448167121.0, + "step": 17958 + }, + { + "epoch": 1.9722161212387437, + "grad_norm": 2.257549285888672, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.715895414352417, + "num_tokens": 448193102.0, + "step": 17959 + }, + { + "epoch": 1.9723259389413572, + "grad_norm": 2.4340314865112305, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6916351914405823, + "num_tokens": 448218078.0, + "step": 17960 + }, + { + "epoch": 1.972435756643971, + "grad_norm": 2.2167487144470215, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7239567637443542, + "num_tokens": 448245058.0, + "step": 17961 + }, + { + "epoch": 1.9725455743465847, + "grad_norm": 2.131497383117676, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7117434740066528, + "num_tokens": 448273876.0, + "step": 17962 + }, + { + "epoch": 1.9726553920491985, + "grad_norm": 2.1333329677581787, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7269599437713623, + "num_tokens": 448300652.0, + "step": 17963 + }, + { + "epoch": 1.972765209751812, + "grad_norm": 2.1875221729278564, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7143243551254272, + "num_tokens": 448325032.0, + "step": 17964 + }, + { + "epoch": 1.9728750274544256, + "grad_norm": 2.2210538387298584, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7143591046333313, + "num_tokens": 448351571.0, + "step": 17965 + }, + { + "epoch": 1.9729848451570393, + "grad_norm": 2.074970006942749, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7067707777023315, + "num_tokens": 448379737.0, + "step": 17966 + }, + { + "epoch": 1.973094662859653, + "grad_norm": 2.4616641998291016, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7176873683929443, + "num_tokens": 448401809.0, + "step": 17967 + }, + { + "epoch": 1.9732044805622666, + "grad_norm": 1.9516974687576294, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7216781377792358, + "num_tokens": 448433062.0, + "step": 17968 + }, + { + "epoch": 1.9733142982648801, + "grad_norm": 2.3336117267608643, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7288030982017517, + "num_tokens": 448457201.0, + "step": 17969 + }, + { + "epoch": 1.973424115967494, + "grad_norm": 2.538466215133667, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7186663150787354, + "num_tokens": 448478356.0, + "step": 17970 + }, + { + "epoch": 1.9735339336701077, + "grad_norm": 2.1756715774536133, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7404696941375732, + "num_tokens": 448505448.0, + "step": 17971 + }, + { + "epoch": 1.9736437513727214, + "grad_norm": 2.32391619682312, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7227509617805481, + "num_tokens": 448529681.0, + "step": 17972 + }, + { + "epoch": 1.973753569075335, + "grad_norm": 2.4799485206604004, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7219152450561523, + "num_tokens": 448550175.0, + "step": 17973 + }, + { + "epoch": 1.9738633867779485, + "grad_norm": 2.0082359313964844, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7468165159225464, + "num_tokens": 448577405.0, + "step": 17974 + }, + { + "epoch": 1.9739732044805622, + "grad_norm": 2.373732089996338, + "learning_rate": 1e-06, + "loss": 0.8088, + "mean_token_accuracy": 0.7448851466178894, + "num_tokens": 448599810.0, + "step": 17975 + }, + { + "epoch": 1.974083022183176, + "grad_norm": 2.2168266773223877, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7198564410209656, + "num_tokens": 448624980.0, + "step": 17976 + }, + { + "epoch": 1.9741928398857898, + "grad_norm": 2.5687735080718994, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7108572721481323, + "num_tokens": 448646405.0, + "step": 17977 + }, + { + "epoch": 1.9743026575884033, + "grad_norm": 2.4286787509918213, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7392488121986389, + "num_tokens": 448668635.0, + "step": 17978 + }, + { + "epoch": 1.9744124752910168, + "grad_norm": 2.4646191596984863, + "learning_rate": 1e-06, + "loss": 0.7679, + "mean_token_accuracy": 0.7536198496818542, + "num_tokens": 448688928.0, + "step": 17979 + }, + { + "epoch": 1.9745222929936306, + "grad_norm": 2.195899724960327, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7133150696754456, + "num_tokens": 448715537.0, + "step": 17980 + }, + { + "epoch": 1.9746321106962443, + "grad_norm": 2.2376742362976074, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.710769534111023, + "num_tokens": 448741417.0, + "step": 17981 + }, + { + "epoch": 1.9747419283988579, + "grad_norm": 2.4295144081115723, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7370445132255554, + "num_tokens": 448763553.0, + "step": 17982 + }, + { + "epoch": 1.9748517461014714, + "grad_norm": 2.3795742988586426, + "learning_rate": 1e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7413338422775269, + "num_tokens": 448785402.0, + "step": 17983 + }, + { + "epoch": 1.9749615638040852, + "grad_norm": 2.206361770629883, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.6936573386192322, + "num_tokens": 448811646.0, + "step": 17984 + }, + { + "epoch": 1.975071381506699, + "grad_norm": 2.140453577041626, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7269984483718872, + "num_tokens": 448838054.0, + "step": 17985 + }, + { + "epoch": 1.9751811992093127, + "grad_norm": 2.27072811126709, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.718114972114563, + "num_tokens": 448862868.0, + "step": 17986 + }, + { + "epoch": 1.9752910169119262, + "grad_norm": 2.036618947982788, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.6992321014404297, + "num_tokens": 448891472.0, + "step": 17987 + }, + { + "epoch": 1.9754008346145397, + "grad_norm": 2.1051084995269775, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7241859436035156, + "num_tokens": 448917058.0, + "step": 17988 + }, + { + "epoch": 1.9755106523171535, + "grad_norm": 2.4884033203125, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7427792549133301, + "num_tokens": 448936927.0, + "step": 17989 + }, + { + "epoch": 1.9756204700197673, + "grad_norm": 2.5801327228546143, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7422071099281311, + "num_tokens": 448956821.0, + "step": 17990 + }, + { + "epoch": 1.9757302877223808, + "grad_norm": 2.4969215393066406, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7200382947921753, + "num_tokens": 448978349.0, + "step": 17991 + }, + { + "epoch": 1.9758401054249946, + "grad_norm": 2.4406304359436035, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7284018993377686, + "num_tokens": 449000290.0, + "step": 17992 + }, + { + "epoch": 1.975949923127608, + "grad_norm": 2.6060702800750732, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7390267252922058, + "num_tokens": 449019038.0, + "step": 17993 + }, + { + "epoch": 1.9760597408302218, + "grad_norm": 2.256483554840088, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.70556640625, + "num_tokens": 449045480.0, + "step": 17994 + }, + { + "epoch": 1.9761695585328356, + "grad_norm": 2.419743537902832, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7130903601646423, + "num_tokens": 449068408.0, + "step": 17995 + }, + { + "epoch": 1.9762793762354491, + "grad_norm": 2.1143321990966797, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7335055470466614, + "num_tokens": 449096434.0, + "step": 17996 + }, + { + "epoch": 1.9763891939380627, + "grad_norm": 2.2583746910095215, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.732873260974884, + "num_tokens": 449120290.0, + "step": 17997 + }, + { + "epoch": 1.9764990116406764, + "grad_norm": 2.171152353286743, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7104941606521606, + "num_tokens": 449147949.0, + "step": 17998 + }, + { + "epoch": 1.9766088293432902, + "grad_norm": 2.1690714359283447, + "learning_rate": 1e-06, + "loss": 0.8356, + "mean_token_accuracy": 0.7334901094436646, + "num_tokens": 449174200.0, + "step": 17999 + }, + { + "epoch": 1.976718647045904, + "grad_norm": 2.5856852531433105, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7230883836746216, + "num_tokens": 449194100.0, + "step": 18000 + }, + { + "epoch": 1.9768284647485175, + "grad_norm": 2.4050850868225098, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.705659031867981, + "num_tokens": 449218220.0, + "step": 18001 + }, + { + "epoch": 1.976938282451131, + "grad_norm": 2.25020694732666, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7301015257835388, + "num_tokens": 449241806.0, + "step": 18002 + }, + { + "epoch": 1.9770481001537448, + "grad_norm": 2.29276704788208, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7157902717590332, + "num_tokens": 449265063.0, + "step": 18003 + }, + { + "epoch": 1.9771579178563585, + "grad_norm": 2.255260944366455, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7037919759750366, + "num_tokens": 449291848.0, + "step": 18004 + }, + { + "epoch": 1.977267735558972, + "grad_norm": 2.467381238937378, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7448447942733765, + "num_tokens": 449314851.0, + "step": 18005 + }, + { + "epoch": 1.9773775532615858, + "grad_norm": 2.7426259517669678, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7429485321044922, + "num_tokens": 449334825.0, + "step": 18006 + }, + { + "epoch": 1.9774873709641994, + "grad_norm": 2.3760313987731934, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7405954003334045, + "num_tokens": 449359142.0, + "step": 18007 + }, + { + "epoch": 1.977597188666813, + "grad_norm": 2.053924560546875, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6876287460327148, + "num_tokens": 449388817.0, + "step": 18008 + }, + { + "epoch": 1.9777070063694269, + "grad_norm": 2.1635751724243164, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6907666921615601, + "num_tokens": 449418281.0, + "step": 18009 + }, + { + "epoch": 1.9778168240720404, + "grad_norm": 2.4249074459075928, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7265962362289429, + "num_tokens": 449440420.0, + "step": 18010 + }, + { + "epoch": 1.977926641774654, + "grad_norm": 2.267178535461426, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7138279676437378, + "num_tokens": 449466059.0, + "step": 18011 + }, + { + "epoch": 1.9780364594772677, + "grad_norm": 2.3953588008880615, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7215611934661865, + "num_tokens": 449489026.0, + "step": 18012 + }, + { + "epoch": 1.9781462771798815, + "grad_norm": 2.227764368057251, + "learning_rate": 1e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7303946018218994, + "num_tokens": 449512097.0, + "step": 18013 + }, + { + "epoch": 1.9782560948824952, + "grad_norm": 2.3687474727630615, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7122682332992554, + "num_tokens": 449533643.0, + "step": 18014 + }, + { + "epoch": 1.9783659125851087, + "grad_norm": 2.307152032852173, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7109332084655762, + "num_tokens": 449559677.0, + "step": 18015 + }, + { + "epoch": 1.9784757302877223, + "grad_norm": 2.5925562381744385, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7246785759925842, + "num_tokens": 449579295.0, + "step": 18016 + }, + { + "epoch": 1.978585547990336, + "grad_norm": 2.3700263500213623, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7069297432899475, + "num_tokens": 449605009.0, + "step": 18017 + }, + { + "epoch": 1.9786953656929498, + "grad_norm": 2.0970191955566406, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7256039381027222, + "num_tokens": 449631446.0, + "step": 18018 + }, + { + "epoch": 1.9788051833955633, + "grad_norm": 2.300633668899536, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7100788354873657, + "num_tokens": 449656130.0, + "step": 18019 + }, + { + "epoch": 1.9789150010981769, + "grad_norm": 2.415278196334839, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7523229122161865, + "num_tokens": 449677535.0, + "step": 18020 + }, + { + "epoch": 1.9790248188007906, + "grad_norm": 2.05757212638855, + "learning_rate": 1e-06, + "loss": 0.809, + "mean_token_accuracy": 0.7408881187438965, + "num_tokens": 449704360.0, + "step": 18021 + }, + { + "epoch": 1.9791346365034044, + "grad_norm": 2.2481038570404053, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7130404710769653, + "num_tokens": 449729916.0, + "step": 18022 + }, + { + "epoch": 1.9792444542060181, + "grad_norm": 2.22715425491333, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7229616045951843, + "num_tokens": 449754058.0, + "step": 18023 + }, + { + "epoch": 1.9793542719086317, + "grad_norm": 2.6863536834716797, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7265527248382568, + "num_tokens": 449773847.0, + "step": 18024 + }, + { + "epoch": 1.9794640896112452, + "grad_norm": 2.3432602882385254, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7226488590240479, + "num_tokens": 449796992.0, + "step": 18025 + }, + { + "epoch": 1.979573907313859, + "grad_norm": 2.2613723278045654, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7010630369186401, + "num_tokens": 449822328.0, + "step": 18026 + }, + { + "epoch": 1.9796837250164727, + "grad_norm": 2.437098264694214, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7305720448493958, + "num_tokens": 449844990.0, + "step": 18027 + }, + { + "epoch": 1.9797935427190865, + "grad_norm": 2.172865390777588, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7217276096343994, + "num_tokens": 449871380.0, + "step": 18028 + }, + { + "epoch": 1.9799033604217, + "grad_norm": 2.460169553756714, + "learning_rate": 1e-06, + "loss": 0.7459, + "mean_token_accuracy": 0.7616143226623535, + "num_tokens": 449891796.0, + "step": 18029 + }, + { + "epoch": 1.9800131781243135, + "grad_norm": 2.1534297466278076, + "learning_rate": 1e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7498814463615417, + "num_tokens": 449917808.0, + "step": 18030 + }, + { + "epoch": 1.9801229958269273, + "grad_norm": 2.609790802001953, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7377134561538696, + "num_tokens": 449939179.0, + "step": 18031 + }, + { + "epoch": 1.980232813529541, + "grad_norm": 2.222271680831909, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7205343246459961, + "num_tokens": 449964481.0, + "step": 18032 + }, + { + "epoch": 1.9803426312321546, + "grad_norm": 2.3262925148010254, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7263126373291016, + "num_tokens": 449987402.0, + "step": 18033 + }, + { + "epoch": 1.9804524489347681, + "grad_norm": 2.3416061401367188, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7381973266601562, + "num_tokens": 450010502.0, + "step": 18034 + }, + { + "epoch": 1.9805622666373819, + "grad_norm": 2.431913137435913, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7109683752059937, + "num_tokens": 450032803.0, + "step": 18035 + }, + { + "epoch": 1.9806720843399956, + "grad_norm": 2.3138253688812256, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7041430473327637, + "num_tokens": 450059558.0, + "step": 18036 + }, + { + "epoch": 1.9807819020426094, + "grad_norm": 2.3466081619262695, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7170382142066956, + "num_tokens": 450083874.0, + "step": 18037 + }, + { + "epoch": 1.980891719745223, + "grad_norm": 2.3553104400634766, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7136943936347961, + "num_tokens": 450108568.0, + "step": 18038 + }, + { + "epoch": 1.9810015374478365, + "grad_norm": 2.3886969089508057, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7234207391738892, + "num_tokens": 450131638.0, + "step": 18039 + }, + { + "epoch": 1.9811113551504502, + "grad_norm": 2.3230814933776855, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7181806564331055, + "num_tokens": 450155309.0, + "step": 18040 + }, + { + "epoch": 1.981221172853064, + "grad_norm": 2.2738096714019775, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7134478092193604, + "num_tokens": 450180781.0, + "step": 18041 + }, + { + "epoch": 1.9813309905556777, + "grad_norm": 2.388685941696167, + "learning_rate": 1e-06, + "loss": 0.7968, + "mean_token_accuracy": 0.7454591989517212, + "num_tokens": 450202959.0, + "step": 18042 + }, + { + "epoch": 1.9814408082582913, + "grad_norm": 2.3197884559631348, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7406189441680908, + "num_tokens": 450227777.0, + "step": 18043 + }, + { + "epoch": 1.9815506259609048, + "grad_norm": 2.5825369358062744, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7134000062942505, + "num_tokens": 450248849.0, + "step": 18044 + }, + { + "epoch": 1.9816604436635186, + "grad_norm": 2.430647373199463, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7161348462104797, + "num_tokens": 450273456.0, + "step": 18045 + }, + { + "epoch": 1.9817702613661323, + "grad_norm": 2.13682222366333, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.712088942527771, + "num_tokens": 450300413.0, + "step": 18046 + }, + { + "epoch": 1.9818800790687459, + "grad_norm": 2.2283945083618164, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7420852184295654, + "num_tokens": 450325246.0, + "step": 18047 + }, + { + "epoch": 1.9819898967713594, + "grad_norm": 2.1798956394195557, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7229351997375488, + "num_tokens": 450350236.0, + "step": 18048 + }, + { + "epoch": 1.9820997144739732, + "grad_norm": 2.0452823638916016, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6942377686500549, + "num_tokens": 450381129.0, + "step": 18049 + }, + { + "epoch": 1.982209532176587, + "grad_norm": 2.4496912956237793, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7371785640716553, + "num_tokens": 450402670.0, + "step": 18050 + }, + { + "epoch": 1.9823193498792007, + "grad_norm": 2.079057455062866, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7128539085388184, + "num_tokens": 450431967.0, + "step": 18051 + }, + { + "epoch": 1.9824291675818142, + "grad_norm": 2.4162673950195312, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.716386616230011, + "num_tokens": 450454836.0, + "step": 18052 + }, + { + "epoch": 1.9825389852844277, + "grad_norm": 2.3769733905792236, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7301183938980103, + "num_tokens": 450477277.0, + "step": 18053 + }, + { + "epoch": 1.9826488029870415, + "grad_norm": 2.243180274963379, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7351850271224976, + "num_tokens": 450501462.0, + "step": 18054 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 2.11043643951416, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7064814567565918, + "num_tokens": 450530558.0, + "step": 18055 + }, + { + "epoch": 1.9828684383922688, + "grad_norm": 2.1814661026000977, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7207924127578735, + "num_tokens": 450556904.0, + "step": 18056 + }, + { + "epoch": 1.9829782560948825, + "grad_norm": 2.3630998134613037, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7071395516395569, + "num_tokens": 450581711.0, + "step": 18057 + }, + { + "epoch": 1.983088073797496, + "grad_norm": 2.183863639831543, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7364010810852051, + "num_tokens": 450607405.0, + "step": 18058 + }, + { + "epoch": 1.9831978915001098, + "grad_norm": 2.1981594562530518, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7076332569122314, + "num_tokens": 450635144.0, + "step": 18059 + }, + { + "epoch": 1.9833077092027236, + "grad_norm": 2.4341912269592285, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7167406678199768, + "num_tokens": 450657204.0, + "step": 18060 + }, + { + "epoch": 1.9834175269053371, + "grad_norm": 1.949718713760376, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.713355302810669, + "num_tokens": 450690459.0, + "step": 18061 + }, + { + "epoch": 1.9835273446079507, + "grad_norm": 2.052119255065918, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7217170596122742, + "num_tokens": 450722376.0, + "step": 18062 + }, + { + "epoch": 1.9836371623105644, + "grad_norm": 2.6269097328186035, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7136459946632385, + "num_tokens": 450743538.0, + "step": 18063 + }, + { + "epoch": 1.9837469800131782, + "grad_norm": 2.325552225112915, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7352274656295776, + "num_tokens": 450767323.0, + "step": 18064 + }, + { + "epoch": 1.983856797715792, + "grad_norm": 2.1389286518096924, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7204864025115967, + "num_tokens": 450793928.0, + "step": 18065 + }, + { + "epoch": 1.9839666154184055, + "grad_norm": 2.431835651397705, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7370541095733643, + "num_tokens": 450814747.0, + "step": 18066 + }, + { + "epoch": 1.984076433121019, + "grad_norm": 2.2848105430603027, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7272015810012817, + "num_tokens": 450837537.0, + "step": 18067 + }, + { + "epoch": 1.9841862508236328, + "grad_norm": 2.139692544937134, + "learning_rate": 1e-06, + "loss": 0.7602, + "mean_token_accuracy": 0.7643154859542847, + "num_tokens": 450860629.0, + "step": 18068 + }, + { + "epoch": 1.9842960685262465, + "grad_norm": 2.748631477355957, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7391053438186646, + "num_tokens": 450879570.0, + "step": 18069 + }, + { + "epoch": 1.98440588622886, + "grad_norm": 2.1875030994415283, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7424381971359253, + "num_tokens": 450904641.0, + "step": 18070 + }, + { + "epoch": 1.9845157039314738, + "grad_norm": 2.334939479827881, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.7048804759979248, + "num_tokens": 450930216.0, + "step": 18071 + }, + { + "epoch": 1.9846255216340873, + "grad_norm": 1.8921335935592651, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7009292840957642, + "num_tokens": 450964557.0, + "step": 18072 + }, + { + "epoch": 1.984735339336701, + "grad_norm": 2.4829304218292236, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7227855920791626, + "num_tokens": 450987789.0, + "step": 18073 + }, + { + "epoch": 1.9848451570393149, + "grad_norm": 2.170596122741699, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7218508124351501, + "num_tokens": 451014293.0, + "step": 18074 + }, + { + "epoch": 1.9849549747419284, + "grad_norm": 2.5869624614715576, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7338154315948486, + "num_tokens": 451033408.0, + "step": 18075 + }, + { + "epoch": 1.985064792444542, + "grad_norm": 2.856987714767456, + "learning_rate": 1e-06, + "loss": 0.7703, + "mean_token_accuracy": 0.7609789967536926, + "num_tokens": 451049713.0, + "step": 18076 + }, + { + "epoch": 1.9851746101471557, + "grad_norm": 2.2382330894470215, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7330454587936401, + "num_tokens": 451074858.0, + "step": 18077 + }, + { + "epoch": 1.9852844278497694, + "grad_norm": 2.3373963832855225, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7201515436172485, + "num_tokens": 451099393.0, + "step": 18078 + }, + { + "epoch": 1.9853942455523832, + "grad_norm": 2.3276946544647217, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7211238145828247, + "num_tokens": 451123555.0, + "step": 18079 + }, + { + "epoch": 1.9855040632549967, + "grad_norm": 2.2993502616882324, + "learning_rate": 1e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7412101030349731, + "num_tokens": 451147232.0, + "step": 18080 + }, + { + "epoch": 1.9856138809576103, + "grad_norm": 2.3272578716278076, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7149330973625183, + "num_tokens": 451174751.0, + "step": 18081 + }, + { + "epoch": 1.985723698660224, + "grad_norm": 2.499504566192627, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7076179385185242, + "num_tokens": 451198032.0, + "step": 18082 + }, + { + "epoch": 1.9858335163628378, + "grad_norm": 2.099102020263672, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7118748426437378, + "num_tokens": 451228293.0, + "step": 18083 + }, + { + "epoch": 1.9859433340654513, + "grad_norm": 2.246891736984253, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7383007407188416, + "num_tokens": 451252366.0, + "step": 18084 + }, + { + "epoch": 1.9860531517680649, + "grad_norm": 2.075765371322632, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.70790696144104, + "num_tokens": 451281579.0, + "step": 18085 + }, + { + "epoch": 1.9861629694706786, + "grad_norm": 2.2455501556396484, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7359562516212463, + "num_tokens": 451305676.0, + "step": 18086 + }, + { + "epoch": 1.9862727871732924, + "grad_norm": 2.4438183307647705, + "learning_rate": 1e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7488219738006592, + "num_tokens": 451328492.0, + "step": 18087 + }, + { + "epoch": 1.9863826048759061, + "grad_norm": 2.2005090713500977, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6946504712104797, + "num_tokens": 451354005.0, + "step": 18088 + }, + { + "epoch": 1.9864924225785197, + "grad_norm": 2.369943141937256, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.7272248268127441, + "num_tokens": 451376757.0, + "step": 18089 + }, + { + "epoch": 1.9866022402811332, + "grad_norm": 2.4465887546539307, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7270739078521729, + "num_tokens": 451399021.0, + "step": 18090 + }, + { + "epoch": 1.986712057983747, + "grad_norm": 1.9185618162155151, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6930960416793823, + "num_tokens": 451431196.0, + "step": 18091 + }, + { + "epoch": 1.9868218756863607, + "grad_norm": 2.272023916244507, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7407705187797546, + "num_tokens": 451455632.0, + "step": 18092 + }, + { + "epoch": 1.9869316933889745, + "grad_norm": 2.218853235244751, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7222397327423096, + "num_tokens": 451482003.0, + "step": 18093 + }, + { + "epoch": 1.987041511091588, + "grad_norm": 2.007077217102051, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7124568223953247, + "num_tokens": 451513382.0, + "step": 18094 + }, + { + "epoch": 1.9871513287942015, + "grad_norm": 2.297497272491455, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7178716659545898, + "num_tokens": 451538386.0, + "step": 18095 + }, + { + "epoch": 1.9872611464968153, + "grad_norm": 2.393427610397339, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7115253210067749, + "num_tokens": 451560363.0, + "step": 18096 + }, + { + "epoch": 1.987370964199429, + "grad_norm": 2.494779586791992, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7274134159088135, + "num_tokens": 451583230.0, + "step": 18097 + }, + { + "epoch": 1.9874807819020426, + "grad_norm": 2.3900768756866455, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7116848230361938, + "num_tokens": 451605865.0, + "step": 18098 + }, + { + "epoch": 1.9875905996046561, + "grad_norm": 2.1967504024505615, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7242547869682312, + "num_tokens": 451630524.0, + "step": 18099 + }, + { + "epoch": 1.9877004173072699, + "grad_norm": 2.221613883972168, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7162202596664429, + "num_tokens": 451658365.0, + "step": 18100 + }, + { + "epoch": 1.9878102350098836, + "grad_norm": 2.2531070709228516, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7180790901184082, + "num_tokens": 451684773.0, + "step": 18101 + }, + { + "epoch": 1.9879200527124974, + "grad_norm": 2.3569743633270264, + "learning_rate": 1e-06, + "loss": 0.7163, + "mean_token_accuracy": 0.7610329389572144, + "num_tokens": 451706496.0, + "step": 18102 + }, + { + "epoch": 1.988029870415111, + "grad_norm": 2.0859756469726562, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7170388698577881, + "num_tokens": 451736854.0, + "step": 18103 + }, + { + "epoch": 1.9881396881177245, + "grad_norm": 2.2222087383270264, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7373222708702087, + "num_tokens": 451761773.0, + "step": 18104 + }, + { + "epoch": 1.9882495058203382, + "grad_norm": 2.2446486949920654, + "learning_rate": 1e-06, + "loss": 0.7793, + "mean_token_accuracy": 0.7574173808097839, + "num_tokens": 451786412.0, + "step": 18105 + }, + { + "epoch": 1.988359323522952, + "grad_norm": 2.4137330055236816, + "learning_rate": 1e-06, + "loss": 0.8046, + "mean_token_accuracy": 0.7539938688278198, + "num_tokens": 451808179.0, + "step": 18106 + }, + { + "epoch": 1.9884691412255655, + "grad_norm": 2.612119674682617, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7076596021652222, + "num_tokens": 451829557.0, + "step": 18107 + }, + { + "epoch": 1.9885789589281793, + "grad_norm": 2.1928164958953857, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7334586381912231, + "num_tokens": 451854476.0, + "step": 18108 + }, + { + "epoch": 1.9886887766307928, + "grad_norm": 2.7695436477661133, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7401938438415527, + "num_tokens": 451870179.0, + "step": 18109 + }, + { + "epoch": 1.9887985943334066, + "grad_norm": 2.2421271800994873, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7253528833389282, + "num_tokens": 451901784.0, + "step": 18110 + }, + { + "epoch": 1.9889084120360203, + "grad_norm": 2.4264187812805176, + "learning_rate": 1e-06, + "loss": 0.7521, + "mean_token_accuracy": 0.7535057067871094, + "num_tokens": 451921749.0, + "step": 18111 + }, + { + "epoch": 1.9890182297386338, + "grad_norm": 2.150806188583374, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7058162093162537, + "num_tokens": 451947971.0, + "step": 18112 + }, + { + "epoch": 1.9891280474412474, + "grad_norm": 2.0373103618621826, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7225596904754639, + "num_tokens": 451977182.0, + "step": 18113 + }, + { + "epoch": 1.9892378651438611, + "grad_norm": 2.2510859966278076, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7125576138496399, + "num_tokens": 452000374.0, + "step": 18114 + }, + { + "epoch": 1.989347682846475, + "grad_norm": 2.5198752880096436, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7239639163017273, + "num_tokens": 452021364.0, + "step": 18115 + }, + { + "epoch": 1.9894575005490887, + "grad_norm": 2.453807830810547, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7258867025375366, + "num_tokens": 452043516.0, + "step": 18116 + }, + { + "epoch": 1.9895673182517022, + "grad_norm": 1.8817280530929565, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7038326263427734, + "num_tokens": 452078071.0, + "step": 18117 + }, + { + "epoch": 1.9896771359543157, + "grad_norm": 2.7985353469848633, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7440019249916077, + "num_tokens": 452094615.0, + "step": 18118 + }, + { + "epoch": 1.9897869536569295, + "grad_norm": 2.243032693862915, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7338161468505859, + "num_tokens": 452119341.0, + "step": 18119 + }, + { + "epoch": 1.9898967713595432, + "grad_norm": 2.0159032344818115, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7002527713775635, + "num_tokens": 452149107.0, + "step": 18120 + }, + { + "epoch": 1.9900065890621568, + "grad_norm": 2.4166030883789062, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.731853723526001, + "num_tokens": 452169320.0, + "step": 18121 + }, + { + "epoch": 1.9901164067647705, + "grad_norm": 2.175046920776367, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7168732285499573, + "num_tokens": 452196167.0, + "step": 18122 + }, + { + "epoch": 1.990226224467384, + "grad_norm": 2.2081658840179443, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7324899435043335, + "num_tokens": 452222073.0, + "step": 18123 + }, + { + "epoch": 1.9903360421699978, + "grad_norm": 1.935715913772583, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7233655452728271, + "num_tokens": 452252976.0, + "step": 18124 + }, + { + "epoch": 1.9904458598726116, + "grad_norm": 2.3822758197784424, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7372235059738159, + "num_tokens": 452274978.0, + "step": 18125 + }, + { + "epoch": 1.9905556775752251, + "grad_norm": 2.246019124984741, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7446852922439575, + "num_tokens": 452298359.0, + "step": 18126 + }, + { + "epoch": 1.9906654952778386, + "grad_norm": 2.2263529300689697, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7132717370986938, + "num_tokens": 452324474.0, + "step": 18127 + }, + { + "epoch": 1.9907753129804524, + "grad_norm": 2.3041558265686035, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7154650688171387, + "num_tokens": 452348309.0, + "step": 18128 + }, + { + "epoch": 1.9908851306830662, + "grad_norm": 2.160681962966919, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7157403230667114, + "num_tokens": 452377222.0, + "step": 18129 + }, + { + "epoch": 1.99099494838568, + "grad_norm": 2.3309755325317383, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7125942707061768, + "num_tokens": 452401985.0, + "step": 18130 + }, + { + "epoch": 1.9911047660882935, + "grad_norm": 2.3663363456726074, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7267409563064575, + "num_tokens": 452424922.0, + "step": 18131 + }, + { + "epoch": 1.991214583790907, + "grad_norm": 2.229875326156616, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7098571062088013, + "num_tokens": 452449920.0, + "step": 18132 + }, + { + "epoch": 1.9913244014935207, + "grad_norm": 2.793872356414795, + "learning_rate": 1e-06, + "loss": 0.7255, + "mean_token_accuracy": 0.7611332535743713, + "num_tokens": 452467042.0, + "step": 18133 + }, + { + "epoch": 1.9914342191961345, + "grad_norm": 2.3336265087127686, + "learning_rate": 1e-06, + "loss": 0.7768, + "mean_token_accuracy": 0.7492596507072449, + "num_tokens": 452489215.0, + "step": 18134 + }, + { + "epoch": 1.991544036898748, + "grad_norm": 2.560286045074463, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7251356840133667, + "num_tokens": 452509026.0, + "step": 18135 + }, + { + "epoch": 1.9916538546013618, + "grad_norm": 2.551635503768921, + "learning_rate": 1e-06, + "loss": 0.7796, + "mean_token_accuracy": 0.7506068348884583, + "num_tokens": 452530693.0, + "step": 18136 + }, + { + "epoch": 1.9917636723039753, + "grad_norm": 2.353372812271118, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6906814575195312, + "num_tokens": 452555908.0, + "step": 18137 + }, + { + "epoch": 1.991873490006589, + "grad_norm": 2.3284261226654053, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7385053634643555, + "num_tokens": 452578617.0, + "step": 18138 + }, + { + "epoch": 1.9919833077092028, + "grad_norm": 2.1287636756896973, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.710589587688446, + "num_tokens": 452607014.0, + "step": 18139 + }, + { + "epoch": 1.9920931254118164, + "grad_norm": 2.1393797397613525, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7137259244918823, + "num_tokens": 452635942.0, + "step": 18140 + }, + { + "epoch": 1.99220294311443, + "grad_norm": 2.009601593017578, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6970630884170532, + "num_tokens": 452667972.0, + "step": 18141 + }, + { + "epoch": 1.9923127608170437, + "grad_norm": 2.095749616622925, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6921632289886475, + "num_tokens": 452697064.0, + "step": 18142 + }, + { + "epoch": 1.9924225785196574, + "grad_norm": 2.3158175945281982, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7378567457199097, + "num_tokens": 452720034.0, + "step": 18143 + }, + { + "epoch": 1.9925323962222712, + "grad_norm": 2.4126386642456055, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.746001660823822, + "num_tokens": 452741627.0, + "step": 18144 + }, + { + "epoch": 1.9926422139248847, + "grad_norm": 2.2360005378723145, + "learning_rate": 1e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7397927641868591, + "num_tokens": 452765443.0, + "step": 18145 + }, + { + "epoch": 1.9927520316274983, + "grad_norm": 2.0498504638671875, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7020303010940552, + "num_tokens": 452797120.0, + "step": 18146 + }, + { + "epoch": 1.992861849330112, + "grad_norm": 2.5152924060821533, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.727126955986023, + "num_tokens": 452818449.0, + "step": 18147 + }, + { + "epoch": 1.9929716670327258, + "grad_norm": 2.2950046062469482, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7065199017524719, + "num_tokens": 452843066.0, + "step": 18148 + }, + { + "epoch": 1.9930814847353393, + "grad_norm": 2.1171677112579346, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7059469223022461, + "num_tokens": 452872755.0, + "step": 18149 + }, + { + "epoch": 1.9931913024379528, + "grad_norm": 2.0897536277770996, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7151209712028503, + "num_tokens": 452901553.0, + "step": 18150 + }, + { + "epoch": 1.9933011201405666, + "grad_norm": 2.1351349353790283, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7371965646743774, + "num_tokens": 452926430.0, + "step": 18151 + }, + { + "epoch": 1.9934109378431804, + "grad_norm": 2.136038303375244, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7316716909408569, + "num_tokens": 452951343.0, + "step": 18152 + }, + { + "epoch": 1.993520755545794, + "grad_norm": 1.9252656698226929, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7105349898338318, + "num_tokens": 452983057.0, + "step": 18153 + }, + { + "epoch": 1.9936305732484076, + "grad_norm": 2.0278284549713135, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6964900493621826, + "num_tokens": 453013541.0, + "step": 18154 + }, + { + "epoch": 1.9937403909510212, + "grad_norm": 2.403189182281494, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7207088470458984, + "num_tokens": 453037265.0, + "step": 18155 + }, + { + "epoch": 1.993850208653635, + "grad_norm": 2.2737460136413574, + "learning_rate": 1e-06, + "loss": 0.7737, + "mean_token_accuracy": 0.7508671283721924, + "num_tokens": 453059962.0, + "step": 18156 + }, + { + "epoch": 1.9939600263562487, + "grad_norm": 2.174827814102173, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.712380588054657, + "num_tokens": 453088952.0, + "step": 18157 + }, + { + "epoch": 1.9940698440588625, + "grad_norm": 2.584977149963379, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7285611629486084, + "num_tokens": 453109352.0, + "step": 18158 + }, + { + "epoch": 1.994179661761476, + "grad_norm": 2.1553103923797607, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7454520463943481, + "num_tokens": 453134796.0, + "step": 18159 + }, + { + "epoch": 1.9942894794640895, + "grad_norm": 2.1605210304260254, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7233284711837769, + "num_tokens": 453160262.0, + "step": 18160 + }, + { + "epoch": 1.9943992971667033, + "grad_norm": 2.6440844535827637, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.711829423904419, + "num_tokens": 453183055.0, + "step": 18161 + }, + { + "epoch": 1.994509114869317, + "grad_norm": 2.4834272861480713, + "learning_rate": 1e-06, + "loss": 0.7792, + "mean_token_accuracy": 0.7458558082580566, + "num_tokens": 453203889.0, + "step": 18162 + }, + { + "epoch": 1.9946189325719306, + "grad_norm": 2.2529003620147705, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7213910818099976, + "num_tokens": 453228298.0, + "step": 18163 + }, + { + "epoch": 1.994728750274544, + "grad_norm": 2.4329395294189453, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7309185266494751, + "num_tokens": 453250980.0, + "step": 18164 + }, + { + "epoch": 1.9948385679771579, + "grad_norm": 2.505312442779541, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7416889667510986, + "num_tokens": 453272027.0, + "step": 18165 + }, + { + "epoch": 1.9949483856797716, + "grad_norm": 2.514336109161377, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7115738391876221, + "num_tokens": 453293177.0, + "step": 18166 + }, + { + "epoch": 1.9950582033823854, + "grad_norm": 2.170551300048828, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7174162268638611, + "num_tokens": 453319211.0, + "step": 18167 + }, + { + "epoch": 1.995168021084999, + "grad_norm": 2.1486449241638184, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7165759801864624, + "num_tokens": 453346228.0, + "step": 18168 + }, + { + "epoch": 1.9952778387876124, + "grad_norm": 2.357653856277466, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7137512564659119, + "num_tokens": 453371384.0, + "step": 18169 + }, + { + "epoch": 1.9953876564902262, + "grad_norm": 1.9552606344223022, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7015857696533203, + "num_tokens": 453403161.0, + "step": 18170 + }, + { + "epoch": 1.99549747419284, + "grad_norm": 2.237797737121582, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7385026216506958, + "num_tokens": 453428929.0, + "step": 18171 + }, + { + "epoch": 1.9956072918954535, + "grad_norm": 2.410510540008545, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7338931560516357, + "num_tokens": 453450006.0, + "step": 18172 + }, + { + "epoch": 1.9957171095980673, + "grad_norm": 2.57529878616333, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7346605062484741, + "num_tokens": 453471323.0, + "step": 18173 + }, + { + "epoch": 1.9958269273006808, + "grad_norm": 2.1231801509857178, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7162511348724365, + "num_tokens": 453498800.0, + "step": 18174 + }, + { + "epoch": 1.9959367450032945, + "grad_norm": 2.339818239212036, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7172776460647583, + "num_tokens": 453524714.0, + "step": 18175 + }, + { + "epoch": 1.9960465627059083, + "grad_norm": 2.1038949489593506, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7145156860351562, + "num_tokens": 453556128.0, + "step": 18176 + }, + { + "epoch": 1.9961563804085218, + "grad_norm": 2.4294395446777344, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7386541366577148, + "num_tokens": 453577511.0, + "step": 18177 + }, + { + "epoch": 1.9962661981111354, + "grad_norm": 2.185119867324829, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7242513298988342, + "num_tokens": 453601618.0, + "step": 18178 + }, + { + "epoch": 1.9963760158137491, + "grad_norm": 2.061701536178589, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6910005807876587, + "num_tokens": 453630982.0, + "step": 18179 + }, + { + "epoch": 1.9964858335163629, + "grad_norm": 2.222853422164917, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7164148688316345, + "num_tokens": 453655936.0, + "step": 18180 + }, + { + "epoch": 1.9965956512189766, + "grad_norm": 2.1288278102874756, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7004319429397583, + "num_tokens": 453683817.0, + "step": 18181 + }, + { + "epoch": 1.9967054689215902, + "grad_norm": 2.1498141288757324, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7280066013336182, + "num_tokens": 453709095.0, + "step": 18182 + }, + { + "epoch": 1.9968152866242037, + "grad_norm": 1.9898680448532104, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7306437492370605, + "num_tokens": 453739008.0, + "step": 18183 + }, + { + "epoch": 1.9969251043268175, + "grad_norm": 2.3172407150268555, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7316172122955322, + "num_tokens": 453761764.0, + "step": 18184 + }, + { + "epoch": 1.9970349220294312, + "grad_norm": 2.4746482372283936, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7139637470245361, + "num_tokens": 453783429.0, + "step": 18185 + }, + { + "epoch": 1.9971447397320448, + "grad_norm": 2.1163060665130615, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7112656831741333, + "num_tokens": 453811309.0, + "step": 18186 + }, + { + "epoch": 1.9972545574346585, + "grad_norm": 2.1430726051330566, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7275019288063049, + "num_tokens": 453840484.0, + "step": 18187 + }, + { + "epoch": 1.997364375137272, + "grad_norm": 2.6005454063415527, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7355536222457886, + "num_tokens": 453861236.0, + "step": 18188 + }, + { + "epoch": 1.9974741928398858, + "grad_norm": 2.4313056468963623, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7330456972122192, + "num_tokens": 453885008.0, + "step": 18189 + }, + { + "epoch": 1.9975840105424996, + "grad_norm": 2.243215322494507, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7276657223701477, + "num_tokens": 453911294.0, + "step": 18190 + }, + { + "epoch": 1.997693828245113, + "grad_norm": 2.108290672302246, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.6986594200134277, + "num_tokens": 453940530.0, + "step": 18191 + }, + { + "epoch": 1.9978036459477266, + "grad_norm": 2.5648181438446045, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7197844982147217, + "num_tokens": 453960785.0, + "step": 18192 + }, + { + "epoch": 1.9979134636503404, + "grad_norm": 2.1289913654327393, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7045241594314575, + "num_tokens": 453989336.0, + "step": 18193 + }, + { + "epoch": 1.9980232813529542, + "grad_norm": 2.105644702911377, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6914965510368347, + "num_tokens": 454018949.0, + "step": 18194 + }, + { + "epoch": 1.998133099055568, + "grad_norm": 2.222946882247925, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7370572090148926, + "num_tokens": 454043739.0, + "step": 18195 + }, + { + "epoch": 1.9982429167581814, + "grad_norm": 2.024249792098999, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7095386981964111, + "num_tokens": 454073801.0, + "step": 18196 + }, + { + "epoch": 1.998352734460795, + "grad_norm": 2.4439780712127686, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7151708602905273, + "num_tokens": 454097602.0, + "step": 18197 + }, + { + "epoch": 1.9984625521634087, + "grad_norm": 2.4546303749084473, + "learning_rate": 1e-06, + "loss": 0.7643, + "mean_token_accuracy": 0.7505756616592407, + "num_tokens": 454119620.0, + "step": 18198 + }, + { + "epoch": 1.9985723698660225, + "grad_norm": 2.311671495437622, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7317094802856445, + "num_tokens": 454143030.0, + "step": 18199 + }, + { + "epoch": 1.998682187568636, + "grad_norm": 2.5143911838531494, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7138132452964783, + "num_tokens": 454164189.0, + "step": 18200 + }, + { + "epoch": 1.9987920052712496, + "grad_norm": 2.274723529815674, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.740127682685852, + "num_tokens": 454187591.0, + "step": 18201 + }, + { + "epoch": 1.9989018229738633, + "grad_norm": 2.1904523372650146, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7011858820915222, + "num_tokens": 454217273.0, + "step": 18202 + }, + { + "epoch": 1.999011640676477, + "grad_norm": 2.378593683242798, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7583891153335571, + "num_tokens": 454239280.0, + "step": 18203 + }, + { + "epoch": 1.9991214583790908, + "grad_norm": 2.559762477874756, + "learning_rate": 1e-06, + "loss": 0.786, + "mean_token_accuracy": 0.7497290372848511, + "num_tokens": 454258870.0, + "step": 18204 + }, + { + "epoch": 1.9992312760817044, + "grad_norm": 2.2793455123901367, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.724101722240448, + "num_tokens": 454283205.0, + "step": 18205 + }, + { + "epoch": 1.999341093784318, + "grad_norm": 2.2438719272613525, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.701848030090332, + "num_tokens": 454310130.0, + "step": 18206 + }, + { + "epoch": 1.9994509114869317, + "grad_norm": 2.3916101455688477, + "learning_rate": 1e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7333701252937317, + "num_tokens": 454333371.0, + "step": 18207 + }, + { + "epoch": 1.9995607291895454, + "grad_norm": 2.213170289993286, + "learning_rate": 1e-06, + "loss": 0.8012, + "mean_token_accuracy": 0.7486034035682678, + "num_tokens": 454358044.0, + "step": 18208 + }, + { + "epoch": 1.9996705468921592, + "grad_norm": 1.8839548826217651, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7154279351234436, + "num_tokens": 454392033.0, + "step": 18209 + }, + { + "epoch": 1.9997803645947727, + "grad_norm": 2.472031354904175, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7336388230323792, + "num_tokens": 454413683.0, + "step": 18210 + }, + { + "epoch": 1.9998901822973862, + "grad_norm": 2.308997392654419, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7235968112945557, + "num_tokens": 454439894.0, + "step": 18211 + }, + { + "epoch": 2.0, + "grad_norm": 2.2079222202301025, + "learning_rate": 1e-06, + "loss": 0.7899, + "mean_token_accuracy": 0.7484481930732727, + "num_tokens": 454463748.0, + "step": 18212 + }, + { + "epoch": 2.0, + "step": 18212, + "total_flos": 2.0464297170303975e+19, + "train_loss": 0.9477986063134819, + "train_runtime": 21240.259, + "train_samples_per_second": 13.719, + "train_steps_per_second": 0.857 + } + ], + "logging_steps": 1, + "max_steps": 18212, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 9106, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.0464297170303975e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..2af115f --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:354bf2647ceb30f979d572d79ff3d1e8b517b85484de65729cb2557d69facc7a +size 13329