From 7c24cdf3995ab61c762bfc84daf34fcc389c7053 Mon Sep 17 00:00:00 2001 From: ModelHub XC Date: Sun, 7 Jun 2026 11:31:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E9=A1=B9=E7=9B=AE?= =?UTF-8?q?=EF=BC=8C=E7=94=B1ModelHub=20XC=E7=A4=BE=E5=8C=BA=E6=8F=90?= =?UTF-8?q?=E4=BE=9B=E6=A8=A1=E5=9E=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model: Neelectric/Llama-3.1-8B-Instruct_SFT_sciencev00.20 Source: Original Platform --- .gitattributes | 36 + README.md | 59 + all_results.json | 8 + chat_template.jinja | 121 + config.json | 35 + generation_config.json | 8 + model-00001-of-00004.safetensors | 3 + model-00002-of-00004.safetensors | 3 + model-00003-of-00004.safetensors | 3 + model-00004-of-00004.safetensors | 3 + model.safetensors.index.json | 299 + special_tokens_map.json | 10 + tokenizer.json | 3 + tokenizer_config.json | 2062 + train_results.json | 8 + trainer_state.json | 163951 ++++++++++++++++++++++++++++ training_args.bin | 3 + 17 files changed, 166615 insertions(+) create mode 100644 .gitattributes create mode 100644 README.md create mode 100644 all_results.json create mode 100644 chat_template.jinja create mode 100644 config.json create mode 100644 generation_config.json create mode 100644 model-00001-of-00004.safetensors create mode 100644 model-00002-of-00004.safetensors create mode 100644 model-00003-of-00004.safetensors create mode 100644 model-00004-of-00004.safetensors create mode 100644 model.safetensors.index.json create mode 100644 special_tokens_map.json create mode 100644 tokenizer.json create mode 100644 tokenizer_config.json create mode 100644 train_results.json create mode 100644 trainer_state.json create mode 100644 training_args.bin diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..8bd841d --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/Replay_0.04.MoT_science.wildguardmix.Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_sciencev00.20 +tags: +- generated_from_trainer +- trl +- sft +- open-r1 +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_sciencev00.20 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/Replay_0.04.MoT_science.wildguardmix.Llama3_4096toks](https://huggingface.co/datasets/Neelectric/Replay_0.04.MoT_science.wildguardmix.Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_sciencev00.20", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_science/runs/mar6uhdo) + + +This model was trained with SFT. + +### Framework versions + +- TRL: 0.28.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.5.0 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..38819ed --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.048535820913292e+19, + "train_loss": 0.9530186036766531, + "train_runtime": 21116.7041, + "train_samples": 145693, + "train_samples_per_second": 13.799, + "train_steps_per_second": 0.862 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..e1d9068 --- /dev/null +++ b/config.json @@ -0,0 +1,35 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..1996dc1 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,8 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..af042f2 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e13ce20c2f69fd328a81052744eafc2c3a8db9d7ac927e5f1e8298caa2c81f7 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..806d110 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55bafbc3be3b002dc4a3bcd6633eaece4927847adff6c047db6a549204b585a2 +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..ea475d5 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84fad7d2f12af8297c6c45a5bf55ee9b3731d0c26674ac12479bc742033f6674 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..79ed43a --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f3382a348bc8e053d0ba80c42aa80051259e901ffb48f99599b3ec3e066b63d +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..e8f05fa --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,10 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..8b0c7c1 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2062 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..38819ed --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 2.048535820913292e+19, + "train_loss": 0.9530186036766531, + "train_runtime": 21116.7041, + "train_samples": 145693, + "train_samples_per_second": 13.799, + "train_steps_per_second": 0.862 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..466f911 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,163951 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 18212, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010981770261366133, + "grad_norm": 7.299972057342529, + "learning_rate": 0.0, + "loss": 1.4841, + "mean_token_accuracy": 0.6132287979125977, + "num_tokens": 25099.0, + "step": 1 + }, + { + "epoch": 0.00021963540522732265, + "grad_norm": 6.5893235206604, + "learning_rate": 5.48847420417124e-10, + "loss": 1.4857, + "mean_token_accuracy": 0.6123285293579102, + "num_tokens": 54691.0, + "step": 2 + }, + { + "epoch": 0.000329453107840984, + "grad_norm": 7.038305282592773, + "learning_rate": 1.097694840834248e-09, + "loss": 1.4808, + "mean_token_accuracy": 0.6056908965110779, + "num_tokens": 79623.0, + "step": 3 + }, + { + "epoch": 0.0004392708104546453, + "grad_norm": 7.590126991271973, + "learning_rate": 1.646542261251372e-09, + "loss": 1.5601, + "mean_token_accuracy": 0.6009494066238403, + "num_tokens": 104738.0, + "step": 4 + }, + { + "epoch": 0.0005490885130683066, + "grad_norm": 7.930464267730713, + "learning_rate": 2.195389681668496e-09, + "loss": 1.5566, + "mean_token_accuracy": 0.6064591407775879, + "num_tokens": 126284.0, + "step": 5 + }, + { + "epoch": 0.000658906215681968, + "grad_norm": 7.766415119171143, + "learning_rate": 2.7442371020856205e-09, + "loss": 1.4749, + "mean_token_accuracy": 0.618071436882019, + "num_tokens": 148808.0, + "step": 6 + }, + { + "epoch": 0.0007687239182956292, + "grad_norm": 6.990437984466553, + "learning_rate": 3.293084522502744e-09, + "loss": 1.4606, + "mean_token_accuracy": 0.617752194404602, + "num_tokens": 175002.0, + "step": 7 + }, + { + "epoch": 0.0008785416209092906, + "grad_norm": 7.95062255859375, + "learning_rate": 3.841931942919868e-09, + "loss": 1.5508, + "mean_token_accuracy": 0.5963762998580933, + "num_tokens": 197998.0, + "step": 8 + }, + { + "epoch": 0.000988359323522952, + "grad_norm": 6.682225704193115, + "learning_rate": 4.390779363336992e-09, + "loss": 1.3835, + "mean_token_accuracy": 0.6345266699790955, + "num_tokens": 225191.0, + "step": 9 + }, + { + "epoch": 0.0010981770261366132, + "grad_norm": 8.263418197631836, + "learning_rate": 4.939626783754116e-09, + "loss": 1.6399, + "mean_token_accuracy": 0.5833006501197815, + "num_tokens": 246870.0, + "step": 10 + }, + { + "epoch": 0.0012079947287502745, + "grad_norm": 6.226279258728027, + "learning_rate": 5.488474204171241e-09, + "loss": 1.5234, + "mean_token_accuracy": 0.6060298681259155, + "num_tokens": 282613.0, + "step": 11 + }, + { + "epoch": 0.001317812431363936, + "grad_norm": 7.45275354385376, + "learning_rate": 6.0373216245883644e-09, + "loss": 1.5384, + "mean_token_accuracy": 0.60728919506073, + "num_tokens": 305920.0, + "step": 12 + }, + { + "epoch": 0.0014276301339775973, + "grad_norm": 6.905296802520752, + "learning_rate": 6.586169045005488e-09, + "loss": 1.5006, + "mean_token_accuracy": 0.611708402633667, + "num_tokens": 332598.0, + "step": 13 + }, + { + "epoch": 0.0015374478365912585, + "grad_norm": 7.328803539276123, + "learning_rate": 7.135016465422612e-09, + "loss": 1.5263, + "mean_token_accuracy": 0.6062934398651123, + "num_tokens": 358200.0, + "step": 14 + }, + { + "epoch": 0.0016472655392049198, + "grad_norm": 6.296462059020996, + "learning_rate": 7.683863885839736e-09, + "loss": 1.5063, + "mean_token_accuracy": 0.6066451668739319, + "num_tokens": 393276.0, + "step": 15 + }, + { + "epoch": 0.0017570832418185812, + "grad_norm": 7.506828784942627, + "learning_rate": 8.23271130625686e-09, + "loss": 1.4109, + "mean_token_accuracy": 0.623050332069397, + "num_tokens": 417328.0, + "step": 16 + }, + { + "epoch": 0.0018669009444322424, + "grad_norm": 7.105743408203125, + "learning_rate": 8.781558726673984e-09, + "loss": 1.4653, + "mean_token_accuracy": 0.6104680895805359, + "num_tokens": 442214.0, + "step": 17 + }, + { + "epoch": 0.001976718647045904, + "grad_norm": 8.595480918884277, + "learning_rate": 9.330406147091108e-09, + "loss": 1.5179, + "mean_token_accuracy": 0.6095225811004639, + "num_tokens": 462349.0, + "step": 18 + }, + { + "epoch": 0.002086536349659565, + "grad_norm": 7.878933429718018, + "learning_rate": 9.879253567508231e-09, + "loss": 1.5062, + "mean_token_accuracy": 0.6071789264678955, + "num_tokens": 482478.0, + "step": 19 + }, + { + "epoch": 0.0021963540522732263, + "grad_norm": 7.99758243560791, + "learning_rate": 1.0428100987925357e-08, + "loss": 1.5379, + "mean_token_accuracy": 0.6065320372581482, + "num_tokens": 505285.0, + "step": 20 + }, + { + "epoch": 0.002306171754886888, + "grad_norm": 7.800711154937744, + "learning_rate": 1.0976948408342482e-08, + "loss": 1.6005, + "mean_token_accuracy": 0.5912305116653442, + "num_tokens": 528309.0, + "step": 21 + }, + { + "epoch": 0.002415989457500549, + "grad_norm": 7.041256904602051, + "learning_rate": 1.1525795828759604e-08, + "loss": 1.403, + "mean_token_accuracy": 0.636406660079956, + "num_tokens": 553432.0, + "step": 22 + }, + { + "epoch": 0.0025258071601142102, + "grad_norm": 6.6265153884887695, + "learning_rate": 1.2074643249176729e-08, + "loss": 1.5046, + "mean_token_accuracy": 0.6121854782104492, + "num_tokens": 582047.0, + "step": 23 + }, + { + "epoch": 0.002635624862727872, + "grad_norm": 7.600011825561523, + "learning_rate": 1.2623490669593852e-08, + "loss": 1.5428, + "mean_token_accuracy": 0.599868893623352, + "num_tokens": 604164.0, + "step": 24 + }, + { + "epoch": 0.002745442565341533, + "grad_norm": 6.344497203826904, + "learning_rate": 1.3172338090010976e-08, + "loss": 1.362, + "mean_token_accuracy": 0.6442499160766602, + "num_tokens": 632656.0, + "step": 25 + }, + { + "epoch": 0.0028552602679551946, + "grad_norm": 6.3063578605651855, + "learning_rate": 1.3721185510428101e-08, + "loss": 1.5179, + "mean_token_accuracy": 0.6029955744743347, + "num_tokens": 665238.0, + "step": 26 + }, + { + "epoch": 0.0029650779705688557, + "grad_norm": 6.184127330780029, + "learning_rate": 1.4270032930845225e-08, + "loss": 1.5183, + "mean_token_accuracy": 0.6016157865524292, + "num_tokens": 696136.0, + "step": 27 + }, + { + "epoch": 0.003074895673182517, + "grad_norm": 7.031423568725586, + "learning_rate": 1.4818880351262348e-08, + "loss": 1.5013, + "mean_token_accuracy": 0.610823392868042, + "num_tokens": 721902.0, + "step": 28 + }, + { + "epoch": 0.0031847133757961785, + "grad_norm": 8.12850570678711, + "learning_rate": 1.536772777167947e-08, + "loss": 1.5674, + "mean_token_accuracy": 0.5980252623558044, + "num_tokens": 739955.0, + "step": 29 + }, + { + "epoch": 0.0032945310784098397, + "grad_norm": 7.405251502990723, + "learning_rate": 1.5916575192096597e-08, + "loss": 1.5081, + "mean_token_accuracy": 0.6177986860275269, + "num_tokens": 763483.0, + "step": 30 + }, + { + "epoch": 0.003404348781023501, + "grad_norm": 6.982152938842773, + "learning_rate": 1.646542261251372e-08, + "loss": 1.5116, + "mean_token_accuracy": 0.6117333173751831, + "num_tokens": 791501.0, + "step": 31 + }, + { + "epoch": 0.0035141664836371624, + "grad_norm": 6.911052703857422, + "learning_rate": 1.7014270032930844e-08, + "loss": 1.4982, + "mean_token_accuracy": 0.6081861257553101, + "num_tokens": 820501.0, + "step": 32 + }, + { + "epoch": 0.0036239841862508236, + "grad_norm": 7.371986389160156, + "learning_rate": 1.756311745334797e-08, + "loss": 1.4518, + "mean_token_accuracy": 0.6211102604866028, + "num_tokens": 845458.0, + "step": 33 + }, + { + "epoch": 0.0037338018888644848, + "grad_norm": 7.199702262878418, + "learning_rate": 1.811196487376509e-08, + "loss": 1.4249, + "mean_token_accuracy": 0.625104546546936, + "num_tokens": 867580.0, + "step": 34 + }, + { + "epoch": 0.0038436195914781464, + "grad_norm": 6.396315097808838, + "learning_rate": 1.8660812294182216e-08, + "loss": 1.4573, + "mean_token_accuracy": 0.6155868172645569, + "num_tokens": 897900.0, + "step": 35 + }, + { + "epoch": 0.003953437294091808, + "grad_norm": 6.700680732727051, + "learning_rate": 1.920965971459934e-08, + "loss": 1.4928, + "mean_token_accuracy": 0.6159014701843262, + "num_tokens": 922354.0, + "step": 36 + }, + { + "epoch": 0.004063254996705469, + "grad_norm": 6.852284908294678, + "learning_rate": 1.9758507135016463e-08, + "loss": 1.5077, + "mean_token_accuracy": 0.6084942817687988, + "num_tokens": 950428.0, + "step": 37 + }, + { + "epoch": 0.00417307269931913, + "grad_norm": 8.295633316040039, + "learning_rate": 2.030735455543359e-08, + "loss": 1.5397, + "mean_token_accuracy": 0.6000158786773682, + "num_tokens": 971655.0, + "step": 38 + }, + { + "epoch": 0.004282890401932792, + "grad_norm": 8.010340690612793, + "learning_rate": 2.0856201975850713e-08, + "loss": 1.5025, + "mean_token_accuracy": 0.6166505217552185, + "num_tokens": 993698.0, + "step": 39 + }, + { + "epoch": 0.004392708104546453, + "grad_norm": 7.383689880371094, + "learning_rate": 2.1405049396267835e-08, + "loss": 1.4903, + "mean_token_accuracy": 0.6115071773529053, + "num_tokens": 1017737.0, + "step": 40 + }, + { + "epoch": 0.004502525807160114, + "grad_norm": 6.455544471740723, + "learning_rate": 2.1953896816684964e-08, + "loss": 1.5879, + "mean_token_accuracy": 0.5885008573532104, + "num_tokens": 1049937.0, + "step": 41 + }, + { + "epoch": 0.004612343509773776, + "grad_norm": 8.007906913757324, + "learning_rate": 2.2502744237102085e-08, + "loss": 1.4996, + "mean_token_accuracy": 0.6207454204559326, + "num_tokens": 1071838.0, + "step": 42 + }, + { + "epoch": 0.0047221612123874365, + "grad_norm": 7.144787311553955, + "learning_rate": 2.3051591657519207e-08, + "loss": 1.397, + "mean_token_accuracy": 0.6404240131378174, + "num_tokens": 1096112.0, + "step": 43 + }, + { + "epoch": 0.004831978915001098, + "grad_norm": 8.051663398742676, + "learning_rate": 2.3600439077936336e-08, + "loss": 1.5032, + "mean_token_accuracy": 0.615178108215332, + "num_tokens": 1117153.0, + "step": 44 + }, + { + "epoch": 0.00494179661761476, + "grad_norm": 8.531699180603027, + "learning_rate": 2.4149286498353458e-08, + "loss": 1.6108, + "mean_token_accuracy": 0.5874541997909546, + "num_tokens": 1136815.0, + "step": 45 + }, + { + "epoch": 0.0050516143202284204, + "grad_norm": 7.009923934936523, + "learning_rate": 2.469813391877058e-08, + "loss": 1.4907, + "mean_token_accuracy": 0.6072732210159302, + "num_tokens": 1164034.0, + "step": 46 + }, + { + "epoch": 0.005161432022842082, + "grad_norm": 7.563737869262695, + "learning_rate": 2.5246981339187705e-08, + "loss": 1.3732, + "mean_token_accuracy": 0.6352757215499878, + "num_tokens": 1185173.0, + "step": 47 + }, + { + "epoch": 0.005271249725455744, + "grad_norm": 7.03902530670166, + "learning_rate": 2.579582875960483e-08, + "loss": 1.516, + "mean_token_accuracy": 0.6035194396972656, + "num_tokens": 1212983.0, + "step": 48 + }, + { + "epoch": 0.005381067428069404, + "grad_norm": 6.290628433227539, + "learning_rate": 2.6344676180021952e-08, + "loss": 1.403, + "mean_token_accuracy": 0.6290041208267212, + "num_tokens": 1243179.0, + "step": 49 + }, + { + "epoch": 0.005490885130683066, + "grad_norm": 6.987545967102051, + "learning_rate": 2.6893523600439077e-08, + "loss": 1.4183, + "mean_token_accuracy": 0.6310019493103027, + "num_tokens": 1270322.0, + "step": 50 + }, + { + "epoch": 0.0056007028332967276, + "grad_norm": 8.444709777832031, + "learning_rate": 2.7442371020856202e-08, + "loss": 1.4849, + "mean_token_accuracy": 0.6092012524604797, + "num_tokens": 1289560.0, + "step": 51 + }, + { + "epoch": 0.005710520535910389, + "grad_norm": 7.2183613777160645, + "learning_rate": 2.7991218441273324e-08, + "loss": 1.4299, + "mean_token_accuracy": 0.6235958337783813, + "num_tokens": 1312242.0, + "step": 52 + }, + { + "epoch": 0.00582033823852405, + "grad_norm": 7.486569404602051, + "learning_rate": 2.854006586169045e-08, + "loss": 1.4701, + "mean_token_accuracy": 0.623228907585144, + "num_tokens": 1334827.0, + "step": 53 + }, + { + "epoch": 0.0059301559411377115, + "grad_norm": 7.370004653930664, + "learning_rate": 2.9088913282107574e-08, + "loss": 1.5109, + "mean_token_accuracy": 0.6084311008453369, + "num_tokens": 1359220.0, + "step": 54 + }, + { + "epoch": 0.006039973643751373, + "grad_norm": 6.621029853820801, + "learning_rate": 2.9637760702524696e-08, + "loss": 1.4774, + "mean_token_accuracy": 0.6114888191223145, + "num_tokens": 1388245.0, + "step": 55 + }, + { + "epoch": 0.006149791346365034, + "grad_norm": 6.085102081298828, + "learning_rate": 3.018660812294182e-08, + "loss": 1.5642, + "mean_token_accuracy": 0.5980187654495239, + "num_tokens": 1423173.0, + "step": 56 + }, + { + "epoch": 0.006259609048978695, + "grad_norm": 7.896478652954102, + "learning_rate": 3.073545554335894e-08, + "loss": 1.5336, + "mean_token_accuracy": 0.599656343460083, + "num_tokens": 1445063.0, + "step": 57 + }, + { + "epoch": 0.006369426751592357, + "grad_norm": 8.0028715133667, + "learning_rate": 3.128430296377607e-08, + "loss": 1.4891, + "mean_token_accuracy": 0.6124809384346008, + "num_tokens": 1466784.0, + "step": 58 + }, + { + "epoch": 0.006479244454206018, + "grad_norm": 7.523932933807373, + "learning_rate": 3.1833150384193193e-08, + "loss": 1.4888, + "mean_token_accuracy": 0.616681694984436, + "num_tokens": 1491183.0, + "step": 59 + }, + { + "epoch": 0.006589062156819679, + "grad_norm": 6.587186336517334, + "learning_rate": 3.2381997804610315e-08, + "loss": 1.4592, + "mean_token_accuracy": 0.62291020154953, + "num_tokens": 1520307.0, + "step": 60 + }, + { + "epoch": 0.006698879859433341, + "grad_norm": 6.989918231964111, + "learning_rate": 3.293084522502744e-08, + "loss": 1.4489, + "mean_token_accuracy": 0.6151033639907837, + "num_tokens": 1545695.0, + "step": 61 + }, + { + "epoch": 0.006808697562047002, + "grad_norm": 6.892788410186768, + "learning_rate": 3.3479692645444566e-08, + "loss": 1.5108, + "mean_token_accuracy": 0.6180200576782227, + "num_tokens": 1573437.0, + "step": 62 + }, + { + "epoch": 0.006918515264660663, + "grad_norm": 8.73695182800293, + "learning_rate": 3.402854006586169e-08, + "loss": 1.4638, + "mean_token_accuracy": 0.623632550239563, + "num_tokens": 1592965.0, + "step": 63 + }, + { + "epoch": 0.007028332967274325, + "grad_norm": 7.838877201080322, + "learning_rate": 3.4577387486278816e-08, + "loss": 1.5111, + "mean_token_accuracy": 0.6119486093521118, + "num_tokens": 1614202.0, + "step": 64 + }, + { + "epoch": 0.007138150669887986, + "grad_norm": 7.5137248039245605, + "learning_rate": 3.512623490669594e-08, + "loss": 1.5883, + "mean_token_accuracy": 0.6014807224273682, + "num_tokens": 1638328.0, + "step": 65 + }, + { + "epoch": 0.007247968372501647, + "grad_norm": 6.937337875366211, + "learning_rate": 3.567508232711306e-08, + "loss": 1.4772, + "mean_token_accuracy": 0.6052414774894714, + "num_tokens": 1661274.0, + "step": 66 + }, + { + "epoch": 0.007357786075115309, + "grad_norm": 6.7480692863464355, + "learning_rate": 3.622392974753018e-08, + "loss": 1.4929, + "mean_token_accuracy": 0.610753059387207, + "num_tokens": 1687503.0, + "step": 67 + }, + { + "epoch": 0.0074676037777289695, + "grad_norm": 7.700900077819824, + "learning_rate": 3.677277716794731e-08, + "loss": 1.5554, + "mean_token_accuracy": 0.6019368171691895, + "num_tokens": 1710804.0, + "step": 68 + }, + { + "epoch": 0.007577421480342631, + "grad_norm": 6.841904163360596, + "learning_rate": 3.732162458836443e-08, + "loss": 1.5118, + "mean_token_accuracy": 0.6034024953842163, + "num_tokens": 1738119.0, + "step": 69 + }, + { + "epoch": 0.007687239182956293, + "grad_norm": 6.146218776702881, + "learning_rate": 3.787047200878156e-08, + "loss": 1.3884, + "mean_token_accuracy": 0.6358658075332642, + "num_tokens": 1769659.0, + "step": 70 + }, + { + "epoch": 0.007797056885569954, + "grad_norm": 7.263650417327881, + "learning_rate": 3.841931942919868e-08, + "loss": 1.5736, + "mean_token_accuracy": 0.6041313409805298, + "num_tokens": 1792757.0, + "step": 71 + }, + { + "epoch": 0.007906874588183616, + "grad_norm": 7.199553489685059, + "learning_rate": 3.8968166849615804e-08, + "loss": 1.5972, + "mean_token_accuracy": 0.5892659425735474, + "num_tokens": 1818788.0, + "step": 72 + }, + { + "epoch": 0.008016692290797276, + "grad_norm": 7.241357326507568, + "learning_rate": 3.9517014270032926e-08, + "loss": 1.5592, + "mean_token_accuracy": 0.5948642492294312, + "num_tokens": 1843095.0, + "step": 73 + }, + { + "epoch": 0.008126509993410937, + "grad_norm": 7.2577128410339355, + "learning_rate": 4.006586169045005e-08, + "loss": 1.4359, + "mean_token_accuracy": 0.617573618888855, + "num_tokens": 1866692.0, + "step": 74 + }, + { + "epoch": 0.008236327696024599, + "grad_norm": 7.47206974029541, + "learning_rate": 4.061470911086718e-08, + "loss": 1.5372, + "mean_token_accuracy": 0.5997949838638306, + "num_tokens": 1889586.0, + "step": 75 + }, + { + "epoch": 0.00834614539863826, + "grad_norm": 6.733101844787598, + "learning_rate": 4.1163556531284305e-08, + "loss": 1.4627, + "mean_token_accuracy": 0.6145527958869934, + "num_tokens": 1917354.0, + "step": 76 + }, + { + "epoch": 0.008455963101251922, + "grad_norm": 6.344648838043213, + "learning_rate": 4.1712403951701427e-08, + "loss": 1.4702, + "mean_token_accuracy": 0.6250356435775757, + "num_tokens": 1945888.0, + "step": 77 + }, + { + "epoch": 0.008565780803865584, + "grad_norm": 7.034523010253906, + "learning_rate": 4.226125137211855e-08, + "loss": 1.4925, + "mean_token_accuracy": 0.613616406917572, + "num_tokens": 1969851.0, + "step": 78 + }, + { + "epoch": 0.008675598506479244, + "grad_norm": 6.394534111022949, + "learning_rate": 4.281009879253567e-08, + "loss": 1.4998, + "mean_token_accuracy": 0.6027745008468628, + "num_tokens": 1999926.0, + "step": 79 + }, + { + "epoch": 0.008785416209092905, + "grad_norm": 7.6916022300720215, + "learning_rate": 4.335894621295279e-08, + "loss": 1.4295, + "mean_token_accuracy": 0.625682532787323, + "num_tokens": 2023661.0, + "step": 80 + }, + { + "epoch": 0.008895233911706567, + "grad_norm": 6.365603923797607, + "learning_rate": 4.390779363336993e-08, + "loss": 1.5253, + "mean_token_accuracy": 0.6049497723579407, + "num_tokens": 2054338.0, + "step": 81 + }, + { + "epoch": 0.009005051614320228, + "grad_norm": 7.051699161529541, + "learning_rate": 4.445664105378705e-08, + "loss": 1.554, + "mean_token_accuracy": 0.5915554761886597, + "num_tokens": 2079276.0, + "step": 82 + }, + { + "epoch": 0.00911486931693389, + "grad_norm": 7.8966169357299805, + "learning_rate": 4.500548847420417e-08, + "loss": 1.453, + "mean_token_accuracy": 0.6232166290283203, + "num_tokens": 2099847.0, + "step": 83 + }, + { + "epoch": 0.009224687019547552, + "grad_norm": 6.283535957336426, + "learning_rate": 4.555433589462129e-08, + "loss": 1.4367, + "mean_token_accuracy": 0.6254738569259644, + "num_tokens": 2126699.0, + "step": 84 + }, + { + "epoch": 0.009334504722161213, + "grad_norm": 7.167924404144287, + "learning_rate": 4.6103183315038415e-08, + "loss": 1.5845, + "mean_token_accuracy": 0.5867136716842651, + "num_tokens": 2151920.0, + "step": 85 + }, + { + "epoch": 0.009444322424774873, + "grad_norm": 7.6462225914001465, + "learning_rate": 4.6652030735455537e-08, + "loss": 1.4622, + "mean_token_accuracy": 0.6204791069030762, + "num_tokens": 2172907.0, + "step": 86 + }, + { + "epoch": 0.009554140127388535, + "grad_norm": 6.5719146728515625, + "learning_rate": 4.720087815587267e-08, + "loss": 1.4781, + "mean_token_accuracy": 0.6095924973487854, + "num_tokens": 2200639.0, + "step": 87 + }, + { + "epoch": 0.009663957830002196, + "grad_norm": 6.7126145362854, + "learning_rate": 4.7749725576289793e-08, + "loss": 1.4553, + "mean_token_accuracy": 0.6175500154495239, + "num_tokens": 2223544.0, + "step": 88 + }, + { + "epoch": 0.009773775532615858, + "grad_norm": 7.711470603942871, + "learning_rate": 4.8298572996706915e-08, + "loss": 1.4189, + "mean_token_accuracy": 0.6234169006347656, + "num_tokens": 2244000.0, + "step": 89 + }, + { + "epoch": 0.00988359323522952, + "grad_norm": 5.8070478439331055, + "learning_rate": 4.884742041712404e-08, + "loss": 1.3856, + "mean_token_accuracy": 0.6339553594589233, + "num_tokens": 2272054.0, + "step": 90 + }, + { + "epoch": 0.009993410937843181, + "grad_norm": 6.451523780822754, + "learning_rate": 4.939626783754116e-08, + "loss": 1.4687, + "mean_token_accuracy": 0.6175835132598877, + "num_tokens": 2297541.0, + "step": 91 + }, + { + "epoch": 0.010103228640456841, + "grad_norm": 5.749032497406006, + "learning_rate": 4.994511525795828e-08, + "loss": 1.3997, + "mean_token_accuracy": 0.6301475167274475, + "num_tokens": 2324944.0, + "step": 92 + }, + { + "epoch": 0.010213046343070502, + "grad_norm": 5.985781192779541, + "learning_rate": 5.049396267837541e-08, + "loss": 1.3976, + "mean_token_accuracy": 0.6327058672904968, + "num_tokens": 2353169.0, + "step": 93 + }, + { + "epoch": 0.010322864045684164, + "grad_norm": 6.36305570602417, + "learning_rate": 5.104281009879254e-08, + "loss": 1.4438, + "mean_token_accuracy": 0.6136465668678284, + "num_tokens": 2377900.0, + "step": 94 + }, + { + "epoch": 0.010432681748297826, + "grad_norm": 6.421722412109375, + "learning_rate": 5.159165751920966e-08, + "loss": 1.4352, + "mean_token_accuracy": 0.6179937720298767, + "num_tokens": 2405336.0, + "step": 95 + }, + { + "epoch": 0.010542499450911487, + "grad_norm": 7.130977630615234, + "learning_rate": 5.214050493962678e-08, + "loss": 1.4807, + "mean_token_accuracy": 0.61016845703125, + "num_tokens": 2427947.0, + "step": 96 + }, + { + "epoch": 0.010652317153525149, + "grad_norm": 6.038248062133789, + "learning_rate": 5.2689352360043903e-08, + "loss": 1.4638, + "mean_token_accuracy": 0.6098105907440186, + "num_tokens": 2457381.0, + "step": 97 + }, + { + "epoch": 0.010762134856138809, + "grad_norm": 7.272004127502441, + "learning_rate": 5.3238199780461025e-08, + "loss": 1.5274, + "mean_token_accuracy": 0.6065617203712463, + "num_tokens": 2477942.0, + "step": 98 + }, + { + "epoch": 0.01087195255875247, + "grad_norm": 6.972545623779297, + "learning_rate": 5.3787047200878154e-08, + "loss": 1.5023, + "mean_token_accuracy": 0.6076018810272217, + "num_tokens": 2501763.0, + "step": 99 + }, + { + "epoch": 0.010981770261366132, + "grad_norm": 6.598283767700195, + "learning_rate": 5.433589462129528e-08, + "loss": 1.5095, + "mean_token_accuracy": 0.6018040180206299, + "num_tokens": 2526441.0, + "step": 100 + }, + { + "epoch": 0.011091587963979794, + "grad_norm": 6.632323741912842, + "learning_rate": 5.4884742041712404e-08, + "loss": 1.4842, + "mean_token_accuracy": 0.6138486266136169, + "num_tokens": 2550203.0, + "step": 101 + }, + { + "epoch": 0.011201405666593455, + "grad_norm": 6.656107425689697, + "learning_rate": 5.5433589462129526e-08, + "loss": 1.4753, + "mean_token_accuracy": 0.6144994497299194, + "num_tokens": 2575381.0, + "step": 102 + }, + { + "epoch": 0.011311223369207117, + "grad_norm": 7.243898391723633, + "learning_rate": 5.598243688254665e-08, + "loss": 1.5866, + "mean_token_accuracy": 0.5946934819221497, + "num_tokens": 2598248.0, + "step": 103 + }, + { + "epoch": 0.011421041071820778, + "grad_norm": 8.08626651763916, + "learning_rate": 5.653128430296377e-08, + "loss": 1.5067, + "mean_token_accuracy": 0.6074979305267334, + "num_tokens": 2617399.0, + "step": 104 + }, + { + "epoch": 0.011530858774434438, + "grad_norm": 6.985254287719727, + "learning_rate": 5.70801317233809e-08, + "loss": 1.5257, + "mean_token_accuracy": 0.6033720970153809, + "num_tokens": 2641807.0, + "step": 105 + }, + { + "epoch": 0.0116406764770481, + "grad_norm": 6.644419193267822, + "learning_rate": 5.762897914379802e-08, + "loss": 1.3862, + "mean_token_accuracy": 0.6287726163864136, + "num_tokens": 2666165.0, + "step": 106 + }, + { + "epoch": 0.011750494179661761, + "grad_norm": 7.627112865447998, + "learning_rate": 5.817782656421515e-08, + "loss": 1.3795, + "mean_token_accuracy": 0.6352954506874084, + "num_tokens": 2685900.0, + "step": 107 + }, + { + "epoch": 0.011860311882275423, + "grad_norm": 8.15728759765625, + "learning_rate": 5.872667398463227e-08, + "loss": 1.423, + "mean_token_accuracy": 0.6321823596954346, + "num_tokens": 2704663.0, + "step": 108 + }, + { + "epoch": 0.011970129584889085, + "grad_norm": 6.8782477378845215, + "learning_rate": 5.927552140504939e-08, + "loss": 1.43, + "mean_token_accuracy": 0.622089147567749, + "num_tokens": 2729631.0, + "step": 109 + }, + { + "epoch": 0.012079947287502746, + "grad_norm": 7.526661396026611, + "learning_rate": 5.982436882546651e-08, + "loss": 1.4831, + "mean_token_accuracy": 0.6155154705047607, + "num_tokens": 2751736.0, + "step": 110 + }, + { + "epoch": 0.012189764990116406, + "grad_norm": 6.528886795043945, + "learning_rate": 6.037321624588364e-08, + "loss": 1.5075, + "mean_token_accuracy": 0.6079744100570679, + "num_tokens": 2780621.0, + "step": 111 + }, + { + "epoch": 0.012299582692730068, + "grad_norm": 6.4309916496276855, + "learning_rate": 6.092206366630077e-08, + "loss": 1.4382, + "mean_token_accuracy": 0.615140974521637, + "num_tokens": 2808034.0, + "step": 112 + }, + { + "epoch": 0.01240940039534373, + "grad_norm": 7.071214199066162, + "learning_rate": 6.147091108671789e-08, + "loss": 1.4226, + "mean_token_accuracy": 0.6158210039138794, + "num_tokens": 2829385.0, + "step": 113 + }, + { + "epoch": 0.01251921809795739, + "grad_norm": 6.987655162811279, + "learning_rate": 6.201975850713501e-08, + "loss": 1.3672, + "mean_token_accuracy": 0.6378641128540039, + "num_tokens": 2852243.0, + "step": 114 + }, + { + "epoch": 0.012629035800571052, + "grad_norm": 6.304754257202148, + "learning_rate": 6.256860592755214e-08, + "loss": 1.4267, + "mean_token_accuracy": 0.6222026348114014, + "num_tokens": 2877869.0, + "step": 115 + }, + { + "epoch": 0.012738853503184714, + "grad_norm": 6.059346675872803, + "learning_rate": 6.311745334796927e-08, + "loss": 1.3775, + "mean_token_accuracy": 0.6292554140090942, + "num_tokens": 2906675.0, + "step": 116 + }, + { + "epoch": 0.012848671205798374, + "grad_norm": 7.4788055419921875, + "learning_rate": 6.366630076838639e-08, + "loss": 1.4274, + "mean_token_accuracy": 0.6168055534362793, + "num_tokens": 2928377.0, + "step": 117 + }, + { + "epoch": 0.012958488908412035, + "grad_norm": 6.937248229980469, + "learning_rate": 6.421514818880352e-08, + "loss": 1.4559, + "mean_token_accuracy": 0.6089651584625244, + "num_tokens": 2952265.0, + "step": 118 + }, + { + "epoch": 0.013068306611025697, + "grad_norm": 7.101714611053467, + "learning_rate": 6.476399560922063e-08, + "loss": 1.4122, + "mean_token_accuracy": 0.6272968053817749, + "num_tokens": 2975024.0, + "step": 119 + }, + { + "epoch": 0.013178124313639359, + "grad_norm": 6.343530654907227, + "learning_rate": 6.531284302963776e-08, + "loss": 1.4502, + "mean_token_accuracy": 0.6130717992782593, + "num_tokens": 3002243.0, + "step": 120 + }, + { + "epoch": 0.01328794201625302, + "grad_norm": 5.5539703369140625, + "learning_rate": 6.586169045005487e-08, + "loss": 1.2746, + "mean_token_accuracy": 0.6517714262008667, + "num_tokens": 3029303.0, + "step": 121 + }, + { + "epoch": 0.013397759718866682, + "grad_norm": 5.926384925842285, + "learning_rate": 6.6410537870472e-08, + "loss": 1.5242, + "mean_token_accuracy": 0.5982208847999573, + "num_tokens": 3055207.0, + "step": 122 + }, + { + "epoch": 0.013507577421480343, + "grad_norm": 5.944589138031006, + "learning_rate": 6.695938529088913e-08, + "loss": 1.3924, + "mean_token_accuracy": 0.6278924942016602, + "num_tokens": 3080154.0, + "step": 123 + }, + { + "epoch": 0.013617395124094003, + "grad_norm": 6.1666154861450195, + "learning_rate": 6.750823271130625e-08, + "loss": 1.3772, + "mean_token_accuracy": 0.6289368867874146, + "num_tokens": 3103871.0, + "step": 124 + }, + { + "epoch": 0.013727212826707665, + "grad_norm": 5.565657138824463, + "learning_rate": 6.805708013172338e-08, + "loss": 1.4082, + "mean_token_accuracy": 0.6168312430381775, + "num_tokens": 3132318.0, + "step": 125 + }, + { + "epoch": 0.013837030529321327, + "grad_norm": 5.270543098449707, + "learning_rate": 6.86059275521405e-08, + "loss": 1.3586, + "mean_token_accuracy": 0.6298952102661133, + "num_tokens": 3159386.0, + "step": 126 + }, + { + "epoch": 0.013946848231934988, + "grad_norm": 5.890013217926025, + "learning_rate": 6.915477497255763e-08, + "loss": 1.3163, + "mean_token_accuracy": 0.6465879678726196, + "num_tokens": 3181141.0, + "step": 127 + }, + { + "epoch": 0.01405666593454865, + "grad_norm": 5.76528263092041, + "learning_rate": 6.970362239297475e-08, + "loss": 1.4306, + "mean_token_accuracy": 0.6196578741073608, + "num_tokens": 3203880.0, + "step": 128 + }, + { + "epoch": 0.014166483637162311, + "grad_norm": 5.6659088134765625, + "learning_rate": 7.025246981339188e-08, + "loss": 1.4834, + "mean_token_accuracy": 0.6054244637489319, + "num_tokens": 3227612.0, + "step": 129 + }, + { + "epoch": 0.014276301339775971, + "grad_norm": 5.322044372558594, + "learning_rate": 7.0801317233809e-08, + "loss": 1.3392, + "mean_token_accuracy": 0.6467907428741455, + "num_tokens": 3254016.0, + "step": 130 + }, + { + "epoch": 0.014386119042389633, + "grad_norm": 4.834613800048828, + "learning_rate": 7.135016465422612e-08, + "loss": 1.4007, + "mean_token_accuracy": 0.6323322653770447, + "num_tokens": 3283824.0, + "step": 131 + }, + { + "epoch": 0.014495936745003294, + "grad_norm": 5.275305271148682, + "learning_rate": 7.189901207464325e-08, + "loss": 1.465, + "mean_token_accuracy": 0.6098832488059998, + "num_tokens": 3310412.0, + "step": 132 + }, + { + "epoch": 0.014605754447616956, + "grad_norm": 5.35302209854126, + "learning_rate": 7.244785949506036e-08, + "loss": 1.4444, + "mean_token_accuracy": 0.5994284749031067, + "num_tokens": 3334376.0, + "step": 133 + }, + { + "epoch": 0.014715572150230618, + "grad_norm": 4.54243278503418, + "learning_rate": 7.299670691547749e-08, + "loss": 1.4287, + "mean_token_accuracy": 0.6165257096290588, + "num_tokens": 3368104.0, + "step": 134 + }, + { + "epoch": 0.01482538985284428, + "grad_norm": 5.151076793670654, + "learning_rate": 7.354555433589462e-08, + "loss": 1.3732, + "mean_token_accuracy": 0.6255090236663818, + "num_tokens": 3394743.0, + "step": 135 + }, + { + "epoch": 0.014935207555457939, + "grad_norm": 5.712588310241699, + "learning_rate": 7.409440175631174e-08, + "loss": 1.4258, + "mean_token_accuracy": 0.6110680103302002, + "num_tokens": 3417886.0, + "step": 136 + }, + { + "epoch": 0.0150450252580716, + "grad_norm": 5.468960285186768, + "learning_rate": 7.464324917672886e-08, + "loss": 1.4562, + "mean_token_accuracy": 0.6087392568588257, + "num_tokens": 3441494.0, + "step": 137 + }, + { + "epoch": 0.015154842960685262, + "grad_norm": 5.089019298553467, + "learning_rate": 7.519209659714599e-08, + "loss": 1.4164, + "mean_token_accuracy": 0.6104951500892639, + "num_tokens": 3468987.0, + "step": 138 + }, + { + "epoch": 0.015264660663298924, + "grad_norm": 4.545821666717529, + "learning_rate": 7.574094401756312e-08, + "loss": 1.4196, + "mean_token_accuracy": 0.6212930679321289, + "num_tokens": 3499335.0, + "step": 139 + }, + { + "epoch": 0.015374478365912585, + "grad_norm": 5.033543586730957, + "learning_rate": 7.628979143798024e-08, + "loss": 1.2947, + "mean_token_accuracy": 0.6441490054130554, + "num_tokens": 3521998.0, + "step": 140 + }, + { + "epoch": 0.015484296068526247, + "grad_norm": 5.430815696716309, + "learning_rate": 7.683863885839736e-08, + "loss": 1.3038, + "mean_token_accuracy": 0.6433429718017578, + "num_tokens": 3543040.0, + "step": 141 + }, + { + "epoch": 0.015594113771139909, + "grad_norm": 5.5661516189575195, + "learning_rate": 7.738748627881449e-08, + "loss": 1.3287, + "mean_token_accuracy": 0.6307182312011719, + "num_tokens": 3565958.0, + "step": 142 + }, + { + "epoch": 0.01570393147375357, + "grad_norm": 5.279961109161377, + "learning_rate": 7.793633369923161e-08, + "loss": 1.3643, + "mean_token_accuracy": 0.6397133469581604, + "num_tokens": 3589318.0, + "step": 143 + }, + { + "epoch": 0.015813749176367232, + "grad_norm": 5.072293758392334, + "learning_rate": 7.848518111964874e-08, + "loss": 1.3019, + "mean_token_accuracy": 0.635187566280365, + "num_tokens": 3612825.0, + "step": 144 + }, + { + "epoch": 0.01592356687898089, + "grad_norm": 5.1605377197265625, + "learning_rate": 7.903402854006585e-08, + "loss": 1.4038, + "mean_token_accuracy": 0.6197243928909302, + "num_tokens": 3636491.0, + "step": 145 + }, + { + "epoch": 0.01603338458159455, + "grad_norm": 5.135138988494873, + "learning_rate": 7.958287596048298e-08, + "loss": 1.4117, + "mean_token_accuracy": 0.6116099953651428, + "num_tokens": 3656184.0, + "step": 146 + }, + { + "epoch": 0.016143202284208215, + "grad_norm": 5.331326484680176, + "learning_rate": 8.01317233809001e-08, + "loss": 1.3388, + "mean_token_accuracy": 0.6378070116043091, + "num_tokens": 3677246.0, + "step": 147 + }, + { + "epoch": 0.016253019986821875, + "grad_norm": 4.247134208679199, + "learning_rate": 8.068057080131722e-08, + "loss": 1.359, + "mean_token_accuracy": 0.6255144476890564, + "num_tokens": 3708661.0, + "step": 148 + }, + { + "epoch": 0.016362837689435538, + "grad_norm": 5.301745891571045, + "learning_rate": 8.122941822173437e-08, + "loss": 1.3418, + "mean_token_accuracy": 0.638133704662323, + "num_tokens": 3730872.0, + "step": 149 + }, + { + "epoch": 0.016472655392049198, + "grad_norm": 4.549897193908691, + "learning_rate": 8.177826564215148e-08, + "loss": 1.3653, + "mean_token_accuracy": 0.6332069039344788, + "num_tokens": 3758260.0, + "step": 150 + }, + { + "epoch": 0.01658247309466286, + "grad_norm": 4.58854866027832, + "learning_rate": 8.232711306256861e-08, + "loss": 1.4308, + "mean_token_accuracy": 0.6072310209274292, + "num_tokens": 3787000.0, + "step": 151 + }, + { + "epoch": 0.01669229079727652, + "grad_norm": 4.850391864776611, + "learning_rate": 8.287596048298572e-08, + "loss": 1.2886, + "mean_token_accuracy": 0.6492581367492676, + "num_tokens": 3811338.0, + "step": 152 + }, + { + "epoch": 0.01680210849989018, + "grad_norm": 4.820602893829346, + "learning_rate": 8.342480790340285e-08, + "loss": 1.3437, + "mean_token_accuracy": 0.6387158632278442, + "num_tokens": 3836161.0, + "step": 153 + }, + { + "epoch": 0.016911926202503844, + "grad_norm": 4.45029878616333, + "learning_rate": 8.397365532381998e-08, + "loss": 1.3549, + "mean_token_accuracy": 0.6306380033493042, + "num_tokens": 3863487.0, + "step": 154 + }, + { + "epoch": 0.017021743905117504, + "grad_norm": 4.1964006423950195, + "learning_rate": 8.45225027442371e-08, + "loss": 1.406, + "mean_token_accuracy": 0.6179303526878357, + "num_tokens": 3894528.0, + "step": 155 + }, + { + "epoch": 0.017131561607731167, + "grad_norm": 4.647528171539307, + "learning_rate": 8.507135016465423e-08, + "loss": 1.2967, + "mean_token_accuracy": 0.6439356803894043, + "num_tokens": 3917211.0, + "step": 156 + }, + { + "epoch": 0.017241379310344827, + "grad_norm": 4.570157051086426, + "learning_rate": 8.562019758507134e-08, + "loss": 1.3906, + "mean_token_accuracy": 0.6256130933761597, + "num_tokens": 3945010.0, + "step": 157 + }, + { + "epoch": 0.017351197012958487, + "grad_norm": 4.843707084655762, + "learning_rate": 8.616904500548847e-08, + "loss": 1.3419, + "mean_token_accuracy": 0.6348270177841187, + "num_tokens": 3966707.0, + "step": 158 + }, + { + "epoch": 0.01746101471557215, + "grad_norm": 4.709655284881592, + "learning_rate": 8.671789242590558e-08, + "loss": 1.3804, + "mean_token_accuracy": 0.6231428980827332, + "num_tokens": 3991687.0, + "step": 159 + }, + { + "epoch": 0.01757083241818581, + "grad_norm": 5.205449104309082, + "learning_rate": 8.726673984632271e-08, + "loss": 1.4051, + "mean_token_accuracy": 0.6195495128631592, + "num_tokens": 4014894.0, + "step": 160 + }, + { + "epoch": 0.017680650120799474, + "grad_norm": 4.267292499542236, + "learning_rate": 8.781558726673985e-08, + "loss": 1.293, + "mean_token_accuracy": 0.648850679397583, + "num_tokens": 4043529.0, + "step": 161 + }, + { + "epoch": 0.017790467823413134, + "grad_norm": 4.402661323547363, + "learning_rate": 8.836443468715697e-08, + "loss": 1.3854, + "mean_token_accuracy": 0.6233797073364258, + "num_tokens": 4069827.0, + "step": 162 + }, + { + "epoch": 0.017900285526026797, + "grad_norm": 4.655051231384277, + "learning_rate": 8.89132821075741e-08, + "loss": 1.3753, + "mean_token_accuracy": 0.6224860548973083, + "num_tokens": 4094360.0, + "step": 163 + }, + { + "epoch": 0.018010103228640457, + "grad_norm": 4.403567314147949, + "learning_rate": 8.946212952799121e-08, + "loss": 1.3205, + "mean_token_accuracy": 0.6473718285560608, + "num_tokens": 4121234.0, + "step": 164 + }, + { + "epoch": 0.018119920931254117, + "grad_norm": 4.114132404327393, + "learning_rate": 9.001097694840834e-08, + "loss": 1.3218, + "mean_token_accuracy": 0.6399441361427307, + "num_tokens": 4150824.0, + "step": 165 + }, + { + "epoch": 0.01822973863386778, + "grad_norm": 4.104740619659424, + "learning_rate": 9.055982436882546e-08, + "loss": 1.4125, + "mean_token_accuracy": 0.6168966889381409, + "num_tokens": 4180598.0, + "step": 166 + }, + { + "epoch": 0.01833955633648144, + "grad_norm": 4.581058025360107, + "learning_rate": 9.110867178924259e-08, + "loss": 1.4266, + "mean_token_accuracy": 0.6123561263084412, + "num_tokens": 4208548.0, + "step": 167 + }, + { + "epoch": 0.018449374039095103, + "grad_norm": 5.739989757537842, + "learning_rate": 9.165751920965971e-08, + "loss": 1.2811, + "mean_token_accuracy": 0.6390488147735596, + "num_tokens": 4228346.0, + "step": 168 + }, + { + "epoch": 0.018559191741708763, + "grad_norm": 4.765919208526611, + "learning_rate": 9.220636663007683e-08, + "loss": 1.3874, + "mean_token_accuracy": 0.6118271350860596, + "num_tokens": 4249667.0, + "step": 169 + }, + { + "epoch": 0.018669009444322426, + "grad_norm": 6.014593124389648, + "learning_rate": 9.275521405049396e-08, + "loss": 1.2781, + "mean_token_accuracy": 0.6539883017539978, + "num_tokens": 4265325.0, + "step": 170 + }, + { + "epoch": 0.018778827146936086, + "grad_norm": 5.2555975914001465, + "learning_rate": 9.330406147091107e-08, + "loss": 1.3345, + "mean_token_accuracy": 0.6344895362854004, + "num_tokens": 4284031.0, + "step": 171 + }, + { + "epoch": 0.018888644849549746, + "grad_norm": 4.322195053100586, + "learning_rate": 9.38529088913282e-08, + "loss": 1.3723, + "mean_token_accuracy": 0.6189368367195129, + "num_tokens": 4313088.0, + "step": 172 + }, + { + "epoch": 0.01899846255216341, + "grad_norm": 4.351803302764893, + "learning_rate": 9.440175631174534e-08, + "loss": 1.4079, + "mean_token_accuracy": 0.6153702735900879, + "num_tokens": 4337492.0, + "step": 173 + }, + { + "epoch": 0.01910828025477707, + "grad_norm": 4.0171403884887695, + "learning_rate": 9.495060373216246e-08, + "loss": 1.2387, + "mean_token_accuracy": 0.651781439781189, + "num_tokens": 4362488.0, + "step": 174 + }, + { + "epoch": 0.019218097957390733, + "grad_norm": 4.824237823486328, + "learning_rate": 9.549945115257959e-08, + "loss": 1.3182, + "mean_token_accuracy": 0.6380966901779175, + "num_tokens": 4382418.0, + "step": 175 + }, + { + "epoch": 0.019327915660004392, + "grad_norm": 3.979499340057373, + "learning_rate": 9.60482985729967e-08, + "loss": 1.336, + "mean_token_accuracy": 0.6369630694389343, + "num_tokens": 4411629.0, + "step": 176 + }, + { + "epoch": 0.019437733362618052, + "grad_norm": 4.414623737335205, + "learning_rate": 9.659714599341383e-08, + "loss": 1.3312, + "mean_token_accuracy": 0.6367157697677612, + "num_tokens": 4434854.0, + "step": 177 + }, + { + "epoch": 0.019547551065231716, + "grad_norm": 3.8410346508026123, + "learning_rate": 9.714599341383095e-08, + "loss": 1.359, + "mean_token_accuracy": 0.6267642378807068, + "num_tokens": 4463562.0, + "step": 178 + }, + { + "epoch": 0.019657368767845376, + "grad_norm": 3.7551653385162354, + "learning_rate": 9.769484083424807e-08, + "loss": 1.2945, + "mean_token_accuracy": 0.6473474502563477, + "num_tokens": 4491139.0, + "step": 179 + }, + { + "epoch": 0.01976718647045904, + "grad_norm": 5.008821487426758, + "learning_rate": 9.82436882546652e-08, + "loss": 1.3598, + "mean_token_accuracy": 0.6244102120399475, + "num_tokens": 4513585.0, + "step": 180 + }, + { + "epoch": 0.0198770041730727, + "grad_norm": 3.6220271587371826, + "learning_rate": 9.879253567508232e-08, + "loss": 1.2966, + "mean_token_accuracy": 0.6487101316452026, + "num_tokens": 4544250.0, + "step": 181 + }, + { + "epoch": 0.019986821875686362, + "grad_norm": 4.249271869659424, + "learning_rate": 9.934138309549945e-08, + "loss": 1.2861, + "mean_token_accuracy": 0.6438025236129761, + "num_tokens": 4575190.0, + "step": 182 + }, + { + "epoch": 0.020096639578300022, + "grad_norm": 4.034646511077881, + "learning_rate": 9.989023051591656e-08, + "loss": 1.3173, + "mean_token_accuracy": 0.6344480514526367, + "num_tokens": 4604769.0, + "step": 183 + }, + { + "epoch": 0.020206457280913682, + "grad_norm": 3.901573419570923, + "learning_rate": 1.0043907793633369e-07, + "loss": 1.3252, + "mean_token_accuracy": 0.6349245309829712, + "num_tokens": 4635566.0, + "step": 184 + }, + { + "epoch": 0.020316274983527345, + "grad_norm": 4.439236164093018, + "learning_rate": 1.0098792535675082e-07, + "loss": 1.2951, + "mean_token_accuracy": 0.6462608575820923, + "num_tokens": 4657817.0, + "step": 185 + }, + { + "epoch": 0.020426092686141005, + "grad_norm": 5.641676425933838, + "learning_rate": 1.0153677277716795e-07, + "loss": 1.2949, + "mean_token_accuracy": 0.6456860303878784, + "num_tokens": 4674967.0, + "step": 186 + }, + { + "epoch": 0.02053591038875467, + "grad_norm": 4.159131050109863, + "learning_rate": 1.0208562019758508e-07, + "loss": 1.2906, + "mean_token_accuracy": 0.6419962644577026, + "num_tokens": 4697999.0, + "step": 187 + }, + { + "epoch": 0.020645728091368328, + "grad_norm": 4.485246658325195, + "learning_rate": 1.0263446761800219e-07, + "loss": 1.3141, + "mean_token_accuracy": 0.639224112033844, + "num_tokens": 4722451.0, + "step": 188 + }, + { + "epoch": 0.02075554579398199, + "grad_norm": 4.031458377838135, + "learning_rate": 1.0318331503841932e-07, + "loss": 1.3074, + "mean_token_accuracy": 0.6422643661499023, + "num_tokens": 4748648.0, + "step": 189 + }, + { + "epoch": 0.02086536349659565, + "grad_norm": 4.858506679534912, + "learning_rate": 1.0373216245883643e-07, + "loss": 1.4788, + "mean_token_accuracy": 0.6143099069595337, + "num_tokens": 4770005.0, + "step": 190 + }, + { + "epoch": 0.02097518119920931, + "grad_norm": 3.669945240020752, + "learning_rate": 1.0428100987925356e-07, + "loss": 1.3238, + "mean_token_accuracy": 0.6325024366378784, + "num_tokens": 4799750.0, + "step": 191 + }, + { + "epoch": 0.021084998901822975, + "grad_norm": 4.455148696899414, + "learning_rate": 1.0482985729967068e-07, + "loss": 1.3316, + "mean_token_accuracy": 0.632487416267395, + "num_tokens": 4828060.0, + "step": 192 + }, + { + "epoch": 0.021194816604436634, + "grad_norm": 3.6638827323913574, + "learning_rate": 1.0537870472008781e-07, + "loss": 1.3458, + "mean_token_accuracy": 0.6317083239555359, + "num_tokens": 4860372.0, + "step": 193 + }, + { + "epoch": 0.021304634307050298, + "grad_norm": 4.96550989151001, + "learning_rate": 1.0592755214050494e-07, + "loss": 1.4161, + "mean_token_accuracy": 0.6144741773605347, + "num_tokens": 4883782.0, + "step": 194 + }, + { + "epoch": 0.021414452009663958, + "grad_norm": 4.566275119781494, + "learning_rate": 1.0647639956092205e-07, + "loss": 1.2824, + "mean_token_accuracy": 0.6450486183166504, + "num_tokens": 4905513.0, + "step": 195 + }, + { + "epoch": 0.021524269712277617, + "grad_norm": 5.260120391845703, + "learning_rate": 1.0702524698133918e-07, + "loss": 1.3099, + "mean_token_accuracy": 0.644229531288147, + "num_tokens": 4928482.0, + "step": 196 + }, + { + "epoch": 0.02163408741489128, + "grad_norm": 4.85931396484375, + "learning_rate": 1.0757409440175631e-07, + "loss": 1.2872, + "mean_token_accuracy": 0.6435239315032959, + "num_tokens": 4955790.0, + "step": 197 + }, + { + "epoch": 0.02174390511750494, + "grad_norm": 5.612387657165527, + "learning_rate": 1.0812294182217344e-07, + "loss": 1.2921, + "mean_token_accuracy": 0.6425173282623291, + "num_tokens": 4980146.0, + "step": 198 + }, + { + "epoch": 0.021853722820118604, + "grad_norm": 6.243117332458496, + "learning_rate": 1.0867178924259056e-07, + "loss": 1.2231, + "mean_token_accuracy": 0.6532638072967529, + "num_tokens": 4996580.0, + "step": 199 + }, + { + "epoch": 0.021963540522732264, + "grad_norm": 5.184607982635498, + "learning_rate": 1.0922063666300768e-07, + "loss": 1.3396, + "mean_token_accuracy": 0.6276638507843018, + "num_tokens": 5023799.0, + "step": 200 + }, + { + "epoch": 0.022073358225345927, + "grad_norm": 5.152313709259033, + "learning_rate": 1.0976948408342481e-07, + "loss": 1.3298, + "mean_token_accuracy": 0.6260155439376831, + "num_tokens": 5050285.0, + "step": 201 + }, + { + "epoch": 0.022183175927959587, + "grad_norm": 4.887985706329346, + "learning_rate": 1.1031833150384192e-07, + "loss": 1.3733, + "mean_token_accuracy": 0.6182245016098022, + "num_tokens": 5079688.0, + "step": 202 + }, + { + "epoch": 0.022292993630573247, + "grad_norm": 4.503468036651611, + "learning_rate": 1.1086717892425905e-07, + "loss": 1.1462, + "mean_token_accuracy": 0.6697584390640259, + "num_tokens": 5103964.0, + "step": 203 + }, + { + "epoch": 0.02240281133318691, + "grad_norm": 6.045025825500488, + "learning_rate": 1.1141602634467617e-07, + "loss": 1.2849, + "mean_token_accuracy": 0.6420315504074097, + "num_tokens": 5124358.0, + "step": 204 + }, + { + "epoch": 0.02251262903580057, + "grad_norm": 4.984360218048096, + "learning_rate": 1.119648737650933e-07, + "loss": 1.3424, + "mean_token_accuracy": 0.6324026584625244, + "num_tokens": 5151651.0, + "step": 205 + }, + { + "epoch": 0.022622446738414233, + "grad_norm": 5.216551303863525, + "learning_rate": 1.1251372118551042e-07, + "loss": 1.3107, + "mean_token_accuracy": 0.6350452899932861, + "num_tokens": 5178583.0, + "step": 206 + }, + { + "epoch": 0.022732264441027893, + "grad_norm": 4.531165599822998, + "learning_rate": 1.1306256860592754e-07, + "loss": 1.3263, + "mean_token_accuracy": 0.6360670924186707, + "num_tokens": 5204539.0, + "step": 207 + }, + { + "epoch": 0.022842082143641557, + "grad_norm": 4.648014545440674, + "learning_rate": 1.1361141602634467e-07, + "loss": 1.3035, + "mean_token_accuracy": 0.6428747177124023, + "num_tokens": 5231901.0, + "step": 208 + }, + { + "epoch": 0.022951899846255217, + "grad_norm": 5.354877471923828, + "learning_rate": 1.141602634467618e-07, + "loss": 1.3154, + "mean_token_accuracy": 0.6374552249908447, + "num_tokens": 5260149.0, + "step": 209 + }, + { + "epoch": 0.023061717548868876, + "grad_norm": 4.2014851570129395, + "learning_rate": 1.1470911086717892e-07, + "loss": 1.3653, + "mean_token_accuracy": 0.626345694065094, + "num_tokens": 5286562.0, + "step": 210 + }, + { + "epoch": 0.02317153525148254, + "grad_norm": 4.432364463806152, + "learning_rate": 1.1525795828759604e-07, + "loss": 1.2379, + "mean_token_accuracy": 0.6536908149719238, + "num_tokens": 5312772.0, + "step": 211 + }, + { + "epoch": 0.0232813529540962, + "grad_norm": 5.050896644592285, + "learning_rate": 1.1580680570801317e-07, + "loss": 1.3326, + "mean_token_accuracy": 0.6285216808319092, + "num_tokens": 5341731.0, + "step": 212 + }, + { + "epoch": 0.023391170656709863, + "grad_norm": 5.040684700012207, + "learning_rate": 1.163556531284303e-07, + "loss": 1.3585, + "mean_token_accuracy": 0.637597918510437, + "num_tokens": 5369718.0, + "step": 213 + }, + { + "epoch": 0.023500988359323523, + "grad_norm": 7.908745765686035, + "learning_rate": 1.1690450054884741e-07, + "loss": 1.2441, + "mean_token_accuracy": 0.6547060608863831, + "num_tokens": 5391922.0, + "step": 214 + }, + { + "epoch": 0.023610806061937183, + "grad_norm": 4.533623218536377, + "learning_rate": 1.1745334796926454e-07, + "loss": 1.2849, + "mean_token_accuracy": 0.6441908478736877, + "num_tokens": 5422592.0, + "step": 215 + }, + { + "epoch": 0.023720623764550846, + "grad_norm": 7.344688892364502, + "learning_rate": 1.1800219538968166e-07, + "loss": 1.2332, + "mean_token_accuracy": 0.6533765196800232, + "num_tokens": 5442435.0, + "step": 216 + }, + { + "epoch": 0.023830441467164506, + "grad_norm": 5.254312515258789, + "learning_rate": 1.1855104281009878e-07, + "loss": 1.3506, + "mean_token_accuracy": 0.6317564845085144, + "num_tokens": 5474288.0, + "step": 217 + }, + { + "epoch": 0.02394025916977817, + "grad_norm": 3.9236907958984375, + "learning_rate": 1.1909989023051591e-07, + "loss": 1.3036, + "mean_token_accuracy": 0.6436545848846436, + "num_tokens": 5503361.0, + "step": 218 + }, + { + "epoch": 0.02405007687239183, + "grad_norm": 5.243902683258057, + "learning_rate": 1.1964873765093303e-07, + "loss": 1.2868, + "mean_token_accuracy": 0.6432110071182251, + "num_tokens": 5531337.0, + "step": 219 + }, + { + "epoch": 0.024159894575005492, + "grad_norm": 6.560851097106934, + "learning_rate": 1.2019758507135017e-07, + "loss": 1.3171, + "mean_token_accuracy": 0.6292932033538818, + "num_tokens": 5551376.0, + "step": 220 + }, + { + "epoch": 0.024269712277619152, + "grad_norm": 5.078478813171387, + "learning_rate": 1.2074643249176729e-07, + "loss": 1.2486, + "mean_token_accuracy": 0.6527093648910522, + "num_tokens": 5582755.0, + "step": 221 + }, + { + "epoch": 0.024379529980232812, + "grad_norm": 4.881256103515625, + "learning_rate": 1.212952799121844e-07, + "loss": 1.2921, + "mean_token_accuracy": 0.6494003534317017, + "num_tokens": 5607579.0, + "step": 222 + }, + { + "epoch": 0.024489347682846475, + "grad_norm": 7.327303886413574, + "learning_rate": 1.2184412733260154e-07, + "loss": 1.2442, + "mean_token_accuracy": 0.6590165495872498, + "num_tokens": 5626078.0, + "step": 223 + }, + { + "epoch": 0.024599165385460135, + "grad_norm": 6.076222896575928, + "learning_rate": 1.2239297475301866e-07, + "loss": 1.1607, + "mean_token_accuracy": 0.6659821271896362, + "num_tokens": 5647938.0, + "step": 224 + }, + { + "epoch": 0.0247089830880738, + "grad_norm": 7.580443859100342, + "learning_rate": 1.2294182217343577e-07, + "loss": 1.2501, + "mean_token_accuracy": 0.641894519329071, + "num_tokens": 5667038.0, + "step": 225 + }, + { + "epoch": 0.02481880079068746, + "grad_norm": 6.209723949432373, + "learning_rate": 1.2349066959385291e-07, + "loss": 1.2344, + "mean_token_accuracy": 0.6526659727096558, + "num_tokens": 5694863.0, + "step": 226 + }, + { + "epoch": 0.024928618493301122, + "grad_norm": 5.758725166320801, + "learning_rate": 1.2403951701427003e-07, + "loss": 1.3173, + "mean_token_accuracy": 0.6205673217773438, + "num_tokens": 5723393.0, + "step": 227 + }, + { + "epoch": 0.02503843619591478, + "grad_norm": 6.875593185424805, + "learning_rate": 1.2458836443468714e-07, + "loss": 1.2176, + "mean_token_accuracy": 0.6588950753211975, + "num_tokens": 5745091.0, + "step": 228 + }, + { + "epoch": 0.02514825389852844, + "grad_norm": 5.787696838378906, + "learning_rate": 1.2513721185510429e-07, + "loss": 1.176, + "mean_token_accuracy": 0.6700103282928467, + "num_tokens": 5771592.0, + "step": 229 + }, + { + "epoch": 0.025258071601142105, + "grad_norm": 5.4587860107421875, + "learning_rate": 1.256860592755214e-07, + "loss": 1.219, + "mean_token_accuracy": 0.6618016958236694, + "num_tokens": 5800246.0, + "step": 230 + }, + { + "epoch": 0.025367889303755765, + "grad_norm": 4.656665325164795, + "learning_rate": 1.2623490669593854e-07, + "loss": 1.2598, + "mean_token_accuracy": 0.6502858400344849, + "num_tokens": 5829168.0, + "step": 231 + }, + { + "epoch": 0.025477707006369428, + "grad_norm": 6.502309322357178, + "learning_rate": 1.2678375411635563e-07, + "loss": 1.3394, + "mean_token_accuracy": 0.6409846544265747, + "num_tokens": 5850995.0, + "step": 232 + }, + { + "epoch": 0.025587524708983088, + "grad_norm": 5.779999256134033, + "learning_rate": 1.2733260153677277e-07, + "loss": 1.3098, + "mean_token_accuracy": 0.6289563775062561, + "num_tokens": 5872951.0, + "step": 233 + }, + { + "epoch": 0.025697342411596748, + "grad_norm": 5.482884407043457, + "learning_rate": 1.278814489571899e-07, + "loss": 1.2966, + "mean_token_accuracy": 0.6360750794410706, + "num_tokens": 5895948.0, + "step": 234 + }, + { + "epoch": 0.02580716011421041, + "grad_norm": 4.6891326904296875, + "learning_rate": 1.2843029637760703e-07, + "loss": 1.1982, + "mean_token_accuracy": 0.6735863089561462, + "num_tokens": 5925269.0, + "step": 235 + }, + { + "epoch": 0.02591697781682407, + "grad_norm": 5.302430152893066, + "learning_rate": 1.2897914379802412e-07, + "loss": 1.242, + "mean_token_accuracy": 0.6489583253860474, + "num_tokens": 5950977.0, + "step": 236 + }, + { + "epoch": 0.026026795519437734, + "grad_norm": 4.85866117477417, + "learning_rate": 1.2952799121844126e-07, + "loss": 1.1494, + "mean_token_accuracy": 0.6769401431083679, + "num_tokens": 5976613.0, + "step": 237 + }, + { + "epoch": 0.026136613222051394, + "grad_norm": 5.875120639801025, + "learning_rate": 1.300768386388584e-07, + "loss": 1.2027, + "mean_token_accuracy": 0.6626912355422974, + "num_tokens": 6002204.0, + "step": 238 + }, + { + "epoch": 0.026246430924665057, + "grad_norm": 6.447970867156982, + "learning_rate": 1.3062568605927552e-07, + "loss": 1.2827, + "mean_token_accuracy": 0.6419177055358887, + "num_tokens": 6028084.0, + "step": 239 + }, + { + "epoch": 0.026356248627278717, + "grad_norm": 5.800168514251709, + "learning_rate": 1.3117453347969266e-07, + "loss": 1.2578, + "mean_token_accuracy": 0.6439934968948364, + "num_tokens": 6049610.0, + "step": 240 + }, + { + "epoch": 0.026466066329892377, + "grad_norm": 7.570160388946533, + "learning_rate": 1.3172338090010975e-07, + "loss": 1.2247, + "mean_token_accuracy": 0.6580437421798706, + "num_tokens": 6070716.0, + "step": 241 + }, + { + "epoch": 0.02657588403250604, + "grad_norm": 5.538049697875977, + "learning_rate": 1.322722283205269e-07, + "loss": 1.3243, + "mean_token_accuracy": 0.628494143486023, + "num_tokens": 6096469.0, + "step": 242 + }, + { + "epoch": 0.0266857017351197, + "grad_norm": 4.717562198638916, + "learning_rate": 1.32821075740944e-07, + "loss": 1.2531, + "mean_token_accuracy": 0.6544532775878906, + "num_tokens": 6125245.0, + "step": 243 + }, + { + "epoch": 0.026795519437733364, + "grad_norm": 8.334376335144043, + "learning_rate": 1.3336992316136115e-07, + "loss": 1.3844, + "mean_token_accuracy": 0.6140117049217224, + "num_tokens": 6151579.0, + "step": 244 + }, + { + "epoch": 0.026905337140347024, + "grad_norm": 6.346831798553467, + "learning_rate": 1.3391877058177826e-07, + "loss": 1.2322, + "mean_token_accuracy": 0.6748461723327637, + "num_tokens": 6174868.0, + "step": 245 + }, + { + "epoch": 0.027015154842960687, + "grad_norm": 5.855744361877441, + "learning_rate": 1.3446761800219538e-07, + "loss": 1.3533, + "mean_token_accuracy": 0.628860354423523, + "num_tokens": 6199265.0, + "step": 246 + }, + { + "epoch": 0.027124972545574347, + "grad_norm": 5.873830795288086, + "learning_rate": 1.350164654226125e-07, + "loss": 1.2312, + "mean_token_accuracy": 0.6500726342201233, + "num_tokens": 6224144.0, + "step": 247 + }, + { + "epoch": 0.027234790248188007, + "grad_norm": 8.996659278869629, + "learning_rate": 1.3556531284302963e-07, + "loss": 1.1981, + "mean_token_accuracy": 0.6607932448387146, + "num_tokens": 6244930.0, + "step": 248 + }, + { + "epoch": 0.02734460795080167, + "grad_norm": 6.71850061416626, + "learning_rate": 1.3611416026344675e-07, + "loss": 1.2148, + "mean_token_accuracy": 0.6568950414657593, + "num_tokens": 6265642.0, + "step": 249 + }, + { + "epoch": 0.02745442565341533, + "grad_norm": 5.2848944664001465, + "learning_rate": 1.366630076838639e-07, + "loss": 1.302, + "mean_token_accuracy": 0.6407535076141357, + "num_tokens": 6293869.0, + "step": 250 + }, + { + "epoch": 0.027564243356028993, + "grad_norm": 5.45893669128418, + "learning_rate": 1.37211855104281e-07, + "loss": 1.2018, + "mean_token_accuracy": 0.6558647155761719, + "num_tokens": 6317574.0, + "step": 251 + }, + { + "epoch": 0.027674061058642653, + "grad_norm": 5.341687202453613, + "learning_rate": 1.3776070252469812e-07, + "loss": 1.223, + "mean_token_accuracy": 0.6485062837600708, + "num_tokens": 6340755.0, + "step": 252 + }, + { + "epoch": 0.027783878761256313, + "grad_norm": 5.928886413574219, + "learning_rate": 1.3830954994511526e-07, + "loss": 1.2781, + "mean_token_accuracy": 0.6388115882873535, + "num_tokens": 6361534.0, + "step": 253 + }, + { + "epoch": 0.027893696463869976, + "grad_norm": 7.206986904144287, + "learning_rate": 1.3885839736553238e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.6791270971298218, + "num_tokens": 6382613.0, + "step": 254 + }, + { + "epoch": 0.028003514166483636, + "grad_norm": 5.43381404876709, + "learning_rate": 1.394072447859495e-07, + "loss": 1.2174, + "mean_token_accuracy": 0.6546769142150879, + "num_tokens": 6408858.0, + "step": 255 + }, + { + "epoch": 0.0281133318690973, + "grad_norm": 6.722110271453857, + "learning_rate": 1.399560922063666e-07, + "loss": 1.1951, + "mean_token_accuracy": 0.663407564163208, + "num_tokens": 6431513.0, + "step": 256 + }, + { + "epoch": 0.02822314957171096, + "grad_norm": 5.124061584472656, + "learning_rate": 1.4050493962678375e-07, + "loss": 1.3004, + "mean_token_accuracy": 0.6368230581283569, + "num_tokens": 6457654.0, + "step": 257 + }, + { + "epoch": 0.028332967274324623, + "grad_norm": 4.903400897979736, + "learning_rate": 1.4105378704720087e-07, + "loss": 1.22, + "mean_token_accuracy": 0.663671612739563, + "num_tokens": 6481937.0, + "step": 258 + }, + { + "epoch": 0.028442784976938282, + "grad_norm": 6.537776947021484, + "learning_rate": 1.41602634467618e-07, + "loss": 1.2654, + "mean_token_accuracy": 0.6628532409667969, + "num_tokens": 6503551.0, + "step": 259 + }, + { + "epoch": 0.028552602679551942, + "grad_norm": 6.090762138366699, + "learning_rate": 1.421514818880351e-07, + "loss": 1.2726, + "mean_token_accuracy": 0.6458427906036377, + "num_tokens": 6524125.0, + "step": 260 + }, + { + "epoch": 0.028662420382165606, + "grad_norm": 4.408134937286377, + "learning_rate": 1.4270032930845224e-07, + "loss": 1.2109, + "mean_token_accuracy": 0.6582942008972168, + "num_tokens": 6554473.0, + "step": 261 + }, + { + "epoch": 0.028772238084779266, + "grad_norm": 5.400392532348633, + "learning_rate": 1.4324917672886938e-07, + "loss": 1.2809, + "mean_token_accuracy": 0.6386619806289673, + "num_tokens": 6579601.0, + "step": 262 + }, + { + "epoch": 0.02888205578739293, + "grad_norm": 5.390255928039551, + "learning_rate": 1.437980241492865e-07, + "loss": 1.1891, + "mean_token_accuracy": 0.6638251543045044, + "num_tokens": 6604706.0, + "step": 263 + }, + { + "epoch": 0.02899187349000659, + "grad_norm": 5.172589302062988, + "learning_rate": 1.4434687156970364e-07, + "loss": 1.2709, + "mean_token_accuracy": 0.6414644718170166, + "num_tokens": 6629926.0, + "step": 264 + }, + { + "epoch": 0.029101691192620252, + "grad_norm": 5.975812911987305, + "learning_rate": 1.4489571899012073e-07, + "loss": 1.3279, + "mean_token_accuracy": 0.63127201795578, + "num_tokens": 6650895.0, + "step": 265 + }, + { + "epoch": 0.029211508895233912, + "grad_norm": 4.736063003540039, + "learning_rate": 1.4544456641053787e-07, + "loss": 1.1859, + "mean_token_accuracy": 0.6675900220870972, + "num_tokens": 6677187.0, + "step": 266 + }, + { + "epoch": 0.029321326597847572, + "grad_norm": 5.616148948669434, + "learning_rate": 1.4599341383095498e-07, + "loss": 1.3273, + "mean_token_accuracy": 0.6390213370323181, + "num_tokens": 6702597.0, + "step": 267 + }, + { + "epoch": 0.029431144300461235, + "grad_norm": 5.003540515899658, + "learning_rate": 1.4654226125137212e-07, + "loss": 1.2503, + "mean_token_accuracy": 0.6547166705131531, + "num_tokens": 6728880.0, + "step": 268 + }, + { + "epoch": 0.029540962003074895, + "grad_norm": 6.331881046295166, + "learning_rate": 1.4709110867178924e-07, + "loss": 1.2327, + "mean_token_accuracy": 0.653976559638977, + "num_tokens": 6755174.0, + "step": 269 + }, + { + "epoch": 0.02965077970568856, + "grad_norm": 4.852304458618164, + "learning_rate": 1.4763995609220636e-07, + "loss": 1.2677, + "mean_token_accuracy": 0.6473736763000488, + "num_tokens": 6785392.0, + "step": 270 + }, + { + "epoch": 0.029760597408302218, + "grad_norm": 6.781867980957031, + "learning_rate": 1.4818880351262347e-07, + "loss": 1.293, + "mean_token_accuracy": 0.6438112258911133, + "num_tokens": 6811249.0, + "step": 271 + }, + { + "epoch": 0.029870415110915878, + "grad_norm": 4.710999011993408, + "learning_rate": 1.487376509330406e-07, + "loss": 1.1746, + "mean_token_accuracy": 0.6723989844322205, + "num_tokens": 6835471.0, + "step": 272 + }, + { + "epoch": 0.02998023281352954, + "grad_norm": 6.787010669708252, + "learning_rate": 1.4928649835345773e-07, + "loss": 1.3145, + "mean_token_accuracy": 0.6522566080093384, + "num_tokens": 6859695.0, + "step": 273 + }, + { + "epoch": 0.0300900505161432, + "grad_norm": 4.490139007568359, + "learning_rate": 1.4983534577387484e-07, + "loss": 1.1889, + "mean_token_accuracy": 0.666710615158081, + "num_tokens": 6886432.0, + "step": 274 + }, + { + "epoch": 0.030199868218756865, + "grad_norm": 4.994210720062256, + "learning_rate": 1.5038419319429198e-07, + "loss": 1.2165, + "mean_token_accuracy": 0.6470868587493896, + "num_tokens": 6912143.0, + "step": 275 + }, + { + "epoch": 0.030309685921370524, + "grad_norm": 4.809266567230225, + "learning_rate": 1.509330406147091e-07, + "loss": 1.113, + "mean_token_accuracy": 0.6764994859695435, + "num_tokens": 6935639.0, + "step": 276 + }, + { + "epoch": 0.030419503623984188, + "grad_norm": 5.502148151397705, + "learning_rate": 1.5148188803512624e-07, + "loss": 1.266, + "mean_token_accuracy": 0.6444622874259949, + "num_tokens": 6960650.0, + "step": 277 + }, + { + "epoch": 0.030529321326597848, + "grad_norm": 4.67022180557251, + "learning_rate": 1.5203073545554336e-07, + "loss": 1.2511, + "mean_token_accuracy": 0.6503620743751526, + "num_tokens": 6986882.0, + "step": 278 + }, + { + "epoch": 0.030639139029211507, + "grad_norm": 5.976946830749512, + "learning_rate": 1.5257958287596047e-07, + "loss": 1.1974, + "mean_token_accuracy": 0.6616934537887573, + "num_tokens": 7010534.0, + "step": 279 + }, + { + "epoch": 0.03074895673182517, + "grad_norm": 5.81409215927124, + "learning_rate": 1.531284302963776e-07, + "loss": 1.1972, + "mean_token_accuracy": 0.6654359102249146, + "num_tokens": 7034220.0, + "step": 280 + }, + { + "epoch": 0.03085877443443883, + "grad_norm": 4.712026119232178, + "learning_rate": 1.5367727771679473e-07, + "loss": 1.3194, + "mean_token_accuracy": 0.6304787397384644, + "num_tokens": 7065135.0, + "step": 281 + }, + { + "epoch": 0.030968592137052494, + "grad_norm": 5.029903888702393, + "learning_rate": 1.5422612513721184e-07, + "loss": 1.2642, + "mean_token_accuracy": 0.6474486589431763, + "num_tokens": 7094768.0, + "step": 282 + }, + { + "epoch": 0.031078409839666154, + "grad_norm": 4.394229888916016, + "learning_rate": 1.5477497255762899e-07, + "loss": 1.2329, + "mean_token_accuracy": 0.6520168781280518, + "num_tokens": 7122168.0, + "step": 283 + }, + { + "epoch": 0.031188227542279817, + "grad_norm": 4.283806800842285, + "learning_rate": 1.5532381997804607e-07, + "loss": 1.2603, + "mean_token_accuracy": 0.6470001935958862, + "num_tokens": 7148740.0, + "step": 284 + }, + { + "epoch": 0.03129804524489348, + "grad_norm": 5.7851643562316895, + "learning_rate": 1.5587266739846322e-07, + "loss": 1.3106, + "mean_token_accuracy": 0.6454190015792847, + "num_tokens": 7172222.0, + "step": 285 + }, + { + "epoch": 0.03140786294750714, + "grad_norm": 5.779552459716797, + "learning_rate": 1.5642151481888036e-07, + "loss": 1.135, + "mean_token_accuracy": 0.6734734177589417, + "num_tokens": 7192055.0, + "step": 286 + }, + { + "epoch": 0.0315176806501208, + "grad_norm": 5.860560894012451, + "learning_rate": 1.5697036223929747e-07, + "loss": 1.1896, + "mean_token_accuracy": 0.6647806167602539, + "num_tokens": 7212513.0, + "step": 287 + }, + { + "epoch": 0.031627498352734464, + "grad_norm": 4.72133731842041, + "learning_rate": 1.5751920965971461e-07, + "loss": 1.246, + "mean_token_accuracy": 0.6520521640777588, + "num_tokens": 7245568.0, + "step": 288 + }, + { + "epoch": 0.03173731605534812, + "grad_norm": 5.204545497894287, + "learning_rate": 1.580680570801317e-07, + "loss": 1.1919, + "mean_token_accuracy": 0.6490986347198486, + "num_tokens": 7268983.0, + "step": 289 + }, + { + "epoch": 0.03184713375796178, + "grad_norm": 5.912210464477539, + "learning_rate": 1.5861690450054885e-07, + "loss": 1.1859, + "mean_token_accuracy": 0.662480354309082, + "num_tokens": 7298221.0, + "step": 290 + }, + { + "epoch": 0.03195695146057544, + "grad_norm": 7.519872665405273, + "learning_rate": 1.5916575192096596e-07, + "loss": 1.2753, + "mean_token_accuracy": 0.6485783457756042, + "num_tokens": 7321839.0, + "step": 291 + }, + { + "epoch": 0.0320667691631891, + "grad_norm": 6.516519069671631, + "learning_rate": 1.597145993413831e-07, + "loss": 1.3355, + "mean_token_accuracy": 0.6270827054977417, + "num_tokens": 7354341.0, + "step": 292 + }, + { + "epoch": 0.03217658686580277, + "grad_norm": 5.34217643737793, + "learning_rate": 1.602634467618002e-07, + "loss": 1.2434, + "mean_token_accuracy": 0.6561932563781738, + "num_tokens": 7380163.0, + "step": 293 + }, + { + "epoch": 0.03228640456841643, + "grad_norm": 5.140142917633057, + "learning_rate": 1.6081229418221733e-07, + "loss": 1.2655, + "mean_token_accuracy": 0.6441688537597656, + "num_tokens": 7406133.0, + "step": 294 + }, + { + "epoch": 0.03239622227103009, + "grad_norm": 4.774118900299072, + "learning_rate": 1.6136114160263445e-07, + "loss": 1.1592, + "mean_token_accuracy": 0.6640952229499817, + "num_tokens": 7432260.0, + "step": 295 + }, + { + "epoch": 0.03250603997364375, + "grad_norm": 4.737305641174316, + "learning_rate": 1.619099890230516e-07, + "loss": 1.1657, + "mean_token_accuracy": 0.6616276502609253, + "num_tokens": 7460803.0, + "step": 296 + }, + { + "epoch": 0.03261585767625741, + "grad_norm": 6.1501946449279785, + "learning_rate": 1.6245883644346873e-07, + "loss": 1.2729, + "mean_token_accuracy": 0.6355389356613159, + "num_tokens": 7483789.0, + "step": 297 + }, + { + "epoch": 0.032725675378871076, + "grad_norm": 5.283253192901611, + "learning_rate": 1.6300768386388582e-07, + "loss": 1.1259, + "mean_token_accuracy": 0.6783849000930786, + "num_tokens": 7506593.0, + "step": 298 + }, + { + "epoch": 0.032835493081484736, + "grad_norm": 5.3632354736328125, + "learning_rate": 1.6355653128430296e-07, + "loss": 1.2857, + "mean_token_accuracy": 0.6460005044937134, + "num_tokens": 7528906.0, + "step": 299 + }, + { + "epoch": 0.032945310784098396, + "grad_norm": 4.539299011230469, + "learning_rate": 1.6410537870472008e-07, + "loss": 1.1815, + "mean_token_accuracy": 0.663701057434082, + "num_tokens": 7559074.0, + "step": 300 + }, + { + "epoch": 0.033055128486712056, + "grad_norm": 4.293272972106934, + "learning_rate": 1.6465422612513722e-07, + "loss": 1.1541, + "mean_token_accuracy": 0.6665412783622742, + "num_tokens": 7581153.0, + "step": 301 + }, + { + "epoch": 0.03316494618932572, + "grad_norm": 6.181020259857178, + "learning_rate": 1.6520307354555433e-07, + "loss": 1.1789, + "mean_token_accuracy": 0.6581469774246216, + "num_tokens": 7601069.0, + "step": 302 + }, + { + "epoch": 0.03327476389193938, + "grad_norm": 5.510436058044434, + "learning_rate": 1.6575192096597145e-07, + "loss": 1.2486, + "mean_token_accuracy": 0.655129075050354, + "num_tokens": 7624497.0, + "step": 303 + }, + { + "epoch": 0.03338458159455304, + "grad_norm": 4.399679183959961, + "learning_rate": 1.6630076838638856e-07, + "loss": 1.2314, + "mean_token_accuracy": 0.6531139016151428, + "num_tokens": 7657499.0, + "step": 304 + }, + { + "epoch": 0.0334943992971667, + "grad_norm": 5.11839485168457, + "learning_rate": 1.668496158068057e-07, + "loss": 1.2137, + "mean_token_accuracy": 0.6530515551567078, + "num_tokens": 7683754.0, + "step": 305 + }, + { + "epoch": 0.03360421699978036, + "grad_norm": 4.464770317077637, + "learning_rate": 1.6739846322722282e-07, + "loss": 1.1569, + "mean_token_accuracy": 0.6724011898040771, + "num_tokens": 7712866.0, + "step": 306 + }, + { + "epoch": 0.03371403470239403, + "grad_norm": 4.905570030212402, + "learning_rate": 1.6794731064763996e-07, + "loss": 1.2559, + "mean_token_accuracy": 0.6495568752288818, + "num_tokens": 7739940.0, + "step": 307 + }, + { + "epoch": 0.03382385240500769, + "grad_norm": 5.283228397369385, + "learning_rate": 1.6849615806805705e-07, + "loss": 1.2424, + "mean_token_accuracy": 0.6465278267860413, + "num_tokens": 7763581.0, + "step": 308 + }, + { + "epoch": 0.03393367010762135, + "grad_norm": 5.620240688323975, + "learning_rate": 1.690450054884742e-07, + "loss": 1.193, + "mean_token_accuracy": 0.6624026894569397, + "num_tokens": 7787302.0, + "step": 309 + }, + { + "epoch": 0.03404348781023501, + "grad_norm": 4.320696830749512, + "learning_rate": 1.6959385290889134e-07, + "loss": 1.2446, + "mean_token_accuracy": 0.6538248062133789, + "num_tokens": 7817342.0, + "step": 310 + }, + { + "epoch": 0.03415330551284867, + "grad_norm": 4.0141096115112305, + "learning_rate": 1.7014270032930845e-07, + "loss": 1.2417, + "mean_token_accuracy": 0.6519490480422974, + "num_tokens": 7850043.0, + "step": 311 + }, + { + "epoch": 0.034263123215462335, + "grad_norm": 4.230241298675537, + "learning_rate": 1.7069154774972557e-07, + "loss": 1.2566, + "mean_token_accuracy": 0.6433922052383423, + "num_tokens": 7877683.0, + "step": 312 + }, + { + "epoch": 0.034372940918075995, + "grad_norm": 5.589524269104004, + "learning_rate": 1.7124039517014268e-07, + "loss": 1.1479, + "mean_token_accuracy": 0.6784481406211853, + "num_tokens": 7901791.0, + "step": 313 + }, + { + "epoch": 0.034482758620689655, + "grad_norm": 4.522602081298828, + "learning_rate": 1.7178924259055982e-07, + "loss": 1.2443, + "mean_token_accuracy": 0.6493811011314392, + "num_tokens": 7930280.0, + "step": 314 + }, + { + "epoch": 0.034592576323303315, + "grad_norm": 5.099110126495361, + "learning_rate": 1.7233809001097694e-07, + "loss": 1.1394, + "mean_token_accuracy": 0.6771368980407715, + "num_tokens": 7954375.0, + "step": 315 + }, + { + "epoch": 0.034702394025916974, + "grad_norm": 5.16107702255249, + "learning_rate": 1.7288693743139408e-07, + "loss": 1.1208, + "mean_token_accuracy": 0.6745137572288513, + "num_tokens": 7979824.0, + "step": 316 + }, + { + "epoch": 0.03481221172853064, + "grad_norm": 4.629313945770264, + "learning_rate": 1.7343578485181117e-07, + "loss": 1.159, + "mean_token_accuracy": 0.6655483245849609, + "num_tokens": 8002141.0, + "step": 317 + }, + { + "epoch": 0.0349220294311443, + "grad_norm": 5.081248760223389, + "learning_rate": 1.739846322722283e-07, + "loss": 1.2035, + "mean_token_accuracy": 0.6651709675788879, + "num_tokens": 8029093.0, + "step": 318 + }, + { + "epoch": 0.03503184713375796, + "grad_norm": 4.893130302429199, + "learning_rate": 1.7453347969264543e-07, + "loss": 1.1141, + "mean_token_accuracy": 0.6739946603775024, + "num_tokens": 8054585.0, + "step": 319 + }, + { + "epoch": 0.03514166483637162, + "grad_norm": 5.0424580574035645, + "learning_rate": 1.7508232711306257e-07, + "loss": 1.1746, + "mean_token_accuracy": 0.6684246063232422, + "num_tokens": 8075524.0, + "step": 320 + }, + { + "epoch": 0.03525148253898529, + "grad_norm": 6.86494255065918, + "learning_rate": 1.756311745334797e-07, + "loss": 1.2107, + "mean_token_accuracy": 0.6636511087417603, + "num_tokens": 8094197.0, + "step": 321 + }, + { + "epoch": 0.03536130024159895, + "grad_norm": 3.7916924953460693, + "learning_rate": 1.761800219538968e-07, + "loss": 1.2055, + "mean_token_accuracy": 0.6648112535476685, + "num_tokens": 8122090.0, + "step": 322 + }, + { + "epoch": 0.03547111794421261, + "grad_norm": 5.0024895668029785, + "learning_rate": 1.7672886937431394e-07, + "loss": 1.1318, + "mean_token_accuracy": 0.6655910015106201, + "num_tokens": 8139597.0, + "step": 323 + }, + { + "epoch": 0.03558093564682627, + "grad_norm": 5.298346519470215, + "learning_rate": 1.7727771679473105e-07, + "loss": 1.2789, + "mean_token_accuracy": 0.6343103647232056, + "num_tokens": 8166560.0, + "step": 324 + }, + { + "epoch": 0.03569075334943993, + "grad_norm": 4.836138725280762, + "learning_rate": 1.778265642151482e-07, + "loss": 1.2904, + "mean_token_accuracy": 0.6426906585693359, + "num_tokens": 8191173.0, + "step": 325 + }, + { + "epoch": 0.035800571052053594, + "grad_norm": 5.6014628410339355, + "learning_rate": 1.7837541163556529e-07, + "loss": 1.1183, + "mean_token_accuracy": 0.6755226850509644, + "num_tokens": 8213117.0, + "step": 326 + }, + { + "epoch": 0.035910388754667254, + "grad_norm": 4.387700080871582, + "learning_rate": 1.7892425905598243e-07, + "loss": 1.2667, + "mean_token_accuracy": 0.6448032855987549, + "num_tokens": 8240513.0, + "step": 327 + }, + { + "epoch": 0.036020206457280914, + "grad_norm": 5.331836700439453, + "learning_rate": 1.7947310647639954e-07, + "loss": 1.1495, + "mean_token_accuracy": 0.6756325960159302, + "num_tokens": 8266885.0, + "step": 328 + }, + { + "epoch": 0.03613002415989457, + "grad_norm": 4.06910514831543, + "learning_rate": 1.8002195389681668e-07, + "loss": 1.2693, + "mean_token_accuracy": 0.6397185325622559, + "num_tokens": 8293173.0, + "step": 329 + }, + { + "epoch": 0.03623984186250823, + "grad_norm": 5.312028884887695, + "learning_rate": 1.805708013172338e-07, + "loss": 1.3204, + "mean_token_accuracy": 0.6394043564796448, + "num_tokens": 8320346.0, + "step": 330 + }, + { + "epoch": 0.0363496595651219, + "grad_norm": 4.558038711547852, + "learning_rate": 1.8111964873765091e-07, + "loss": 1.308, + "mean_token_accuracy": 0.6321897506713867, + "num_tokens": 8345155.0, + "step": 331 + }, + { + "epoch": 0.03645947726773556, + "grad_norm": 5.029248237609863, + "learning_rate": 1.8166849615806803e-07, + "loss": 1.2277, + "mean_token_accuracy": 0.64702969789505, + "num_tokens": 8370401.0, + "step": 332 + }, + { + "epoch": 0.03656929497034922, + "grad_norm": 4.165885925292969, + "learning_rate": 1.8221734357848517e-07, + "loss": 1.1464, + "mean_token_accuracy": 0.6742668747901917, + "num_tokens": 8393837.0, + "step": 333 + }, + { + "epoch": 0.03667911267296288, + "grad_norm": 3.593191146850586, + "learning_rate": 1.827661909989023e-07, + "loss": 1.2167, + "mean_token_accuracy": 0.6571366786956787, + "num_tokens": 8425157.0, + "step": 334 + }, + { + "epoch": 0.03678893037557654, + "grad_norm": 5.0703043937683105, + "learning_rate": 1.8331503841931943e-07, + "loss": 1.129, + "mean_token_accuracy": 0.6781293749809265, + "num_tokens": 8445755.0, + "step": 335 + }, + { + "epoch": 0.036898748078190206, + "grad_norm": 4.716860771179199, + "learning_rate": 1.8386388583973654e-07, + "loss": 1.166, + "mean_token_accuracy": 0.6641435623168945, + "num_tokens": 8464691.0, + "step": 336 + }, + { + "epoch": 0.037008565780803866, + "grad_norm": 4.495270252227783, + "learning_rate": 1.8441273326015366e-07, + "loss": 1.159, + "mean_token_accuracy": 0.6715354919433594, + "num_tokens": 8488219.0, + "step": 337 + }, + { + "epoch": 0.037118383483417526, + "grad_norm": 4.431127548217773, + "learning_rate": 1.849615806805708e-07, + "loss": 1.1707, + "mean_token_accuracy": 0.6594254374504089, + "num_tokens": 8513210.0, + "step": 338 + }, + { + "epoch": 0.037228201186031186, + "grad_norm": 4.327627658843994, + "learning_rate": 1.8551042810098792e-07, + "loss": 1.1494, + "mean_token_accuracy": 0.670987606048584, + "num_tokens": 8537726.0, + "step": 339 + }, + { + "epoch": 0.03733801888864485, + "grad_norm": 5.710991382598877, + "learning_rate": 1.8605927552140506e-07, + "loss": 1.1085, + "mean_token_accuracy": 0.6842859983444214, + "num_tokens": 8553507.0, + "step": 340 + }, + { + "epoch": 0.03744783659125851, + "grad_norm": 4.6186909675598145, + "learning_rate": 1.8660812294182215e-07, + "loss": 1.241, + "mean_token_accuracy": 0.6548902988433838, + "num_tokens": 8578714.0, + "step": 341 + }, + { + "epoch": 0.03755765429387217, + "grad_norm": 3.967904567718506, + "learning_rate": 1.871569703622393e-07, + "loss": 1.2629, + "mean_token_accuracy": 0.6401352882385254, + "num_tokens": 8608764.0, + "step": 342 + }, + { + "epoch": 0.03766747199648583, + "grad_norm": 4.328205108642578, + "learning_rate": 1.877058177826564e-07, + "loss": 1.1903, + "mean_token_accuracy": 0.6620467901229858, + "num_tokens": 8631382.0, + "step": 343 + }, + { + "epoch": 0.03777728969909949, + "grad_norm": 4.822125434875488, + "learning_rate": 1.8825466520307354e-07, + "loss": 1.2441, + "mean_token_accuracy": 0.646818220615387, + "num_tokens": 8655692.0, + "step": 344 + }, + { + "epoch": 0.03788710740171316, + "grad_norm": 5.360873699188232, + "learning_rate": 1.8880351262349069e-07, + "loss": 1.1108, + "mean_token_accuracy": 0.6763237714767456, + "num_tokens": 8672506.0, + "step": 345 + }, + { + "epoch": 0.03799692510432682, + "grad_norm": 4.624794006347656, + "learning_rate": 1.8935236004390778e-07, + "loss": 1.2694, + "mean_token_accuracy": 0.6407361030578613, + "num_tokens": 8700942.0, + "step": 346 + }, + { + "epoch": 0.03810674280694048, + "grad_norm": 4.51175594329834, + "learning_rate": 1.8990120746432492e-07, + "loss": 1.2699, + "mean_token_accuracy": 0.6366171836853027, + "num_tokens": 8730237.0, + "step": 347 + }, + { + "epoch": 0.03821656050955414, + "grad_norm": 4.345934867858887, + "learning_rate": 1.9045005488474203e-07, + "loss": 1.2329, + "mean_token_accuracy": 0.6533514261245728, + "num_tokens": 8762288.0, + "step": 348 + }, + { + "epoch": 0.0383263782121678, + "grad_norm": 5.244883060455322, + "learning_rate": 1.9099890230515917e-07, + "loss": 1.1721, + "mean_token_accuracy": 0.6597553491592407, + "num_tokens": 8784907.0, + "step": 349 + }, + { + "epoch": 0.038436195914781465, + "grad_norm": 6.383604049682617, + "learning_rate": 1.9154774972557626e-07, + "loss": 1.0937, + "mean_token_accuracy": 0.6758692264556885, + "num_tokens": 8803780.0, + "step": 350 + }, + { + "epoch": 0.038546013617395125, + "grad_norm": 5.371850967407227, + "learning_rate": 1.920965971459934e-07, + "loss": 1.2145, + "mean_token_accuracy": 0.6690839529037476, + "num_tokens": 8828468.0, + "step": 351 + }, + { + "epoch": 0.038655831320008785, + "grad_norm": 4.82059907913208, + "learning_rate": 1.9264544456641052e-07, + "loss": 1.1156, + "mean_token_accuracy": 0.6746271848678589, + "num_tokens": 8851714.0, + "step": 352 + }, + { + "epoch": 0.038765649022622445, + "grad_norm": 4.699011325836182, + "learning_rate": 1.9319429198682766e-07, + "loss": 1.2979, + "mean_token_accuracy": 0.6380293965339661, + "num_tokens": 8881106.0, + "step": 353 + }, + { + "epoch": 0.038875466725236105, + "grad_norm": 5.130366802215576, + "learning_rate": 1.9374313940724478e-07, + "loss": 1.1746, + "mean_token_accuracy": 0.6635655164718628, + "num_tokens": 8903470.0, + "step": 354 + }, + { + "epoch": 0.03898528442784977, + "grad_norm": 4.314210414886475, + "learning_rate": 1.942919868276619e-07, + "loss": 1.2383, + "mean_token_accuracy": 0.6496376991271973, + "num_tokens": 8925428.0, + "step": 355 + }, + { + "epoch": 0.03909510213046343, + "grad_norm": 4.550247669219971, + "learning_rate": 1.9484083424807903e-07, + "loss": 1.1749, + "mean_token_accuracy": 0.6667396426200867, + "num_tokens": 8952137.0, + "step": 356 + }, + { + "epoch": 0.03920491983307709, + "grad_norm": 5.116442680358887, + "learning_rate": 1.9538968166849615e-07, + "loss": 1.2637, + "mean_token_accuracy": 0.6522982716560364, + "num_tokens": 8978167.0, + "step": 357 + }, + { + "epoch": 0.03931473753569075, + "grad_norm": 4.425562858581543, + "learning_rate": 1.959385290889133e-07, + "loss": 1.1651, + "mean_token_accuracy": 0.6693230271339417, + "num_tokens": 9002315.0, + "step": 358 + }, + { + "epoch": 0.03942455523830442, + "grad_norm": 5.482595443725586, + "learning_rate": 1.964873765093304e-07, + "loss": 1.1437, + "mean_token_accuracy": 0.6671539545059204, + "num_tokens": 9025591.0, + "step": 359 + }, + { + "epoch": 0.03953437294091808, + "grad_norm": 4.566300392150879, + "learning_rate": 1.9703622392974752e-07, + "loss": 1.183, + "mean_token_accuracy": 0.6734629273414612, + "num_tokens": 9047881.0, + "step": 360 + }, + { + "epoch": 0.03964419064353174, + "grad_norm": 3.8446145057678223, + "learning_rate": 1.9758507135016464e-07, + "loss": 1.1898, + "mean_token_accuracy": 0.6546288728713989, + "num_tokens": 9072708.0, + "step": 361 + }, + { + "epoch": 0.0397540083461454, + "grad_norm": 3.962151288986206, + "learning_rate": 1.9813391877058178e-07, + "loss": 1.2028, + "mean_token_accuracy": 0.653793454170227, + "num_tokens": 9100951.0, + "step": 362 + }, + { + "epoch": 0.03986382604875906, + "grad_norm": 3.860605001449585, + "learning_rate": 1.986827661909989e-07, + "loss": 1.2111, + "mean_token_accuracy": 0.6550585627555847, + "num_tokens": 9131036.0, + "step": 363 + }, + { + "epoch": 0.039973643751372724, + "grad_norm": 4.333126544952393, + "learning_rate": 1.9923161361141603e-07, + "loss": 1.2258, + "mean_token_accuracy": 0.6633358597755432, + "num_tokens": 9154982.0, + "step": 364 + }, + { + "epoch": 0.040083461453986384, + "grad_norm": 3.8857336044311523, + "learning_rate": 1.9978046103183312e-07, + "loss": 1.2605, + "mean_token_accuracy": 0.6429277658462524, + "num_tokens": 9182108.0, + "step": 365 + }, + { + "epoch": 0.040193279156600044, + "grad_norm": 5.176736354827881, + "learning_rate": 2.0032930845225027e-07, + "loss": 1.2268, + "mean_token_accuracy": 0.6436806917190552, + "num_tokens": 9203015.0, + "step": 366 + }, + { + "epoch": 0.040303096859213704, + "grad_norm": 3.584615468978882, + "learning_rate": 2.0087815587266738e-07, + "loss": 1.1777, + "mean_token_accuracy": 0.6748425960540771, + "num_tokens": 9232147.0, + "step": 367 + }, + { + "epoch": 0.040412914561827364, + "grad_norm": 3.8257038593292236, + "learning_rate": 2.0142700329308452e-07, + "loss": 1.1755, + "mean_token_accuracy": 0.6601058840751648, + "num_tokens": 9257910.0, + "step": 368 + }, + { + "epoch": 0.04052273226444103, + "grad_norm": 3.9028618335723877, + "learning_rate": 2.0197585071350164e-07, + "loss": 1.1842, + "mean_token_accuracy": 0.6821008920669556, + "num_tokens": 9281277.0, + "step": 369 + }, + { + "epoch": 0.04063254996705469, + "grad_norm": 4.739362716674805, + "learning_rate": 2.0252469813391875e-07, + "loss": 1.0664, + "mean_token_accuracy": 0.6952624320983887, + "num_tokens": 9305540.0, + "step": 370 + }, + { + "epoch": 0.04074236766966835, + "grad_norm": 4.322873592376709, + "learning_rate": 2.030735455543359e-07, + "loss": 1.1244, + "mean_token_accuracy": 0.6925644874572754, + "num_tokens": 9329330.0, + "step": 371 + }, + { + "epoch": 0.04085218537228201, + "grad_norm": 3.441701889038086, + "learning_rate": 2.03622392974753e-07, + "loss": 1.1523, + "mean_token_accuracy": 0.6672924160957336, + "num_tokens": 9361184.0, + "step": 372 + }, + { + "epoch": 0.04096200307489567, + "grad_norm": 4.373020172119141, + "learning_rate": 2.0417124039517015e-07, + "loss": 1.1864, + "mean_token_accuracy": 0.6677829027175903, + "num_tokens": 9384628.0, + "step": 373 + }, + { + "epoch": 0.04107182077750934, + "grad_norm": 3.1595635414123535, + "learning_rate": 2.0472008781558724e-07, + "loss": 1.1872, + "mean_token_accuracy": 0.6610921621322632, + "num_tokens": 9419365.0, + "step": 374 + }, + { + "epoch": 0.041181638480122996, + "grad_norm": 4.12789249420166, + "learning_rate": 2.0526893523600438e-07, + "loss": 1.2613, + "mean_token_accuracy": 0.6421530246734619, + "num_tokens": 9448921.0, + "step": 375 + }, + { + "epoch": 0.041291456182736656, + "grad_norm": 3.721283435821533, + "learning_rate": 2.058177826564215e-07, + "loss": 1.2111, + "mean_token_accuracy": 0.6579893827438354, + "num_tokens": 9478653.0, + "step": 376 + }, + { + "epoch": 0.041401273885350316, + "grad_norm": 5.998361587524414, + "learning_rate": 2.0636663007683864e-07, + "loss": 1.2314, + "mean_token_accuracy": 0.6507468819618225, + "num_tokens": 9497038.0, + "step": 377 + }, + { + "epoch": 0.04151109158796398, + "grad_norm": 4.628074645996094, + "learning_rate": 2.0691547749725575e-07, + "loss": 1.2324, + "mean_token_accuracy": 0.6456447243690491, + "num_tokens": 9522529.0, + "step": 378 + }, + { + "epoch": 0.04162090929057764, + "grad_norm": 3.6651904582977295, + "learning_rate": 2.0746432491767287e-07, + "loss": 1.2539, + "mean_token_accuracy": 0.6342935562133789, + "num_tokens": 9547911.0, + "step": 379 + }, + { + "epoch": 0.0417307269931913, + "grad_norm": 3.601516008377075, + "learning_rate": 2.0801317233809e-07, + "loss": 1.2462, + "mean_token_accuracy": 0.647515058517456, + "num_tokens": 9574836.0, + "step": 380 + }, + { + "epoch": 0.04184054469580496, + "grad_norm": 3.4699840545654297, + "learning_rate": 2.0856201975850713e-07, + "loss": 1.1961, + "mean_token_accuracy": 0.6642818450927734, + "num_tokens": 9606432.0, + "step": 381 + }, + { + "epoch": 0.04195036239841862, + "grad_norm": 3.8179264068603516, + "learning_rate": 2.0911086717892427e-07, + "loss": 1.15, + "mean_token_accuracy": 0.6649212837219238, + "num_tokens": 9631753.0, + "step": 382 + }, + { + "epoch": 0.04206018010103229, + "grad_norm": 3.8801636695861816, + "learning_rate": 2.0965971459934136e-07, + "loss": 1.2673, + "mean_token_accuracy": 0.6443725824356079, + "num_tokens": 9659011.0, + "step": 383 + }, + { + "epoch": 0.04216999780364595, + "grad_norm": 3.9628701210021973, + "learning_rate": 2.102085620197585e-07, + "loss": 1.1576, + "mean_token_accuracy": 0.6668222546577454, + "num_tokens": 9685262.0, + "step": 384 + }, + { + "epoch": 0.04227981550625961, + "grad_norm": 4.488767147064209, + "learning_rate": 2.1075740944017561e-07, + "loss": 1.1788, + "mean_token_accuracy": 0.6597780585289001, + "num_tokens": 9709619.0, + "step": 385 + }, + { + "epoch": 0.04238963320887327, + "grad_norm": 3.8246052265167236, + "learning_rate": 2.1130625686059276e-07, + "loss": 1.2116, + "mean_token_accuracy": 0.6523967981338501, + "num_tokens": 9733198.0, + "step": 386 + }, + { + "epoch": 0.04249945091148693, + "grad_norm": 3.2429428100585938, + "learning_rate": 2.1185510428100987e-07, + "loss": 1.199, + "mean_token_accuracy": 0.6591994166374207, + "num_tokens": 9760197.0, + "step": 387 + }, + { + "epoch": 0.042609268614100596, + "grad_norm": 3.835602283477783, + "learning_rate": 2.1240395170142699e-07, + "loss": 1.1697, + "mean_token_accuracy": 0.6642253398895264, + "num_tokens": 9786737.0, + "step": 388 + }, + { + "epoch": 0.042719086316714255, + "grad_norm": 4.102005958557129, + "learning_rate": 2.129527991218441e-07, + "loss": 1.1397, + "mean_token_accuracy": 0.6737978458404541, + "num_tokens": 9808825.0, + "step": 389 + }, + { + "epoch": 0.042828904019327915, + "grad_norm": 4.9301276206970215, + "learning_rate": 2.1350164654226124e-07, + "loss": 1.1511, + "mean_token_accuracy": 0.6640348434448242, + "num_tokens": 9827890.0, + "step": 390 + }, + { + "epoch": 0.042938721721941575, + "grad_norm": 3.9114503860473633, + "learning_rate": 2.1405049396267836e-07, + "loss": 1.149, + "mean_token_accuracy": 0.669329047203064, + "num_tokens": 9855221.0, + "step": 391 + }, + { + "epoch": 0.043048539424555235, + "grad_norm": 4.232679843902588, + "learning_rate": 2.145993413830955e-07, + "loss": 1.1249, + "mean_token_accuracy": 0.6723724007606506, + "num_tokens": 9880478.0, + "step": 392 + }, + { + "epoch": 0.0431583571271689, + "grad_norm": 4.036497116088867, + "learning_rate": 2.1514818880351262e-07, + "loss": 1.1197, + "mean_token_accuracy": 0.679579496383667, + "num_tokens": 9902753.0, + "step": 393 + }, + { + "epoch": 0.04326817482978256, + "grad_norm": 3.4415714740753174, + "learning_rate": 2.1569703622392973e-07, + "loss": 1.2354, + "mean_token_accuracy": 0.6450338363647461, + "num_tokens": 9927766.0, + "step": 394 + }, + { + "epoch": 0.04337799253239622, + "grad_norm": 3.647519826889038, + "learning_rate": 2.1624588364434687e-07, + "loss": 1.2132, + "mean_token_accuracy": 0.6686776280403137, + "num_tokens": 9955679.0, + "step": 395 + }, + { + "epoch": 0.04348781023500988, + "grad_norm": 3.561011791229248, + "learning_rate": 2.16794731064764e-07, + "loss": 1.2802, + "mean_token_accuracy": 0.6482245326042175, + "num_tokens": 9981818.0, + "step": 396 + }, + { + "epoch": 0.04359762793762355, + "grad_norm": 5.4556145668029785, + "learning_rate": 2.1734357848518113e-07, + "loss": 1.1417, + "mean_token_accuracy": 0.6709260940551758, + "num_tokens": 9999172.0, + "step": 397 + }, + { + "epoch": 0.04370744564023721, + "grad_norm": 3.376741647720337, + "learning_rate": 2.1789242590559822e-07, + "loss": 1.259, + "mean_token_accuracy": 0.6403700113296509, + "num_tokens": 10026561.0, + "step": 398 + }, + { + "epoch": 0.04381726334285087, + "grad_norm": 2.923739194869995, + "learning_rate": 2.1844127332601536e-07, + "loss": 1.219, + "mean_token_accuracy": 0.6509903073310852, + "num_tokens": 10059378.0, + "step": 399 + }, + { + "epoch": 0.04392708104546453, + "grad_norm": 3.4434945583343506, + "learning_rate": 2.1899012074643247e-07, + "loss": 1.1322, + "mean_token_accuracy": 0.6726398468017578, + "num_tokens": 10088764.0, + "step": 400 + }, + { + "epoch": 0.04403689874807819, + "grad_norm": 4.227515697479248, + "learning_rate": 2.1953896816684962e-07, + "loss": 1.0771, + "mean_token_accuracy": 0.6874892711639404, + "num_tokens": 10111751.0, + "step": 401 + }, + { + "epoch": 0.044146716450691854, + "grad_norm": 3.0054030418395996, + "learning_rate": 2.200878155872667e-07, + "loss": 1.1978, + "mean_token_accuracy": 0.6599588394165039, + "num_tokens": 10143151.0, + "step": 402 + }, + { + "epoch": 0.044256534153305514, + "grad_norm": 3.462881088256836, + "learning_rate": 2.2063666300768385e-07, + "loss": 1.2415, + "mean_token_accuracy": 0.6424514055252075, + "num_tokens": 10168411.0, + "step": 403 + }, + { + "epoch": 0.044366351855919174, + "grad_norm": 3.970703363418579, + "learning_rate": 2.21185510428101e-07, + "loss": 1.3091, + "mean_token_accuracy": 0.6398603320121765, + "num_tokens": 10193163.0, + "step": 404 + }, + { + "epoch": 0.044476169558532834, + "grad_norm": 3.112436532974243, + "learning_rate": 2.217343578485181e-07, + "loss": 1.2228, + "mean_token_accuracy": 0.6523829698562622, + "num_tokens": 10223173.0, + "step": 405 + }, + { + "epoch": 0.044585987261146494, + "grad_norm": 4.217040061950684, + "learning_rate": 2.2228320526893525e-07, + "loss": 1.2437, + "mean_token_accuracy": 0.6442314386367798, + "num_tokens": 10247552.0, + "step": 406 + }, + { + "epoch": 0.04469580496376016, + "grad_norm": 2.897552490234375, + "learning_rate": 2.2283205268935233e-07, + "loss": 1.1726, + "mean_token_accuracy": 0.6709779500961304, + "num_tokens": 10275555.0, + "step": 407 + }, + { + "epoch": 0.04480562266637382, + "grad_norm": 3.6059670448303223, + "learning_rate": 2.2338090010976948e-07, + "loss": 1.0963, + "mean_token_accuracy": 0.6854841709136963, + "num_tokens": 10299295.0, + "step": 408 + }, + { + "epoch": 0.04491544036898748, + "grad_norm": 3.2000386714935303, + "learning_rate": 2.239297475301866e-07, + "loss": 1.128, + "mean_token_accuracy": 0.6722471714019775, + "num_tokens": 10324466.0, + "step": 409 + }, + { + "epoch": 0.04502525807160114, + "grad_norm": 3.456547737121582, + "learning_rate": 2.2447859495060373e-07, + "loss": 1.1918, + "mean_token_accuracy": 0.6680697202682495, + "num_tokens": 10349479.0, + "step": 410 + }, + { + "epoch": 0.0451350757742148, + "grad_norm": 3.5689690113067627, + "learning_rate": 2.2502744237102085e-07, + "loss": 1.1862, + "mean_token_accuracy": 0.6678423881530762, + "num_tokens": 10377612.0, + "step": 411 + }, + { + "epoch": 0.04524489347682847, + "grad_norm": 3.629929304122925, + "learning_rate": 2.2557628979143796e-07, + "loss": 1.1612, + "mean_token_accuracy": 0.6776440143585205, + "num_tokens": 10399913.0, + "step": 412 + }, + { + "epoch": 0.04535471117944213, + "grad_norm": 3.5139596462249756, + "learning_rate": 2.2612513721185508e-07, + "loss": 1.1568, + "mean_token_accuracy": 0.6628978252410889, + "num_tokens": 10424885.0, + "step": 413 + }, + { + "epoch": 0.04546452888205579, + "grad_norm": 4.432156085968018, + "learning_rate": 2.2667398463227222e-07, + "loss": 1.141, + "mean_token_accuracy": 0.6747143864631653, + "num_tokens": 10442928.0, + "step": 414 + }, + { + "epoch": 0.045574346584669446, + "grad_norm": 3.627002000808716, + "learning_rate": 2.2722283205268934e-07, + "loss": 1.1891, + "mean_token_accuracy": 0.6558240652084351, + "num_tokens": 10469086.0, + "step": 415 + }, + { + "epoch": 0.04568416428728311, + "grad_norm": 3.3724966049194336, + "learning_rate": 2.2777167947310648e-07, + "loss": 1.1947, + "mean_token_accuracy": 0.6591870784759521, + "num_tokens": 10499340.0, + "step": 416 + }, + { + "epoch": 0.04579398198989677, + "grad_norm": 4.3481059074401855, + "learning_rate": 2.283205268935236e-07, + "loss": 1.205, + "mean_token_accuracy": 0.6602190732955933, + "num_tokens": 10517354.0, + "step": 417 + }, + { + "epoch": 0.04590379969251043, + "grad_norm": 3.2963504791259766, + "learning_rate": 2.288693743139407e-07, + "loss": 1.1808, + "mean_token_accuracy": 0.6704314351081848, + "num_tokens": 10547378.0, + "step": 418 + }, + { + "epoch": 0.04601361739512409, + "grad_norm": 3.3958661556243896, + "learning_rate": 2.2941822173435785e-07, + "loss": 1.1571, + "mean_token_accuracy": 0.6667879819869995, + "num_tokens": 10575154.0, + "step": 419 + }, + { + "epoch": 0.04612343509773775, + "grad_norm": 3.589769124984741, + "learning_rate": 2.2996706915477496e-07, + "loss": 1.1458, + "mean_token_accuracy": 0.671122133731842, + "num_tokens": 10597583.0, + "step": 420 + }, + { + "epoch": 0.04623325280035142, + "grad_norm": 3.8520004749298096, + "learning_rate": 2.3051591657519208e-07, + "loss": 1.1033, + "mean_token_accuracy": 0.6762583255767822, + "num_tokens": 10622114.0, + "step": 421 + }, + { + "epoch": 0.04634307050296508, + "grad_norm": 4.289045810699463, + "learning_rate": 2.310647639956092e-07, + "loss": 1.2902, + "mean_token_accuracy": 0.6332734823226929, + "num_tokens": 10646187.0, + "step": 422 + }, + { + "epoch": 0.04645288820557874, + "grad_norm": 3.6707096099853516, + "learning_rate": 2.3161361141602634e-07, + "loss": 1.201, + "mean_token_accuracy": 0.661088228225708, + "num_tokens": 10669772.0, + "step": 423 + }, + { + "epoch": 0.0465627059081924, + "grad_norm": 3.8293371200561523, + "learning_rate": 2.3216245883644345e-07, + "loss": 1.1151, + "mean_token_accuracy": 0.6772782802581787, + "num_tokens": 10689913.0, + "step": 424 + }, + { + "epoch": 0.04667252361080606, + "grad_norm": 3.9286084175109863, + "learning_rate": 2.327113062568606e-07, + "loss": 1.1891, + "mean_token_accuracy": 0.657008945941925, + "num_tokens": 10711812.0, + "step": 425 + }, + { + "epoch": 0.046782341313419726, + "grad_norm": 3.41076922416687, + "learning_rate": 2.3326015367727768e-07, + "loss": 1.0908, + "mean_token_accuracy": 0.686318039894104, + "num_tokens": 10736373.0, + "step": 426 + }, + { + "epoch": 0.046892159016033386, + "grad_norm": 3.5543036460876465, + "learning_rate": 2.3380900109769482e-07, + "loss": 1.2446, + "mean_token_accuracy": 0.6591947078704834, + "num_tokens": 10761002.0, + "step": 427 + }, + { + "epoch": 0.047001976718647046, + "grad_norm": 3.3853533267974854, + "learning_rate": 2.3435784851811197e-07, + "loss": 1.1275, + "mean_token_accuracy": 0.6693648099899292, + "num_tokens": 10787627.0, + "step": 428 + }, + { + "epoch": 0.047111794421260705, + "grad_norm": 3.026378631591797, + "learning_rate": 2.3490669593852908e-07, + "loss": 1.1433, + "mean_token_accuracy": 0.6692010164260864, + "num_tokens": 10815837.0, + "step": 429 + }, + { + "epoch": 0.047221612123874365, + "grad_norm": 3.0493292808532715, + "learning_rate": 2.3545554335894622e-07, + "loss": 1.2601, + "mean_token_accuracy": 0.6394587159156799, + "num_tokens": 10847573.0, + "step": 430 + }, + { + "epoch": 0.04733142982648803, + "grad_norm": 3.337815284729004, + "learning_rate": 2.360043907793633e-07, + "loss": 1.1442, + "mean_token_accuracy": 0.6731640100479126, + "num_tokens": 10871611.0, + "step": 431 + }, + { + "epoch": 0.04744124752910169, + "grad_norm": 2.9721689224243164, + "learning_rate": 2.3655323819978045e-07, + "loss": 1.1973, + "mean_token_accuracy": 0.652218222618103, + "num_tokens": 10899434.0, + "step": 432 + }, + { + "epoch": 0.04755106523171535, + "grad_norm": 3.5036394596099854, + "learning_rate": 2.3710208562019757e-07, + "loss": 1.1379, + "mean_token_accuracy": 0.674240231513977, + "num_tokens": 10924833.0, + "step": 433 + }, + { + "epoch": 0.04766088293432901, + "grad_norm": 2.853243112564087, + "learning_rate": 2.376509330406147e-07, + "loss": 1.2232, + "mean_token_accuracy": 0.6481903791427612, + "num_tokens": 10956146.0, + "step": 434 + }, + { + "epoch": 0.04777070063694268, + "grad_norm": 3.499692916870117, + "learning_rate": 2.3819978046103183e-07, + "loss": 1.1717, + "mean_token_accuracy": 0.6601950526237488, + "num_tokens": 10976126.0, + "step": 435 + }, + { + "epoch": 0.04788051833955634, + "grad_norm": 3.499450206756592, + "learning_rate": 2.3874862788144894e-07, + "loss": 1.1193, + "mean_token_accuracy": 0.6770179271697998, + "num_tokens": 10996437.0, + "step": 436 + }, + { + "epoch": 0.04799033604217, + "grad_norm": 3.307541847229004, + "learning_rate": 2.3929747530186606e-07, + "loss": 1.0951, + "mean_token_accuracy": 0.6842434406280518, + "num_tokens": 11025169.0, + "step": 437 + }, + { + "epoch": 0.04810015374478366, + "grad_norm": 4.199154376983643, + "learning_rate": 2.3984632272228317e-07, + "loss": 1.2092, + "mean_token_accuracy": 0.6526225805282593, + "num_tokens": 11045599.0, + "step": 438 + }, + { + "epoch": 0.04820997144739732, + "grad_norm": 3.0220718383789062, + "learning_rate": 2.4039517014270034e-07, + "loss": 1.169, + "mean_token_accuracy": 0.6563690900802612, + "num_tokens": 11074368.0, + "step": 439 + }, + { + "epoch": 0.048319789150010985, + "grad_norm": 4.352552890777588, + "learning_rate": 2.4094401756311745e-07, + "loss": 1.2182, + "mean_token_accuracy": 0.6503441333770752, + "num_tokens": 11093569.0, + "step": 440 + }, + { + "epoch": 0.048429606852624645, + "grad_norm": 2.993140935897827, + "learning_rate": 2.4149286498353457e-07, + "loss": 1.1828, + "mean_token_accuracy": 0.6562097072601318, + "num_tokens": 11123112.0, + "step": 441 + }, + { + "epoch": 0.048539424555238304, + "grad_norm": 3.327059030532837, + "learning_rate": 2.420417124039517e-07, + "loss": 1.1204, + "mean_token_accuracy": 0.6752980947494507, + "num_tokens": 11144895.0, + "step": 442 + }, + { + "epoch": 0.048649242257851964, + "grad_norm": 2.874176025390625, + "learning_rate": 2.425905598243688e-07, + "loss": 1.2258, + "mean_token_accuracy": 0.6493383646011353, + "num_tokens": 11175502.0, + "step": 443 + }, + { + "epoch": 0.048759059960465624, + "grad_norm": 2.847654342651367, + "learning_rate": 2.431394072447859e-07, + "loss": 1.1444, + "mean_token_accuracy": 0.6743741035461426, + "num_tokens": 11203328.0, + "step": 444 + }, + { + "epoch": 0.04886887766307929, + "grad_norm": 3.3366801738739014, + "learning_rate": 2.436882546652031e-07, + "loss": 1.149, + "mean_token_accuracy": 0.6721182465553284, + "num_tokens": 11226765.0, + "step": 445 + }, + { + "epoch": 0.04897869536569295, + "grad_norm": 3.458024024963379, + "learning_rate": 2.442371020856202e-07, + "loss": 1.0684, + "mean_token_accuracy": 0.6882966756820679, + "num_tokens": 11249838.0, + "step": 446 + }, + { + "epoch": 0.04908851306830661, + "grad_norm": 3.5404717922210693, + "learning_rate": 2.447859495060373e-07, + "loss": 1.1681, + "mean_token_accuracy": 0.6613035798072815, + "num_tokens": 11269586.0, + "step": 447 + }, + { + "epoch": 0.04919833077092027, + "grad_norm": 3.4303770065307617, + "learning_rate": 2.4533479692645443e-07, + "loss": 1.195, + "mean_token_accuracy": 0.6511849164962769, + "num_tokens": 11290103.0, + "step": 448 + }, + { + "epoch": 0.04930814847353393, + "grad_norm": 3.3879871368408203, + "learning_rate": 2.4588364434687154e-07, + "loss": 1.1441, + "mean_token_accuracy": 0.6696126461029053, + "num_tokens": 11314846.0, + "step": 449 + }, + { + "epoch": 0.0494179661761476, + "grad_norm": 3.2180089950561523, + "learning_rate": 2.4643249176728866e-07, + "loss": 1.1321, + "mean_token_accuracy": 0.6784703731536865, + "num_tokens": 11341642.0, + "step": 450 + }, + { + "epoch": 0.04952778387876126, + "grad_norm": 3.5016472339630127, + "learning_rate": 2.4698133918770583e-07, + "loss": 1.2116, + "mean_token_accuracy": 0.6535888314247131, + "num_tokens": 11363490.0, + "step": 451 + }, + { + "epoch": 0.04963760158137492, + "grad_norm": 3.1381497383117676, + "learning_rate": 2.4753018660812294e-07, + "loss": 1.1335, + "mean_token_accuracy": 0.6710525751113892, + "num_tokens": 11387112.0, + "step": 452 + }, + { + "epoch": 0.04974741928398858, + "grad_norm": 3.370199203491211, + "learning_rate": 2.4807903402854006e-07, + "loss": 1.1902, + "mean_token_accuracy": 0.6545066237449646, + "num_tokens": 11411810.0, + "step": 453 + }, + { + "epoch": 0.049857236986602244, + "grad_norm": 4.229959011077881, + "learning_rate": 2.486278814489572e-07, + "loss": 1.1479, + "mean_token_accuracy": 0.6783998012542725, + "num_tokens": 11430461.0, + "step": 454 + }, + { + "epoch": 0.0499670546892159, + "grad_norm": 3.9024810791015625, + "learning_rate": 2.491767288693743e-07, + "loss": 1.1179, + "mean_token_accuracy": 0.6710408329963684, + "num_tokens": 11450031.0, + "step": 455 + }, + { + "epoch": 0.05007687239182956, + "grad_norm": 3.292757034301758, + "learning_rate": 2.4972557628979146e-07, + "loss": 1.2501, + "mean_token_accuracy": 0.6430089473724365, + "num_tokens": 11480036.0, + "step": 456 + }, + { + "epoch": 0.05018669009444322, + "grad_norm": 3.0414881706237793, + "learning_rate": 2.5027442371020857e-07, + "loss": 1.1427, + "mean_token_accuracy": 0.6864325404167175, + "num_tokens": 11507324.0, + "step": 457 + }, + { + "epoch": 0.05029650779705688, + "grad_norm": 3.1660163402557373, + "learning_rate": 2.508232711306257e-07, + "loss": 1.1847, + "mean_token_accuracy": 0.6602901220321655, + "num_tokens": 11532471.0, + "step": 458 + }, + { + "epoch": 0.05040632549967055, + "grad_norm": 3.0841269493103027, + "learning_rate": 2.513721185510428e-07, + "loss": 1.1698, + "mean_token_accuracy": 0.6711302995681763, + "num_tokens": 11559747.0, + "step": 459 + }, + { + "epoch": 0.05051614320228421, + "grad_norm": 2.95182204246521, + "learning_rate": 2.519209659714599e-07, + "loss": 1.2158, + "mean_token_accuracy": 0.6517921686172485, + "num_tokens": 11594386.0, + "step": 460 + }, + { + "epoch": 0.05062596090489787, + "grad_norm": 2.9328596591949463, + "learning_rate": 2.524698133918771e-07, + "loss": 1.1066, + "mean_token_accuracy": 0.6776368618011475, + "num_tokens": 11623480.0, + "step": 461 + }, + { + "epoch": 0.05073577860751153, + "grad_norm": 3.4215247631073, + "learning_rate": 2.5301866081229415e-07, + "loss": 1.1859, + "mean_token_accuracy": 0.658714771270752, + "num_tokens": 11649528.0, + "step": 462 + }, + { + "epoch": 0.05084559631012519, + "grad_norm": 3.0658490657806396, + "learning_rate": 2.5356750823271126e-07, + "loss": 1.1482, + "mean_token_accuracy": 0.6708585619926453, + "num_tokens": 11676433.0, + "step": 463 + }, + { + "epoch": 0.050955414012738856, + "grad_norm": 2.9640700817108154, + "learning_rate": 2.5411635565312843e-07, + "loss": 1.1548, + "mean_token_accuracy": 0.6642121076583862, + "num_tokens": 11700851.0, + "step": 464 + }, + { + "epoch": 0.051065231715352516, + "grad_norm": 2.8411171436309814, + "learning_rate": 2.5466520307354555e-07, + "loss": 1.1186, + "mean_token_accuracy": 0.6815978288650513, + "num_tokens": 11731709.0, + "step": 465 + }, + { + "epoch": 0.051175049417966176, + "grad_norm": 3.6872799396514893, + "learning_rate": 2.5521405049396266e-07, + "loss": 1.1118, + "mean_token_accuracy": 0.674654483795166, + "num_tokens": 11752640.0, + "step": 466 + }, + { + "epoch": 0.051284867120579836, + "grad_norm": 3.614946126937866, + "learning_rate": 2.557628979143798e-07, + "loss": 1.1244, + "mean_token_accuracy": 0.6739754676818848, + "num_tokens": 11775395.0, + "step": 467 + }, + { + "epoch": 0.051394684823193496, + "grad_norm": 3.103177785873413, + "learning_rate": 2.5631174533479695e-07, + "loss": 1.0259, + "mean_token_accuracy": 0.6971845626831055, + "num_tokens": 11799061.0, + "step": 468 + }, + { + "epoch": 0.05150450252580716, + "grad_norm": 3.1902060508728027, + "learning_rate": 2.5686059275521406e-07, + "loss": 1.1671, + "mean_token_accuracy": 0.666542649269104, + "num_tokens": 11823711.0, + "step": 469 + }, + { + "epoch": 0.05161432022842082, + "grad_norm": 2.9770164489746094, + "learning_rate": 2.574094401756312e-07, + "loss": 1.1655, + "mean_token_accuracy": 0.6689152717590332, + "num_tokens": 11848398.0, + "step": 470 + }, + { + "epoch": 0.05172413793103448, + "grad_norm": 3.0504519939422607, + "learning_rate": 2.5795828759604824e-07, + "loss": 1.313, + "mean_token_accuracy": 0.6290080547332764, + "num_tokens": 11878880.0, + "step": 471 + }, + { + "epoch": 0.05183395563364814, + "grad_norm": 3.177133321762085, + "learning_rate": 2.585071350164654e-07, + "loss": 1.1441, + "mean_token_accuracy": 0.6643580198287964, + "num_tokens": 11900527.0, + "step": 472 + }, + { + "epoch": 0.05194377333626181, + "grad_norm": 2.876521587371826, + "learning_rate": 2.590559824368825e-07, + "loss": 1.1134, + "mean_token_accuracy": 0.6743433475494385, + "num_tokens": 11924386.0, + "step": 473 + }, + { + "epoch": 0.05205359103887547, + "grad_norm": 3.120854377746582, + "learning_rate": 2.5960482985729964e-07, + "loss": 1.1197, + "mean_token_accuracy": 0.6757641434669495, + "num_tokens": 11946253.0, + "step": 474 + }, + { + "epoch": 0.05216340874148913, + "grad_norm": 2.643712043762207, + "learning_rate": 2.601536772777168e-07, + "loss": 1.1518, + "mean_token_accuracy": 0.6742304563522339, + "num_tokens": 11974876.0, + "step": 475 + }, + { + "epoch": 0.05227322644410279, + "grad_norm": 3.1415610313415527, + "learning_rate": 2.607025246981339e-07, + "loss": 1.1634, + "mean_token_accuracy": 0.6669130921363831, + "num_tokens": 12001791.0, + "step": 476 + }, + { + "epoch": 0.05238304414671645, + "grad_norm": 3.0240604877471924, + "learning_rate": 2.6125137211855104e-07, + "loss": 1.2261, + "mean_token_accuracy": 0.6597251892089844, + "num_tokens": 12027163.0, + "step": 477 + }, + { + "epoch": 0.052492861849330115, + "grad_norm": 3.0369744300842285, + "learning_rate": 2.6180021953896815e-07, + "loss": 1.1005, + "mean_token_accuracy": 0.6829776763916016, + "num_tokens": 12048740.0, + "step": 478 + }, + { + "epoch": 0.052602679551943775, + "grad_norm": 2.978372097015381, + "learning_rate": 2.623490669593853e-07, + "loss": 1.235, + "mean_token_accuracy": 0.6463698744773865, + "num_tokens": 12073303.0, + "step": 479 + }, + { + "epoch": 0.052712497254557435, + "grad_norm": 2.9675395488739014, + "learning_rate": 2.6289791437980244e-07, + "loss": 1.066, + "mean_token_accuracy": 0.6885344386100769, + "num_tokens": 12097124.0, + "step": 480 + }, + { + "epoch": 0.052822314957171095, + "grad_norm": 2.885652780532837, + "learning_rate": 2.634467618002195e-07, + "loss": 1.0952, + "mean_token_accuracy": 0.6864824295043945, + "num_tokens": 12121108.0, + "step": 481 + }, + { + "epoch": 0.052932132659784754, + "grad_norm": 2.801570415496826, + "learning_rate": 2.639956092206366e-07, + "loss": 1.0402, + "mean_token_accuracy": 0.6933528184890747, + "num_tokens": 12147027.0, + "step": 482 + }, + { + "epoch": 0.05304195036239842, + "grad_norm": 2.5892670154571533, + "learning_rate": 2.645444566410538e-07, + "loss": 1.0712, + "mean_token_accuracy": 0.6826642751693726, + "num_tokens": 12177368.0, + "step": 483 + }, + { + "epoch": 0.05315176806501208, + "grad_norm": 2.7438409328460693, + "learning_rate": 2.650933040614709e-07, + "loss": 1.2853, + "mean_token_accuracy": 0.6387002468109131, + "num_tokens": 12206385.0, + "step": 484 + }, + { + "epoch": 0.05326158576762574, + "grad_norm": 3.4478306770324707, + "learning_rate": 2.65642151481888e-07, + "loss": 1.1718, + "mean_token_accuracy": 0.6617858409881592, + "num_tokens": 12226379.0, + "step": 485 + }, + { + "epoch": 0.0533714034702394, + "grad_norm": 3.3155224323272705, + "learning_rate": 2.6619099890230513e-07, + "loss": 1.236, + "mean_token_accuracy": 0.6501975059509277, + "num_tokens": 12251220.0, + "step": 486 + }, + { + "epoch": 0.05348122117285306, + "grad_norm": 3.185854196548462, + "learning_rate": 2.667398463227223e-07, + "loss": 1.0721, + "mean_token_accuracy": 0.6817905902862549, + "num_tokens": 12273802.0, + "step": 487 + }, + { + "epoch": 0.05359103887546673, + "grad_norm": 2.77070951461792, + "learning_rate": 2.672886937431394e-07, + "loss": 1.121, + "mean_token_accuracy": 0.6780343055725098, + "num_tokens": 12302901.0, + "step": 488 + }, + { + "epoch": 0.05370085657808039, + "grad_norm": 2.95831561088562, + "learning_rate": 2.678375411635565e-07, + "loss": 1.0912, + "mean_token_accuracy": 0.6772794723510742, + "num_tokens": 12326859.0, + "step": 489 + }, + { + "epoch": 0.05381067428069405, + "grad_norm": 2.9761202335357666, + "learning_rate": 2.6838638858397364e-07, + "loss": 1.1028, + "mean_token_accuracy": 0.676504909992218, + "num_tokens": 12349191.0, + "step": 490 + }, + { + "epoch": 0.05392049198330771, + "grad_norm": 2.952533006668091, + "learning_rate": 2.6893523600439076e-07, + "loss": 1.146, + "mean_token_accuracy": 0.6689636707305908, + "num_tokens": 12373816.0, + "step": 491 + }, + { + "epoch": 0.054030309685921374, + "grad_norm": 3.3667240142822266, + "learning_rate": 2.6948408342480787e-07, + "loss": 1.1159, + "mean_token_accuracy": 0.6719892024993896, + "num_tokens": 12397005.0, + "step": 492 + }, + { + "epoch": 0.054140127388535034, + "grad_norm": 2.930985927581787, + "learning_rate": 2.70032930845225e-07, + "loss": 1.086, + "mean_token_accuracy": 0.6800886392593384, + "num_tokens": 12423758.0, + "step": 493 + }, + { + "epoch": 0.054249945091148694, + "grad_norm": 2.878262758255005, + "learning_rate": 2.7058177826564215e-07, + "loss": 1.1869, + "mean_token_accuracy": 0.6640263199806213, + "num_tokens": 12450726.0, + "step": 494 + }, + { + "epoch": 0.05435976279376235, + "grad_norm": 3.326131582260132, + "learning_rate": 2.7113062568605927e-07, + "loss": 1.1161, + "mean_token_accuracy": 0.6761621236801147, + "num_tokens": 12473573.0, + "step": 495 + }, + { + "epoch": 0.05446958049637601, + "grad_norm": 2.6973445415496826, + "learning_rate": 2.716794731064764e-07, + "loss": 1.1577, + "mean_token_accuracy": 0.6674363613128662, + "num_tokens": 12500861.0, + "step": 496 + }, + { + "epoch": 0.05457939819898968, + "grad_norm": 3.1415019035339355, + "learning_rate": 2.722283205268935e-07, + "loss": 1.0754, + "mean_token_accuracy": 0.6862953901290894, + "num_tokens": 12521383.0, + "step": 497 + }, + { + "epoch": 0.05468921590160334, + "grad_norm": 2.6061716079711914, + "learning_rate": 2.7277716794731067e-07, + "loss": 1.235, + "mean_token_accuracy": 0.6453698873519897, + "num_tokens": 12549374.0, + "step": 498 + }, + { + "epoch": 0.054799033604217, + "grad_norm": 2.7829294204711914, + "learning_rate": 2.733260153677278e-07, + "loss": 1.1934, + "mean_token_accuracy": 0.6563996076583862, + "num_tokens": 12576184.0, + "step": 499 + }, + { + "epoch": 0.05490885130683066, + "grad_norm": 3.0592434406280518, + "learning_rate": 2.7387486278814485e-07, + "loss": 1.1112, + "mean_token_accuracy": 0.6727941632270813, + "num_tokens": 12597662.0, + "step": 500 + }, + { + "epoch": 0.05501866900944432, + "grad_norm": 2.778776168823242, + "learning_rate": 2.74423710208562e-07, + "loss": 1.1171, + "mean_token_accuracy": 0.6752678155899048, + "num_tokens": 12622241.0, + "step": 501 + }, + { + "epoch": 0.055128486712057986, + "grad_norm": 2.616973876953125, + "learning_rate": 2.7497255762897913e-07, + "loss": 1.1858, + "mean_token_accuracy": 0.6538116335868835, + "num_tokens": 12650202.0, + "step": 502 + }, + { + "epoch": 0.055238304414671646, + "grad_norm": 3.2142152786254883, + "learning_rate": 2.7552140504939624e-07, + "loss": 1.1718, + "mean_token_accuracy": 0.6512656807899475, + "num_tokens": 12670739.0, + "step": 503 + }, + { + "epoch": 0.055348122117285306, + "grad_norm": 2.654853582382202, + "learning_rate": 2.7607025246981336e-07, + "loss": 1.1162, + "mean_token_accuracy": 0.6756888628005981, + "num_tokens": 12698044.0, + "step": 504 + }, + { + "epoch": 0.055457939819898966, + "grad_norm": 3.172919750213623, + "learning_rate": 2.7661909989023053e-07, + "loss": 1.0284, + "mean_token_accuracy": 0.6961695551872253, + "num_tokens": 12719254.0, + "step": 505 + }, + { + "epoch": 0.055567757522512626, + "grad_norm": 2.7904627323150635, + "learning_rate": 2.7716794731064764e-07, + "loss": 1.1843, + "mean_token_accuracy": 0.6639755368232727, + "num_tokens": 12746794.0, + "step": 506 + }, + { + "epoch": 0.05567757522512629, + "grad_norm": 3.1791176795959473, + "learning_rate": 2.7771679473106476e-07, + "loss": 1.0887, + "mean_token_accuracy": 0.6816771626472473, + "num_tokens": 12769441.0, + "step": 507 + }, + { + "epoch": 0.05578739292773995, + "grad_norm": 3.069209098815918, + "learning_rate": 2.7826564215148187e-07, + "loss": 1.1529, + "mean_token_accuracy": 0.6674942374229431, + "num_tokens": 12790823.0, + "step": 508 + }, + { + "epoch": 0.05589721063035361, + "grad_norm": 2.8213326930999756, + "learning_rate": 2.78814489571899e-07, + "loss": 1.0208, + "mean_token_accuracy": 0.6915856599807739, + "num_tokens": 12812284.0, + "step": 509 + }, + { + "epoch": 0.05600702833296727, + "grad_norm": 3.0216495990753174, + "learning_rate": 2.793633369923161e-07, + "loss": 1.0459, + "mean_token_accuracy": 0.6964659690856934, + "num_tokens": 12831786.0, + "step": 510 + }, + { + "epoch": 0.05611684603558094, + "grad_norm": 2.7780985832214355, + "learning_rate": 2.799121844127332e-07, + "loss": 1.1726, + "mean_token_accuracy": 0.6622322797775269, + "num_tokens": 12857234.0, + "step": 511 + }, + { + "epoch": 0.0562266637381946, + "grad_norm": 2.940384864807129, + "learning_rate": 2.804610318331504e-07, + "loss": 1.1259, + "mean_token_accuracy": 0.6761329174041748, + "num_tokens": 12879430.0, + "step": 512 + }, + { + "epoch": 0.05633648144080826, + "grad_norm": 3.098926544189453, + "learning_rate": 2.810098792535675e-07, + "loss": 1.0505, + "mean_token_accuracy": 0.6924781799316406, + "num_tokens": 12900637.0, + "step": 513 + }, + { + "epoch": 0.05644629914342192, + "grad_norm": 3.5314488410949707, + "learning_rate": 2.815587266739846e-07, + "loss": 1.0998, + "mean_token_accuracy": 0.6772013306617737, + "num_tokens": 12918137.0, + "step": 514 + }, + { + "epoch": 0.05655611684603558, + "grad_norm": 3.2067737579345703, + "learning_rate": 2.8210757409440173e-07, + "loss": 1.116, + "mean_token_accuracy": 0.67569899559021, + "num_tokens": 12939691.0, + "step": 515 + }, + { + "epoch": 0.056665934548649245, + "grad_norm": 2.6719796657562256, + "learning_rate": 2.826564215148189e-07, + "loss": 1.2032, + "mean_token_accuracy": 0.649361252784729, + "num_tokens": 12966541.0, + "step": 516 + }, + { + "epoch": 0.056775752251262905, + "grad_norm": 2.8783822059631348, + "learning_rate": 2.83205268935236e-07, + "loss": 1.0724, + "mean_token_accuracy": 0.6953704357147217, + "num_tokens": 12989315.0, + "step": 517 + }, + { + "epoch": 0.056885569953876565, + "grad_norm": 2.4668161869049072, + "learning_rate": 2.8375411635565313e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.675737738609314, + "num_tokens": 13020711.0, + "step": 518 + }, + { + "epoch": 0.056995387656490225, + "grad_norm": 2.807339906692505, + "learning_rate": 2.843029637760702e-07, + "loss": 1.1909, + "mean_token_accuracy": 0.6583257913589478, + "num_tokens": 13046494.0, + "step": 519 + }, + { + "epoch": 0.057105205359103885, + "grad_norm": 2.8503973484039307, + "learning_rate": 2.8485181119648736e-07, + "loss": 1.0212, + "mean_token_accuracy": 0.6974999308586121, + "num_tokens": 13072883.0, + "step": 520 + }, + { + "epoch": 0.05721502306171755, + "grad_norm": 2.8236775398254395, + "learning_rate": 2.854006586169045e-07, + "loss": 1.1135, + "mean_token_accuracy": 0.6780767440795898, + "num_tokens": 13097635.0, + "step": 521 + }, + { + "epoch": 0.05732484076433121, + "grad_norm": 2.4560625553131104, + "learning_rate": 2.859495060373216e-07, + "loss": 1.2112, + "mean_token_accuracy": 0.6578030586242676, + "num_tokens": 13127026.0, + "step": 522 + }, + { + "epoch": 0.05743465846694487, + "grad_norm": 2.6766185760498047, + "learning_rate": 2.8649835345773876e-07, + "loss": 1.2051, + "mean_token_accuracy": 0.6622928380966187, + "num_tokens": 13151505.0, + "step": 523 + }, + { + "epoch": 0.05754447616955853, + "grad_norm": 2.8793318271636963, + "learning_rate": 2.870472008781559e-07, + "loss": 1.1593, + "mean_token_accuracy": 0.6589547991752625, + "num_tokens": 13173188.0, + "step": 524 + }, + { + "epoch": 0.05765429387217219, + "grad_norm": 2.806520462036133, + "learning_rate": 2.87596048298573e-07, + "loss": 1.1812, + "mean_token_accuracy": 0.6545251607894897, + "num_tokens": 13198271.0, + "step": 525 + }, + { + "epoch": 0.05776411157478586, + "grad_norm": 3.099924087524414, + "learning_rate": 2.881448957189901e-07, + "loss": 1.1044, + "mean_token_accuracy": 0.6630576252937317, + "num_tokens": 13218073.0, + "step": 526 + }, + { + "epoch": 0.05787392927739952, + "grad_norm": 2.824410915374756, + "learning_rate": 2.886937431394073e-07, + "loss": 1.1353, + "mean_token_accuracy": 0.6654239892959595, + "num_tokens": 13241269.0, + "step": 527 + }, + { + "epoch": 0.05798374698001318, + "grad_norm": 2.7979612350463867, + "learning_rate": 2.8924259055982434e-07, + "loss": 1.1578, + "mean_token_accuracy": 0.6765269041061401, + "num_tokens": 13265775.0, + "step": 528 + }, + { + "epoch": 0.05809356468262684, + "grad_norm": 3.2727127075195312, + "learning_rate": 2.8979143798024145e-07, + "loss": 1.1114, + "mean_token_accuracy": 0.6793792247772217, + "num_tokens": 13284220.0, + "step": 529 + }, + { + "epoch": 0.058203382385240504, + "grad_norm": 2.7986044883728027, + "learning_rate": 2.9034028540065857e-07, + "loss": 1.1141, + "mean_token_accuracy": 0.6705853939056396, + "num_tokens": 13308751.0, + "step": 530 + }, + { + "epoch": 0.058313200087854164, + "grad_norm": 2.897871255874634, + "learning_rate": 2.9088913282107574e-07, + "loss": 1.0913, + "mean_token_accuracy": 0.6806923151016235, + "num_tokens": 13333529.0, + "step": 531 + }, + { + "epoch": 0.058423017790467824, + "grad_norm": 2.7098028659820557, + "learning_rate": 2.9143798024149285e-07, + "loss": 1.1567, + "mean_token_accuracy": 0.6723824739456177, + "num_tokens": 13359157.0, + "step": 532 + }, + { + "epoch": 0.058532835493081484, + "grad_norm": 2.5971827507019043, + "learning_rate": 2.9198682766190997e-07, + "loss": 1.0544, + "mean_token_accuracy": 0.6860266923904419, + "num_tokens": 13385309.0, + "step": 533 + }, + { + "epoch": 0.058642653195695144, + "grad_norm": 2.966479778289795, + "learning_rate": 2.9253567508232713e-07, + "loss": 1.0038, + "mean_token_accuracy": 0.703565776348114, + "num_tokens": 13404936.0, + "step": 534 + }, + { + "epoch": 0.05875247089830881, + "grad_norm": 3.277047872543335, + "learning_rate": 2.9308452250274425e-07, + "loss": 1.1186, + "mean_token_accuracy": 0.6763636469841003, + "num_tokens": 13423046.0, + "step": 535 + }, + { + "epoch": 0.05886228860092247, + "grad_norm": 2.9443979263305664, + "learning_rate": 2.9363336992316136e-07, + "loss": 1.117, + "mean_token_accuracy": 0.6762725114822388, + "num_tokens": 13445394.0, + "step": 536 + }, + { + "epoch": 0.05897210630353613, + "grad_norm": 2.941073179244995, + "learning_rate": 2.941822173435785e-07, + "loss": 1.0561, + "mean_token_accuracy": 0.6880275011062622, + "num_tokens": 13466013.0, + "step": 537 + }, + { + "epoch": 0.05908192400614979, + "grad_norm": 2.3978137969970703, + "learning_rate": 2.947310647639956e-07, + "loss": 1.1101, + "mean_token_accuracy": 0.6799939870834351, + "num_tokens": 13496045.0, + "step": 538 + }, + { + "epoch": 0.05919174170876345, + "grad_norm": 2.323853015899658, + "learning_rate": 2.952799121844127e-07, + "loss": 1.1775, + "mean_token_accuracy": 0.6553791761398315, + "num_tokens": 13527045.0, + "step": 539 + }, + { + "epoch": 0.05930155941137712, + "grad_norm": 2.5114169120788574, + "learning_rate": 2.958287596048298e-07, + "loss": 1.2061, + "mean_token_accuracy": 0.653729259967804, + "num_tokens": 13556085.0, + "step": 540 + }, + { + "epoch": 0.059411377113990776, + "grad_norm": 2.874176025390625, + "learning_rate": 2.9637760702524694e-07, + "loss": 1.184, + "mean_token_accuracy": 0.6613447666168213, + "num_tokens": 13579785.0, + "step": 541 + }, + { + "epoch": 0.059521194816604436, + "grad_norm": 2.4828665256500244, + "learning_rate": 2.969264544456641e-07, + "loss": 1.0965, + "mean_token_accuracy": 0.6838924288749695, + "num_tokens": 13606907.0, + "step": 542 + }, + { + "epoch": 0.059631012519218096, + "grad_norm": 2.6365292072296143, + "learning_rate": 2.974753018660812e-07, + "loss": 1.1285, + "mean_token_accuracy": 0.6809840798377991, + "num_tokens": 13632431.0, + "step": 543 + }, + { + "epoch": 0.059740830221831756, + "grad_norm": 2.476796865463257, + "learning_rate": 2.9802414928649834e-07, + "loss": 1.1789, + "mean_token_accuracy": 0.6601662635803223, + "num_tokens": 13662160.0, + "step": 544 + }, + { + "epoch": 0.05985064792444542, + "grad_norm": 2.6444709300994873, + "learning_rate": 2.9857299670691546e-07, + "loss": 1.1905, + "mean_token_accuracy": 0.6559277772903442, + "num_tokens": 13687544.0, + "step": 545 + }, + { + "epoch": 0.05996046562705908, + "grad_norm": 2.5461642742156982, + "learning_rate": 2.991218441273326e-07, + "loss": 1.1016, + "mean_token_accuracy": 0.6759538650512695, + "num_tokens": 13716813.0, + "step": 546 + }, + { + "epoch": 0.06007028332967274, + "grad_norm": 2.4353854656219482, + "learning_rate": 2.996706915477497e-07, + "loss": 1.1175, + "mean_token_accuracy": 0.6744880676269531, + "num_tokens": 13745846.0, + "step": 547 + }, + { + "epoch": 0.0601801010322864, + "grad_norm": 2.243669271469116, + "learning_rate": 3.002195389681668e-07, + "loss": 1.2459, + "mean_token_accuracy": 0.6384430527687073, + "num_tokens": 13782913.0, + "step": 548 + }, + { + "epoch": 0.06028991873490007, + "grad_norm": 2.651186466217041, + "learning_rate": 3.0076838638858397e-07, + "loss": 1.0935, + "mean_token_accuracy": 0.678183913230896, + "num_tokens": 13807528.0, + "step": 549 + }, + { + "epoch": 0.06039973643751373, + "grad_norm": 2.7499074935913086, + "learning_rate": 3.013172338090011e-07, + "loss": 1.1475, + "mean_token_accuracy": 0.667216420173645, + "num_tokens": 13835691.0, + "step": 550 + }, + { + "epoch": 0.06050955414012739, + "grad_norm": 2.4920880794525146, + "learning_rate": 3.018660812294182e-07, + "loss": 1.0851, + "mean_token_accuracy": 0.6796188950538635, + "num_tokens": 13863619.0, + "step": 551 + }, + { + "epoch": 0.06061937184274105, + "grad_norm": 2.645073413848877, + "learning_rate": 3.024149286498353e-07, + "loss": 1.1805, + "mean_token_accuracy": 0.6637253165245056, + "num_tokens": 13887734.0, + "step": 552 + }, + { + "epoch": 0.06072918954535471, + "grad_norm": 2.436328649520874, + "learning_rate": 3.029637760702525e-07, + "loss": 1.1037, + "mean_token_accuracy": 0.6829124093055725, + "num_tokens": 13918838.0, + "step": 553 + }, + { + "epoch": 0.060839007247968376, + "grad_norm": 2.345961570739746, + "learning_rate": 3.035126234906696e-07, + "loss": 1.2301, + "mean_token_accuracy": 0.6505126953125, + "num_tokens": 13951706.0, + "step": 554 + }, + { + "epoch": 0.060948824950582035, + "grad_norm": 2.787062168121338, + "learning_rate": 3.040614709110867e-07, + "loss": 1.1453, + "mean_token_accuracy": 0.6716105937957764, + "num_tokens": 13976340.0, + "step": 555 + }, + { + "epoch": 0.061058642653195695, + "grad_norm": 3.086503028869629, + "learning_rate": 3.046103183315038e-07, + "loss": 1.1583, + "mean_token_accuracy": 0.6700878143310547, + "num_tokens": 13995581.0, + "step": 556 + }, + { + "epoch": 0.061168460355809355, + "grad_norm": 2.721364974975586, + "learning_rate": 3.0515916575192094e-07, + "loss": 1.1202, + "mean_token_accuracy": 0.6778144836425781, + "num_tokens": 14018538.0, + "step": 557 + }, + { + "epoch": 0.061278278058423015, + "grad_norm": 2.4531643390655518, + "learning_rate": 3.0570801317233806e-07, + "loss": 1.1427, + "mean_token_accuracy": 0.6703611016273499, + "num_tokens": 14046644.0, + "step": 558 + }, + { + "epoch": 0.06138809576103668, + "grad_norm": 2.472886562347412, + "learning_rate": 3.062568605927552e-07, + "loss": 1.0586, + "mean_token_accuracy": 0.6883119344711304, + "num_tokens": 14073350.0, + "step": 559 + }, + { + "epoch": 0.06149791346365034, + "grad_norm": 3.07132887840271, + "learning_rate": 3.0680570801317234e-07, + "loss": 1.1347, + "mean_token_accuracy": 0.6671514511108398, + "num_tokens": 14096505.0, + "step": 560 + }, + { + "epoch": 0.061607731166264, + "grad_norm": 2.672004461288452, + "learning_rate": 3.0735455543358946e-07, + "loss": 1.2015, + "mean_token_accuracy": 0.6660730838775635, + "num_tokens": 14121400.0, + "step": 561 + }, + { + "epoch": 0.06171754886887766, + "grad_norm": 2.1511788368225098, + "learning_rate": 3.0790340285400657e-07, + "loss": 1.0962, + "mean_token_accuracy": 0.6816846132278442, + "num_tokens": 14158228.0, + "step": 562 + }, + { + "epoch": 0.06182736657149132, + "grad_norm": 2.8138344287872314, + "learning_rate": 3.084522502744237e-07, + "loss": 1.1087, + "mean_token_accuracy": 0.6750808358192444, + "num_tokens": 14180735.0, + "step": 563 + }, + { + "epoch": 0.06193718427410499, + "grad_norm": 2.328840970993042, + "learning_rate": 3.0900109769484086e-07, + "loss": 1.2436, + "mean_token_accuracy": 0.6421109437942505, + "num_tokens": 14215936.0, + "step": 564 + }, + { + "epoch": 0.06204700197671865, + "grad_norm": 2.72223162651062, + "learning_rate": 3.0954994511525797e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6731175780296326, + "num_tokens": 14240923.0, + "step": 565 + }, + { + "epoch": 0.06215681967933231, + "grad_norm": 2.9312214851379395, + "learning_rate": 3.1009879253567503e-07, + "loss": 1.1534, + "mean_token_accuracy": 0.6696109771728516, + "num_tokens": 14261778.0, + "step": 566 + }, + { + "epoch": 0.06226663738194597, + "grad_norm": 2.212632894515991, + "learning_rate": 3.1064763995609215e-07, + "loss": 1.169, + "mean_token_accuracy": 0.6677085161209106, + "num_tokens": 14297115.0, + "step": 567 + }, + { + "epoch": 0.062376455084559634, + "grad_norm": 2.298412322998047, + "learning_rate": 3.111964873765093e-07, + "loss": 1.1548, + "mean_token_accuracy": 0.66495680809021, + "num_tokens": 14327446.0, + "step": 568 + }, + { + "epoch": 0.062486272787173294, + "grad_norm": 2.7101733684539795, + "learning_rate": 3.1174533479692643e-07, + "loss": 1.1004, + "mean_token_accuracy": 0.6803160905838013, + "num_tokens": 14350281.0, + "step": 569 + }, + { + "epoch": 0.06259609048978695, + "grad_norm": 2.23431134223938, + "learning_rate": 3.1229418221734355e-07, + "loss": 1.2197, + "mean_token_accuracy": 0.6521090269088745, + "num_tokens": 14384683.0, + "step": 570 + }, + { + "epoch": 0.06270590819240061, + "grad_norm": 3.3279101848602295, + "learning_rate": 3.128430296377607e-07, + "loss": 0.9655, + "mean_token_accuracy": 0.7071746587753296, + "num_tokens": 14402578.0, + "step": 571 + }, + { + "epoch": 0.06281572589501427, + "grad_norm": 2.7231826782226562, + "learning_rate": 3.1339187705817783e-07, + "loss": 1.1553, + "mean_token_accuracy": 0.654304027557373, + "num_tokens": 14428295.0, + "step": 572 + }, + { + "epoch": 0.06292554359762793, + "grad_norm": 3.029651165008545, + "learning_rate": 3.1394072447859495e-07, + "loss": 1.0667, + "mean_token_accuracy": 0.6838836073875427, + "num_tokens": 14449619.0, + "step": 573 + }, + { + "epoch": 0.0630353613002416, + "grad_norm": 2.65346097946167, + "learning_rate": 3.1448957189901206e-07, + "loss": 1.134, + "mean_token_accuracy": 0.6725438237190247, + "num_tokens": 14477004.0, + "step": 574 + }, + { + "epoch": 0.06314517900285525, + "grad_norm": 2.513810396194458, + "learning_rate": 3.1503841931942923e-07, + "loss": 1.1343, + "mean_token_accuracy": 0.6692912578582764, + "num_tokens": 14505974.0, + "step": 575 + }, + { + "epoch": 0.06325499670546893, + "grad_norm": 2.909149408340454, + "learning_rate": 3.155872667398463e-07, + "loss": 1.0952, + "mean_token_accuracy": 0.6780856847763062, + "num_tokens": 14527113.0, + "step": 576 + }, + { + "epoch": 0.06336481440808259, + "grad_norm": 2.1114578247070312, + "learning_rate": 3.161361141602634e-07, + "loss": 1.3003, + "mean_token_accuracy": 0.6280856132507324, + "num_tokens": 14563630.0, + "step": 577 + }, + { + "epoch": 0.06347463211069625, + "grad_norm": 2.6579153537750244, + "learning_rate": 3.166849615806805e-07, + "loss": 1.0913, + "mean_token_accuracy": 0.675217866897583, + "num_tokens": 14588482.0, + "step": 578 + }, + { + "epoch": 0.0635844498133099, + "grad_norm": 2.6560630798339844, + "learning_rate": 3.172338090010977e-07, + "loss": 1.1834, + "mean_token_accuracy": 0.6619194149971008, + "num_tokens": 14616055.0, + "step": 579 + }, + { + "epoch": 0.06369426751592357, + "grad_norm": 2.6803152561187744, + "learning_rate": 3.177826564215148e-07, + "loss": 1.0956, + "mean_token_accuracy": 0.6811650991439819, + "num_tokens": 14640356.0, + "step": 580 + }, + { + "epoch": 0.06380408521853723, + "grad_norm": 3.021843433380127, + "learning_rate": 3.183315038419319e-07, + "loss": 1.1243, + "mean_token_accuracy": 0.6706340909004211, + "num_tokens": 14660683.0, + "step": 581 + }, + { + "epoch": 0.06391390292115089, + "grad_norm": 2.5750837326049805, + "learning_rate": 3.188803512623491e-07, + "loss": 1.228, + "mean_token_accuracy": 0.6429418325424194, + "num_tokens": 14687330.0, + "step": 582 + }, + { + "epoch": 0.06402372062376455, + "grad_norm": 2.8907644748687744, + "learning_rate": 3.194291986827662e-07, + "loss": 1.1292, + "mean_token_accuracy": 0.6709950566291809, + "num_tokens": 14709786.0, + "step": 583 + }, + { + "epoch": 0.0641335383263782, + "grad_norm": 3.225160598754883, + "learning_rate": 3.199780461031833e-07, + "loss": 1.1785, + "mean_token_accuracy": 0.6794615983963013, + "num_tokens": 14730223.0, + "step": 584 + }, + { + "epoch": 0.06424335602899188, + "grad_norm": 2.643960475921631, + "learning_rate": 3.205268935236004e-07, + "loss": 1.2148, + "mean_token_accuracy": 0.644565999507904, + "num_tokens": 14757130.0, + "step": 585 + }, + { + "epoch": 0.06435317373160554, + "grad_norm": 3.0609288215637207, + "learning_rate": 3.2107574094401755e-07, + "loss": 1.1672, + "mean_token_accuracy": 0.6672347784042358, + "num_tokens": 14777051.0, + "step": 586 + }, + { + "epoch": 0.0644629914342192, + "grad_norm": 2.5193378925323486, + "learning_rate": 3.2162458836443467e-07, + "loss": 1.0308, + "mean_token_accuracy": 0.7023967504501343, + "num_tokens": 14803907.0, + "step": 587 + }, + { + "epoch": 0.06457280913683286, + "grad_norm": 2.5019190311431885, + "learning_rate": 3.221734357848518e-07, + "loss": 1.1913, + "mean_token_accuracy": 0.6645336151123047, + "num_tokens": 14831853.0, + "step": 588 + }, + { + "epoch": 0.06468262683944652, + "grad_norm": 2.661386013031006, + "learning_rate": 3.227222832052689e-07, + "loss": 1.1268, + "mean_token_accuracy": 0.6737356185913086, + "num_tokens": 14858050.0, + "step": 589 + }, + { + "epoch": 0.06479244454206018, + "grad_norm": 2.218308210372925, + "learning_rate": 3.2327113062568606e-07, + "loss": 1.0869, + "mean_token_accuracy": 0.6915162205696106, + "num_tokens": 14888661.0, + "step": 590 + }, + { + "epoch": 0.06490226224467384, + "grad_norm": 2.855579376220703, + "learning_rate": 3.238199780461032e-07, + "loss": 1.0899, + "mean_token_accuracy": 0.687799870967865, + "num_tokens": 14909491.0, + "step": 591 + }, + { + "epoch": 0.0650120799472875, + "grad_norm": 2.5212056636810303, + "learning_rate": 3.243688254665203e-07, + "loss": 1.1061, + "mean_token_accuracy": 0.6789067983627319, + "num_tokens": 14937028.0, + "step": 592 + }, + { + "epoch": 0.06512189764990116, + "grad_norm": 3.016812562942505, + "learning_rate": 3.2491767288693746e-07, + "loss": 0.9152, + "mean_token_accuracy": 0.7243346571922302, + "num_tokens": 14954674.0, + "step": 593 + }, + { + "epoch": 0.06523171535251482, + "grad_norm": 2.511186122894287, + "learning_rate": 3.254665203073546e-07, + "loss": 1.0016, + "mean_token_accuracy": 0.7041577696800232, + "num_tokens": 14979777.0, + "step": 594 + }, + { + "epoch": 0.06534153305512849, + "grad_norm": 2.6996915340423584, + "learning_rate": 3.2601536772777164e-07, + "loss": 1.1122, + "mean_token_accuracy": 0.679176390171051, + "num_tokens": 15003191.0, + "step": 595 + }, + { + "epoch": 0.06545135075774215, + "grad_norm": 2.415316343307495, + "learning_rate": 3.2656421514818876e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6908979415893555, + "num_tokens": 15034847.0, + "step": 596 + }, + { + "epoch": 0.06556116846035581, + "grad_norm": 2.421203136444092, + "learning_rate": 3.271130625686059e-07, + "loss": 1.1391, + "mean_token_accuracy": 0.6725011467933655, + "num_tokens": 15062183.0, + "step": 597 + }, + { + "epoch": 0.06567098616296947, + "grad_norm": 2.36167311668396, + "learning_rate": 3.2766190998902304e-07, + "loss": 1.1118, + "mean_token_accuracy": 0.677710771560669, + "num_tokens": 15092946.0, + "step": 598 + }, + { + "epoch": 0.06578080386558313, + "grad_norm": 2.7881486415863037, + "learning_rate": 3.2821075740944015e-07, + "loss": 1.1382, + "mean_token_accuracy": 0.6747989654541016, + "num_tokens": 15115090.0, + "step": 599 + }, + { + "epoch": 0.06589062156819679, + "grad_norm": 2.725412607192993, + "learning_rate": 3.2875960482985727e-07, + "loss": 1.0954, + "mean_token_accuracy": 0.6795216798782349, + "num_tokens": 15137046.0, + "step": 600 + }, + { + "epoch": 0.06600043927081045, + "grad_norm": 2.4272141456604004, + "learning_rate": 3.2930845225027444e-07, + "loss": 1.2093, + "mean_token_accuracy": 0.6556698083877563, + "num_tokens": 15165734.0, + "step": 601 + }, + { + "epoch": 0.06611025697342411, + "grad_norm": 2.504462480545044, + "learning_rate": 3.2985729967069155e-07, + "loss": 1.0363, + "mean_token_accuracy": 0.6977566480636597, + "num_tokens": 15190645.0, + "step": 602 + }, + { + "epoch": 0.06622007467603777, + "grad_norm": 2.726003408432007, + "learning_rate": 3.3040614709110867e-07, + "loss": 1.1295, + "mean_token_accuracy": 0.6718578934669495, + "num_tokens": 15212229.0, + "step": 603 + }, + { + "epoch": 0.06632989237865144, + "grad_norm": 2.1873505115509033, + "learning_rate": 3.3095499451152573e-07, + "loss": 1.1166, + "mean_token_accuracy": 0.6746806502342224, + "num_tokens": 15245089.0, + "step": 604 + }, + { + "epoch": 0.0664397100812651, + "grad_norm": 2.8752427101135254, + "learning_rate": 3.315038419319429e-07, + "loss": 1.0836, + "mean_token_accuracy": 0.6770894527435303, + "num_tokens": 15264036.0, + "step": 605 + }, + { + "epoch": 0.06654952778387876, + "grad_norm": 2.548799753189087, + "learning_rate": 3.3205268935236e-07, + "loss": 1.0435, + "mean_token_accuracy": 0.7036620378494263, + "num_tokens": 15287352.0, + "step": 606 + }, + { + "epoch": 0.06665934548649242, + "grad_norm": 2.323837995529175, + "learning_rate": 3.3260153677277713e-07, + "loss": 1.16, + "mean_token_accuracy": 0.6593409180641174, + "num_tokens": 15315830.0, + "step": 607 + }, + { + "epoch": 0.06676916318910608, + "grad_norm": 2.7458791732788086, + "learning_rate": 3.331503841931943e-07, + "loss": 1.1843, + "mean_token_accuracy": 0.6600192785263062, + "num_tokens": 15337925.0, + "step": 608 + }, + { + "epoch": 0.06687898089171974, + "grad_norm": 2.949211597442627, + "learning_rate": 3.336992316136114e-07, + "loss": 1.0613, + "mean_token_accuracy": 0.6847716569900513, + "num_tokens": 15361542.0, + "step": 609 + }, + { + "epoch": 0.0669887985943334, + "grad_norm": 2.20949387550354, + "learning_rate": 3.3424807903402853e-07, + "loss": 1.1341, + "mean_token_accuracy": 0.6646608114242554, + "num_tokens": 15392740.0, + "step": 610 + }, + { + "epoch": 0.06709861629694706, + "grad_norm": 2.780973434448242, + "learning_rate": 3.3479692645444564e-07, + "loss": 1.0623, + "mean_token_accuracy": 0.6944647431373596, + "num_tokens": 15418729.0, + "step": 611 + }, + { + "epoch": 0.06720843399956072, + "grad_norm": 2.712317705154419, + "learning_rate": 3.353457738748628e-07, + "loss": 1.0475, + "mean_token_accuracy": 0.6902891397476196, + "num_tokens": 15441531.0, + "step": 612 + }, + { + "epoch": 0.06731825170217438, + "grad_norm": 2.5896294116973877, + "learning_rate": 3.3589462129527993e-07, + "loss": 1.1047, + "mean_token_accuracy": 0.6737771034240723, + "num_tokens": 15466208.0, + "step": 613 + }, + { + "epoch": 0.06742806940478806, + "grad_norm": 2.3053863048553467, + "learning_rate": 3.36443468715697e-07, + "loss": 1.0861, + "mean_token_accuracy": 0.6928853988647461, + "num_tokens": 15496447.0, + "step": 614 + }, + { + "epoch": 0.06753788710740172, + "grad_norm": 2.921971321105957, + "learning_rate": 3.369923161361141e-07, + "loss": 1.13, + "mean_token_accuracy": 0.6809245347976685, + "num_tokens": 15516121.0, + "step": 615 + }, + { + "epoch": 0.06764770481001538, + "grad_norm": 2.503389835357666, + "learning_rate": 3.3754116355653127e-07, + "loss": 1.0625, + "mean_token_accuracy": 0.6932612657546997, + "num_tokens": 15540109.0, + "step": 616 + }, + { + "epoch": 0.06775752251262904, + "grad_norm": 3.102799654006958, + "learning_rate": 3.380900109769484e-07, + "loss": 1.1218, + "mean_token_accuracy": 0.6773762702941895, + "num_tokens": 15557475.0, + "step": 617 + }, + { + "epoch": 0.0678673402152427, + "grad_norm": 2.5481526851654053, + "learning_rate": 3.386388583973655e-07, + "loss": 1.0938, + "mean_token_accuracy": 0.6731961369514465, + "num_tokens": 15581341.0, + "step": 618 + }, + { + "epoch": 0.06797715791785636, + "grad_norm": 2.2121741771698, + "learning_rate": 3.3918770581778267e-07, + "loss": 1.1228, + "mean_token_accuracy": 0.6661986112594604, + "num_tokens": 15613965.0, + "step": 619 + }, + { + "epoch": 0.06808697562047002, + "grad_norm": 2.561338186264038, + "learning_rate": 3.397365532381998e-07, + "loss": 1.0954, + "mean_token_accuracy": 0.6798969507217407, + "num_tokens": 15640732.0, + "step": 620 + }, + { + "epoch": 0.06819679332308368, + "grad_norm": 2.4357123374938965, + "learning_rate": 3.402854006586169e-07, + "loss": 1.1796, + "mean_token_accuracy": 0.6518585085868835, + "num_tokens": 15669327.0, + "step": 621 + }, + { + "epoch": 0.06830661102569734, + "grad_norm": 2.5744853019714355, + "learning_rate": 3.40834248079034e-07, + "loss": 1.1509, + "mean_token_accuracy": 0.6645584106445312, + "num_tokens": 15693428.0, + "step": 622 + }, + { + "epoch": 0.06841642872831101, + "grad_norm": 2.803546905517578, + "learning_rate": 3.4138309549945113e-07, + "loss": 1.0792, + "mean_token_accuracy": 0.6834772825241089, + "num_tokens": 15716339.0, + "step": 623 + }, + { + "epoch": 0.06852624643092467, + "grad_norm": 2.8019912242889404, + "learning_rate": 3.4193194291986825e-07, + "loss": 1.1222, + "mean_token_accuracy": 0.6812372803688049, + "num_tokens": 15738138.0, + "step": 624 + }, + { + "epoch": 0.06863606413353833, + "grad_norm": 2.6573383808135986, + "learning_rate": 3.4248079034028536e-07, + "loss": 1.0258, + "mean_token_accuracy": 0.6901243329048157, + "num_tokens": 15760150.0, + "step": 625 + }, + { + "epoch": 0.06874588183615199, + "grad_norm": 2.543210983276367, + "learning_rate": 3.430296377607025e-07, + "loss": 1.1128, + "mean_token_accuracy": 0.6782667636871338, + "num_tokens": 15784763.0, + "step": 626 + }, + { + "epoch": 0.06885569953876565, + "grad_norm": 2.5912694931030273, + "learning_rate": 3.4357848518111965e-07, + "loss": 1.0663, + "mean_token_accuracy": 0.6881307363510132, + "num_tokens": 15809578.0, + "step": 627 + }, + { + "epoch": 0.06896551724137931, + "grad_norm": 2.9924683570861816, + "learning_rate": 3.4412733260153676e-07, + "loss": 1.1524, + "mean_token_accuracy": 0.669642984867096, + "num_tokens": 15827800.0, + "step": 628 + }, + { + "epoch": 0.06907533494399297, + "grad_norm": 2.228060722351074, + "learning_rate": 3.446761800219539e-07, + "loss": 1.1044, + "mean_token_accuracy": 0.6898642778396606, + "num_tokens": 15858349.0, + "step": 629 + }, + { + "epoch": 0.06918515264660663, + "grad_norm": 2.9577338695526123, + "learning_rate": 3.4522502744237104e-07, + "loss": 1.0556, + "mean_token_accuracy": 0.6938169598579407, + "num_tokens": 15878648.0, + "step": 630 + }, + { + "epoch": 0.06929497034922029, + "grad_norm": 2.099459409713745, + "learning_rate": 3.4577387486278816e-07, + "loss": 1.1186, + "mean_token_accuracy": 0.6751818060874939, + "num_tokens": 15913188.0, + "step": 631 + }, + { + "epoch": 0.06940478805183395, + "grad_norm": 2.3543081283569336, + "learning_rate": 3.463227222832053e-07, + "loss": 1.0971, + "mean_token_accuracy": 0.676201343536377, + "num_tokens": 15941265.0, + "step": 632 + }, + { + "epoch": 0.06951460575444762, + "grad_norm": 2.434943199157715, + "learning_rate": 3.4687156970362234e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6927269697189331, + "num_tokens": 15966808.0, + "step": 633 + }, + { + "epoch": 0.06962442345706128, + "grad_norm": 2.8955342769622803, + "learning_rate": 3.474204171240395e-07, + "loss": 1.2445, + "mean_token_accuracy": 0.6602693796157837, + "num_tokens": 15987588.0, + "step": 634 + }, + { + "epoch": 0.06973424115967494, + "grad_norm": 2.784283399581909, + "learning_rate": 3.479692645444566e-07, + "loss": 1.0694, + "mean_token_accuracy": 0.6825205087661743, + "num_tokens": 16010635.0, + "step": 635 + }, + { + "epoch": 0.0698440588622886, + "grad_norm": 2.465923309326172, + "learning_rate": 3.4851811196487374e-07, + "loss": 1.1851, + "mean_token_accuracy": 0.6603357195854187, + "num_tokens": 16036039.0, + "step": 636 + }, + { + "epoch": 0.06995387656490226, + "grad_norm": 2.954533815383911, + "learning_rate": 3.4906695938529085e-07, + "loss": 1.1476, + "mean_token_accuracy": 0.667350709438324, + "num_tokens": 16057501.0, + "step": 637 + }, + { + "epoch": 0.07006369426751592, + "grad_norm": 2.2490530014038086, + "learning_rate": 3.49615806805708e-07, + "loss": 1.0592, + "mean_token_accuracy": 0.6917130947113037, + "num_tokens": 16085122.0, + "step": 638 + }, + { + "epoch": 0.07017351197012958, + "grad_norm": 2.940509080886841, + "learning_rate": 3.5016465422612513e-07, + "loss": 1.048, + "mean_token_accuracy": 0.6907490491867065, + "num_tokens": 16104666.0, + "step": 639 + }, + { + "epoch": 0.07028332967274324, + "grad_norm": 3.155121326446533, + "learning_rate": 3.5071350164654225e-07, + "loss": 1.0081, + "mean_token_accuracy": 0.7010363340377808, + "num_tokens": 16123868.0, + "step": 640 + }, + { + "epoch": 0.0703931473753569, + "grad_norm": 2.274219274520874, + "learning_rate": 3.512623490669594e-07, + "loss": 1.2057, + "mean_token_accuracy": 0.6515191793441772, + "num_tokens": 16151798.0, + "step": 641 + }, + { + "epoch": 0.07050296507797058, + "grad_norm": 2.821948289871216, + "learning_rate": 3.518111964873765e-07, + "loss": 1.0672, + "mean_token_accuracy": 0.6812224388122559, + "num_tokens": 16171441.0, + "step": 642 + }, + { + "epoch": 0.07061278278058424, + "grad_norm": 3.080331563949585, + "learning_rate": 3.523600439077936e-07, + "loss": 1.073, + "mean_token_accuracy": 0.6840553283691406, + "num_tokens": 16189200.0, + "step": 643 + }, + { + "epoch": 0.0707226004831979, + "grad_norm": 2.447563886642456, + "learning_rate": 3.529088913282107e-07, + "loss": 1.1295, + "mean_token_accuracy": 0.6698682904243469, + "num_tokens": 16214788.0, + "step": 644 + }, + { + "epoch": 0.07083241818581155, + "grad_norm": 2.5515148639678955, + "learning_rate": 3.534577387486279e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.6752299666404724, + "num_tokens": 16238649.0, + "step": 645 + }, + { + "epoch": 0.07094223588842521, + "grad_norm": 2.2883987426757812, + "learning_rate": 3.54006586169045e-07, + "loss": 1.1792, + "mean_token_accuracy": 0.6564618349075317, + "num_tokens": 16268313.0, + "step": 646 + }, + { + "epoch": 0.07105205359103887, + "grad_norm": 2.4037864208221436, + "learning_rate": 3.545554335894621e-07, + "loss": 1.2043, + "mean_token_accuracy": 0.6598304510116577, + "num_tokens": 16297984.0, + "step": 647 + }, + { + "epoch": 0.07116187129365253, + "grad_norm": 3.025705575942993, + "learning_rate": 3.551042810098792e-07, + "loss": 1.1379, + "mean_token_accuracy": 0.6688076257705688, + "num_tokens": 16316489.0, + "step": 648 + }, + { + "epoch": 0.0712716889962662, + "grad_norm": 2.520049571990967, + "learning_rate": 3.556531284302964e-07, + "loss": 1.0269, + "mean_token_accuracy": 0.6998509168624878, + "num_tokens": 16339467.0, + "step": 649 + }, + { + "epoch": 0.07138150669887985, + "grad_norm": 2.5324127674102783, + "learning_rate": 3.562019758507135e-07, + "loss": 1.0754, + "mean_token_accuracy": 0.6844525337219238, + "num_tokens": 16364667.0, + "step": 650 + }, + { + "epoch": 0.07149132440149351, + "grad_norm": 2.8728091716766357, + "learning_rate": 3.5675082327113057e-07, + "loss": 1.1498, + "mean_token_accuracy": 0.670398473739624, + "num_tokens": 16384161.0, + "step": 651 + }, + { + "epoch": 0.07160114210410719, + "grad_norm": 2.4250998497009277, + "learning_rate": 3.5729967069154774e-07, + "loss": 1.0598, + "mean_token_accuracy": 0.6868730783462524, + "num_tokens": 16411278.0, + "step": 652 + }, + { + "epoch": 0.07171095980672085, + "grad_norm": 2.316423177719116, + "learning_rate": 3.5784851811196485e-07, + "loss": 1.0958, + "mean_token_accuracy": 0.6793264150619507, + "num_tokens": 16439753.0, + "step": 653 + }, + { + "epoch": 0.07182077750933451, + "grad_norm": 2.641401529312134, + "learning_rate": 3.5839736553238197e-07, + "loss": 1.12, + "mean_token_accuracy": 0.6761783361434937, + "num_tokens": 16463478.0, + "step": 654 + }, + { + "epoch": 0.07193059521194817, + "grad_norm": 2.8458735942840576, + "learning_rate": 3.589462129527991e-07, + "loss": 1.0269, + "mean_token_accuracy": 0.692963719367981, + "num_tokens": 16485221.0, + "step": 655 + }, + { + "epoch": 0.07204041291456183, + "grad_norm": 2.521883964538574, + "learning_rate": 3.5949506037321625e-07, + "loss": 1.0899, + "mean_token_accuracy": 0.6827625036239624, + "num_tokens": 16510910.0, + "step": 656 + }, + { + "epoch": 0.07215023061717549, + "grad_norm": 2.6919686794281006, + "learning_rate": 3.6004390779363337e-07, + "loss": 1.187, + "mean_token_accuracy": 0.654962956905365, + "num_tokens": 16534550.0, + "step": 657 + }, + { + "epoch": 0.07226004831978915, + "grad_norm": 2.3077337741851807, + "learning_rate": 3.605927552140505e-07, + "loss": 1.1692, + "mean_token_accuracy": 0.6651220321655273, + "num_tokens": 16564410.0, + "step": 658 + }, + { + "epoch": 0.0723698660224028, + "grad_norm": 2.7832677364349365, + "learning_rate": 3.611416026344676e-07, + "loss": 1.1307, + "mean_token_accuracy": 0.6686447858810425, + "num_tokens": 16586475.0, + "step": 659 + }, + { + "epoch": 0.07247968372501647, + "grad_norm": 2.4799225330352783, + "learning_rate": 3.6169045005488477e-07, + "loss": 1.1157, + "mean_token_accuracy": 0.6771341562271118, + "num_tokens": 16610927.0, + "step": 660 + }, + { + "epoch": 0.07258950142763014, + "grad_norm": 2.3315248489379883, + "learning_rate": 3.6223929747530183e-07, + "loss": 1.0975, + "mean_token_accuracy": 0.6855183243751526, + "num_tokens": 16641015.0, + "step": 661 + }, + { + "epoch": 0.0726993191302438, + "grad_norm": 2.3845293521881104, + "learning_rate": 3.6278814489571894e-07, + "loss": 1.1187, + "mean_token_accuracy": 0.6727142930030823, + "num_tokens": 16668171.0, + "step": 662 + }, + { + "epoch": 0.07280913683285746, + "grad_norm": 2.427612066268921, + "learning_rate": 3.6333699231613606e-07, + "loss": 1.182, + "mean_token_accuracy": 0.6547329425811768, + "num_tokens": 16696778.0, + "step": 663 + }, + { + "epoch": 0.07291895453547112, + "grad_norm": 2.5028722286224365, + "learning_rate": 3.6388583973655323e-07, + "loss": 1.1704, + "mean_token_accuracy": 0.6579713225364685, + "num_tokens": 16722899.0, + "step": 664 + }, + { + "epoch": 0.07302877223808478, + "grad_norm": 2.512525796890259, + "learning_rate": 3.6443468715697034e-07, + "loss": 1.1047, + "mean_token_accuracy": 0.6763297319412231, + "num_tokens": 16748794.0, + "step": 665 + }, + { + "epoch": 0.07313858994069844, + "grad_norm": 2.3384547233581543, + "learning_rate": 3.6498353457738746e-07, + "loss": 1.0923, + "mean_token_accuracy": 0.6789442896842957, + "num_tokens": 16775544.0, + "step": 666 + }, + { + "epoch": 0.0732484076433121, + "grad_norm": 2.518455982208252, + "learning_rate": 3.655323819978046e-07, + "loss": 1.0667, + "mean_token_accuracy": 0.6848827600479126, + "num_tokens": 16799451.0, + "step": 667 + }, + { + "epoch": 0.07335822534592576, + "grad_norm": 2.6328928470611572, + "learning_rate": 3.6608122941822174e-07, + "loss": 1.1776, + "mean_token_accuracy": 0.6689873933792114, + "num_tokens": 16823199.0, + "step": 668 + }, + { + "epoch": 0.07346804304853942, + "grad_norm": 2.678807258605957, + "learning_rate": 3.6663007683863886e-07, + "loss": 1.0015, + "mean_token_accuracy": 0.7032730579376221, + "num_tokens": 16842983.0, + "step": 669 + }, + { + "epoch": 0.07357786075115308, + "grad_norm": 2.418389320373535, + "learning_rate": 3.671789242590559e-07, + "loss": 1.1165, + "mean_token_accuracy": 0.6766065359115601, + "num_tokens": 16868324.0, + "step": 670 + }, + { + "epoch": 0.07368767845376675, + "grad_norm": 2.2491986751556396, + "learning_rate": 3.677277716794731e-07, + "loss": 1.1074, + "mean_token_accuracy": 0.6806511282920837, + "num_tokens": 16896825.0, + "step": 671 + }, + { + "epoch": 0.07379749615638041, + "grad_norm": 2.4752562046051025, + "learning_rate": 3.682766190998902e-07, + "loss": 1.1313, + "mean_token_accuracy": 0.6731595396995544, + "num_tokens": 16922948.0, + "step": 672 + }, + { + "epoch": 0.07390731385899407, + "grad_norm": 2.5673305988311768, + "learning_rate": 3.688254665203073e-07, + "loss": 1.0083, + "mean_token_accuracy": 0.704251229763031, + "num_tokens": 16946686.0, + "step": 673 + }, + { + "epoch": 0.07401713156160773, + "grad_norm": 2.422200918197632, + "learning_rate": 3.6937431394072443e-07, + "loss": 1.1532, + "mean_token_accuracy": 0.6603802442550659, + "num_tokens": 16973620.0, + "step": 674 + }, + { + "epoch": 0.07412694926422139, + "grad_norm": 2.398324966430664, + "learning_rate": 3.699231613611416e-07, + "loss": 1.1353, + "mean_token_accuracy": 0.6673444509506226, + "num_tokens": 17002468.0, + "step": 675 + }, + { + "epoch": 0.07423676696683505, + "grad_norm": 2.758178949356079, + "learning_rate": 3.704720087815587e-07, + "loss": 1.0101, + "mean_token_accuracy": 0.7029598355293274, + "num_tokens": 17023172.0, + "step": 676 + }, + { + "epoch": 0.07434658466944871, + "grad_norm": 2.8412113189697266, + "learning_rate": 3.7102085620197583e-07, + "loss": 1.1477, + "mean_token_accuracy": 0.6593257188796997, + "num_tokens": 17044492.0, + "step": 677 + }, + { + "epoch": 0.07445640237206237, + "grad_norm": 3.178053140640259, + "learning_rate": 3.71569703622393e-07, + "loss": 1.0952, + "mean_token_accuracy": 0.6770972013473511, + "num_tokens": 17064646.0, + "step": 678 + }, + { + "epoch": 0.07456622007467603, + "grad_norm": 2.441101551055908, + "learning_rate": 3.721185510428101e-07, + "loss": 1.0784, + "mean_token_accuracy": 0.6774493455886841, + "num_tokens": 17089846.0, + "step": 679 + }, + { + "epoch": 0.0746760377772897, + "grad_norm": 2.7096619606018066, + "learning_rate": 3.726673984632272e-07, + "loss": 1.0454, + "mean_token_accuracy": 0.7023369669914246, + "num_tokens": 17112343.0, + "step": 680 + }, + { + "epoch": 0.07478585547990337, + "grad_norm": 2.505997657775879, + "learning_rate": 3.732162458836443e-07, + "loss": 1.1231, + "mean_token_accuracy": 0.6689850687980652, + "num_tokens": 17137375.0, + "step": 681 + }, + { + "epoch": 0.07489567318251703, + "grad_norm": 2.4121267795562744, + "learning_rate": 3.7376509330406146e-07, + "loss": 1.1907, + "mean_token_accuracy": 0.6532436013221741, + "num_tokens": 17165117.0, + "step": 682 + }, + { + "epoch": 0.07500549088513069, + "grad_norm": 2.543029308319092, + "learning_rate": 3.743139407244786e-07, + "loss": 1.0779, + "mean_token_accuracy": 0.6821057796478271, + "num_tokens": 17189854.0, + "step": 683 + }, + { + "epoch": 0.07511530858774434, + "grad_norm": 2.433607578277588, + "learning_rate": 3.748627881448957e-07, + "loss": 1.106, + "mean_token_accuracy": 0.6705437898635864, + "num_tokens": 17216493.0, + "step": 684 + }, + { + "epoch": 0.075225126290358, + "grad_norm": 2.6939077377319336, + "learning_rate": 3.754116355653128e-07, + "loss": 1.0791, + "mean_token_accuracy": 0.6801088452339172, + "num_tokens": 17237723.0, + "step": 685 + }, + { + "epoch": 0.07533494399297166, + "grad_norm": 2.424144744873047, + "learning_rate": 3.7596048298573e-07, + "loss": 1.2339, + "mean_token_accuracy": 0.646138072013855, + "num_tokens": 17264832.0, + "step": 686 + }, + { + "epoch": 0.07544476169558532, + "grad_norm": 2.848402261734009, + "learning_rate": 3.765093304061471e-07, + "loss": 1.1, + "mean_token_accuracy": 0.6771562695503235, + "num_tokens": 17284780.0, + "step": 687 + }, + { + "epoch": 0.07555457939819898, + "grad_norm": 2.3519837856292725, + "learning_rate": 3.770581778265642e-07, + "loss": 1.0312, + "mean_token_accuracy": 0.6968759894371033, + "num_tokens": 17311272.0, + "step": 688 + }, + { + "epoch": 0.07566439710081264, + "grad_norm": 2.5342864990234375, + "learning_rate": 3.7760702524698137e-07, + "loss": 1.0345, + "mean_token_accuracy": 0.6916899681091309, + "num_tokens": 17335438.0, + "step": 689 + }, + { + "epoch": 0.07577421480342632, + "grad_norm": 2.2425856590270996, + "learning_rate": 3.7815587266739844e-07, + "loss": 1.1032, + "mean_token_accuracy": 0.6834409236907959, + "num_tokens": 17364141.0, + "step": 690 + }, + { + "epoch": 0.07588403250603998, + "grad_norm": 2.5208358764648438, + "learning_rate": 3.7870472008781555e-07, + "loss": 1.1376, + "mean_token_accuracy": 0.6711099743843079, + "num_tokens": 17387840.0, + "step": 691 + }, + { + "epoch": 0.07599385020865364, + "grad_norm": 2.5545687675476074, + "learning_rate": 3.7925356750823267e-07, + "loss": 1.0882, + "mean_token_accuracy": 0.6760087609291077, + "num_tokens": 17409782.0, + "step": 692 + }, + { + "epoch": 0.0761036679112673, + "grad_norm": 2.052992820739746, + "learning_rate": 3.7980241492864983e-07, + "loss": 1.1968, + "mean_token_accuracy": 0.6533735394477844, + "num_tokens": 17446292.0, + "step": 693 + }, + { + "epoch": 0.07621348561388096, + "grad_norm": 2.383561849594116, + "learning_rate": 3.8035126234906695e-07, + "loss": 1.0833, + "mean_token_accuracy": 0.6849239468574524, + "num_tokens": 17476261.0, + "step": 694 + }, + { + "epoch": 0.07632330331649462, + "grad_norm": 3.159374952316284, + "learning_rate": 3.8090010976948406e-07, + "loss": 0.9894, + "mean_token_accuracy": 0.6986459493637085, + "num_tokens": 17493154.0, + "step": 695 + }, + { + "epoch": 0.07643312101910828, + "grad_norm": 2.3847992420196533, + "learning_rate": 3.814489571899012e-07, + "loss": 1.1007, + "mean_token_accuracy": 0.6756787300109863, + "num_tokens": 17519863.0, + "step": 696 + }, + { + "epoch": 0.07654293872172194, + "grad_norm": 2.5451784133911133, + "learning_rate": 3.8199780461031835e-07, + "loss": 1.1191, + "mean_token_accuracy": 0.6761246919631958, + "num_tokens": 17543120.0, + "step": 697 + }, + { + "epoch": 0.0766527564243356, + "grad_norm": 2.647397041320801, + "learning_rate": 3.8254665203073546e-07, + "loss": 1.1037, + "mean_token_accuracy": 0.6729086637496948, + "num_tokens": 17565290.0, + "step": 698 + }, + { + "epoch": 0.07676257412694927, + "grad_norm": 2.2732834815979004, + "learning_rate": 3.830954994511525e-07, + "loss": 1.171, + "mean_token_accuracy": 0.661373496055603, + "num_tokens": 17594610.0, + "step": 699 + }, + { + "epoch": 0.07687239182956293, + "grad_norm": 2.7579188346862793, + "learning_rate": 3.836443468715697e-07, + "loss": 1.1144, + "mean_token_accuracy": 0.6755446195602417, + "num_tokens": 17615699.0, + "step": 700 + }, + { + "epoch": 0.07698220953217659, + "grad_norm": 2.757838726043701, + "learning_rate": 3.841931942919868e-07, + "loss": 0.9826, + "mean_token_accuracy": 0.7058974504470825, + "num_tokens": 17634740.0, + "step": 701 + }, + { + "epoch": 0.07709202723479025, + "grad_norm": 3.0257158279418945, + "learning_rate": 3.847420417124039e-07, + "loss": 1.0033, + "mean_token_accuracy": 0.6997252702713013, + "num_tokens": 17651681.0, + "step": 702 + }, + { + "epoch": 0.07720184493740391, + "grad_norm": 2.579807996749878, + "learning_rate": 3.8529088913282104e-07, + "loss": 0.9689, + "mean_token_accuracy": 0.708893895149231, + "num_tokens": 17673088.0, + "step": 703 + }, + { + "epoch": 0.07731166264001757, + "grad_norm": 2.9461865425109863, + "learning_rate": 3.858397365532382e-07, + "loss": 0.9544, + "mean_token_accuracy": 0.7035249471664429, + "num_tokens": 17690613.0, + "step": 704 + }, + { + "epoch": 0.07742148034263123, + "grad_norm": 2.4166176319122314, + "learning_rate": 3.863885839736553e-07, + "loss": 1.1572, + "mean_token_accuracy": 0.6575721502304077, + "num_tokens": 17718003.0, + "step": 705 + }, + { + "epoch": 0.07753129804524489, + "grad_norm": 2.72037672996521, + "learning_rate": 3.8693743139407244e-07, + "loss": 1.0233, + "mean_token_accuracy": 0.6984424591064453, + "num_tokens": 17739415.0, + "step": 706 + }, + { + "epoch": 0.07764111574785855, + "grad_norm": 2.2859017848968506, + "learning_rate": 3.8748627881448955e-07, + "loss": 1.1373, + "mean_token_accuracy": 0.6665388941764832, + "num_tokens": 17769209.0, + "step": 707 + }, + { + "epoch": 0.07775093345047221, + "grad_norm": 2.4971730709075928, + "learning_rate": 3.880351262349067e-07, + "loss": 1.0254, + "mean_token_accuracy": 0.6951342821121216, + "num_tokens": 17792788.0, + "step": 708 + }, + { + "epoch": 0.07786075115308588, + "grad_norm": 2.68829345703125, + "learning_rate": 3.885839736553238e-07, + "loss": 1.0319, + "mean_token_accuracy": 0.6904721856117249, + "num_tokens": 17813343.0, + "step": 709 + }, + { + "epoch": 0.07797056885569954, + "grad_norm": 2.7837836742401123, + "learning_rate": 3.891328210757409e-07, + "loss": 1.0533, + "mean_token_accuracy": 0.6852169036865234, + "num_tokens": 17837644.0, + "step": 710 + }, + { + "epoch": 0.0780803865583132, + "grad_norm": 2.3211593627929688, + "learning_rate": 3.8968166849615807e-07, + "loss": 1.1212, + "mean_token_accuracy": 0.6747230291366577, + "num_tokens": 17866672.0, + "step": 711 + }, + { + "epoch": 0.07819020426092686, + "grad_norm": 2.1993603706359863, + "learning_rate": 3.902305159165752e-07, + "loss": 1.1354, + "mean_token_accuracy": 0.6758900880813599, + "num_tokens": 17897938.0, + "step": 712 + }, + { + "epoch": 0.07830002196354052, + "grad_norm": 2.645503044128418, + "learning_rate": 3.907793633369923e-07, + "loss": 1.0671, + "mean_token_accuracy": 0.6890725493431091, + "num_tokens": 17920384.0, + "step": 713 + }, + { + "epoch": 0.07840983966615418, + "grad_norm": 2.5443575382232666, + "learning_rate": 3.913282107574094e-07, + "loss": 1.0604, + "mean_token_accuracy": 0.6820321679115295, + "num_tokens": 17941409.0, + "step": 714 + }, + { + "epoch": 0.07851965736876784, + "grad_norm": 2.481534719467163, + "learning_rate": 3.918770581778266e-07, + "loss": 1.0989, + "mean_token_accuracy": 0.6763007640838623, + "num_tokens": 17965710.0, + "step": 715 + }, + { + "epoch": 0.0786294750713815, + "grad_norm": 2.412942409515381, + "learning_rate": 3.924259055982437e-07, + "loss": 1.0465, + "mean_token_accuracy": 0.7149618268013, + "num_tokens": 17992263.0, + "step": 716 + }, + { + "epoch": 0.07873929277399516, + "grad_norm": 2.582756996154785, + "learning_rate": 3.929747530186608e-07, + "loss": 1.0444, + "mean_token_accuracy": 0.6898366212844849, + "num_tokens": 18015196.0, + "step": 717 + }, + { + "epoch": 0.07884911047660884, + "grad_norm": 2.398740530014038, + "learning_rate": 3.935236004390779e-07, + "loss": 1.0513, + "mean_token_accuracy": 0.6887186765670776, + "num_tokens": 18042969.0, + "step": 718 + }, + { + "epoch": 0.0789589281792225, + "grad_norm": 2.2134082317352295, + "learning_rate": 3.9407244785949504e-07, + "loss": 1.18, + "mean_token_accuracy": 0.6783194541931152, + "num_tokens": 18073141.0, + "step": 719 + }, + { + "epoch": 0.07906874588183616, + "grad_norm": 2.3666670322418213, + "learning_rate": 3.9462129527991216e-07, + "loss": 1.057, + "mean_token_accuracy": 0.6819192171096802, + "num_tokens": 18100165.0, + "step": 720 + }, + { + "epoch": 0.07917856358444982, + "grad_norm": 2.3248565196990967, + "learning_rate": 3.9517014270032927e-07, + "loss": 1.0721, + "mean_token_accuracy": 0.6829104423522949, + "num_tokens": 18127649.0, + "step": 721 + }, + { + "epoch": 0.07928838128706348, + "grad_norm": 2.660203456878662, + "learning_rate": 3.957189901207464e-07, + "loss": 1.1275, + "mean_token_accuracy": 0.6679672598838806, + "num_tokens": 18150004.0, + "step": 722 + }, + { + "epoch": 0.07939819898967714, + "grad_norm": 2.058602809906006, + "learning_rate": 3.9626783754116356e-07, + "loss": 1.0886, + "mean_token_accuracy": 0.67735755443573, + "num_tokens": 18180992.0, + "step": 723 + }, + { + "epoch": 0.0795080166922908, + "grad_norm": 2.4096601009368896, + "learning_rate": 3.9681668496158067e-07, + "loss": 1.0642, + "mean_token_accuracy": 0.6818733215332031, + "num_tokens": 18206116.0, + "step": 724 + }, + { + "epoch": 0.07961783439490445, + "grad_norm": 2.284515380859375, + "learning_rate": 3.973655323819978e-07, + "loss": 1.0571, + "mean_token_accuracy": 0.7002670764923096, + "num_tokens": 18234955.0, + "step": 725 + }, + { + "epoch": 0.07972765209751811, + "grad_norm": 2.4290642738342285, + "learning_rate": 3.9791437980241495e-07, + "loss": 1.0077, + "mean_token_accuracy": 0.6958414316177368, + "num_tokens": 18259000.0, + "step": 726 + }, + { + "epoch": 0.07983746980013177, + "grad_norm": 2.1704914569854736, + "learning_rate": 3.9846322722283207e-07, + "loss": 1.0522, + "mean_token_accuracy": 0.6900841593742371, + "num_tokens": 18291518.0, + "step": 727 + }, + { + "epoch": 0.07994728750274545, + "grad_norm": 2.3621084690093994, + "learning_rate": 3.9901207464324913e-07, + "loss": 1.042, + "mean_token_accuracy": 0.6964784860610962, + "num_tokens": 18318378.0, + "step": 728 + }, + { + "epoch": 0.08005710520535911, + "grad_norm": 2.6089282035827637, + "learning_rate": 3.9956092206366625e-07, + "loss": 1.1179, + "mean_token_accuracy": 0.6720976829528809, + "num_tokens": 18343084.0, + "step": 729 + }, + { + "epoch": 0.08016692290797277, + "grad_norm": 3.118126153945923, + "learning_rate": 4.001097694840834e-07, + "loss": 1.1589, + "mean_token_accuracy": 0.6829028129577637, + "num_tokens": 18361029.0, + "step": 730 + }, + { + "epoch": 0.08027674061058643, + "grad_norm": 2.3597187995910645, + "learning_rate": 4.0065861690450053e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.6823716163635254, + "num_tokens": 18387693.0, + "step": 731 + }, + { + "epoch": 0.08038655831320009, + "grad_norm": 2.466543436050415, + "learning_rate": 4.0120746432491765e-07, + "loss": 1.2166, + "mean_token_accuracy": 0.6486137509346008, + "num_tokens": 18415729.0, + "step": 732 + }, + { + "epoch": 0.08049637601581375, + "grad_norm": 2.3602397441864014, + "learning_rate": 4.0175631174533476e-07, + "loss": 1.0559, + "mean_token_accuracy": 0.682237982749939, + "num_tokens": 18445062.0, + "step": 733 + }, + { + "epoch": 0.08060619371842741, + "grad_norm": 2.4771063327789307, + "learning_rate": 4.0230515916575193e-07, + "loss": 1.1171, + "mean_token_accuracy": 0.6743090748786926, + "num_tokens": 18467683.0, + "step": 734 + }, + { + "epoch": 0.08071601142104107, + "grad_norm": 2.1384575366973877, + "learning_rate": 4.0285400658616904e-07, + "loss": 1.1031, + "mean_token_accuracy": 0.6886795163154602, + "num_tokens": 18500423.0, + "step": 735 + }, + { + "epoch": 0.08082582912365473, + "grad_norm": 2.699432849884033, + "learning_rate": 4.0340285400658616e-07, + "loss": 1.0826, + "mean_token_accuracy": 0.6813129782676697, + "num_tokens": 18521688.0, + "step": 736 + }, + { + "epoch": 0.0809356468262684, + "grad_norm": 2.6230177879333496, + "learning_rate": 4.039517014270033e-07, + "loss": 0.9746, + "mean_token_accuracy": 0.710568904876709, + "num_tokens": 18543371.0, + "step": 737 + }, + { + "epoch": 0.08104546452888206, + "grad_norm": 2.7020792961120605, + "learning_rate": 4.045005488474204e-07, + "loss": 1.1303, + "mean_token_accuracy": 0.6729695796966553, + "num_tokens": 18563734.0, + "step": 738 + }, + { + "epoch": 0.08115528223149572, + "grad_norm": 2.4188292026519775, + "learning_rate": 4.050493962678375e-07, + "loss": 1.0767, + "mean_token_accuracy": 0.6783480644226074, + "num_tokens": 18588930.0, + "step": 739 + }, + { + "epoch": 0.08126509993410938, + "grad_norm": 2.390404224395752, + "learning_rate": 4.055982436882546e-07, + "loss": 0.9708, + "mean_token_accuracy": 0.7127358913421631, + "num_tokens": 18613099.0, + "step": 740 + }, + { + "epoch": 0.08137491763672304, + "grad_norm": 2.222390651702881, + "learning_rate": 4.061470911086718e-07, + "loss": 1.012, + "mean_token_accuracy": 0.6982486844062805, + "num_tokens": 18640247.0, + "step": 741 + }, + { + "epoch": 0.0814847353393367, + "grad_norm": 2.462271213531494, + "learning_rate": 4.066959385290889e-07, + "loss": 1.1199, + "mean_token_accuracy": 0.6716838479042053, + "num_tokens": 18664894.0, + "step": 742 + }, + { + "epoch": 0.08159455304195036, + "grad_norm": 2.20913028717041, + "learning_rate": 4.07244785949506e-07, + "loss": 1.124, + "mean_token_accuracy": 0.674543559551239, + "num_tokens": 18693895.0, + "step": 743 + }, + { + "epoch": 0.08170437074456402, + "grad_norm": 2.2242226600646973, + "learning_rate": 4.0779363336992313e-07, + "loss": 1.0757, + "mean_token_accuracy": 0.6812198758125305, + "num_tokens": 18721402.0, + "step": 744 + }, + { + "epoch": 0.08181418844717768, + "grad_norm": 2.3013033866882324, + "learning_rate": 4.083424807903403e-07, + "loss": 1.1124, + "mean_token_accuracy": 0.6733696460723877, + "num_tokens": 18749927.0, + "step": 745 + }, + { + "epoch": 0.08192400614979134, + "grad_norm": 2.290344715118408, + "learning_rate": 4.0889132821075737e-07, + "loss": 0.9889, + "mean_token_accuracy": 0.7069524526596069, + "num_tokens": 18778435.0, + "step": 746 + }, + { + "epoch": 0.08203382385240501, + "grad_norm": 2.1170763969421387, + "learning_rate": 4.094401756311745e-07, + "loss": 1.0124, + "mean_token_accuracy": 0.6974818706512451, + "num_tokens": 18808844.0, + "step": 747 + }, + { + "epoch": 0.08214364155501867, + "grad_norm": 2.2502031326293945, + "learning_rate": 4.0998902305159165e-07, + "loss": 1.0759, + "mean_token_accuracy": 0.6831977367401123, + "num_tokens": 18837283.0, + "step": 748 + }, + { + "epoch": 0.08225345925763233, + "grad_norm": 2.3968591690063477, + "learning_rate": 4.1053787047200876e-07, + "loss": 1.2041, + "mean_token_accuracy": 0.6521055698394775, + "num_tokens": 18864327.0, + "step": 749 + }, + { + "epoch": 0.08236327696024599, + "grad_norm": 2.6679844856262207, + "learning_rate": 4.110867178924259e-07, + "loss": 1.1163, + "mean_token_accuracy": 0.6753627061843872, + "num_tokens": 18885773.0, + "step": 750 + }, + { + "epoch": 0.08247309466285965, + "grad_norm": 2.105592966079712, + "learning_rate": 4.11635565312843e-07, + "loss": 1.0531, + "mean_token_accuracy": 0.6901881098747253, + "num_tokens": 18917124.0, + "step": 751 + }, + { + "epoch": 0.08258291236547331, + "grad_norm": 2.124455690383911, + "learning_rate": 4.1218441273326016e-07, + "loss": 1.1526, + "mean_token_accuracy": 0.6673635244369507, + "num_tokens": 18947894.0, + "step": 752 + }, + { + "epoch": 0.08269273006808697, + "grad_norm": 2.392573595046997, + "learning_rate": 4.127332601536773e-07, + "loss": 0.9921, + "mean_token_accuracy": 0.7060288190841675, + "num_tokens": 18972187.0, + "step": 753 + }, + { + "epoch": 0.08280254777070063, + "grad_norm": 2.3994009494781494, + "learning_rate": 4.132821075740944e-07, + "loss": 1.1266, + "mean_token_accuracy": 0.6785954236984253, + "num_tokens": 18999598.0, + "step": 754 + }, + { + "epoch": 0.08291236547331429, + "grad_norm": 2.560957670211792, + "learning_rate": 4.138309549945115e-07, + "loss": 1.0852, + "mean_token_accuracy": 0.6830888986587524, + "num_tokens": 19023500.0, + "step": 755 + }, + { + "epoch": 0.08302218317592797, + "grad_norm": 2.2723746299743652, + "learning_rate": 4.143798024149286e-07, + "loss": 1.1086, + "mean_token_accuracy": 0.6839545369148254, + "num_tokens": 19050274.0, + "step": 756 + }, + { + "epoch": 0.08313200087854163, + "grad_norm": 2.6988141536712646, + "learning_rate": 4.1492864983534574e-07, + "loss": 1.1087, + "mean_token_accuracy": 0.6728480458259583, + "num_tokens": 19070645.0, + "step": 757 + }, + { + "epoch": 0.08324181858115529, + "grad_norm": 2.623339891433716, + "learning_rate": 4.1547749725576285e-07, + "loss": 1.0286, + "mean_token_accuracy": 0.6946727633476257, + "num_tokens": 19091221.0, + "step": 758 + }, + { + "epoch": 0.08335163628376895, + "grad_norm": 2.252521276473999, + "learning_rate": 4.1602634467618e-07, + "loss": 1.0307, + "mean_token_accuracy": 0.6874610185623169, + "num_tokens": 19117490.0, + "step": 759 + }, + { + "epoch": 0.0834614539863826, + "grad_norm": 2.50317645072937, + "learning_rate": 4.1657519209659714e-07, + "loss": 0.9765, + "mean_token_accuracy": 0.7079938650131226, + "num_tokens": 19140385.0, + "step": 760 + }, + { + "epoch": 0.08357127168899627, + "grad_norm": 2.49137806892395, + "learning_rate": 4.1712403951701425e-07, + "loss": 1.0088, + "mean_token_accuracy": 0.7013164758682251, + "num_tokens": 19164003.0, + "step": 761 + }, + { + "epoch": 0.08368108939160993, + "grad_norm": 2.179130792617798, + "learning_rate": 4.1767288693743137e-07, + "loss": 1.0712, + "mean_token_accuracy": 0.6900628805160522, + "num_tokens": 19195237.0, + "step": 762 + }, + { + "epoch": 0.08379090709422359, + "grad_norm": 2.511406421661377, + "learning_rate": 4.1822173435784854e-07, + "loss": 1.0916, + "mean_token_accuracy": 0.6866112947463989, + "num_tokens": 19221117.0, + "step": 763 + }, + { + "epoch": 0.08390072479683724, + "grad_norm": 1.910391092300415, + "learning_rate": 4.1877058177826565e-07, + "loss": 1.0829, + "mean_token_accuracy": 0.6775920391082764, + "num_tokens": 19256726.0, + "step": 764 + }, + { + "epoch": 0.0840105424994509, + "grad_norm": 2.0727040767669678, + "learning_rate": 4.193194291986827e-07, + "loss": 1.1068, + "mean_token_accuracy": 0.6764180064201355, + "num_tokens": 19289549.0, + "step": 765 + }, + { + "epoch": 0.08412036020206458, + "grad_norm": 2.497769832611084, + "learning_rate": 4.1986827661909983e-07, + "loss": 1.0589, + "mean_token_accuracy": 0.6824806928634644, + "num_tokens": 19314447.0, + "step": 766 + }, + { + "epoch": 0.08423017790467824, + "grad_norm": 2.2559022903442383, + "learning_rate": 4.20417124039517e-07, + "loss": 1.132, + "mean_token_accuracy": 0.6678884029388428, + "num_tokens": 19341932.0, + "step": 767 + }, + { + "epoch": 0.0843399956072919, + "grad_norm": 2.20353627204895, + "learning_rate": 4.209659714599341e-07, + "loss": 1.1296, + "mean_token_accuracy": 0.6687237620353699, + "num_tokens": 19371601.0, + "step": 768 + }, + { + "epoch": 0.08444981330990556, + "grad_norm": 2.259108543395996, + "learning_rate": 4.2151481888035123e-07, + "loss": 1.0924, + "mean_token_accuracy": 0.6799494028091431, + "num_tokens": 19399936.0, + "step": 769 + }, + { + "epoch": 0.08455963101251922, + "grad_norm": 2.491393566131592, + "learning_rate": 4.2206366630076834e-07, + "loss": 1.2088, + "mean_token_accuracy": 0.6537195444107056, + "num_tokens": 19425901.0, + "step": 770 + }, + { + "epoch": 0.08466944871513288, + "grad_norm": 2.2533252239227295, + "learning_rate": 4.226125137211855e-07, + "loss": 1.1647, + "mean_token_accuracy": 0.6706730723381042, + "num_tokens": 19454852.0, + "step": 771 + }, + { + "epoch": 0.08477926641774654, + "grad_norm": 2.9822847843170166, + "learning_rate": 4.231613611416026e-07, + "loss": 1.0199, + "mean_token_accuracy": 0.6936815977096558, + "num_tokens": 19473059.0, + "step": 772 + }, + { + "epoch": 0.0848890841203602, + "grad_norm": 2.134784460067749, + "learning_rate": 4.2371020856201974e-07, + "loss": 1.0674, + "mean_token_accuracy": 0.6858918070793152, + "num_tokens": 19505064.0, + "step": 773 + }, + { + "epoch": 0.08499890182297386, + "grad_norm": 2.537125587463379, + "learning_rate": 4.242590559824369e-07, + "loss": 1.1978, + "mean_token_accuracy": 0.6563242077827454, + "num_tokens": 19531875.0, + "step": 774 + }, + { + "epoch": 0.08510871952558753, + "grad_norm": 2.2267558574676514, + "learning_rate": 4.2480790340285397e-07, + "loss": 1.1968, + "mean_token_accuracy": 0.6586317420005798, + "num_tokens": 19560047.0, + "step": 775 + }, + { + "epoch": 0.08521853722820119, + "grad_norm": 2.546537160873413, + "learning_rate": 4.253567508232711e-07, + "loss": 1.1457, + "mean_token_accuracy": 0.6688651442527771, + "num_tokens": 19583024.0, + "step": 776 + }, + { + "epoch": 0.08532835493081485, + "grad_norm": 2.316696882247925, + "learning_rate": 4.259055982436882e-07, + "loss": 1.1855, + "mean_token_accuracy": 0.659776508808136, + "num_tokens": 19610319.0, + "step": 777 + }, + { + "epoch": 0.08543817263342851, + "grad_norm": 2.36940336227417, + "learning_rate": 4.2645444566410537e-07, + "loss": 1.1253, + "mean_token_accuracy": 0.6763954162597656, + "num_tokens": 19634461.0, + "step": 778 + }, + { + "epoch": 0.08554799033604217, + "grad_norm": 2.2244691848754883, + "learning_rate": 4.270032930845225e-07, + "loss": 1.1172, + "mean_token_accuracy": 0.6750435829162598, + "num_tokens": 19663333.0, + "step": 779 + }, + { + "epoch": 0.08565780803865583, + "grad_norm": 2.2690796852111816, + "learning_rate": 4.275521405049396e-07, + "loss": 1.1372, + "mean_token_accuracy": 0.6689225435256958, + "num_tokens": 19691721.0, + "step": 780 + }, + { + "epoch": 0.08576762574126949, + "grad_norm": 2.545482635498047, + "learning_rate": 4.281009879253567e-07, + "loss": 1.014, + "mean_token_accuracy": 0.6981731653213501, + "num_tokens": 19715460.0, + "step": 781 + }, + { + "epoch": 0.08587744344388315, + "grad_norm": 2.3323309421539307, + "learning_rate": 4.286498353457739e-07, + "loss": 1.1092, + "mean_token_accuracy": 0.6728013753890991, + "num_tokens": 19741360.0, + "step": 782 + }, + { + "epoch": 0.08598726114649681, + "grad_norm": 2.332857370376587, + "learning_rate": 4.29198682766191e-07, + "loss": 1.0625, + "mean_token_accuracy": 0.6850797533988953, + "num_tokens": 19768185.0, + "step": 783 + }, + { + "epoch": 0.08609707884911047, + "grad_norm": 2.2006561756134033, + "learning_rate": 4.2974753018660806e-07, + "loss": 1.146, + "mean_token_accuracy": 0.6743675470352173, + "num_tokens": 19796292.0, + "step": 784 + }, + { + "epoch": 0.08620689655172414, + "grad_norm": 2.749635696411133, + "learning_rate": 4.3029637760702523e-07, + "loss": 1.1623, + "mean_token_accuracy": 0.6652998328208923, + "num_tokens": 19818900.0, + "step": 785 + }, + { + "epoch": 0.0863167142543378, + "grad_norm": 2.912391185760498, + "learning_rate": 4.3084522502744235e-07, + "loss": 1.0443, + "mean_token_accuracy": 0.6889141798019409, + "num_tokens": 19840686.0, + "step": 786 + }, + { + "epoch": 0.08642653195695146, + "grad_norm": 2.541618585586548, + "learning_rate": 4.3139407244785946e-07, + "loss": 1.0876, + "mean_token_accuracy": 0.6912481188774109, + "num_tokens": 19863306.0, + "step": 787 + }, + { + "epoch": 0.08653634965956512, + "grad_norm": 2.474992275238037, + "learning_rate": 4.319429198682766e-07, + "loss": 1.1502, + "mean_token_accuracy": 0.6597659587860107, + "num_tokens": 19887496.0, + "step": 788 + }, + { + "epoch": 0.08664616736217878, + "grad_norm": 2.561335802078247, + "learning_rate": 4.3249176728869374e-07, + "loss": 1.1253, + "mean_token_accuracy": 0.6711612939834595, + "num_tokens": 19912478.0, + "step": 789 + }, + { + "epoch": 0.08675598506479244, + "grad_norm": 2.348053455352783, + "learning_rate": 4.3304061470911086e-07, + "loss": 1.0544, + "mean_token_accuracy": 0.6883223056793213, + "num_tokens": 19938532.0, + "step": 790 + }, + { + "epoch": 0.0868658027674061, + "grad_norm": 2.438049793243408, + "learning_rate": 4.33589462129528e-07, + "loss": 1.1078, + "mean_token_accuracy": 0.6708307266235352, + "num_tokens": 19963354.0, + "step": 791 + }, + { + "epoch": 0.08697562047001976, + "grad_norm": 2.4061238765716553, + "learning_rate": 4.341383095499451e-07, + "loss": 1.0832, + "mean_token_accuracy": 0.6804481148719788, + "num_tokens": 19988432.0, + "step": 792 + }, + { + "epoch": 0.08708543817263342, + "grad_norm": 2.615272045135498, + "learning_rate": 4.3468715697036226e-07, + "loss": 1.0439, + "mean_token_accuracy": 0.6937229037284851, + "num_tokens": 20010548.0, + "step": 793 + }, + { + "epoch": 0.0871952558752471, + "grad_norm": 2.270385503768921, + "learning_rate": 4.352360043907793e-07, + "loss": 1.0282, + "mean_token_accuracy": 0.6987624764442444, + "num_tokens": 20036764.0, + "step": 794 + }, + { + "epoch": 0.08730507357786076, + "grad_norm": 2.0728983879089355, + "learning_rate": 4.3578485181119644e-07, + "loss": 1.105, + "mean_token_accuracy": 0.6675371527671814, + "num_tokens": 20069293.0, + "step": 795 + }, + { + "epoch": 0.08741489128047442, + "grad_norm": 2.43455171585083, + "learning_rate": 4.363336992316136e-07, + "loss": 1.0623, + "mean_token_accuracy": 0.6856253147125244, + "num_tokens": 20092486.0, + "step": 796 + }, + { + "epoch": 0.08752470898308808, + "grad_norm": 2.1094353199005127, + "learning_rate": 4.368825466520307e-07, + "loss": 1.1227, + "mean_token_accuracy": 0.6713043451309204, + "num_tokens": 20124091.0, + "step": 797 + }, + { + "epoch": 0.08763452668570174, + "grad_norm": 2.723862648010254, + "learning_rate": 4.3743139407244783e-07, + "loss": 1.0955, + "mean_token_accuracy": 0.6752392053604126, + "num_tokens": 20145859.0, + "step": 798 + }, + { + "epoch": 0.0877443443883154, + "grad_norm": 2.475290060043335, + "learning_rate": 4.3798024149286495e-07, + "loss": 1.1422, + "mean_token_accuracy": 0.6655663251876831, + "num_tokens": 20170523.0, + "step": 799 + }, + { + "epoch": 0.08785416209092906, + "grad_norm": 2.670757293701172, + "learning_rate": 4.385290889132821e-07, + "loss": 1.0647, + "mean_token_accuracy": 0.686606228351593, + "num_tokens": 20192950.0, + "step": 800 + }, + { + "epoch": 0.08796397979354272, + "grad_norm": 2.194441318511963, + "learning_rate": 4.3907793633369923e-07, + "loss": 1.023, + "mean_token_accuracy": 0.6933830380439758, + "num_tokens": 20221103.0, + "step": 801 + }, + { + "epoch": 0.08807379749615638, + "grad_norm": 2.396991014480591, + "learning_rate": 4.3962678375411635e-07, + "loss": 1.026, + "mean_token_accuracy": 0.6943655610084534, + "num_tokens": 20246297.0, + "step": 802 + }, + { + "epoch": 0.08818361519877004, + "grad_norm": 2.4246745109558105, + "learning_rate": 4.401756311745334e-07, + "loss": 1.0935, + "mean_token_accuracy": 0.6700174808502197, + "num_tokens": 20271232.0, + "step": 803 + }, + { + "epoch": 0.08829343290138371, + "grad_norm": 2.5782408714294434, + "learning_rate": 4.407244785949506e-07, + "loss": 1.0131, + "mean_token_accuracy": 0.6934108734130859, + "num_tokens": 20294574.0, + "step": 804 + }, + { + "epoch": 0.08840325060399737, + "grad_norm": 2.734862804412842, + "learning_rate": 4.412733260153677e-07, + "loss": 1.1389, + "mean_token_accuracy": 0.6693503260612488, + "num_tokens": 20318100.0, + "step": 805 + }, + { + "epoch": 0.08851306830661103, + "grad_norm": 2.726649522781372, + "learning_rate": 4.418221734357848e-07, + "loss": 1.1383, + "mean_token_accuracy": 0.6619410514831543, + "num_tokens": 20339812.0, + "step": 806 + }, + { + "epoch": 0.08862288600922469, + "grad_norm": 2.4363718032836914, + "learning_rate": 4.42371020856202e-07, + "loss": 1.0392, + "mean_token_accuracy": 0.6801987290382385, + "num_tokens": 20364274.0, + "step": 807 + }, + { + "epoch": 0.08873270371183835, + "grad_norm": 2.3139448165893555, + "learning_rate": 4.429198682766191e-07, + "loss": 1.0855, + "mean_token_accuracy": 0.67362380027771, + "num_tokens": 20391576.0, + "step": 808 + }, + { + "epoch": 0.08884252141445201, + "grad_norm": 2.543118476867676, + "learning_rate": 4.434687156970362e-07, + "loss": 0.9317, + "mean_token_accuracy": 0.7140514850616455, + "num_tokens": 20414466.0, + "step": 809 + }, + { + "epoch": 0.08895233911706567, + "grad_norm": 2.4159398078918457, + "learning_rate": 4.440175631174533e-07, + "loss": 1.1213, + "mean_token_accuracy": 0.6736444234848022, + "num_tokens": 20440457.0, + "step": 810 + }, + { + "epoch": 0.08906215681967933, + "grad_norm": 2.463202953338623, + "learning_rate": 4.445664105378705e-07, + "loss": 1.1159, + "mean_token_accuracy": 0.6683136224746704, + "num_tokens": 20466643.0, + "step": 811 + }, + { + "epoch": 0.08917197452229299, + "grad_norm": 2.3967514038085938, + "learning_rate": 4.451152579582876e-07, + "loss": 1.0225, + "mean_token_accuracy": 0.704746663570404, + "num_tokens": 20493265.0, + "step": 812 + }, + { + "epoch": 0.08928179222490666, + "grad_norm": 2.504504442214966, + "learning_rate": 4.4566410537870467e-07, + "loss": 1.0286, + "mean_token_accuracy": 0.6949347853660583, + "num_tokens": 20514573.0, + "step": 813 + }, + { + "epoch": 0.08939160992752032, + "grad_norm": 2.5868425369262695, + "learning_rate": 4.462129527991218e-07, + "loss": 1.1454, + "mean_token_accuracy": 0.673192024230957, + "num_tokens": 20536244.0, + "step": 814 + }, + { + "epoch": 0.08950142763013398, + "grad_norm": 2.3917455673217773, + "learning_rate": 4.4676180021953895e-07, + "loss": 1.1466, + "mean_token_accuracy": 0.6640468239784241, + "num_tokens": 20562473.0, + "step": 815 + }, + { + "epoch": 0.08961124533274764, + "grad_norm": 2.5808498859405518, + "learning_rate": 4.4731064763995607e-07, + "loss": 0.9608, + "mean_token_accuracy": 0.7082681655883789, + "num_tokens": 20583484.0, + "step": 816 + }, + { + "epoch": 0.0897210630353613, + "grad_norm": 2.8625376224517822, + "learning_rate": 4.478594950603732e-07, + "loss": 0.988, + "mean_token_accuracy": 0.7035341262817383, + "num_tokens": 20600813.0, + "step": 817 + }, + { + "epoch": 0.08983088073797496, + "grad_norm": 2.4349050521850586, + "learning_rate": 4.4840834248079035e-07, + "loss": 1.0507, + "mean_token_accuracy": 0.693722665309906, + "num_tokens": 20624944.0, + "step": 818 + }, + { + "epoch": 0.08994069844058862, + "grad_norm": 2.648808717727661, + "learning_rate": 4.4895718990120747e-07, + "loss": 1.0911, + "mean_token_accuracy": 0.6790225505828857, + "num_tokens": 20646439.0, + "step": 819 + }, + { + "epoch": 0.09005051614320228, + "grad_norm": 2.147771120071411, + "learning_rate": 4.495060373216246e-07, + "loss": 1.1086, + "mean_token_accuracy": 0.6733027100563049, + "num_tokens": 20675894.0, + "step": 820 + }, + { + "epoch": 0.09016033384581594, + "grad_norm": 2.499911308288574, + "learning_rate": 4.500548847420417e-07, + "loss": 1.1684, + "mean_token_accuracy": 0.6615160703659058, + "num_tokens": 20700629.0, + "step": 821 + }, + { + "epoch": 0.0902701515484296, + "grad_norm": 2.487645149230957, + "learning_rate": 4.5060373216245886e-07, + "loss": 1.0087, + "mean_token_accuracy": 0.6982757449150085, + "num_tokens": 20722307.0, + "step": 822 + }, + { + "epoch": 0.09037996925104327, + "grad_norm": 2.23895525932312, + "learning_rate": 4.5115257958287593e-07, + "loss": 1.128, + "mean_token_accuracy": 0.6724228858947754, + "num_tokens": 20751371.0, + "step": 823 + }, + { + "epoch": 0.09048978695365693, + "grad_norm": 2.4830234050750732, + "learning_rate": 4.5170142700329304e-07, + "loss": 1.081, + "mean_token_accuracy": 0.680242657661438, + "num_tokens": 20776599.0, + "step": 824 + }, + { + "epoch": 0.0905996046562706, + "grad_norm": 2.3895483016967773, + "learning_rate": 4.5225027442371016e-07, + "loss": 1.17, + "mean_token_accuracy": 0.6620667576789856, + "num_tokens": 20802814.0, + "step": 825 + }, + { + "epoch": 0.09070942235888425, + "grad_norm": 2.2536396980285645, + "learning_rate": 4.527991218441273e-07, + "loss": 1.0987, + "mean_token_accuracy": 0.6754958629608154, + "num_tokens": 20830381.0, + "step": 826 + }, + { + "epoch": 0.09081924006149791, + "grad_norm": 2.4142534732818604, + "learning_rate": 4.5334796926454444e-07, + "loss": 1.139, + "mean_token_accuracy": 0.6643408536911011, + "num_tokens": 20854475.0, + "step": 827 + }, + { + "epoch": 0.09092905776411157, + "grad_norm": 2.2028915882110596, + "learning_rate": 4.5389681668496156e-07, + "loss": 1.1916, + "mean_token_accuracy": 0.6613553762435913, + "num_tokens": 20884253.0, + "step": 828 + }, + { + "epoch": 0.09103887546672523, + "grad_norm": 2.586768865585327, + "learning_rate": 4.5444566410537867e-07, + "loss": 1.0822, + "mean_token_accuracy": 0.6782411336898804, + "num_tokens": 20906880.0, + "step": 829 + }, + { + "epoch": 0.09114869316933889, + "grad_norm": 2.469651699066162, + "learning_rate": 4.5499451152579584e-07, + "loss": 1.0869, + "mean_token_accuracy": 0.6864473819732666, + "num_tokens": 20929687.0, + "step": 830 + }, + { + "epoch": 0.09125851087195255, + "grad_norm": 2.4198384284973145, + "learning_rate": 4.5554335894621295e-07, + "loss": 1.0931, + "mean_token_accuracy": 0.6686872243881226, + "num_tokens": 20953391.0, + "step": 831 + }, + { + "epoch": 0.09136832857456623, + "grad_norm": 2.172318458557129, + "learning_rate": 4.5609220636663e-07, + "loss": 1.138, + "mean_token_accuracy": 0.6673009991645813, + "num_tokens": 20983786.0, + "step": 832 + }, + { + "epoch": 0.09147814627717989, + "grad_norm": 2.7212331295013428, + "learning_rate": 4.566410537870472e-07, + "loss": 1.0552, + "mean_token_accuracy": 0.6843419075012207, + "num_tokens": 21004382.0, + "step": 833 + }, + { + "epoch": 0.09158796397979355, + "grad_norm": 2.4918603897094727, + "learning_rate": 4.571899012074643e-07, + "loss": 1.0219, + "mean_token_accuracy": 0.6928293108940125, + "num_tokens": 21025557.0, + "step": 834 + }, + { + "epoch": 0.0916977816824072, + "grad_norm": 2.296408176422119, + "learning_rate": 4.577387486278814e-07, + "loss": 1.0501, + "mean_token_accuracy": 0.6882469058036804, + "num_tokens": 21052249.0, + "step": 835 + }, + { + "epoch": 0.09180759938502087, + "grad_norm": 2.308629274368286, + "learning_rate": 4.5828759604829853e-07, + "loss": 1.0116, + "mean_token_accuracy": 0.701562762260437, + "num_tokens": 21076416.0, + "step": 836 + }, + { + "epoch": 0.09191741708763453, + "grad_norm": 2.835524320602417, + "learning_rate": 4.588364434687157e-07, + "loss": 1.0303, + "mean_token_accuracy": 0.6943880319595337, + "num_tokens": 21094246.0, + "step": 837 + }, + { + "epoch": 0.09202723479024819, + "grad_norm": 2.6379919052124023, + "learning_rate": 4.593852908891328e-07, + "loss": 1.1253, + "mean_token_accuracy": 0.6670966744422913, + "num_tokens": 21115511.0, + "step": 838 + }, + { + "epoch": 0.09213705249286185, + "grad_norm": 2.4947030544281006, + "learning_rate": 4.5993413830954993e-07, + "loss": 1.0336, + "mean_token_accuracy": 0.6986379623413086, + "num_tokens": 21139076.0, + "step": 839 + }, + { + "epoch": 0.0922468701954755, + "grad_norm": 2.1884162425994873, + "learning_rate": 4.6048298572996704e-07, + "loss": 1.1185, + "mean_token_accuracy": 0.674740731716156, + "num_tokens": 21169529.0, + "step": 840 + }, + { + "epoch": 0.09235668789808917, + "grad_norm": 2.895876407623291, + "learning_rate": 4.6103183315038416e-07, + "loss": 0.9821, + "mean_token_accuracy": 0.7122517824172974, + "num_tokens": 21188395.0, + "step": 841 + }, + { + "epoch": 0.09246650560070284, + "grad_norm": 2.3310728073120117, + "learning_rate": 4.615806805708013e-07, + "loss": 1.1034, + "mean_token_accuracy": 0.6825312376022339, + "num_tokens": 21214770.0, + "step": 842 + }, + { + "epoch": 0.0925763233033165, + "grad_norm": 2.0835163593292236, + "learning_rate": 4.621295279912184e-07, + "loss": 1.1756, + "mean_token_accuracy": 0.6603489518165588, + "num_tokens": 21246391.0, + "step": 843 + }, + { + "epoch": 0.09268614100593016, + "grad_norm": 2.561908721923828, + "learning_rate": 4.6267837541163556e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.6947275400161743, + "num_tokens": 21268571.0, + "step": 844 + }, + { + "epoch": 0.09279595870854382, + "grad_norm": 2.6329355239868164, + "learning_rate": 4.632272228320527e-07, + "loss": 1.0268, + "mean_token_accuracy": 0.6978034973144531, + "num_tokens": 21290327.0, + "step": 845 + }, + { + "epoch": 0.09290577641115748, + "grad_norm": 2.1870503425598145, + "learning_rate": 4.637760702524698e-07, + "loss": 1.0365, + "mean_token_accuracy": 0.698804497718811, + "num_tokens": 21319029.0, + "step": 846 + }, + { + "epoch": 0.09301559411377114, + "grad_norm": 2.649536371231079, + "learning_rate": 4.643249176728869e-07, + "loss": 1.0917, + "mean_token_accuracy": 0.6794653534889221, + "num_tokens": 21340317.0, + "step": 847 + }, + { + "epoch": 0.0931254118163848, + "grad_norm": 2.648329496383667, + "learning_rate": 4.6487376509330407e-07, + "loss": 1.1496, + "mean_token_accuracy": 0.6603056192398071, + "num_tokens": 21362579.0, + "step": 848 + }, + { + "epoch": 0.09323522951899846, + "grad_norm": 2.550477981567383, + "learning_rate": 4.654226125137212e-07, + "loss": 1.0513, + "mean_token_accuracy": 0.6886233687400818, + "num_tokens": 21383859.0, + "step": 849 + }, + { + "epoch": 0.09334504722161212, + "grad_norm": 2.8275132179260254, + "learning_rate": 4.659714599341383e-07, + "loss": 1.0179, + "mean_token_accuracy": 0.6893301010131836, + "num_tokens": 21401779.0, + "step": 850 + }, + { + "epoch": 0.09345486492422579, + "grad_norm": 2.3226189613342285, + "learning_rate": 4.6652030735455537e-07, + "loss": 1.0748, + "mean_token_accuracy": 0.6835023164749146, + "num_tokens": 21428490.0, + "step": 851 + }, + { + "epoch": 0.09356468262683945, + "grad_norm": 2.2234890460968018, + "learning_rate": 4.6706915477497253e-07, + "loss": 1.0207, + "mean_token_accuracy": 0.6947211027145386, + "num_tokens": 21459490.0, + "step": 852 + }, + { + "epoch": 0.09367450032945311, + "grad_norm": 2.33908748626709, + "learning_rate": 4.6761800219538965e-07, + "loss": 1.0963, + "mean_token_accuracy": 0.6739757061004639, + "num_tokens": 21485434.0, + "step": 853 + }, + { + "epoch": 0.09378431803206677, + "grad_norm": 2.5606043338775635, + "learning_rate": 4.6816684961580676e-07, + "loss": 1.0779, + "mean_token_accuracy": 0.6887083053588867, + "num_tokens": 21506576.0, + "step": 854 + }, + { + "epoch": 0.09389413573468043, + "grad_norm": 2.4839062690734863, + "learning_rate": 4.6871569703622393e-07, + "loss": 1.027, + "mean_token_accuracy": 0.6921185255050659, + "num_tokens": 21528700.0, + "step": 855 + }, + { + "epoch": 0.09400395343729409, + "grad_norm": 2.3317065238952637, + "learning_rate": 4.6926454445664105e-07, + "loss": 1.0823, + "mean_token_accuracy": 0.6792881488800049, + "num_tokens": 21553801.0, + "step": 856 + }, + { + "epoch": 0.09411377113990775, + "grad_norm": 2.2625226974487305, + "learning_rate": 4.6981339187705816e-07, + "loss": 1.0536, + "mean_token_accuracy": 0.6896547079086304, + "num_tokens": 21580035.0, + "step": 857 + }, + { + "epoch": 0.09422358884252141, + "grad_norm": 2.382715940475464, + "learning_rate": 4.703622392974753e-07, + "loss": 1.0329, + "mean_token_accuracy": 0.689548134803772, + "num_tokens": 21604226.0, + "step": 858 + }, + { + "epoch": 0.09433340654513507, + "grad_norm": 2.329456090927124, + "learning_rate": 4.7091108671789245e-07, + "loss": 1.0507, + "mean_token_accuracy": 0.687242865562439, + "num_tokens": 21628457.0, + "step": 859 + }, + { + "epoch": 0.09444322424774873, + "grad_norm": 2.188361406326294, + "learning_rate": 4.714599341383095e-07, + "loss": 1.1, + "mean_token_accuracy": 0.6762570142745972, + "num_tokens": 21655909.0, + "step": 860 + }, + { + "epoch": 0.0945530419503624, + "grad_norm": 2.4186832904815674, + "learning_rate": 4.720087815587266e-07, + "loss": 1.0932, + "mean_token_accuracy": 0.6734630465507507, + "num_tokens": 21679265.0, + "step": 861 + }, + { + "epoch": 0.09466285965297606, + "grad_norm": 2.283048152923584, + "learning_rate": 4.7255762897914374e-07, + "loss": 1.0516, + "mean_token_accuracy": 0.6937364339828491, + "num_tokens": 21706009.0, + "step": 862 + }, + { + "epoch": 0.09477267735558972, + "grad_norm": 2.5690248012542725, + "learning_rate": 4.731064763995609e-07, + "loss": 1.0945, + "mean_token_accuracy": 0.686625599861145, + "num_tokens": 21728903.0, + "step": 863 + }, + { + "epoch": 0.09488249505820338, + "grad_norm": 2.1780471801757812, + "learning_rate": 4.73655323819978e-07, + "loss": 1.0265, + "mean_token_accuracy": 0.6945939064025879, + "num_tokens": 21757078.0, + "step": 864 + }, + { + "epoch": 0.09499231276081704, + "grad_norm": 1.9154438972473145, + "learning_rate": 4.7420417124039514e-07, + "loss": 1.1219, + "mean_token_accuracy": 0.6671146154403687, + "num_tokens": 21794282.0, + "step": 865 + }, + { + "epoch": 0.0951021304634307, + "grad_norm": 2.565385103225708, + "learning_rate": 4.747530186608123e-07, + "loss": 0.9901, + "mean_token_accuracy": 0.700933039188385, + "num_tokens": 21815952.0, + "step": 866 + }, + { + "epoch": 0.09521194816604436, + "grad_norm": 2.376742124557495, + "learning_rate": 4.753018660812294e-07, + "loss": 0.9872, + "mean_token_accuracy": 0.6991251707077026, + "num_tokens": 21838053.0, + "step": 867 + }, + { + "epoch": 0.09532176586865802, + "grad_norm": 2.4438743591308594, + "learning_rate": 4.7585071350164654e-07, + "loss": 0.9837, + "mean_token_accuracy": 0.7007230520248413, + "num_tokens": 21862752.0, + "step": 868 + }, + { + "epoch": 0.09543158357127168, + "grad_norm": 2.1187193393707275, + "learning_rate": 4.7639956092206365e-07, + "loss": 1.018, + "mean_token_accuracy": 0.7040569186210632, + "num_tokens": 21891701.0, + "step": 869 + }, + { + "epoch": 0.09554140127388536, + "grad_norm": 2.6080660820007324, + "learning_rate": 4.769484083424808e-07, + "loss": 1.1173, + "mean_token_accuracy": 0.6738392114639282, + "num_tokens": 21914464.0, + "step": 870 + }, + { + "epoch": 0.09565121897649902, + "grad_norm": 2.836865186691284, + "learning_rate": 4.774972557628979e-07, + "loss": 1.0697, + "mean_token_accuracy": 0.6774745583534241, + "num_tokens": 21932044.0, + "step": 871 + }, + { + "epoch": 0.09576103667911268, + "grad_norm": 2.3169662952423096, + "learning_rate": 4.78046103183315e-07, + "loss": 1.1614, + "mean_token_accuracy": 0.6647125482559204, + "num_tokens": 21958235.0, + "step": 872 + }, + { + "epoch": 0.09587085438172634, + "grad_norm": 2.7736096382141113, + "learning_rate": 4.785949506037321e-07, + "loss": 1.0466, + "mean_token_accuracy": 0.6898237466812134, + "num_tokens": 21978714.0, + "step": 873 + }, + { + "epoch": 0.09598067208434, + "grad_norm": 2.655991792678833, + "learning_rate": 4.791437980241493e-07, + "loss": 1.0625, + "mean_token_accuracy": 0.6784647107124329, + "num_tokens": 21998795.0, + "step": 874 + }, + { + "epoch": 0.09609048978695366, + "grad_norm": 2.686549425125122, + "learning_rate": 4.796926454445663e-07, + "loss": 1.0351, + "mean_token_accuracy": 0.690045952796936, + "num_tokens": 22019710.0, + "step": 875 + }, + { + "epoch": 0.09620030748956732, + "grad_norm": 2.5607454776763916, + "learning_rate": 4.802414928649835e-07, + "loss": 1.057, + "mean_token_accuracy": 0.6911852955818176, + "num_tokens": 22041940.0, + "step": 876 + }, + { + "epoch": 0.09631012519218098, + "grad_norm": 2.2242326736450195, + "learning_rate": 4.807903402854007e-07, + "loss": 1.0189, + "mean_token_accuracy": 0.6916821002960205, + "num_tokens": 22068787.0, + "step": 877 + }, + { + "epoch": 0.09641994289479464, + "grad_norm": 2.5272600650787354, + "learning_rate": 4.813391877058177e-07, + "loss": 1.0139, + "mean_token_accuracy": 0.7010358572006226, + "num_tokens": 22092163.0, + "step": 878 + }, + { + "epoch": 0.0965297605974083, + "grad_norm": 2.431121826171875, + "learning_rate": 4.818880351262349e-07, + "loss": 1.0554, + "mean_token_accuracy": 0.679903507232666, + "num_tokens": 22117163.0, + "step": 879 + }, + { + "epoch": 0.09663957830002197, + "grad_norm": 2.579385280609131, + "learning_rate": 4.82436882546652e-07, + "loss": 1.0869, + "mean_token_accuracy": 0.6820125579833984, + "num_tokens": 22139264.0, + "step": 880 + }, + { + "epoch": 0.09674939600263563, + "grad_norm": 2.7224810123443604, + "learning_rate": 4.829857299670691e-07, + "loss": 1.0879, + "mean_token_accuracy": 0.6775065660476685, + "num_tokens": 22158698.0, + "step": 881 + }, + { + "epoch": 0.09685921370524929, + "grad_norm": 2.2788925170898438, + "learning_rate": 4.835345773874863e-07, + "loss": 1.038, + "mean_token_accuracy": 0.6923207640647888, + "num_tokens": 22187136.0, + "step": 882 + }, + { + "epoch": 0.09696903140786295, + "grad_norm": 2.202669858932495, + "learning_rate": 4.840834248079034e-07, + "loss": 1.0447, + "mean_token_accuracy": 0.6904990673065186, + "num_tokens": 22216675.0, + "step": 883 + }, + { + "epoch": 0.09707884911047661, + "grad_norm": 2.5985429286956787, + "learning_rate": 4.846322722283204e-07, + "loss": 1.0172, + "mean_token_accuracy": 0.7014920711517334, + "num_tokens": 22238187.0, + "step": 884 + }, + { + "epoch": 0.09718866681309027, + "grad_norm": 2.3614017963409424, + "learning_rate": 4.851811196487376e-07, + "loss": 1.0021, + "mean_token_accuracy": 0.699410080909729, + "num_tokens": 22263974.0, + "step": 885 + }, + { + "epoch": 0.09729848451570393, + "grad_norm": 2.137648820877075, + "learning_rate": 4.857299670691548e-07, + "loss": 1.0838, + "mean_token_accuracy": 0.6775752305984497, + "num_tokens": 22292781.0, + "step": 886 + }, + { + "epoch": 0.09740830221831759, + "grad_norm": 2.85644793510437, + "learning_rate": 4.862788144895718e-07, + "loss": 1.0864, + "mean_token_accuracy": 0.6817159652709961, + "num_tokens": 22312500.0, + "step": 887 + }, + { + "epoch": 0.09751811992093125, + "grad_norm": 2.429286479949951, + "learning_rate": 4.86827661909989e-07, + "loss": 1.0108, + "mean_token_accuracy": 0.6949171423912048, + "num_tokens": 22337345.0, + "step": 888 + }, + { + "epoch": 0.09762793762354492, + "grad_norm": 2.454712390899658, + "learning_rate": 4.873765093304062e-07, + "loss": 0.9648, + "mean_token_accuracy": 0.7174226641654968, + "num_tokens": 22362974.0, + "step": 889 + }, + { + "epoch": 0.09773775532615858, + "grad_norm": 2.4816954135894775, + "learning_rate": 4.879253567508232e-07, + "loss": 0.9768, + "mean_token_accuracy": 0.7001152038574219, + "num_tokens": 22386489.0, + "step": 890 + }, + { + "epoch": 0.09784757302877224, + "grad_norm": 2.3254189491271973, + "learning_rate": 4.884742041712404e-07, + "loss": 1.0871, + "mean_token_accuracy": 0.6885391473770142, + "num_tokens": 22412652.0, + "step": 891 + }, + { + "epoch": 0.0979573907313859, + "grad_norm": 2.235757827758789, + "learning_rate": 4.890230515916576e-07, + "loss": 1.0929, + "mean_token_accuracy": 0.678260326385498, + "num_tokens": 22439482.0, + "step": 892 + }, + { + "epoch": 0.09806720843399956, + "grad_norm": 2.2279927730560303, + "learning_rate": 4.895718990120746e-07, + "loss": 1.2262, + "mean_token_accuracy": 0.6463466882705688, + "num_tokens": 22470336.0, + "step": 893 + }, + { + "epoch": 0.09817702613661322, + "grad_norm": 2.3634278774261475, + "learning_rate": 4.901207464324917e-07, + "loss": 1.0231, + "mean_token_accuracy": 0.6965295076370239, + "num_tokens": 22494867.0, + "step": 894 + }, + { + "epoch": 0.09828684383922688, + "grad_norm": 2.9089534282684326, + "learning_rate": 4.906695938529089e-07, + "loss": 1.0884, + "mean_token_accuracy": 0.6772608757019043, + "num_tokens": 22514987.0, + "step": 895 + }, + { + "epoch": 0.09839666154184054, + "grad_norm": 2.484205961227417, + "learning_rate": 4.91218441273326e-07, + "loss": 0.9273, + "mean_token_accuracy": 0.7232189178466797, + "num_tokens": 22538001.0, + "step": 896 + }, + { + "epoch": 0.0985064792444542, + "grad_norm": 2.291553497314453, + "learning_rate": 4.917672886937431e-07, + "loss": 1.0574, + "mean_token_accuracy": 0.6856012940406799, + "num_tokens": 22564158.0, + "step": 897 + }, + { + "epoch": 0.09861629694706786, + "grad_norm": 2.33829402923584, + "learning_rate": 4.923161361141603e-07, + "loss": 1.0997, + "mean_token_accuracy": 0.6705226302146912, + "num_tokens": 22588375.0, + "step": 898 + }, + { + "epoch": 0.09872611464968153, + "grad_norm": 2.4893150329589844, + "learning_rate": 4.928649835345773e-07, + "loss": 1.1427, + "mean_token_accuracy": 0.685928463935852, + "num_tokens": 22611638.0, + "step": 899 + }, + { + "epoch": 0.0988359323522952, + "grad_norm": 2.3593523502349854, + "learning_rate": 4.934138309549945e-07, + "loss": 1.1348, + "mean_token_accuracy": 0.6652599573135376, + "num_tokens": 22640109.0, + "step": 900 + }, + { + "epoch": 0.09894575005490885, + "grad_norm": 2.6085503101348877, + "learning_rate": 4.939626783754117e-07, + "loss": 1.1069, + "mean_token_accuracy": 0.6737638711929321, + "num_tokens": 22662470.0, + "step": 901 + }, + { + "epoch": 0.09905556775752251, + "grad_norm": 2.1083505153656006, + "learning_rate": 4.945115257958287e-07, + "loss": 1.1081, + "mean_token_accuracy": 0.6798473596572876, + "num_tokens": 22694600.0, + "step": 902 + }, + { + "epoch": 0.09916538546013617, + "grad_norm": 2.368901014328003, + "learning_rate": 4.950603732162459e-07, + "loss": 0.9583, + "mean_token_accuracy": 0.7121833562850952, + "num_tokens": 22719321.0, + "step": 903 + }, + { + "epoch": 0.09927520316274983, + "grad_norm": 2.4916141033172607, + "learning_rate": 4.95609220636663e-07, + "loss": 1.059, + "mean_token_accuracy": 0.6927052736282349, + "num_tokens": 22742378.0, + "step": 904 + }, + { + "epoch": 0.0993850208653635, + "grad_norm": 2.1462347507476807, + "learning_rate": 4.961580680570801e-07, + "loss": 1.1226, + "mean_token_accuracy": 0.6731855869293213, + "num_tokens": 22772620.0, + "step": 905 + }, + { + "epoch": 0.09949483856797715, + "grad_norm": 2.399515390396118, + "learning_rate": 4.967069154774972e-07, + "loss": 1.0948, + "mean_token_accuracy": 0.6768900752067566, + "num_tokens": 22798667.0, + "step": 906 + }, + { + "epoch": 0.09960465627059081, + "grad_norm": 2.317406177520752, + "learning_rate": 4.972557628979143e-07, + "loss": 1.1061, + "mean_token_accuracy": 0.6759398579597473, + "num_tokens": 22824627.0, + "step": 907 + }, + { + "epoch": 0.09971447397320449, + "grad_norm": 2.252380132675171, + "learning_rate": 4.978046103183315e-07, + "loss": 1.1055, + "mean_token_accuracy": 0.6759792566299438, + "num_tokens": 22852265.0, + "step": 908 + }, + { + "epoch": 0.09982429167581815, + "grad_norm": 2.322296380996704, + "learning_rate": 4.983534577387486e-07, + "loss": 1.0581, + "mean_token_accuracy": 0.6931321620941162, + "num_tokens": 22878309.0, + "step": 909 + }, + { + "epoch": 0.0999341093784318, + "grad_norm": 2.067913770675659, + "learning_rate": 4.989023051591657e-07, + "loss": 1.1131, + "mean_token_accuracy": 0.6808410286903381, + "num_tokens": 22908218.0, + "step": 910 + }, + { + "epoch": 0.10004392708104547, + "grad_norm": 2.4732894897460938, + "learning_rate": 4.994511525795829e-07, + "loss": 1.0654, + "mean_token_accuracy": 0.6807636022567749, + "num_tokens": 22931902.0, + "step": 911 + }, + { + "epoch": 0.10015374478365913, + "grad_norm": 2.1639838218688965, + "learning_rate": 5e-07, + "loss": 1.1615, + "mean_token_accuracy": 0.6630319952964783, + "num_tokens": 22962227.0, + "step": 912 + }, + { + "epoch": 0.10026356248627279, + "grad_norm": 2.6724934577941895, + "learning_rate": 5.005488474204171e-07, + "loss": 1.1537, + "mean_token_accuracy": 0.6634881496429443, + "num_tokens": 22983086.0, + "step": 913 + }, + { + "epoch": 0.10037338018888645, + "grad_norm": 2.507678985595703, + "learning_rate": 5.010976948408342e-07, + "loss": 1.0122, + "mean_token_accuracy": 0.6970375180244446, + "num_tokens": 23006086.0, + "step": 914 + }, + { + "epoch": 0.1004831978915001, + "grad_norm": 2.0580272674560547, + "learning_rate": 5.016465422612514e-07, + "loss": 1.114, + "mean_token_accuracy": 0.6743525266647339, + "num_tokens": 23037740.0, + "step": 915 + }, + { + "epoch": 0.10059301559411377, + "grad_norm": 2.402472972869873, + "learning_rate": 5.021953896816685e-07, + "loss": 1.0469, + "mean_token_accuracy": 0.6860793232917786, + "num_tokens": 23061832.0, + "step": 916 + }, + { + "epoch": 0.10070283329672743, + "grad_norm": 2.4654109477996826, + "learning_rate": 5.027442371020856e-07, + "loss": 1.0856, + "mean_token_accuracy": 0.6838884949684143, + "num_tokens": 23086758.0, + "step": 917 + }, + { + "epoch": 0.1008126509993411, + "grad_norm": 2.6181416511535645, + "learning_rate": 5.032930845225028e-07, + "loss": 0.9721, + "mean_token_accuracy": 0.7084866762161255, + "num_tokens": 23109203.0, + "step": 918 + }, + { + "epoch": 0.10092246870195476, + "grad_norm": 2.207584857940674, + "learning_rate": 5.038419319429198e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6876564025878906, + "num_tokens": 23137628.0, + "step": 919 + }, + { + "epoch": 0.10103228640456842, + "grad_norm": 2.177509307861328, + "learning_rate": 5.04390779363337e-07, + "loss": 1.0223, + "mean_token_accuracy": 0.6938190460205078, + "num_tokens": 23165585.0, + "step": 920 + }, + { + "epoch": 0.10114210410718208, + "grad_norm": 2.4214632511138916, + "learning_rate": 5.049396267837542e-07, + "loss": 1.0087, + "mean_token_accuracy": 0.700836181640625, + "num_tokens": 23190050.0, + "step": 921 + }, + { + "epoch": 0.10125192180979574, + "grad_norm": 2.3436355590820312, + "learning_rate": 5.054884742041711e-07, + "loss": 1.0762, + "mean_token_accuracy": 0.6779595017433167, + "num_tokens": 23214418.0, + "step": 922 + }, + { + "epoch": 0.1013617395124094, + "grad_norm": 2.3647239208221436, + "learning_rate": 5.060373216245883e-07, + "loss": 1.0546, + "mean_token_accuracy": 0.6867666244506836, + "num_tokens": 23239099.0, + "step": 923 + }, + { + "epoch": 0.10147155721502306, + "grad_norm": 2.0646848678588867, + "learning_rate": 5.065861690450055e-07, + "loss": 1.1443, + "mean_token_accuracy": 0.6652857065200806, + "num_tokens": 23270126.0, + "step": 924 + }, + { + "epoch": 0.10158137491763672, + "grad_norm": 2.3180768489837646, + "learning_rate": 5.071350164654225e-07, + "loss": 1.1468, + "mean_token_accuracy": 0.6614596843719482, + "num_tokens": 23298838.0, + "step": 925 + }, + { + "epoch": 0.10169119262025038, + "grad_norm": 2.978895425796509, + "learning_rate": 5.076838638858397e-07, + "loss": 1.0054, + "mean_token_accuracy": 0.7048840522766113, + "num_tokens": 23316684.0, + "step": 926 + }, + { + "epoch": 0.10180101032286405, + "grad_norm": 2.5609285831451416, + "learning_rate": 5.082327113062569e-07, + "loss": 1.0157, + "mean_token_accuracy": 0.6923421621322632, + "num_tokens": 23337045.0, + "step": 927 + }, + { + "epoch": 0.10191082802547771, + "grad_norm": 2.2088537216186523, + "learning_rate": 5.087815587266739e-07, + "loss": 1.0413, + "mean_token_accuracy": 0.6851462721824646, + "num_tokens": 23365447.0, + "step": 928 + }, + { + "epoch": 0.10202064572809137, + "grad_norm": 2.664463758468628, + "learning_rate": 5.093304061470911e-07, + "loss": 1.0968, + "mean_token_accuracy": 0.6698136329650879, + "num_tokens": 23388199.0, + "step": 929 + }, + { + "epoch": 0.10213046343070503, + "grad_norm": 2.535182476043701, + "learning_rate": 5.098792535675082e-07, + "loss": 1.0653, + "mean_token_accuracy": 0.6898008584976196, + "num_tokens": 23412090.0, + "step": 930 + }, + { + "epoch": 0.10224028113331869, + "grad_norm": 2.3036956787109375, + "learning_rate": 5.104281009879253e-07, + "loss": 1.123, + "mean_token_accuracy": 0.6679455041885376, + "num_tokens": 23441788.0, + "step": 931 + }, + { + "epoch": 0.10235009883593235, + "grad_norm": 2.824256420135498, + "learning_rate": 5.109769484083425e-07, + "loss": 1.1141, + "mean_token_accuracy": 0.6708754301071167, + "num_tokens": 23462422.0, + "step": 932 + }, + { + "epoch": 0.10245991653854601, + "grad_norm": 2.4186723232269287, + "learning_rate": 5.115257958287596e-07, + "loss": 1.0503, + "mean_token_accuracy": 0.6880072355270386, + "num_tokens": 23486897.0, + "step": 933 + }, + { + "epoch": 0.10256973424115967, + "grad_norm": 2.421607494354248, + "learning_rate": 5.120746432491767e-07, + "loss": 1.0771, + "mean_token_accuracy": 0.6884269714355469, + "num_tokens": 23511069.0, + "step": 934 + }, + { + "epoch": 0.10267955194377333, + "grad_norm": 2.3209402561187744, + "learning_rate": 5.126234906695939e-07, + "loss": 1.1406, + "mean_token_accuracy": 0.6713506579399109, + "num_tokens": 23537059.0, + "step": 935 + }, + { + "epoch": 0.10278936964638699, + "grad_norm": 2.4552576541900635, + "learning_rate": 5.13172338090011e-07, + "loss": 1.0821, + "mean_token_accuracy": 0.6792397499084473, + "num_tokens": 23562537.0, + "step": 936 + }, + { + "epoch": 0.10289918734900066, + "grad_norm": 2.3304591178894043, + "learning_rate": 5.137211855104281e-07, + "loss": 1.0891, + "mean_token_accuracy": 0.6792335510253906, + "num_tokens": 23589830.0, + "step": 937 + }, + { + "epoch": 0.10300900505161432, + "grad_norm": 2.578233003616333, + "learning_rate": 5.142700329308453e-07, + "loss": 1.0157, + "mean_token_accuracy": 0.6945316195487976, + "num_tokens": 23611028.0, + "step": 938 + }, + { + "epoch": 0.10311882275422798, + "grad_norm": 2.8797974586486816, + "learning_rate": 5.148188803512624e-07, + "loss": 0.9368, + "mean_token_accuracy": 0.7168089151382446, + "num_tokens": 23629122.0, + "step": 939 + }, + { + "epoch": 0.10322864045684164, + "grad_norm": 2.309335231781006, + "learning_rate": 5.153677277716795e-07, + "loss": 1.0359, + "mean_token_accuracy": 0.69370436668396, + "num_tokens": 23654643.0, + "step": 940 + }, + { + "epoch": 0.1033384581594553, + "grad_norm": 2.6776230335235596, + "learning_rate": 5.159165751920965e-07, + "loss": 0.9875, + "mean_token_accuracy": 0.7018595933914185, + "num_tokens": 23676566.0, + "step": 941 + }, + { + "epoch": 0.10344827586206896, + "grad_norm": 2.16076922416687, + "learning_rate": 5.164654226125136e-07, + "loss": 1.1564, + "mean_token_accuracy": 0.6592235565185547, + "num_tokens": 23707782.0, + "step": 942 + }, + { + "epoch": 0.10355809356468262, + "grad_norm": 2.264585494995117, + "learning_rate": 5.170142700329308e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.669295072555542, + "num_tokens": 23735731.0, + "step": 943 + }, + { + "epoch": 0.10366791126729628, + "grad_norm": 2.4211554527282715, + "learning_rate": 5.175631174533479e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.6766113042831421, + "num_tokens": 23760093.0, + "step": 944 + }, + { + "epoch": 0.10377772896990994, + "grad_norm": 2.3028721809387207, + "learning_rate": 5.18111964873765e-07, + "loss": 1.0651, + "mean_token_accuracy": 0.6877451539039612, + "num_tokens": 23786463.0, + "step": 945 + }, + { + "epoch": 0.10388754667252362, + "grad_norm": 2.211763858795166, + "learning_rate": 5.186608122941822e-07, + "loss": 0.9708, + "mean_token_accuracy": 0.7038300037384033, + "num_tokens": 23813504.0, + "step": 946 + }, + { + "epoch": 0.10399736437513728, + "grad_norm": 2.4819462299346924, + "learning_rate": 5.192096597145993e-07, + "loss": 1.0749, + "mean_token_accuracy": 0.6812487244606018, + "num_tokens": 23836480.0, + "step": 947 + }, + { + "epoch": 0.10410718207775094, + "grad_norm": 2.6108524799346924, + "learning_rate": 5.197585071350164e-07, + "loss": 0.9976, + "mean_token_accuracy": 0.7005524635314941, + "num_tokens": 23858016.0, + "step": 948 + }, + { + "epoch": 0.1042169997803646, + "grad_norm": 2.5380280017852783, + "learning_rate": 5.203073545554336e-07, + "loss": 1.1298, + "mean_token_accuracy": 0.6612528562545776, + "num_tokens": 23880298.0, + "step": 949 + }, + { + "epoch": 0.10432681748297826, + "grad_norm": 2.1817805767059326, + "learning_rate": 5.208562019758507e-07, + "loss": 1.0974, + "mean_token_accuracy": 0.6734462380409241, + "num_tokens": 23909807.0, + "step": 950 + }, + { + "epoch": 0.10443663518559192, + "grad_norm": 2.3011527061462402, + "learning_rate": 5.214050493962678e-07, + "loss": 1.1134, + "mean_token_accuracy": 0.6743913888931274, + "num_tokens": 23936187.0, + "step": 951 + }, + { + "epoch": 0.10454645288820558, + "grad_norm": 2.776961088180542, + "learning_rate": 5.219538968166849e-07, + "loss": 1.071, + "mean_token_accuracy": 0.6852315068244934, + "num_tokens": 23956451.0, + "step": 952 + }, + { + "epoch": 0.10465627059081924, + "grad_norm": 2.1883385181427, + "learning_rate": 5.225027442371021e-07, + "loss": 1.0709, + "mean_token_accuracy": 0.6924112439155579, + "num_tokens": 23983086.0, + "step": 953 + }, + { + "epoch": 0.1047660882934329, + "grad_norm": 2.4226717948913574, + "learning_rate": 5.230515916575192e-07, + "loss": 0.8929, + "mean_token_accuracy": 0.7185324430465698, + "num_tokens": 24004658.0, + "step": 954 + }, + { + "epoch": 0.10487590599604656, + "grad_norm": 2.719355344772339, + "learning_rate": 5.236004390779363e-07, + "loss": 1.0012, + "mean_token_accuracy": 0.6987016201019287, + "num_tokens": 24024701.0, + "step": 955 + }, + { + "epoch": 0.10498572369866023, + "grad_norm": 2.3394410610198975, + "learning_rate": 5.241492864983535e-07, + "loss": 1.0675, + "mean_token_accuracy": 0.6887273788452148, + "num_tokens": 24049794.0, + "step": 956 + }, + { + "epoch": 0.10509554140127389, + "grad_norm": 2.3828632831573486, + "learning_rate": 5.246981339187706e-07, + "loss": 1.0903, + "mean_token_accuracy": 0.6749441623687744, + "num_tokens": 24076328.0, + "step": 957 + }, + { + "epoch": 0.10520535910388755, + "grad_norm": 2.208559989929199, + "learning_rate": 5.252469813391877e-07, + "loss": 1.0352, + "mean_token_accuracy": 0.6904388666152954, + "num_tokens": 24103611.0, + "step": 958 + }, + { + "epoch": 0.10531517680650121, + "grad_norm": 2.5309770107269287, + "learning_rate": 5.257958287596049e-07, + "loss": 1.0576, + "mean_token_accuracy": 0.6869688034057617, + "num_tokens": 24126179.0, + "step": 959 + }, + { + "epoch": 0.10542499450911487, + "grad_norm": 2.237792730331421, + "learning_rate": 5.263446761800219e-07, + "loss": 1.1307, + "mean_token_accuracy": 0.667883038520813, + "num_tokens": 24151894.0, + "step": 960 + }, + { + "epoch": 0.10553481221172853, + "grad_norm": 2.194114923477173, + "learning_rate": 5.26893523600439e-07, + "loss": 1.0076, + "mean_token_accuracy": 0.7002937197685242, + "num_tokens": 24181074.0, + "step": 961 + }, + { + "epoch": 0.10564462991434219, + "grad_norm": 2.299659490585327, + "learning_rate": 5.274423710208562e-07, + "loss": 1.0104, + "mean_token_accuracy": 0.7034606337547302, + "num_tokens": 24207900.0, + "step": 962 + }, + { + "epoch": 0.10575444761695585, + "grad_norm": 2.5020503997802734, + "learning_rate": 5.279912184412732e-07, + "loss": 1.0831, + "mean_token_accuracy": 0.684950590133667, + "num_tokens": 24232435.0, + "step": 963 + }, + { + "epoch": 0.10586426531956951, + "grad_norm": 2.1792352199554443, + "learning_rate": 5.285400658616904e-07, + "loss": 1.0907, + "mean_token_accuracy": 0.6765685677528381, + "num_tokens": 24261793.0, + "step": 964 + }, + { + "epoch": 0.10597408302218318, + "grad_norm": 2.379348039627075, + "learning_rate": 5.290889132821076e-07, + "loss": 1.0637, + "mean_token_accuracy": 0.6939804553985596, + "num_tokens": 24285811.0, + "step": 965 + }, + { + "epoch": 0.10608390072479684, + "grad_norm": 2.4480020999908447, + "learning_rate": 5.296377607025246e-07, + "loss": 1.0332, + "mean_token_accuracy": 0.6990548372268677, + "num_tokens": 24307046.0, + "step": 966 + }, + { + "epoch": 0.1061937184274105, + "grad_norm": 2.0270657539367676, + "learning_rate": 5.301866081229418e-07, + "loss": 1.0642, + "mean_token_accuracy": 0.6791924238204956, + "num_tokens": 24337528.0, + "step": 967 + }, + { + "epoch": 0.10630353613002416, + "grad_norm": 2.3467812538146973, + "learning_rate": 5.30735455543359e-07, + "loss": 1.0404, + "mean_token_accuracy": 0.6922905445098877, + "num_tokens": 24363624.0, + "step": 968 + }, + { + "epoch": 0.10641335383263782, + "grad_norm": 2.576843023300171, + "learning_rate": 5.31284302963776e-07, + "loss": 1.0271, + "mean_token_accuracy": 0.6939077377319336, + "num_tokens": 24384537.0, + "step": 969 + }, + { + "epoch": 0.10652317153525148, + "grad_norm": 2.4227850437164307, + "learning_rate": 5.318331503841932e-07, + "loss": 0.9607, + "mean_token_accuracy": 0.7102237939834595, + "num_tokens": 24408637.0, + "step": 970 + }, + { + "epoch": 0.10663298923786514, + "grad_norm": 2.2098183631896973, + "learning_rate": 5.323819978046103e-07, + "loss": 1.0336, + "mean_token_accuracy": 0.6910318732261658, + "num_tokens": 24438058.0, + "step": 971 + }, + { + "epoch": 0.1067428069404788, + "grad_norm": 2.599820375442505, + "learning_rate": 5.329308452250274e-07, + "loss": 1.0569, + "mean_token_accuracy": 0.6903505325317383, + "num_tokens": 24459235.0, + "step": 972 + }, + { + "epoch": 0.10685262464309246, + "grad_norm": 2.087310314178467, + "learning_rate": 5.334796926454446e-07, + "loss": 1.0105, + "mean_token_accuracy": 0.7010165452957153, + "num_tokens": 24488734.0, + "step": 973 + }, + { + "epoch": 0.10696244234570612, + "grad_norm": 2.1564457416534424, + "learning_rate": 5.340285400658617e-07, + "loss": 1.1036, + "mean_token_accuracy": 0.6828464865684509, + "num_tokens": 24516942.0, + "step": 974 + }, + { + "epoch": 0.1070722600483198, + "grad_norm": 2.6122732162475586, + "learning_rate": 5.345773874862788e-07, + "loss": 1.0155, + "mean_token_accuracy": 0.6886523962020874, + "num_tokens": 24536714.0, + "step": 975 + }, + { + "epoch": 0.10718207775093345, + "grad_norm": 2.53664493560791, + "learning_rate": 5.35126234906696e-07, + "loss": 0.9666, + "mean_token_accuracy": 0.7053484916687012, + "num_tokens": 24558313.0, + "step": 976 + }, + { + "epoch": 0.10729189545354711, + "grad_norm": 2.1965434551239014, + "learning_rate": 5.35675082327113e-07, + "loss": 1.0824, + "mean_token_accuracy": 0.6799745559692383, + "num_tokens": 24586884.0, + "step": 977 + }, + { + "epoch": 0.10740171315616077, + "grad_norm": 2.448540687561035, + "learning_rate": 5.362239297475302e-07, + "loss": 1.1034, + "mean_token_accuracy": 0.671379566192627, + "num_tokens": 24609091.0, + "step": 978 + }, + { + "epoch": 0.10751153085877443, + "grad_norm": 2.3704569339752197, + "learning_rate": 5.367727771679473e-07, + "loss": 1.03, + "mean_token_accuracy": 0.6955327391624451, + "num_tokens": 24635217.0, + "step": 979 + }, + { + "epoch": 0.1076213485613881, + "grad_norm": 2.4243345260620117, + "learning_rate": 5.373216245883643e-07, + "loss": 0.986, + "mean_token_accuracy": 0.7015687227249146, + "num_tokens": 24659122.0, + "step": 980 + }, + { + "epoch": 0.10773116626400175, + "grad_norm": 2.179283380508423, + "learning_rate": 5.378704720087815e-07, + "loss": 1.1354, + "mean_token_accuracy": 0.6741594672203064, + "num_tokens": 24689523.0, + "step": 981 + }, + { + "epoch": 0.10784098396661541, + "grad_norm": 2.045374631881714, + "learning_rate": 5.384193194291986e-07, + "loss": 1.0606, + "mean_token_accuracy": 0.6950728297233582, + "num_tokens": 24719448.0, + "step": 982 + }, + { + "epoch": 0.10795080166922907, + "grad_norm": 2.2959699630737305, + "learning_rate": 5.389681668496157e-07, + "loss": 1.1719, + "mean_token_accuracy": 0.6632931232452393, + "num_tokens": 24749041.0, + "step": 983 + }, + { + "epoch": 0.10806061937184275, + "grad_norm": 2.145516872406006, + "learning_rate": 5.395170142700329e-07, + "loss": 1.0696, + "mean_token_accuracy": 0.677993893623352, + "num_tokens": 24777957.0, + "step": 984 + }, + { + "epoch": 0.10817043707445641, + "grad_norm": 2.548588991165161, + "learning_rate": 5.4006586169045e-07, + "loss": 1.0451, + "mean_token_accuracy": 0.698378324508667, + "num_tokens": 24800688.0, + "step": 985 + }, + { + "epoch": 0.10828025477707007, + "grad_norm": 2.49101185798645, + "learning_rate": 5.406147091108671e-07, + "loss": 1.072, + "mean_token_accuracy": 0.6818203926086426, + "num_tokens": 24823347.0, + "step": 986 + }, + { + "epoch": 0.10839007247968373, + "grad_norm": 2.583812952041626, + "learning_rate": 5.411635565312843e-07, + "loss": 1.0499, + "mean_token_accuracy": 0.6873349547386169, + "num_tokens": 24843707.0, + "step": 987 + }, + { + "epoch": 0.10849989018229739, + "grad_norm": 2.349170446395874, + "learning_rate": 5.417124039517014e-07, + "loss": 1.155, + "mean_token_accuracy": 0.6643915176391602, + "num_tokens": 24870121.0, + "step": 988 + }, + { + "epoch": 0.10860970788491105, + "grad_norm": 2.468423843383789, + "learning_rate": 5.422612513721185e-07, + "loss": 1.0904, + "mean_token_accuracy": 0.6813580989837646, + "num_tokens": 24894283.0, + "step": 989 + }, + { + "epoch": 0.1087195255875247, + "grad_norm": 2.4027974605560303, + "learning_rate": 5.428100987925357e-07, + "loss": 1.0436, + "mean_token_accuracy": 0.6920702457427979, + "num_tokens": 24920248.0, + "step": 990 + }, + { + "epoch": 0.10882934329013837, + "grad_norm": 2.4526565074920654, + "learning_rate": 5.433589462129528e-07, + "loss": 1.0539, + "mean_token_accuracy": 0.7030538320541382, + "num_tokens": 24943645.0, + "step": 991 + }, + { + "epoch": 0.10893916099275203, + "grad_norm": 2.327349901199341, + "learning_rate": 5.439077936333699e-07, + "loss": 1.0048, + "mean_token_accuracy": 0.6966944336891174, + "num_tokens": 24969103.0, + "step": 992 + }, + { + "epoch": 0.10904897869536569, + "grad_norm": 2.837480068206787, + "learning_rate": 5.44456641053787e-07, + "loss": 1.0202, + "mean_token_accuracy": 0.6906304359436035, + "num_tokens": 24987467.0, + "step": 993 + }, + { + "epoch": 0.10915879639797936, + "grad_norm": 2.040834903717041, + "learning_rate": 5.450054884742042e-07, + "loss": 1.1308, + "mean_token_accuracy": 0.6612470746040344, + "num_tokens": 25020289.0, + "step": 994 + }, + { + "epoch": 0.10926861410059302, + "grad_norm": 2.514462947845459, + "learning_rate": 5.455543358946213e-07, + "loss": 0.9986, + "mean_token_accuracy": 0.7028346061706543, + "num_tokens": 25042635.0, + "step": 995 + }, + { + "epoch": 0.10937843180320668, + "grad_norm": 2.806882619857788, + "learning_rate": 5.461031833150384e-07, + "loss": 0.9924, + "mean_token_accuracy": 0.7013079524040222, + "num_tokens": 25061437.0, + "step": 996 + }, + { + "epoch": 0.10948824950582034, + "grad_norm": 2.4140946865081787, + "learning_rate": 5.466520307354556e-07, + "loss": 1.0527, + "mean_token_accuracy": 0.6874586939811707, + "num_tokens": 25085742.0, + "step": 997 + }, + { + "epoch": 0.109598067208434, + "grad_norm": 2.453185796737671, + "learning_rate": 5.472008781558726e-07, + "loss": 1.035, + "mean_token_accuracy": 0.6930899620056152, + "num_tokens": 25108279.0, + "step": 998 + }, + { + "epoch": 0.10970788491104766, + "grad_norm": 2.422032594680786, + "learning_rate": 5.477497255762897e-07, + "loss": 1.1726, + "mean_token_accuracy": 0.6698400974273682, + "num_tokens": 25135580.0, + "step": 999 + }, + { + "epoch": 0.10981770261366132, + "grad_norm": 2.119283437728882, + "learning_rate": 5.482985729967069e-07, + "loss": 0.9919, + "mean_token_accuracy": 0.6985162496566772, + "num_tokens": 25162753.0, + "step": 1000 + }, + { + "epoch": 0.10992752031627498, + "grad_norm": 2.3614842891693115, + "learning_rate": 5.48847420417124e-07, + "loss": 1.0429, + "mean_token_accuracy": 0.6844663619995117, + "num_tokens": 25188963.0, + "step": 1001 + }, + { + "epoch": 0.11003733801888864, + "grad_norm": 2.153212308883667, + "learning_rate": 5.493962678375411e-07, + "loss": 1.1574, + "mean_token_accuracy": 0.6642470955848694, + "num_tokens": 25219195.0, + "step": 1002 + }, + { + "epoch": 0.11014715572150231, + "grad_norm": 2.3222222328186035, + "learning_rate": 5.499451152579583e-07, + "loss": 1.1002, + "mean_token_accuracy": 0.6790688037872314, + "num_tokens": 25246276.0, + "step": 1003 + }, + { + "epoch": 0.11025697342411597, + "grad_norm": 2.2277486324310303, + "learning_rate": 5.504939626783753e-07, + "loss": 1.014, + "mean_token_accuracy": 0.6930123567581177, + "num_tokens": 25273304.0, + "step": 1004 + }, + { + "epoch": 0.11036679112672963, + "grad_norm": 2.3687431812286377, + "learning_rate": 5.510428100987925e-07, + "loss": 1.1563, + "mean_token_accuracy": 0.6567736864089966, + "num_tokens": 25298572.0, + "step": 1005 + }, + { + "epoch": 0.11047660882934329, + "grad_norm": 2.7665627002716064, + "learning_rate": 5.515916575192097e-07, + "loss": 0.9926, + "mean_token_accuracy": 0.6999256610870361, + "num_tokens": 25316709.0, + "step": 1006 + }, + { + "epoch": 0.11058642653195695, + "grad_norm": 2.363571882247925, + "learning_rate": 5.521405049396267e-07, + "loss": 1.075, + "mean_token_accuracy": 0.6853659152984619, + "num_tokens": 25342089.0, + "step": 1007 + }, + { + "epoch": 0.11069624423457061, + "grad_norm": 2.365342140197754, + "learning_rate": 5.526893523600439e-07, + "loss": 0.9662, + "mean_token_accuracy": 0.7062013149261475, + "num_tokens": 25365387.0, + "step": 1008 + }, + { + "epoch": 0.11080606193718427, + "grad_norm": 2.857318639755249, + "learning_rate": 5.532381997804611e-07, + "loss": 0.9611, + "mean_token_accuracy": 0.708354115486145, + "num_tokens": 25382735.0, + "step": 1009 + }, + { + "epoch": 0.11091587963979793, + "grad_norm": 2.3239588737487793, + "learning_rate": 5.537870472008781e-07, + "loss": 1.0892, + "mean_token_accuracy": 0.6767115592956543, + "num_tokens": 25407607.0, + "step": 1010 + }, + { + "epoch": 0.11102569734241159, + "grad_norm": 2.583155632019043, + "learning_rate": 5.543358946212953e-07, + "loss": 0.9989, + "mean_token_accuracy": 0.7025543451309204, + "num_tokens": 25427705.0, + "step": 1011 + }, + { + "epoch": 0.11113551504502525, + "grad_norm": 2.3211302757263184, + "learning_rate": 5.548847420417125e-07, + "loss": 1.0978, + "mean_token_accuracy": 0.6719236373901367, + "num_tokens": 25454369.0, + "step": 1012 + }, + { + "epoch": 0.11124533274763893, + "grad_norm": 2.0502383708953857, + "learning_rate": 5.554335894621295e-07, + "loss": 1.0353, + "mean_token_accuracy": 0.6930632591247559, + "num_tokens": 25486416.0, + "step": 1013 + }, + { + "epoch": 0.11135515045025259, + "grad_norm": 2.1791515350341797, + "learning_rate": 5.559824368825467e-07, + "loss": 1.1287, + "mean_token_accuracy": 0.6704766154289246, + "num_tokens": 25515852.0, + "step": 1014 + }, + { + "epoch": 0.11146496815286625, + "grad_norm": 2.267302989959717, + "learning_rate": 5.565312843029637e-07, + "loss": 1.0866, + "mean_token_accuracy": 0.6850862503051758, + "num_tokens": 25542067.0, + "step": 1015 + }, + { + "epoch": 0.1115747858554799, + "grad_norm": 3.0133516788482666, + "learning_rate": 5.570801317233809e-07, + "loss": 0.9938, + "mean_token_accuracy": 0.6994720697402954, + "num_tokens": 25557589.0, + "step": 1016 + }, + { + "epoch": 0.11168460355809356, + "grad_norm": 2.2980942726135254, + "learning_rate": 5.57628979143798e-07, + "loss": 1.0599, + "mean_token_accuracy": 0.685702919960022, + "num_tokens": 25584292.0, + "step": 1017 + }, + { + "epoch": 0.11179442126070722, + "grad_norm": 2.3926985263824463, + "learning_rate": 5.58177826564215e-07, + "loss": 0.9943, + "mean_token_accuracy": 0.6950646042823792, + "num_tokens": 25608064.0, + "step": 1018 + }, + { + "epoch": 0.11190423896332088, + "grad_norm": 2.3618311882019043, + "learning_rate": 5.587266739846322e-07, + "loss": 0.9896, + "mean_token_accuracy": 0.7015422582626343, + "num_tokens": 25632356.0, + "step": 1019 + }, + { + "epoch": 0.11201405666593454, + "grad_norm": 2.227701425552368, + "learning_rate": 5.592755214050494e-07, + "loss": 1.1312, + "mean_token_accuracy": 0.6645995378494263, + "num_tokens": 25660949.0, + "step": 1020 + }, + { + "epoch": 0.1121238743685482, + "grad_norm": 2.358907699584961, + "learning_rate": 5.598243688254664e-07, + "loss": 1.0921, + "mean_token_accuracy": 0.6845738887786865, + "num_tokens": 25687395.0, + "step": 1021 + }, + { + "epoch": 0.11223369207116188, + "grad_norm": 2.722654104232788, + "learning_rate": 5.603732162458836e-07, + "loss": 1.0445, + "mean_token_accuracy": 0.6895926594734192, + "num_tokens": 25707347.0, + "step": 1022 + }, + { + "epoch": 0.11234350977377554, + "grad_norm": 2.187425374984741, + "learning_rate": 5.609220636663008e-07, + "loss": 0.9642, + "mean_token_accuracy": 0.7092350721359253, + "num_tokens": 25733976.0, + "step": 1023 + }, + { + "epoch": 0.1124533274763892, + "grad_norm": 2.4008262157440186, + "learning_rate": 5.614709110867178e-07, + "loss": 1.0734, + "mean_token_accuracy": 0.6802710890769958, + "num_tokens": 25758240.0, + "step": 1024 + }, + { + "epoch": 0.11256314517900286, + "grad_norm": 2.397481918334961, + "learning_rate": 5.62019758507135e-07, + "loss": 1.0327, + "mean_token_accuracy": 0.6903080940246582, + "num_tokens": 25782907.0, + "step": 1025 + }, + { + "epoch": 0.11267296288161652, + "grad_norm": 2.2857003211975098, + "learning_rate": 5.625686059275521e-07, + "loss": 1.0905, + "mean_token_accuracy": 0.6758660674095154, + "num_tokens": 25810062.0, + "step": 1026 + }, + { + "epoch": 0.11278278058423018, + "grad_norm": 2.4243950843811035, + "learning_rate": 5.631174533479692e-07, + "loss": 1.0948, + "mean_token_accuracy": 0.6836026906967163, + "num_tokens": 25833923.0, + "step": 1027 + }, + { + "epoch": 0.11289259828684384, + "grad_norm": 2.4830520153045654, + "learning_rate": 5.636663007683864e-07, + "loss": 1.1228, + "mean_token_accuracy": 0.667061448097229, + "num_tokens": 25856763.0, + "step": 1028 + }, + { + "epoch": 0.1130024159894575, + "grad_norm": 2.3218767642974854, + "learning_rate": 5.642151481888035e-07, + "loss": 1.1778, + "mean_token_accuracy": 0.6555006504058838, + "num_tokens": 25885253.0, + "step": 1029 + }, + { + "epoch": 0.11311223369207116, + "grad_norm": 2.218526601791382, + "learning_rate": 5.647639956092206e-07, + "loss": 1.088, + "mean_token_accuracy": 0.6772266030311584, + "num_tokens": 25912545.0, + "step": 1030 + }, + { + "epoch": 0.11322205139468482, + "grad_norm": 2.47224497795105, + "learning_rate": 5.653128430296378e-07, + "loss": 1.0318, + "mean_token_accuracy": 0.699153482913971, + "num_tokens": 25935083.0, + "step": 1031 + }, + { + "epoch": 0.11333186909729849, + "grad_norm": 2.3313310146331787, + "learning_rate": 5.658616904500549e-07, + "loss": 1.0805, + "mean_token_accuracy": 0.6776203513145447, + "num_tokens": 25962127.0, + "step": 1032 + }, + { + "epoch": 0.11344168679991215, + "grad_norm": 2.156822443008423, + "learning_rate": 5.66410537870472e-07, + "loss": 1.1084, + "mean_token_accuracy": 0.6818971633911133, + "num_tokens": 25991364.0, + "step": 1033 + }, + { + "epoch": 0.11355150450252581, + "grad_norm": 2.1437923908233643, + "learning_rate": 5.669593852908892e-07, + "loss": 1.1471, + "mean_token_accuracy": 0.6626564264297485, + "num_tokens": 26020258.0, + "step": 1034 + }, + { + "epoch": 0.11366132220513947, + "grad_norm": 2.3899459838867188, + "learning_rate": 5.675082327113063e-07, + "loss": 1.0111, + "mean_token_accuracy": 0.6953873634338379, + "num_tokens": 26044064.0, + "step": 1035 + }, + { + "epoch": 0.11377113990775313, + "grad_norm": 2.692331552505493, + "learning_rate": 5.680570801317233e-07, + "loss": 1.0946, + "mean_token_accuracy": 0.6882129907608032, + "num_tokens": 26064405.0, + "step": 1036 + }, + { + "epoch": 0.11388095761036679, + "grad_norm": 2.3257079124450684, + "learning_rate": 5.686059275521404e-07, + "loss": 1.0328, + "mean_token_accuracy": 0.6914512515068054, + "num_tokens": 26089887.0, + "step": 1037 + }, + { + "epoch": 0.11399077531298045, + "grad_norm": 2.518392324447632, + "learning_rate": 5.691547749725576e-07, + "loss": 1.036, + "mean_token_accuracy": 0.6920803785324097, + "num_tokens": 26112961.0, + "step": 1038 + }, + { + "epoch": 0.11410059301559411, + "grad_norm": 2.3948166370391846, + "learning_rate": 5.697036223929747e-07, + "loss": 1.1652, + "mean_token_accuracy": 0.6601170301437378, + "num_tokens": 26138162.0, + "step": 1039 + }, + { + "epoch": 0.11421041071820777, + "grad_norm": 2.248443603515625, + "learning_rate": 5.702524698133918e-07, + "loss": 1.0078, + "mean_token_accuracy": 0.6996346712112427, + "num_tokens": 26163089.0, + "step": 1040 + }, + { + "epoch": 0.11432022842082144, + "grad_norm": 2.1062686443328857, + "learning_rate": 5.70801317233809e-07, + "loss": 1.1739, + "mean_token_accuracy": 0.6484674215316772, + "num_tokens": 26196116.0, + "step": 1041 + }, + { + "epoch": 0.1144300461234351, + "grad_norm": 2.2238097190856934, + "learning_rate": 5.713501646542261e-07, + "loss": 1.0945, + "mean_token_accuracy": 0.6708521842956543, + "num_tokens": 26224036.0, + "step": 1042 + }, + { + "epoch": 0.11453986382604876, + "grad_norm": 1.9987611770629883, + "learning_rate": 5.718990120746432e-07, + "loss": 1.0881, + "mean_token_accuracy": 0.6762707233428955, + "num_tokens": 26255507.0, + "step": 1043 + }, + { + "epoch": 0.11464968152866242, + "grad_norm": 2.6893439292907715, + "learning_rate": 5.724478594950604e-07, + "loss": 0.9451, + "mean_token_accuracy": 0.7098459005355835, + "num_tokens": 26273991.0, + "step": 1044 + }, + { + "epoch": 0.11475949923127608, + "grad_norm": 2.535762071609497, + "learning_rate": 5.729967069154775e-07, + "loss": 1.1327, + "mean_token_accuracy": 0.666397213935852, + "num_tokens": 26297552.0, + "step": 1045 + }, + { + "epoch": 0.11486931693388974, + "grad_norm": 2.3154006004333496, + "learning_rate": 5.735455543358946e-07, + "loss": 0.9987, + "mean_token_accuracy": 0.701055645942688, + "num_tokens": 26322251.0, + "step": 1046 + }, + { + "epoch": 0.1149791346365034, + "grad_norm": 2.216594696044922, + "learning_rate": 5.740944017563118e-07, + "loss": 1.0267, + "mean_token_accuracy": 0.6886802315711975, + "num_tokens": 26349185.0, + "step": 1047 + }, + { + "epoch": 0.11508895233911706, + "grad_norm": 2.1203181743621826, + "learning_rate": 5.746432491767288e-07, + "loss": 1.138, + "mean_token_accuracy": 0.6611162424087524, + "num_tokens": 26378110.0, + "step": 1048 + }, + { + "epoch": 0.11519877004173072, + "grad_norm": 2.294311046600342, + "learning_rate": 5.75192096597146e-07, + "loss": 1.0981, + "mean_token_accuracy": 0.6735788583755493, + "num_tokens": 26404452.0, + "step": 1049 + }, + { + "epoch": 0.11530858774434438, + "grad_norm": 2.280764579772949, + "learning_rate": 5.757409440175632e-07, + "loss": 1.0254, + "mean_token_accuracy": 0.6889426708221436, + "num_tokens": 26431041.0, + "step": 1050 + }, + { + "epoch": 0.11541840544695806, + "grad_norm": 2.330935001373291, + "learning_rate": 5.762897914379802e-07, + "loss": 1.0588, + "mean_token_accuracy": 0.6952853202819824, + "num_tokens": 26456964.0, + "step": 1051 + }, + { + "epoch": 0.11552822314957172, + "grad_norm": 2.556633949279785, + "learning_rate": 5.768386388583974e-07, + "loss": 1.0069, + "mean_token_accuracy": 0.7020768523216248, + "num_tokens": 26480279.0, + "step": 1052 + }, + { + "epoch": 0.11563804085218538, + "grad_norm": 2.4233624935150146, + "learning_rate": 5.773874862788145e-07, + "loss": 1.1729, + "mean_token_accuracy": 0.6599013209342957, + "num_tokens": 26506261.0, + "step": 1053 + }, + { + "epoch": 0.11574785855479904, + "grad_norm": 2.188551664352417, + "learning_rate": 5.779363336992316e-07, + "loss": 1.1109, + "mean_token_accuracy": 0.6638815402984619, + "num_tokens": 26535758.0, + "step": 1054 + }, + { + "epoch": 0.1158576762574127, + "grad_norm": 2.2125449180603027, + "learning_rate": 5.784851811196487e-07, + "loss": 1.1063, + "mean_token_accuracy": 0.6688412427902222, + "num_tokens": 26565281.0, + "step": 1055 + }, + { + "epoch": 0.11596749396002635, + "grad_norm": 1.9347949028015137, + "learning_rate": 5.790340285400658e-07, + "loss": 1.0729, + "mean_token_accuracy": 0.6876128315925598, + "num_tokens": 26599191.0, + "step": 1056 + }, + { + "epoch": 0.11607731166264001, + "grad_norm": 2.5840368270874023, + "learning_rate": 5.795828759604829e-07, + "loss": 1.1928, + "mean_token_accuracy": 0.6542754173278809, + "num_tokens": 26620815.0, + "step": 1057 + }, + { + "epoch": 0.11618712936525367, + "grad_norm": 2.3154056072235107, + "learning_rate": 5.801317233809001e-07, + "loss": 1.1075, + "mean_token_accuracy": 0.6853007674217224, + "num_tokens": 26646475.0, + "step": 1058 + }, + { + "epoch": 0.11629694706786733, + "grad_norm": 2.610387086868286, + "learning_rate": 5.806805708013171e-07, + "loss": 1.0603, + "mean_token_accuracy": 0.6851075291633606, + "num_tokens": 26669029.0, + "step": 1059 + }, + { + "epoch": 0.11640676477048101, + "grad_norm": 2.608438014984131, + "learning_rate": 5.812294182217343e-07, + "loss": 1.0097, + "mean_token_accuracy": 0.696455717086792, + "num_tokens": 26691497.0, + "step": 1060 + }, + { + "epoch": 0.11651658247309467, + "grad_norm": 2.2981276512145996, + "learning_rate": 5.817782656421515e-07, + "loss": 1.0624, + "mean_token_accuracy": 0.6874572038650513, + "num_tokens": 26716526.0, + "step": 1061 + }, + { + "epoch": 0.11662640017570833, + "grad_norm": 2.7745282649993896, + "learning_rate": 5.823271130625685e-07, + "loss": 1.0659, + "mean_token_accuracy": 0.683587908744812, + "num_tokens": 26737409.0, + "step": 1062 + }, + { + "epoch": 0.11673621787832199, + "grad_norm": 2.418147325515747, + "learning_rate": 5.828759604829857e-07, + "loss": 1.0329, + "mean_token_accuracy": 0.6911680698394775, + "num_tokens": 26762515.0, + "step": 1063 + }, + { + "epoch": 0.11684603558093565, + "grad_norm": 2.4048876762390137, + "learning_rate": 5.834248079034029e-07, + "loss": 1.0655, + "mean_token_accuracy": 0.6863371729850769, + "num_tokens": 26787151.0, + "step": 1064 + }, + { + "epoch": 0.11695585328354931, + "grad_norm": 2.141209363937378, + "learning_rate": 5.839736553238199e-07, + "loss": 1.1519, + "mean_token_accuracy": 0.6761904954910278, + "num_tokens": 26818091.0, + "step": 1065 + }, + { + "epoch": 0.11706567098616297, + "grad_norm": 2.0967040061950684, + "learning_rate": 5.845225027442371e-07, + "loss": 1.1535, + "mean_token_accuracy": 0.6589199900627136, + "num_tokens": 26848196.0, + "step": 1066 + }, + { + "epoch": 0.11717548868877663, + "grad_norm": 2.6277995109558105, + "learning_rate": 5.850713501646543e-07, + "loss": 1.0094, + "mean_token_accuracy": 0.6959623098373413, + "num_tokens": 26870584.0, + "step": 1067 + }, + { + "epoch": 0.11728530639139029, + "grad_norm": 2.1786365509033203, + "learning_rate": 5.856201975850713e-07, + "loss": 1.1195, + "mean_token_accuracy": 0.6749535799026489, + "num_tokens": 26899000.0, + "step": 1068 + }, + { + "epoch": 0.11739512409400395, + "grad_norm": 2.1004695892333984, + "learning_rate": 5.861690450054885e-07, + "loss": 1.1601, + "mean_token_accuracy": 0.6574848294258118, + "num_tokens": 26931347.0, + "step": 1069 + }, + { + "epoch": 0.11750494179661762, + "grad_norm": 2.2653470039367676, + "learning_rate": 5.867178924259056e-07, + "loss": 1.0398, + "mean_token_accuracy": 0.6950175166130066, + "num_tokens": 26957546.0, + "step": 1070 + }, + { + "epoch": 0.11761475949923128, + "grad_norm": 2.5626847743988037, + "learning_rate": 5.872667398463227e-07, + "loss": 1.0118, + "mean_token_accuracy": 0.6944279670715332, + "num_tokens": 26977980.0, + "step": 1071 + }, + { + "epoch": 0.11772457720184494, + "grad_norm": 2.202253818511963, + "learning_rate": 5.878155872667399e-07, + "loss": 1.0837, + "mean_token_accuracy": 0.679955005645752, + "num_tokens": 27006979.0, + "step": 1072 + }, + { + "epoch": 0.1178343949044586, + "grad_norm": 2.6929619312286377, + "learning_rate": 5.88364434687157e-07, + "loss": 1.0402, + "mean_token_accuracy": 0.6933497190475464, + "num_tokens": 27026828.0, + "step": 1073 + }, + { + "epoch": 0.11794421260707226, + "grad_norm": 2.5225002765655518, + "learning_rate": 5.88913282107574e-07, + "loss": 1.1422, + "mean_token_accuracy": 0.6784918904304504, + "num_tokens": 27049848.0, + "step": 1074 + }, + { + "epoch": 0.11805403030968592, + "grad_norm": 2.233482599258423, + "learning_rate": 5.894621295279912e-07, + "loss": 1.0381, + "mean_token_accuracy": 0.6912816762924194, + "num_tokens": 27077676.0, + "step": 1075 + }, + { + "epoch": 0.11816384801229958, + "grad_norm": 2.238790512084961, + "learning_rate": 5.900109769484083e-07, + "loss": 1.0541, + "mean_token_accuracy": 0.681969940662384, + "num_tokens": 27103658.0, + "step": 1076 + }, + { + "epoch": 0.11827366571491324, + "grad_norm": 2.394615411758423, + "learning_rate": 5.905598243688254e-07, + "loss": 1.0683, + "mean_token_accuracy": 0.6777940392494202, + "num_tokens": 27129576.0, + "step": 1077 + }, + { + "epoch": 0.1183834834175269, + "grad_norm": 2.623887777328491, + "learning_rate": 5.911086717892426e-07, + "loss": 1.1542, + "mean_token_accuracy": 0.6672322750091553, + "num_tokens": 27152990.0, + "step": 1078 + }, + { + "epoch": 0.11849330112014057, + "grad_norm": 2.4506564140319824, + "learning_rate": 5.916575192096597e-07, + "loss": 1.15, + "mean_token_accuracy": 0.6596304178237915, + "num_tokens": 27177863.0, + "step": 1079 + }, + { + "epoch": 0.11860311882275423, + "grad_norm": 2.064116954803467, + "learning_rate": 5.922063666300768e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.6763035655021667, + "num_tokens": 27210343.0, + "step": 1080 + }, + { + "epoch": 0.1187129365253679, + "grad_norm": 2.373749017715454, + "learning_rate": 5.927552140504939e-07, + "loss": 1.0041, + "mean_token_accuracy": 0.7007138729095459, + "num_tokens": 27234000.0, + "step": 1081 + }, + { + "epoch": 0.11882275422798155, + "grad_norm": 2.5617458820343018, + "learning_rate": 5.93304061470911e-07, + "loss": 1.0685, + "mean_token_accuracy": 0.6809994578361511, + "num_tokens": 27256813.0, + "step": 1082 + }, + { + "epoch": 0.11893257193059521, + "grad_norm": 2.4649736881256104, + "learning_rate": 5.938529088913282e-07, + "loss": 1.0858, + "mean_token_accuracy": 0.6822892427444458, + "num_tokens": 27280346.0, + "step": 1083 + }, + { + "epoch": 0.11904238963320887, + "grad_norm": 2.261058807373047, + "learning_rate": 5.944017563117453e-07, + "loss": 1.0961, + "mean_token_accuracy": 0.6783247590065002, + "num_tokens": 27307673.0, + "step": 1084 + }, + { + "epoch": 0.11915220733582253, + "grad_norm": 2.498589277267456, + "learning_rate": 5.949506037321624e-07, + "loss": 1.1453, + "mean_token_accuracy": 0.67146897315979, + "num_tokens": 27331930.0, + "step": 1085 + }, + { + "epoch": 0.11926202503843619, + "grad_norm": 2.4675991535186768, + "learning_rate": 5.954994511525796e-07, + "loss": 1.0769, + "mean_token_accuracy": 0.6905819773674011, + "num_tokens": 27355803.0, + "step": 1086 + }, + { + "epoch": 0.11937184274104985, + "grad_norm": 2.2417144775390625, + "learning_rate": 5.960482985729967e-07, + "loss": 1.1366, + "mean_token_accuracy": 0.6623809337615967, + "num_tokens": 27382666.0, + "step": 1087 + }, + { + "epoch": 0.11948166044366351, + "grad_norm": 2.3097894191741943, + "learning_rate": 5.965971459934138e-07, + "loss": 1.0888, + "mean_token_accuracy": 0.6738661527633667, + "num_tokens": 27409224.0, + "step": 1088 + }, + { + "epoch": 0.11959147814627719, + "grad_norm": 2.6231155395507812, + "learning_rate": 5.971459934138309e-07, + "loss": 0.9891, + "mean_token_accuracy": 0.7008641958236694, + "num_tokens": 27430743.0, + "step": 1089 + }, + { + "epoch": 0.11970129584889085, + "grad_norm": 2.2016351222991943, + "learning_rate": 5.976948408342481e-07, + "loss": 1.0838, + "mean_token_accuracy": 0.6774599552154541, + "num_tokens": 27459897.0, + "step": 1090 + }, + { + "epoch": 0.1198111135515045, + "grad_norm": 2.5100579261779785, + "learning_rate": 5.982436882546652e-07, + "loss": 0.992, + "mean_token_accuracy": 0.6996198296546936, + "num_tokens": 27482772.0, + "step": 1091 + }, + { + "epoch": 0.11992093125411817, + "grad_norm": 2.275480270385742, + "learning_rate": 5.987925356750822e-07, + "loss": 1.1261, + "mean_token_accuracy": 0.6686651706695557, + "num_tokens": 27511932.0, + "step": 1092 + }, + { + "epoch": 0.12003074895673183, + "grad_norm": 2.357520818710327, + "learning_rate": 5.993413830954994e-07, + "loss": 1.023, + "mean_token_accuracy": 0.6974035501480103, + "num_tokens": 27538295.0, + "step": 1093 + }, + { + "epoch": 0.12014056665934549, + "grad_norm": 2.3122384548187256, + "learning_rate": 5.998902305159165e-07, + "loss": 1.074, + "mean_token_accuracy": 0.6901071667671204, + "num_tokens": 27561392.0, + "step": 1094 + }, + { + "epoch": 0.12025038436195915, + "grad_norm": 2.2116641998291016, + "learning_rate": 6.004390779363336e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6794512271881104, + "num_tokens": 27591078.0, + "step": 1095 + }, + { + "epoch": 0.1203602020645728, + "grad_norm": 2.5335354804992676, + "learning_rate": 6.009879253567508e-07, + "loss": 1.0436, + "mean_token_accuracy": 0.6975153684616089, + "num_tokens": 27614922.0, + "step": 1096 + }, + { + "epoch": 0.12047001976718646, + "grad_norm": 2.409423828125, + "learning_rate": 6.015367727771679e-07, + "loss": 1.0712, + "mean_token_accuracy": 0.6932327151298523, + "num_tokens": 27639792.0, + "step": 1097 + }, + { + "epoch": 0.12057983746980014, + "grad_norm": 2.6124043464660645, + "learning_rate": 6.02085620197585e-07, + "loss": 1.033, + "mean_token_accuracy": 0.6917353868484497, + "num_tokens": 27660755.0, + "step": 1098 + }, + { + "epoch": 0.1206896551724138, + "grad_norm": 2.3391129970550537, + "learning_rate": 6.026344676180022e-07, + "loss": 1.0923, + "mean_token_accuracy": 0.6832401156425476, + "num_tokens": 27685963.0, + "step": 1099 + }, + { + "epoch": 0.12079947287502746, + "grad_norm": 2.273353099822998, + "learning_rate": 6.031833150384192e-07, + "loss": 1.0666, + "mean_token_accuracy": 0.689083993434906, + "num_tokens": 27712199.0, + "step": 1100 + }, + { + "epoch": 0.12090929057764112, + "grad_norm": 2.662245035171509, + "learning_rate": 6.037321624588364e-07, + "loss": 1.097, + "mean_token_accuracy": 0.6774407625198364, + "num_tokens": 27733262.0, + "step": 1101 + }, + { + "epoch": 0.12101910828025478, + "grad_norm": 2.298452615737915, + "learning_rate": 6.042810098792536e-07, + "loss": 1.0784, + "mean_token_accuracy": 0.6832969784736633, + "num_tokens": 27759900.0, + "step": 1102 + }, + { + "epoch": 0.12112892598286844, + "grad_norm": 2.6642022132873535, + "learning_rate": 6.048298572996706e-07, + "loss": 0.9981, + "mean_token_accuracy": 0.7014501690864563, + "num_tokens": 27779975.0, + "step": 1103 + }, + { + "epoch": 0.1212387436854821, + "grad_norm": 2.504960060119629, + "learning_rate": 6.053787047200878e-07, + "loss": 1.0333, + "mean_token_accuracy": 0.6924421787261963, + "num_tokens": 27801938.0, + "step": 1104 + }, + { + "epoch": 0.12134856138809576, + "grad_norm": 2.6982831954956055, + "learning_rate": 6.05927552140505e-07, + "loss": 0.9704, + "mean_token_accuracy": 0.7148630619049072, + "num_tokens": 27820614.0, + "step": 1105 + }, + { + "epoch": 0.12145837909070942, + "grad_norm": 2.4245994091033936, + "learning_rate": 6.06476399560922e-07, + "loss": 1.0757, + "mean_token_accuracy": 0.681125283241272, + "num_tokens": 27844328.0, + "step": 1106 + }, + { + "epoch": 0.12156819679332308, + "grad_norm": 2.54964280128479, + "learning_rate": 6.070252469813392e-07, + "loss": 1.0546, + "mean_token_accuracy": 0.6808758974075317, + "num_tokens": 27867155.0, + "step": 1107 + }, + { + "epoch": 0.12167801449593675, + "grad_norm": 2.482048511505127, + "learning_rate": 6.075740944017564e-07, + "loss": 1.1068, + "mean_token_accuracy": 0.6816183924674988, + "num_tokens": 27892547.0, + "step": 1108 + }, + { + "epoch": 0.12178783219855041, + "grad_norm": 2.3117117881774902, + "learning_rate": 6.081229418221734e-07, + "loss": 1.0526, + "mean_token_accuracy": 0.6847954988479614, + "num_tokens": 27918023.0, + "step": 1109 + }, + { + "epoch": 0.12189764990116407, + "grad_norm": 2.5104820728302, + "learning_rate": 6.086717892425906e-07, + "loss": 1.0735, + "mean_token_accuracy": 0.6845738291740417, + "num_tokens": 27944378.0, + "step": 1110 + }, + { + "epoch": 0.12200746760377773, + "grad_norm": 2.4615488052368164, + "learning_rate": 6.092206366630076e-07, + "loss": 1.0123, + "mean_token_accuracy": 0.706108570098877, + "num_tokens": 27965699.0, + "step": 1111 + }, + { + "epoch": 0.12211728530639139, + "grad_norm": 2.488248586654663, + "learning_rate": 6.097694840834247e-07, + "loss": 1.0144, + "mean_token_accuracy": 0.6917130351066589, + "num_tokens": 27987753.0, + "step": 1112 + }, + { + "epoch": 0.12222710300900505, + "grad_norm": 2.2808260917663574, + "learning_rate": 6.103183315038419e-07, + "loss": 0.9973, + "mean_token_accuracy": 0.7067300081253052, + "num_tokens": 28011575.0, + "step": 1113 + }, + { + "epoch": 0.12233692071161871, + "grad_norm": 2.3573668003082275, + "learning_rate": 6.10867178924259e-07, + "loss": 1.1002, + "mean_token_accuracy": 0.6769939064979553, + "num_tokens": 28037803.0, + "step": 1114 + }, + { + "epoch": 0.12244673841423237, + "grad_norm": 2.6017463207244873, + "learning_rate": 6.114160263446761e-07, + "loss": 0.9895, + "mean_token_accuracy": 0.7050862312316895, + "num_tokens": 28058580.0, + "step": 1115 + }, + { + "epoch": 0.12255655611684603, + "grad_norm": 2.2379655838012695, + "learning_rate": 6.119648737650933e-07, + "loss": 1.028, + "mean_token_accuracy": 0.6959649324417114, + "num_tokens": 28085180.0, + "step": 1116 + }, + { + "epoch": 0.1226663738194597, + "grad_norm": 2.41803240776062, + "learning_rate": 6.125137211855103e-07, + "loss": 1.0032, + "mean_token_accuracy": 0.6931769847869873, + "num_tokens": 28108336.0, + "step": 1117 + }, + { + "epoch": 0.12277619152207336, + "grad_norm": 2.7167954444885254, + "learning_rate": 6.130625686059275e-07, + "loss": 1.1203, + "mean_token_accuracy": 0.6683780550956726, + "num_tokens": 28129869.0, + "step": 1118 + }, + { + "epoch": 0.12288600922468702, + "grad_norm": 2.3123435974121094, + "learning_rate": 6.136114160263447e-07, + "loss": 0.9964, + "mean_token_accuracy": 0.7014327049255371, + "num_tokens": 28153561.0, + "step": 1119 + }, + { + "epoch": 0.12299582692730068, + "grad_norm": 2.225281238555908, + "learning_rate": 6.141602634467617e-07, + "loss": 1.0297, + "mean_token_accuracy": 0.6938352584838867, + "num_tokens": 28179977.0, + "step": 1120 + }, + { + "epoch": 0.12310564462991434, + "grad_norm": 2.146517753601074, + "learning_rate": 6.147091108671789e-07, + "loss": 1.0868, + "mean_token_accuracy": 0.6785247921943665, + "num_tokens": 28207800.0, + "step": 1121 + }, + { + "epoch": 0.123215462332528, + "grad_norm": 2.5271337032318115, + "learning_rate": 6.15257958287596e-07, + "loss": 1.0414, + "mean_token_accuracy": 0.6872588396072388, + "num_tokens": 28230203.0, + "step": 1122 + }, + { + "epoch": 0.12332528003514166, + "grad_norm": 2.6000542640686035, + "learning_rate": 6.158068057080131e-07, + "loss": 1.0635, + "mean_token_accuracy": 0.6781548857688904, + "num_tokens": 28252873.0, + "step": 1123 + }, + { + "epoch": 0.12343509773775532, + "grad_norm": 2.3555727005004883, + "learning_rate": 6.163556531284303e-07, + "loss": 1.0373, + "mean_token_accuracy": 0.6846871376037598, + "num_tokens": 28277611.0, + "step": 1124 + }, + { + "epoch": 0.12354491544036898, + "grad_norm": 2.2613840103149414, + "learning_rate": 6.169045005488474e-07, + "loss": 1.0491, + "mean_token_accuracy": 0.6865450143814087, + "num_tokens": 28304902.0, + "step": 1125 + }, + { + "epoch": 0.12365473314298264, + "grad_norm": 2.5104641914367676, + "learning_rate": 6.174533479692645e-07, + "loss": 1.0145, + "mean_token_accuracy": 0.7001814842224121, + "num_tokens": 28325650.0, + "step": 1126 + }, + { + "epoch": 0.12376455084559632, + "grad_norm": 2.2800028324127197, + "learning_rate": 6.180021953896817e-07, + "loss": 1.0693, + "mean_token_accuracy": 0.6818630695343018, + "num_tokens": 28351664.0, + "step": 1127 + }, + { + "epoch": 0.12387436854820998, + "grad_norm": 2.1405398845672607, + "learning_rate": 6.185510428100988e-07, + "loss": 1.1362, + "mean_token_accuracy": 0.6598080396652222, + "num_tokens": 28381501.0, + "step": 1128 + }, + { + "epoch": 0.12398418625082364, + "grad_norm": 2.1610093116760254, + "learning_rate": 6.190998902305159e-07, + "loss": 1.0471, + "mean_token_accuracy": 0.6923641562461853, + "num_tokens": 28409473.0, + "step": 1129 + }, + { + "epoch": 0.1240940039534373, + "grad_norm": 2.3505406379699707, + "learning_rate": 6.196487376509331e-07, + "loss": 1.0221, + "mean_token_accuracy": 0.6945385932922363, + "num_tokens": 28432486.0, + "step": 1130 + }, + { + "epoch": 0.12420382165605096, + "grad_norm": 2.411109685897827, + "learning_rate": 6.201975850713501e-07, + "loss": 1.1249, + "mean_token_accuracy": 0.6659008264541626, + "num_tokens": 28456666.0, + "step": 1131 + }, + { + "epoch": 0.12431363935866462, + "grad_norm": 2.5611162185668945, + "learning_rate": 6.207464324917672e-07, + "loss": 1.0875, + "mean_token_accuracy": 0.6837670207023621, + "num_tokens": 28481950.0, + "step": 1132 + }, + { + "epoch": 0.12442345706127828, + "grad_norm": 2.371830701828003, + "learning_rate": 6.212952799121843e-07, + "loss": 1.0536, + "mean_token_accuracy": 0.6858165264129639, + "num_tokens": 28505899.0, + "step": 1133 + }, + { + "epoch": 0.12453327476389194, + "grad_norm": 2.4346203804016113, + "learning_rate": 6.218441273326015e-07, + "loss": 1.0646, + "mean_token_accuracy": 0.6887593269348145, + "num_tokens": 28530256.0, + "step": 1134 + }, + { + "epoch": 0.1246430924665056, + "grad_norm": 2.4053897857666016, + "learning_rate": 6.223929747530186e-07, + "loss": 1.0721, + "mean_token_accuracy": 0.6802600622177124, + "num_tokens": 28555711.0, + "step": 1135 + }, + { + "epoch": 0.12475291016911927, + "grad_norm": 2.426088333129883, + "learning_rate": 6.229418221734357e-07, + "loss": 0.9878, + "mean_token_accuracy": 0.7001517415046692, + "num_tokens": 28577502.0, + "step": 1136 + }, + { + "epoch": 0.12486272787173293, + "grad_norm": 2.4498326778411865, + "learning_rate": 6.234906695938529e-07, + "loss": 1.1136, + "mean_token_accuracy": 0.66994309425354, + "num_tokens": 28601862.0, + "step": 1137 + }, + { + "epoch": 0.12497254557434659, + "grad_norm": 2.513789653778076, + "learning_rate": 6.2403951701427e-07, + "loss": 1.0447, + "mean_token_accuracy": 0.6947736740112305, + "num_tokens": 28625005.0, + "step": 1138 + }, + { + "epoch": 0.12508236327696023, + "grad_norm": 2.3203072547912598, + "learning_rate": 6.245883644346871e-07, + "loss": 1.0024, + "mean_token_accuracy": 0.6982190608978271, + "num_tokens": 28651110.0, + "step": 1139 + }, + { + "epoch": 0.1251921809795739, + "grad_norm": 2.205721378326416, + "learning_rate": 6.251372118551043e-07, + "loss": 1.03, + "mean_token_accuracy": 0.6950889229774475, + "num_tokens": 28677884.0, + "step": 1140 + }, + { + "epoch": 0.12530199868218758, + "grad_norm": 2.368455410003662, + "learning_rate": 6.256860592755214e-07, + "loss": 1.128, + "mean_token_accuracy": 0.671988308429718, + "num_tokens": 28701920.0, + "step": 1141 + }, + { + "epoch": 0.12541181638480123, + "grad_norm": 2.631657123565674, + "learning_rate": 6.262349066959385e-07, + "loss": 1.1083, + "mean_token_accuracy": 0.6793885231018066, + "num_tokens": 28724811.0, + "step": 1142 + }, + { + "epoch": 0.1255216340874149, + "grad_norm": 2.329063892364502, + "learning_rate": 6.267837541163557e-07, + "loss": 1.0355, + "mean_token_accuracy": 0.6816918253898621, + "num_tokens": 28750132.0, + "step": 1143 + }, + { + "epoch": 0.12563145179002855, + "grad_norm": 2.566567897796631, + "learning_rate": 6.273326015367727e-07, + "loss": 1.0325, + "mean_token_accuracy": 0.6909832954406738, + "num_tokens": 28771176.0, + "step": 1144 + }, + { + "epoch": 0.12574126949264222, + "grad_norm": 2.54785418510437, + "learning_rate": 6.278814489571899e-07, + "loss": 1.0442, + "mean_token_accuracy": 0.6959875822067261, + "num_tokens": 28792848.0, + "step": 1145 + }, + { + "epoch": 0.12585108719525587, + "grad_norm": 2.325847625732422, + "learning_rate": 6.284302963776071e-07, + "loss": 1.1215, + "mean_token_accuracy": 0.6730554103851318, + "num_tokens": 28818267.0, + "step": 1146 + }, + { + "epoch": 0.12596090489786954, + "grad_norm": 2.268259286880493, + "learning_rate": 6.289791437980241e-07, + "loss": 0.9156, + "mean_token_accuracy": 0.7260578870773315, + "num_tokens": 28845247.0, + "step": 1147 + }, + { + "epoch": 0.1260707226004832, + "grad_norm": 2.4010086059570312, + "learning_rate": 6.295279912184413e-07, + "loss": 1.0928, + "mean_token_accuracy": 0.6745180487632751, + "num_tokens": 28868207.0, + "step": 1148 + }, + { + "epoch": 0.12618054030309686, + "grad_norm": 2.694333791732788, + "learning_rate": 6.300768386388585e-07, + "loss": 1.0397, + "mean_token_accuracy": 0.6896214485168457, + "num_tokens": 28888201.0, + "step": 1149 + }, + { + "epoch": 0.1262903580057105, + "grad_norm": 1.8973075151443481, + "learning_rate": 6.306256860592754e-07, + "loss": 1.0036, + "mean_token_accuracy": 0.6972768306732178, + "num_tokens": 28923861.0, + "step": 1150 + }, + { + "epoch": 0.12640017570832418, + "grad_norm": 2.199314594268799, + "learning_rate": 6.311745334796926e-07, + "loss": 1.1158, + "mean_token_accuracy": 0.6708675622940063, + "num_tokens": 28953543.0, + "step": 1151 + }, + { + "epoch": 0.12650999341093785, + "grad_norm": 2.4260945320129395, + "learning_rate": 6.317233809001098e-07, + "loss": 1.0208, + "mean_token_accuracy": 0.7058711051940918, + "num_tokens": 28978971.0, + "step": 1152 + }, + { + "epoch": 0.1266198111135515, + "grad_norm": 2.2010345458984375, + "learning_rate": 6.322722283205268e-07, + "loss": 1.1164, + "mean_token_accuracy": 0.6696286201477051, + "num_tokens": 29006760.0, + "step": 1153 + }, + { + "epoch": 0.12672962881616517, + "grad_norm": 2.943213939666748, + "learning_rate": 6.32821075740944e-07, + "loss": 0.9968, + "mean_token_accuracy": 0.6968121528625488, + "num_tokens": 29024596.0, + "step": 1154 + }, + { + "epoch": 0.12683944651877882, + "grad_norm": 2.1584339141845703, + "learning_rate": 6.33369923161361e-07, + "loss": 1.0351, + "mean_token_accuracy": 0.6883566379547119, + "num_tokens": 29053410.0, + "step": 1155 + }, + { + "epoch": 0.1269492642213925, + "grad_norm": 2.116623640060425, + "learning_rate": 6.339187705817782e-07, + "loss": 1.0491, + "mean_token_accuracy": 0.6906059980392456, + "num_tokens": 29082186.0, + "step": 1156 + }, + { + "epoch": 0.12705908192400614, + "grad_norm": 2.347776412963867, + "learning_rate": 6.344676180021954e-07, + "loss": 0.9527, + "mean_token_accuracy": 0.7099206447601318, + "num_tokens": 29105627.0, + "step": 1157 + }, + { + "epoch": 0.1271688996266198, + "grad_norm": 2.374603271484375, + "learning_rate": 6.350164654226124e-07, + "loss": 1.1083, + "mean_token_accuracy": 0.6729949712753296, + "num_tokens": 29130006.0, + "step": 1158 + }, + { + "epoch": 0.12727871732923346, + "grad_norm": 2.3608078956604004, + "learning_rate": 6.355653128430296e-07, + "loss": 1.0338, + "mean_token_accuracy": 0.6913377642631531, + "num_tokens": 29153876.0, + "step": 1159 + }, + { + "epoch": 0.12738853503184713, + "grad_norm": 2.2605011463165283, + "learning_rate": 6.361141602634468e-07, + "loss": 1.0255, + "mean_token_accuracy": 0.6922928094863892, + "num_tokens": 29179938.0, + "step": 1160 + }, + { + "epoch": 0.1274983527344608, + "grad_norm": 2.1347193717956543, + "learning_rate": 6.366630076838638e-07, + "loss": 0.9833, + "mean_token_accuracy": 0.7021628618240356, + "num_tokens": 29207965.0, + "step": 1161 + }, + { + "epoch": 0.12760817043707445, + "grad_norm": 2.2484099864959717, + "learning_rate": 6.37211855104281e-07, + "loss": 1.085, + "mean_token_accuracy": 0.6727571487426758, + "num_tokens": 29236365.0, + "step": 1162 + }, + { + "epoch": 0.12771798813968813, + "grad_norm": 2.2931482791900635, + "learning_rate": 6.377607025246982e-07, + "loss": 1.083, + "mean_token_accuracy": 0.6768282055854797, + "num_tokens": 29262411.0, + "step": 1163 + }, + { + "epoch": 0.12782780584230177, + "grad_norm": 2.2822225093841553, + "learning_rate": 6.383095499451152e-07, + "loss": 1.0986, + "mean_token_accuracy": 0.683506965637207, + "num_tokens": 29290977.0, + "step": 1164 + }, + { + "epoch": 0.12793762354491545, + "grad_norm": 2.1745831966400146, + "learning_rate": 6.388583973655324e-07, + "loss": 1.0577, + "mean_token_accuracy": 0.6876158714294434, + "num_tokens": 29320402.0, + "step": 1165 + }, + { + "epoch": 0.1280474412475291, + "grad_norm": 2.3083243370056152, + "learning_rate": 6.394072447859495e-07, + "loss": 1.045, + "mean_token_accuracy": 0.6864755153656006, + "num_tokens": 29345397.0, + "step": 1166 + }, + { + "epoch": 0.12815725895014277, + "grad_norm": 2.259389877319336, + "learning_rate": 6.399560922063666e-07, + "loss": 0.9538, + "mean_token_accuracy": 0.7170733213424683, + "num_tokens": 29371408.0, + "step": 1167 + }, + { + "epoch": 0.1282670766527564, + "grad_norm": 2.330214023590088, + "learning_rate": 6.405049396267838e-07, + "loss": 1.005, + "mean_token_accuracy": 0.7013934850692749, + "num_tokens": 29397496.0, + "step": 1168 + }, + { + "epoch": 0.12837689435537009, + "grad_norm": 2.4103283882141113, + "learning_rate": 6.410537870472008e-07, + "loss": 1.0327, + "mean_token_accuracy": 0.6918696165084839, + "num_tokens": 29420846.0, + "step": 1169 + }, + { + "epoch": 0.12848671205798376, + "grad_norm": 2.7252097129821777, + "learning_rate": 6.416026344676179e-07, + "loss": 1.054, + "mean_token_accuracy": 0.6824352741241455, + "num_tokens": 29441079.0, + "step": 1170 + }, + { + "epoch": 0.1285965297605974, + "grad_norm": 2.1295018196105957, + "learning_rate": 6.421514818880351e-07, + "loss": 1.0047, + "mean_token_accuracy": 0.7034111022949219, + "num_tokens": 29469545.0, + "step": 1171 + }, + { + "epoch": 0.12870634746321108, + "grad_norm": 2.474095106124878, + "learning_rate": 6.427003293084522e-07, + "loss": 1.0075, + "mean_token_accuracy": 0.6896706819534302, + "num_tokens": 29491286.0, + "step": 1172 + }, + { + "epoch": 0.12881616516582473, + "grad_norm": 2.200167417526245, + "learning_rate": 6.432491767288693e-07, + "loss": 1.0583, + "mean_token_accuracy": 0.6859623193740845, + "num_tokens": 29517726.0, + "step": 1173 + }, + { + "epoch": 0.1289259828684384, + "grad_norm": 2.0056841373443604, + "learning_rate": 6.437980241492865e-07, + "loss": 1.0898, + "mean_token_accuracy": 0.6773226857185364, + "num_tokens": 29549392.0, + "step": 1174 + }, + { + "epoch": 0.12903580057105205, + "grad_norm": 2.546793222427368, + "learning_rate": 6.443468715697036e-07, + "loss": 1.0638, + "mean_token_accuracy": 0.6912771463394165, + "num_tokens": 29572166.0, + "step": 1175 + }, + { + "epoch": 0.12914561827366572, + "grad_norm": 2.4042859077453613, + "learning_rate": 6.448957189901207e-07, + "loss": 1.0459, + "mean_token_accuracy": 0.7065327167510986, + "num_tokens": 29595282.0, + "step": 1176 + }, + { + "epoch": 0.12925543597627936, + "grad_norm": 2.524557113647461, + "learning_rate": 6.454445664105378e-07, + "loss": 1.0801, + "mean_token_accuracy": 0.6881028413772583, + "num_tokens": 29619834.0, + "step": 1177 + }, + { + "epoch": 0.12936525367889304, + "grad_norm": 2.484954833984375, + "learning_rate": 6.45993413830955e-07, + "loss": 1.0305, + "mean_token_accuracy": 0.7005075812339783, + "num_tokens": 29642060.0, + "step": 1178 + }, + { + "epoch": 0.1294750713815067, + "grad_norm": 2.664522647857666, + "learning_rate": 6.465422612513721e-07, + "loss": 1.0393, + "mean_token_accuracy": 0.6879799365997314, + "num_tokens": 29662687.0, + "step": 1179 + }, + { + "epoch": 0.12958488908412036, + "grad_norm": 2.512347936630249, + "learning_rate": 6.470911086717892e-07, + "loss": 1.0802, + "mean_token_accuracy": 0.681551992893219, + "num_tokens": 29685553.0, + "step": 1180 + }, + { + "epoch": 0.12969470678673403, + "grad_norm": 2.141381025314331, + "learning_rate": 6.476399560922064e-07, + "loss": 1.1461, + "mean_token_accuracy": 0.6661773920059204, + "num_tokens": 29716607.0, + "step": 1181 + }, + { + "epoch": 0.12980452448934768, + "grad_norm": 2.7766590118408203, + "learning_rate": 6.481888035126235e-07, + "loss": 1.0215, + "mean_token_accuracy": 0.7013577222824097, + "num_tokens": 29736577.0, + "step": 1182 + }, + { + "epoch": 0.12991434219196135, + "grad_norm": 2.4501190185546875, + "learning_rate": 6.487376509330406e-07, + "loss": 0.9913, + "mean_token_accuracy": 0.7064758539199829, + "num_tokens": 29759506.0, + "step": 1183 + }, + { + "epoch": 0.130024159894575, + "grad_norm": 2.34523868560791, + "learning_rate": 6.492864983534578e-07, + "loss": 1.1789, + "mean_token_accuracy": 0.6678473353385925, + "num_tokens": 29786374.0, + "step": 1184 + }, + { + "epoch": 0.13013397759718867, + "grad_norm": 2.39481520652771, + "learning_rate": 6.498353457738749e-07, + "loss": 0.9947, + "mean_token_accuracy": 0.7034062743186951, + "num_tokens": 29808977.0, + "step": 1185 + }, + { + "epoch": 0.13024379529980232, + "grad_norm": 2.1371586322784424, + "learning_rate": 6.50384193194292e-07, + "loss": 1.0533, + "mean_token_accuracy": 0.6940155625343323, + "num_tokens": 29837292.0, + "step": 1186 + }, + { + "epoch": 0.130353613002416, + "grad_norm": 2.4908924102783203, + "learning_rate": 6.509330406147092e-07, + "loss": 1.0486, + "mean_token_accuracy": 0.6829053163528442, + "num_tokens": 29861418.0, + "step": 1187 + }, + { + "epoch": 0.13046343070502964, + "grad_norm": 3.143057346343994, + "learning_rate": 6.514818880351261e-07, + "loss": 0.9342, + "mean_token_accuracy": 0.7170557975769043, + "num_tokens": 29876953.0, + "step": 1188 + }, + { + "epoch": 0.1305732484076433, + "grad_norm": 2.707318067550659, + "learning_rate": 6.520307354555433e-07, + "loss": 1.1293, + "mean_token_accuracy": 0.6792445778846741, + "num_tokens": 29897226.0, + "step": 1189 + }, + { + "epoch": 0.13068306611025698, + "grad_norm": 2.496107578277588, + "learning_rate": 6.525795828759604e-07, + "loss": 1.0005, + "mean_token_accuracy": 0.6984919309616089, + "num_tokens": 29919518.0, + "step": 1190 + }, + { + "epoch": 0.13079288381287063, + "grad_norm": 2.549346923828125, + "learning_rate": 6.531284302963775e-07, + "loss": 1.0667, + "mean_token_accuracy": 0.6917868852615356, + "num_tokens": 29941926.0, + "step": 1191 + }, + { + "epoch": 0.1309027015154843, + "grad_norm": 2.4953081607818604, + "learning_rate": 6.536772777167947e-07, + "loss": 0.99, + "mean_token_accuracy": 0.7035000324249268, + "num_tokens": 29964743.0, + "step": 1192 + }, + { + "epoch": 0.13101251921809795, + "grad_norm": 2.3521013259887695, + "learning_rate": 6.542261251372118e-07, + "loss": 1.0582, + "mean_token_accuracy": 0.6925902962684631, + "num_tokens": 29989695.0, + "step": 1193 + }, + { + "epoch": 0.13112233692071162, + "grad_norm": 2.3802943229675293, + "learning_rate": 6.547749725576289e-07, + "loss": 1.0402, + "mean_token_accuracy": 0.6974150538444519, + "num_tokens": 30013986.0, + "step": 1194 + }, + { + "epoch": 0.13123215462332527, + "grad_norm": 2.2771799564361572, + "learning_rate": 6.553238199780461e-07, + "loss": 1.1121, + "mean_token_accuracy": 0.6788181066513062, + "num_tokens": 30041239.0, + "step": 1195 + }, + { + "epoch": 0.13134197232593894, + "grad_norm": 2.5468485355377197, + "learning_rate": 6.558726673984632e-07, + "loss": 1.0553, + "mean_token_accuracy": 0.6891566514968872, + "num_tokens": 30064004.0, + "step": 1196 + }, + { + "epoch": 0.1314517900285526, + "grad_norm": 2.3674845695495605, + "learning_rate": 6.564215148188803e-07, + "loss": 1.0491, + "mean_token_accuracy": 0.6887980699539185, + "num_tokens": 30088224.0, + "step": 1197 + }, + { + "epoch": 0.13156160773116626, + "grad_norm": 2.4881234169006348, + "learning_rate": 6.569703622392975e-07, + "loss": 1.0282, + "mean_token_accuracy": 0.6890202760696411, + "num_tokens": 30110183.0, + "step": 1198 + }, + { + "epoch": 0.13167142543377994, + "grad_norm": 2.2966232299804688, + "learning_rate": 6.575192096597145e-07, + "loss": 0.9659, + "mean_token_accuracy": 0.709102988243103, + "num_tokens": 30133618.0, + "step": 1199 + }, + { + "epoch": 0.13178124313639358, + "grad_norm": 2.4324305057525635, + "learning_rate": 6.580680570801317e-07, + "loss": 1.0072, + "mean_token_accuracy": 0.6944517493247986, + "num_tokens": 30156671.0, + "step": 1200 + }, + { + "epoch": 0.13189106083900726, + "grad_norm": 2.154249906539917, + "learning_rate": 6.586169045005489e-07, + "loss": 0.9823, + "mean_token_accuracy": 0.7018500566482544, + "num_tokens": 30183599.0, + "step": 1201 + }, + { + "epoch": 0.1320008785416209, + "grad_norm": 2.327256202697754, + "learning_rate": 6.591657519209659e-07, + "loss": 1.005, + "mean_token_accuracy": 0.7068874835968018, + "num_tokens": 30207443.0, + "step": 1202 + }, + { + "epoch": 0.13211069624423458, + "grad_norm": 2.103459358215332, + "learning_rate": 6.597145993413831e-07, + "loss": 1.1283, + "mean_token_accuracy": 0.6653170585632324, + "num_tokens": 30237500.0, + "step": 1203 + }, + { + "epoch": 0.13222051394684822, + "grad_norm": 2.3595798015594482, + "learning_rate": 6.602634467618003e-07, + "loss": 1.0366, + "mean_token_accuracy": 0.6931836009025574, + "num_tokens": 30263145.0, + "step": 1204 + }, + { + "epoch": 0.1323303316494619, + "grad_norm": 1.9127827882766724, + "learning_rate": 6.608122941822173e-07, + "loss": 1.0715, + "mean_token_accuracy": 0.6775957942008972, + "num_tokens": 30300776.0, + "step": 1205 + }, + { + "epoch": 0.13244014935207554, + "grad_norm": 2.3165621757507324, + "learning_rate": 6.613611416026345e-07, + "loss": 1.0313, + "mean_token_accuracy": 0.6849045157432556, + "num_tokens": 30325714.0, + "step": 1206 + }, + { + "epoch": 0.13254996705468922, + "grad_norm": 2.298062562942505, + "learning_rate": 6.619099890230515e-07, + "loss": 1.0431, + "mean_token_accuracy": 0.6871130466461182, + "num_tokens": 30352183.0, + "step": 1207 + }, + { + "epoch": 0.1326597847573029, + "grad_norm": 2.1663661003112793, + "learning_rate": 6.624588364434686e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.6783910989761353, + "num_tokens": 30380666.0, + "step": 1208 + }, + { + "epoch": 0.13276960245991654, + "grad_norm": 2.5053977966308594, + "learning_rate": 6.630076838638858e-07, + "loss": 1.0714, + "mean_token_accuracy": 0.6826249361038208, + "num_tokens": 30403260.0, + "step": 1209 + }, + { + "epoch": 0.1328794201625302, + "grad_norm": 2.1685879230499268, + "learning_rate": 6.635565312843029e-07, + "loss": 1.0719, + "mean_token_accuracy": 0.6948904395103455, + "num_tokens": 30430555.0, + "step": 1210 + }, + { + "epoch": 0.13298923786514386, + "grad_norm": 2.043947219848633, + "learning_rate": 6.6410537870472e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.6822085976600647, + "num_tokens": 30462956.0, + "step": 1211 + }, + { + "epoch": 0.13309905556775753, + "grad_norm": 2.0718822479248047, + "learning_rate": 6.646542261251372e-07, + "loss": 1.0875, + "mean_token_accuracy": 0.6786828637123108, + "num_tokens": 30494965.0, + "step": 1212 + }, + { + "epoch": 0.13320887327037118, + "grad_norm": 2.3027503490448, + "learning_rate": 6.652030735455543e-07, + "loss": 1.0079, + "mean_token_accuracy": 0.6972672939300537, + "num_tokens": 30520561.0, + "step": 1213 + }, + { + "epoch": 0.13331869097298485, + "grad_norm": 2.396811008453369, + "learning_rate": 6.657519209659714e-07, + "loss": 1.0465, + "mean_token_accuracy": 0.6987401843070984, + "num_tokens": 30544193.0, + "step": 1214 + }, + { + "epoch": 0.1334285086755985, + "grad_norm": 2.569474458694458, + "learning_rate": 6.663007683863886e-07, + "loss": 1.0308, + "mean_token_accuracy": 0.6978989839553833, + "num_tokens": 30565889.0, + "step": 1215 + }, + { + "epoch": 0.13353832637821217, + "grad_norm": 2.6063036918640137, + "learning_rate": 6.668496158068057e-07, + "loss": 1.0703, + "mean_token_accuracy": 0.6817799806594849, + "num_tokens": 30589022.0, + "step": 1216 + }, + { + "epoch": 0.13364814408082584, + "grad_norm": 2.324439287185669, + "learning_rate": 6.673984632272228e-07, + "loss": 0.9758, + "mean_token_accuracy": 0.699937105178833, + "num_tokens": 30612896.0, + "step": 1217 + }, + { + "epoch": 0.1337579617834395, + "grad_norm": 2.4266960620880127, + "learning_rate": 6.679473106476399e-07, + "loss": 0.9997, + "mean_token_accuracy": 0.7007068991661072, + "num_tokens": 30635944.0, + "step": 1218 + }, + { + "epoch": 0.13386777948605316, + "grad_norm": 2.2224349975585938, + "learning_rate": 6.684961580680571e-07, + "loss": 1.0743, + "mean_token_accuracy": 0.6754989624023438, + "num_tokens": 30663301.0, + "step": 1219 + }, + { + "epoch": 0.1339775971886668, + "grad_norm": 2.1762146949768066, + "learning_rate": 6.690450054884742e-07, + "loss": 0.9633, + "mean_token_accuracy": 0.710430383682251, + "num_tokens": 30691604.0, + "step": 1220 + }, + { + "epoch": 0.13408741489128048, + "grad_norm": 2.3104677200317383, + "learning_rate": 6.695938529088913e-07, + "loss": 1.0169, + "mean_token_accuracy": 0.6936776638031006, + "num_tokens": 30717236.0, + "step": 1221 + }, + { + "epoch": 0.13419723259389413, + "grad_norm": 2.4716837406158447, + "learning_rate": 6.701427003293085e-07, + "loss": 1.0358, + "mean_token_accuracy": 0.6937057971954346, + "num_tokens": 30740979.0, + "step": 1222 + }, + { + "epoch": 0.1343070502965078, + "grad_norm": 2.302645683288574, + "learning_rate": 6.706915477497256e-07, + "loss": 1.0298, + "mean_token_accuracy": 0.6935442686080933, + "num_tokens": 30767603.0, + "step": 1223 + }, + { + "epoch": 0.13441686799912145, + "grad_norm": 2.467970609664917, + "learning_rate": 6.712403951701427e-07, + "loss": 1.1424, + "mean_token_accuracy": 0.6653188467025757, + "num_tokens": 30791425.0, + "step": 1224 + }, + { + "epoch": 0.13452668570173512, + "grad_norm": 2.187992572784424, + "learning_rate": 6.717892425905599e-07, + "loss": 1.1363, + "mean_token_accuracy": 0.6623034477233887, + "num_tokens": 30820567.0, + "step": 1225 + }, + { + "epoch": 0.13463650340434877, + "grad_norm": 2.686433792114258, + "learning_rate": 6.723380900109769e-07, + "loss": 0.9243, + "mean_token_accuracy": 0.725636899471283, + "num_tokens": 30839411.0, + "step": 1226 + }, + { + "epoch": 0.13474632110696244, + "grad_norm": 2.6938982009887695, + "learning_rate": 6.72886937431394e-07, + "loss": 1.0529, + "mean_token_accuracy": 0.691637396812439, + "num_tokens": 30862199.0, + "step": 1227 + }, + { + "epoch": 0.13485613880957611, + "grad_norm": 2.0607800483703613, + "learning_rate": 6.734357848518111e-07, + "loss": 1.0547, + "mean_token_accuracy": 0.6892328262329102, + "num_tokens": 30892200.0, + "step": 1228 + }, + { + "epoch": 0.13496595651218976, + "grad_norm": 2.2170941829681396, + "learning_rate": 6.739846322722282e-07, + "loss": 1.0371, + "mean_token_accuracy": 0.6851901412010193, + "num_tokens": 30922050.0, + "step": 1229 + }, + { + "epoch": 0.13507577421480343, + "grad_norm": 2.3036420345306396, + "learning_rate": 6.745334796926454e-07, + "loss": 1.0725, + "mean_token_accuracy": 0.6824201345443726, + "num_tokens": 30947823.0, + "step": 1230 + }, + { + "epoch": 0.13518559191741708, + "grad_norm": 2.03676438331604, + "learning_rate": 6.750823271130625e-07, + "loss": 1.1206, + "mean_token_accuracy": 0.6651834845542908, + "num_tokens": 30979637.0, + "step": 1231 + }, + { + "epoch": 0.13529540962003075, + "grad_norm": 2.0230395793914795, + "learning_rate": 6.756311745334796e-07, + "loss": 1.0451, + "mean_token_accuracy": 0.6867144703865051, + "num_tokens": 31010471.0, + "step": 1232 + }, + { + "epoch": 0.1354052273226444, + "grad_norm": 2.4095263481140137, + "learning_rate": 6.761800219538968e-07, + "loss": 1.1149, + "mean_token_accuracy": 0.6791189908981323, + "num_tokens": 31034748.0, + "step": 1233 + }, + { + "epoch": 0.13551504502525807, + "grad_norm": 2.1623098850250244, + "learning_rate": 6.767288693743139e-07, + "loss": 1.0725, + "mean_token_accuracy": 0.6882408857345581, + "num_tokens": 31062349.0, + "step": 1234 + }, + { + "epoch": 0.13562486272787172, + "grad_norm": 2.3916015625, + "learning_rate": 6.77277716794731e-07, + "loss": 0.9766, + "mean_token_accuracy": 0.7119017243385315, + "num_tokens": 31086241.0, + "step": 1235 + }, + { + "epoch": 0.1357346804304854, + "grad_norm": 2.39422607421875, + "learning_rate": 6.778265642151482e-07, + "loss": 1.0828, + "mean_token_accuracy": 0.6840260028839111, + "num_tokens": 31110463.0, + "step": 1236 + }, + { + "epoch": 0.13584449813309907, + "grad_norm": 2.170484781265259, + "learning_rate": 6.783754116355653e-07, + "loss": 1.0756, + "mean_token_accuracy": 0.6829025745391846, + "num_tokens": 31139481.0, + "step": 1237 + }, + { + "epoch": 0.1359543158357127, + "grad_norm": 2.319953441619873, + "learning_rate": 6.789242590559824e-07, + "loss": 1.1261, + "mean_token_accuracy": 0.6715601682662964, + "num_tokens": 31166058.0, + "step": 1238 + }, + { + "epoch": 0.1360641335383264, + "grad_norm": 2.441965103149414, + "learning_rate": 6.794731064763996e-07, + "loss": 1.033, + "mean_token_accuracy": 0.6894073486328125, + "num_tokens": 31189628.0, + "step": 1239 + }, + { + "epoch": 0.13617395124094003, + "grad_norm": 2.5459485054016113, + "learning_rate": 6.800219538968166e-07, + "loss": 0.9977, + "mean_token_accuracy": 0.6970227360725403, + "num_tokens": 31211639.0, + "step": 1240 + }, + { + "epoch": 0.1362837689435537, + "grad_norm": 2.1550467014312744, + "learning_rate": 6.805708013172338e-07, + "loss": 0.9792, + "mean_token_accuracy": 0.7014561891555786, + "num_tokens": 31239239.0, + "step": 1241 + }, + { + "epoch": 0.13639358664616735, + "grad_norm": 2.788784980773926, + "learning_rate": 6.81119648737651e-07, + "loss": 1.0225, + "mean_token_accuracy": 0.691809892654419, + "num_tokens": 31257412.0, + "step": 1242 + }, + { + "epoch": 0.13650340434878103, + "grad_norm": 2.295440673828125, + "learning_rate": 6.81668496158068e-07, + "loss": 1.056, + "mean_token_accuracy": 0.6838880777359009, + "num_tokens": 31283991.0, + "step": 1243 + }, + { + "epoch": 0.13661322205139467, + "grad_norm": 2.252479076385498, + "learning_rate": 6.822173435784852e-07, + "loss": 1.0393, + "mean_token_accuracy": 0.6891689896583557, + "num_tokens": 31310520.0, + "step": 1244 + }, + { + "epoch": 0.13672303975400835, + "grad_norm": 2.3569600582122803, + "learning_rate": 6.827661909989023e-07, + "loss": 1.0206, + "mean_token_accuracy": 0.6939730048179626, + "num_tokens": 31334100.0, + "step": 1245 + }, + { + "epoch": 0.13683285745662202, + "grad_norm": 2.3319590091705322, + "learning_rate": 6.833150384193193e-07, + "loss": 1.1294, + "mean_token_accuracy": 0.6803349256515503, + "num_tokens": 31363210.0, + "step": 1246 + }, + { + "epoch": 0.13694267515923567, + "grad_norm": 2.315112590789795, + "learning_rate": 6.838638858397365e-07, + "loss": 1.1163, + "mean_token_accuracy": 0.6726329326629639, + "num_tokens": 31391343.0, + "step": 1247 + }, + { + "epoch": 0.13705249286184934, + "grad_norm": 2.784914970397949, + "learning_rate": 6.844127332601537e-07, + "loss": 0.9487, + "mean_token_accuracy": 0.7101585865020752, + "num_tokens": 31410173.0, + "step": 1248 + }, + { + "epoch": 0.13716231056446299, + "grad_norm": 2.138333797454834, + "learning_rate": 6.849615806805707e-07, + "loss": 1.1539, + "mean_token_accuracy": 0.6684083342552185, + "num_tokens": 31442322.0, + "step": 1249 + }, + { + "epoch": 0.13727212826707666, + "grad_norm": 2.573148727416992, + "learning_rate": 6.855104281009879e-07, + "loss": 1.035, + "mean_token_accuracy": 0.6927554607391357, + "num_tokens": 31465893.0, + "step": 1250 + }, + { + "epoch": 0.1373819459696903, + "grad_norm": 2.529103994369507, + "learning_rate": 6.86059275521405e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6695661544799805, + "num_tokens": 31488824.0, + "step": 1251 + }, + { + "epoch": 0.13749176367230398, + "grad_norm": 2.3825278282165527, + "learning_rate": 6.866081229418221e-07, + "loss": 1.0711, + "mean_token_accuracy": 0.6828985810279846, + "num_tokens": 31513794.0, + "step": 1252 + }, + { + "epoch": 0.13760158137491763, + "grad_norm": 2.3151726722717285, + "learning_rate": 6.871569703622393e-07, + "loss": 0.9418, + "mean_token_accuracy": 0.7120054960250854, + "num_tokens": 31537392.0, + "step": 1253 + }, + { + "epoch": 0.1377113990775313, + "grad_norm": 2.0656702518463135, + "learning_rate": 6.877058177826564e-07, + "loss": 1.0884, + "mean_token_accuracy": 0.6834834218025208, + "num_tokens": 31568956.0, + "step": 1254 + }, + { + "epoch": 0.13782121678014497, + "grad_norm": 2.3308680057525635, + "learning_rate": 6.882546652030735e-07, + "loss": 0.9473, + "mean_token_accuracy": 0.7169270515441895, + "num_tokens": 31595126.0, + "step": 1255 + }, + { + "epoch": 0.13793103448275862, + "grad_norm": 2.2290804386138916, + "learning_rate": 6.888035126234907e-07, + "loss": 1.0942, + "mean_token_accuracy": 0.684200644493103, + "num_tokens": 31621290.0, + "step": 1256 + }, + { + "epoch": 0.1380408521853723, + "grad_norm": 1.93596351146698, + "learning_rate": 6.893523600439078e-07, + "loss": 1.0481, + "mean_token_accuracy": 0.6882472038269043, + "num_tokens": 31653826.0, + "step": 1257 + }, + { + "epoch": 0.13815066988798594, + "grad_norm": 2.3601081371307373, + "learning_rate": 6.899012074643249e-07, + "loss": 1.0656, + "mean_token_accuracy": 0.6839304566383362, + "num_tokens": 31680756.0, + "step": 1258 + }, + { + "epoch": 0.1382604875905996, + "grad_norm": 2.6963822841644287, + "learning_rate": 6.904500548847421e-07, + "loss": 1.0676, + "mean_token_accuracy": 0.6871110200881958, + "num_tokens": 31700965.0, + "step": 1259 + }, + { + "epoch": 0.13837030529321326, + "grad_norm": 2.400618553161621, + "learning_rate": 6.909989023051592e-07, + "loss": 1.1157, + "mean_token_accuracy": 0.6666061282157898, + "num_tokens": 31725743.0, + "step": 1260 + }, + { + "epoch": 0.13848012299582693, + "grad_norm": 2.592055559158325, + "learning_rate": 6.915477497255763e-07, + "loss": 1.0685, + "mean_token_accuracy": 0.6790921092033386, + "num_tokens": 31748274.0, + "step": 1261 + }, + { + "epoch": 0.13858994069844058, + "grad_norm": 2.834649085998535, + "learning_rate": 6.920965971459934e-07, + "loss": 1.0872, + "mean_token_accuracy": 0.679916262626648, + "num_tokens": 31767896.0, + "step": 1262 + }, + { + "epoch": 0.13869975840105425, + "grad_norm": 2.291447877883911, + "learning_rate": 6.926454445664105e-07, + "loss": 1.0155, + "mean_token_accuracy": 0.6959672570228577, + "num_tokens": 31793744.0, + "step": 1263 + }, + { + "epoch": 0.1388095761036679, + "grad_norm": 2.401500701904297, + "learning_rate": 6.931942919868276e-07, + "loss": 0.9887, + "mean_token_accuracy": 0.6975253820419312, + "num_tokens": 31817122.0, + "step": 1264 + }, + { + "epoch": 0.13891939380628157, + "grad_norm": 2.2751965522766113, + "learning_rate": 6.937431394072447e-07, + "loss": 1.0849, + "mean_token_accuracy": 0.6884410381317139, + "num_tokens": 31845216.0, + "step": 1265 + }, + { + "epoch": 0.13902921150889525, + "grad_norm": 2.4175148010253906, + "learning_rate": 6.942919868276618e-07, + "loss": 1.0007, + "mean_token_accuracy": 0.703882098197937, + "num_tokens": 31869108.0, + "step": 1266 + }, + { + "epoch": 0.1391390292115089, + "grad_norm": 2.534092903137207, + "learning_rate": 6.94840834248079e-07, + "loss": 1.0783, + "mean_token_accuracy": 0.6799949407577515, + "num_tokens": 31892244.0, + "step": 1267 + }, + { + "epoch": 0.13924884691412256, + "grad_norm": 2.134504556655884, + "learning_rate": 6.953896816684961e-07, + "loss": 1.0646, + "mean_token_accuracy": 0.6875232458114624, + "num_tokens": 31922571.0, + "step": 1268 + }, + { + "epoch": 0.1393586646167362, + "grad_norm": 2.352311611175537, + "learning_rate": 6.959385290889132e-07, + "loss": 0.9231, + "mean_token_accuracy": 0.720452070236206, + "num_tokens": 31946864.0, + "step": 1269 + }, + { + "epoch": 0.13946848231934988, + "grad_norm": 2.2492175102233887, + "learning_rate": 6.964873765093304e-07, + "loss": 1.1018, + "mean_token_accuracy": 0.6750471591949463, + "num_tokens": 31976203.0, + "step": 1270 + }, + { + "epoch": 0.13957830002196353, + "grad_norm": 2.4411511421203613, + "learning_rate": 6.970362239297475e-07, + "loss": 0.9159, + "mean_token_accuracy": 0.7238627076148987, + "num_tokens": 31997052.0, + "step": 1271 + }, + { + "epoch": 0.1396881177245772, + "grad_norm": 2.3217177391052246, + "learning_rate": 6.975850713501646e-07, + "loss": 1.094, + "mean_token_accuracy": 0.6822695732116699, + "num_tokens": 32024508.0, + "step": 1272 + }, + { + "epoch": 0.13979793542719085, + "grad_norm": 2.168936014175415, + "learning_rate": 6.981339187705817e-07, + "loss": 1.0427, + "mean_token_accuracy": 0.6963130831718445, + "num_tokens": 32052118.0, + "step": 1273 + }, + { + "epoch": 0.13990775312980452, + "grad_norm": 2.020792007446289, + "learning_rate": 6.986827661909989e-07, + "loss": 1.1164, + "mean_token_accuracy": 0.664141833782196, + "num_tokens": 32085641.0, + "step": 1274 + }, + { + "epoch": 0.1400175708324182, + "grad_norm": 2.582760810852051, + "learning_rate": 6.99231613611416e-07, + "loss": 1.025, + "mean_token_accuracy": 0.6976631879806519, + "num_tokens": 32106783.0, + "step": 1275 + }, + { + "epoch": 0.14012738853503184, + "grad_norm": 2.474773406982422, + "learning_rate": 6.997804610318331e-07, + "loss": 1.0608, + "mean_token_accuracy": 0.6988824605941772, + "num_tokens": 32129474.0, + "step": 1276 + }, + { + "epoch": 0.14023720623764552, + "grad_norm": 2.2514257431030273, + "learning_rate": 7.003293084522503e-07, + "loss": 1.1382, + "mean_token_accuracy": 0.6659097671508789, + "num_tokens": 32157476.0, + "step": 1277 + }, + { + "epoch": 0.14034702394025916, + "grad_norm": 2.676520347595215, + "learning_rate": 7.008781558726674e-07, + "loss": 1.0508, + "mean_token_accuracy": 0.6834828853607178, + "num_tokens": 32176897.0, + "step": 1278 + }, + { + "epoch": 0.14045684164287284, + "grad_norm": 2.3157763481140137, + "learning_rate": 7.014270032930845e-07, + "loss": 1.0588, + "mean_token_accuracy": 0.688978374004364, + "num_tokens": 32202512.0, + "step": 1279 + }, + { + "epoch": 0.14056665934548648, + "grad_norm": 2.4802424907684326, + "learning_rate": 7.019758507135017e-07, + "loss": 1.0138, + "mean_token_accuracy": 0.6939882040023804, + "num_tokens": 32223672.0, + "step": 1280 + }, + { + "epoch": 0.14067647704810016, + "grad_norm": 2.53257417678833, + "learning_rate": 7.025246981339188e-07, + "loss": 0.9484, + "mean_token_accuracy": 0.713266134262085, + "num_tokens": 32244911.0, + "step": 1281 + }, + { + "epoch": 0.1407862947507138, + "grad_norm": 2.3514888286590576, + "learning_rate": 7.030735455543358e-07, + "loss": 1.0066, + "mean_token_accuracy": 0.7007927894592285, + "num_tokens": 32268623.0, + "step": 1282 + }, + { + "epoch": 0.14089611245332748, + "grad_norm": 2.442167043685913, + "learning_rate": 7.03622392974753e-07, + "loss": 1.0194, + "mean_token_accuracy": 0.6973426342010498, + "num_tokens": 32291533.0, + "step": 1283 + }, + { + "epoch": 0.14100593015594115, + "grad_norm": 2.3872547149658203, + "learning_rate": 7.0417124039517e-07, + "loss": 1.0193, + "mean_token_accuracy": 0.6982361078262329, + "num_tokens": 32314859.0, + "step": 1284 + }, + { + "epoch": 0.1411157478585548, + "grad_norm": 2.614866018295288, + "learning_rate": 7.047200878155872e-07, + "loss": 1.0976, + "mean_token_accuracy": 0.6860049962997437, + "num_tokens": 32336126.0, + "step": 1285 + }, + { + "epoch": 0.14122556556116847, + "grad_norm": 2.5948092937469482, + "learning_rate": 7.052689352360044e-07, + "loss": 0.9975, + "mean_token_accuracy": 0.6983330845832825, + "num_tokens": 32357767.0, + "step": 1286 + }, + { + "epoch": 0.14133538326378212, + "grad_norm": 2.2798287868499756, + "learning_rate": 7.058177826564214e-07, + "loss": 1.0796, + "mean_token_accuracy": 0.6907878518104553, + "num_tokens": 32383821.0, + "step": 1287 + }, + { + "epoch": 0.1414452009663958, + "grad_norm": 2.360030174255371, + "learning_rate": 7.063666300768386e-07, + "loss": 1.1437, + "mean_token_accuracy": 0.6630511283874512, + "num_tokens": 32412690.0, + "step": 1288 + }, + { + "epoch": 0.14155501866900944, + "grad_norm": 2.158608913421631, + "learning_rate": 7.069154774972558e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6780171990394592, + "num_tokens": 32442713.0, + "step": 1289 + }, + { + "epoch": 0.1416648363716231, + "grad_norm": 2.3119254112243652, + "learning_rate": 7.074643249176728e-07, + "loss": 1.005, + "mean_token_accuracy": 0.6979787349700928, + "num_tokens": 32467157.0, + "step": 1290 + }, + { + "epoch": 0.14177465407423676, + "grad_norm": 2.485530376434326, + "learning_rate": 7.0801317233809e-07, + "loss": 0.9683, + "mean_token_accuracy": 0.7143275737762451, + "num_tokens": 32487930.0, + "step": 1291 + }, + { + "epoch": 0.14188447177685043, + "grad_norm": 2.518498659133911, + "learning_rate": 7.085620197585072e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.675711452960968, + "num_tokens": 32513099.0, + "step": 1292 + }, + { + "epoch": 0.1419942894794641, + "grad_norm": 1.9310868978500366, + "learning_rate": 7.091108671789242e-07, + "loss": 1.106, + "mean_token_accuracy": 0.6743343472480774, + "num_tokens": 32547224.0, + "step": 1293 + }, + { + "epoch": 0.14210410718207775, + "grad_norm": 2.24393892288208, + "learning_rate": 7.096597145993414e-07, + "loss": 1.0136, + "mean_token_accuracy": 0.6929798722267151, + "num_tokens": 32573452.0, + "step": 1294 + }, + { + "epoch": 0.14221392488469142, + "grad_norm": 2.5185697078704834, + "learning_rate": 7.102085620197584e-07, + "loss": 1.0893, + "mean_token_accuracy": 0.6649553775787354, + "num_tokens": 32595135.0, + "step": 1295 + }, + { + "epoch": 0.14232374258730507, + "grad_norm": 2.2026450634002686, + "learning_rate": 7.107574094401756e-07, + "loss": 1.0155, + "mean_token_accuracy": 0.7057381868362427, + "num_tokens": 32619869.0, + "step": 1296 + }, + { + "epoch": 0.14243356028991874, + "grad_norm": 2.568293571472168, + "learning_rate": 7.113062568605928e-07, + "loss": 1.0629, + "mean_token_accuracy": 0.679232120513916, + "num_tokens": 32640477.0, + "step": 1297 + }, + { + "epoch": 0.1425433779925324, + "grad_norm": 2.113682746887207, + "learning_rate": 7.118551042810098e-07, + "loss": 1.0563, + "mean_token_accuracy": 0.6853005886077881, + "num_tokens": 32667320.0, + "step": 1298 + }, + { + "epoch": 0.14265319569514606, + "grad_norm": 2.3672478199005127, + "learning_rate": 7.12403951701427e-07, + "loss": 1.1492, + "mean_token_accuracy": 0.6594257354736328, + "num_tokens": 32692425.0, + "step": 1299 + }, + { + "epoch": 0.1427630133977597, + "grad_norm": 2.2157020568847656, + "learning_rate": 7.129527991218442e-07, + "loss": 1.0421, + "mean_token_accuracy": 0.6844804286956787, + "num_tokens": 32717814.0, + "step": 1300 + }, + { + "epoch": 0.14287283110037338, + "grad_norm": 2.3092145919799805, + "learning_rate": 7.135016465422611e-07, + "loss": 1.0365, + "mean_token_accuracy": 0.7016894817352295, + "num_tokens": 32743238.0, + "step": 1301 + }, + { + "epoch": 0.14298264880298703, + "grad_norm": 2.317599058151245, + "learning_rate": 7.140504939626783e-07, + "loss": 0.9109, + "mean_token_accuracy": 0.7239155173301697, + "num_tokens": 32766975.0, + "step": 1302 + }, + { + "epoch": 0.1430924665056007, + "grad_norm": 2.134359836578369, + "learning_rate": 7.145993413830955e-07, + "loss": 1.0301, + "mean_token_accuracy": 0.6981063485145569, + "num_tokens": 32795753.0, + "step": 1303 + }, + { + "epoch": 0.14320228420821438, + "grad_norm": 2.0946125984191895, + "learning_rate": 7.151481888035125e-07, + "loss": 0.9972, + "mean_token_accuracy": 0.702026903629303, + "num_tokens": 32824347.0, + "step": 1304 + }, + { + "epoch": 0.14331210191082802, + "grad_norm": 2.265745162963867, + "learning_rate": 7.156970362239297e-07, + "loss": 0.9856, + "mean_token_accuracy": 0.7062239646911621, + "num_tokens": 32848949.0, + "step": 1305 + }, + { + "epoch": 0.1434219196134417, + "grad_norm": 1.9882642030715942, + "learning_rate": 7.162458836443468e-07, + "loss": 1.0768, + "mean_token_accuracy": 0.6847758889198303, + "num_tokens": 32882427.0, + "step": 1306 + }, + { + "epoch": 0.14353173731605534, + "grad_norm": 2.275052547454834, + "learning_rate": 7.167947310647639e-07, + "loss": 0.975, + "mean_token_accuracy": 0.7278146743774414, + "num_tokens": 32906396.0, + "step": 1307 + }, + { + "epoch": 0.14364155501866901, + "grad_norm": 1.9888217449188232, + "learning_rate": 7.173435784851811e-07, + "loss": 1.0967, + "mean_token_accuracy": 0.6792854070663452, + "num_tokens": 32937630.0, + "step": 1308 + }, + { + "epoch": 0.14375137272128266, + "grad_norm": 2.4347519874572754, + "learning_rate": 7.178924259055982e-07, + "loss": 1.0573, + "mean_token_accuracy": 0.6843504905700684, + "num_tokens": 32960818.0, + "step": 1309 + }, + { + "epoch": 0.14386119042389633, + "grad_norm": 2.2266671657562256, + "learning_rate": 7.184412733260153e-07, + "loss": 0.9261, + "mean_token_accuracy": 0.7174015641212463, + "num_tokens": 32988589.0, + "step": 1310 + }, + { + "epoch": 0.14397100812650998, + "grad_norm": 2.3098127841949463, + "learning_rate": 7.189901207464325e-07, + "loss": 1.0419, + "mean_token_accuracy": 0.6876944303512573, + "num_tokens": 33014730.0, + "step": 1311 + }, + { + "epoch": 0.14408082582912365, + "grad_norm": 2.2721872329711914, + "learning_rate": 7.195389681668496e-07, + "loss": 1.0693, + "mean_token_accuracy": 0.6803593635559082, + "num_tokens": 33039529.0, + "step": 1312 + }, + { + "epoch": 0.14419064353173733, + "grad_norm": 2.6775131225585938, + "learning_rate": 7.200878155872667e-07, + "loss": 0.9279, + "mean_token_accuracy": 0.714279294013977, + "num_tokens": 33057158.0, + "step": 1313 + }, + { + "epoch": 0.14430046123435097, + "grad_norm": 2.4084906578063965, + "learning_rate": 7.206366630076838e-07, + "loss": 1.0536, + "mean_token_accuracy": 0.6849154233932495, + "num_tokens": 33080762.0, + "step": 1314 + }, + { + "epoch": 0.14441027893696465, + "grad_norm": 2.4906928539276123, + "learning_rate": 7.21185510428101e-07, + "loss": 1.0346, + "mean_token_accuracy": 0.6958746314048767, + "num_tokens": 33104622.0, + "step": 1315 + }, + { + "epoch": 0.1445200966395783, + "grad_norm": 2.043611764907837, + "learning_rate": 7.217343578485181e-07, + "loss": 0.9991, + "mean_token_accuracy": 0.7017035484313965, + "num_tokens": 33135093.0, + "step": 1316 + }, + { + "epoch": 0.14462991434219197, + "grad_norm": 2.178272247314453, + "learning_rate": 7.222832052689352e-07, + "loss": 1.0023, + "mean_token_accuracy": 0.7061833143234253, + "num_tokens": 33161889.0, + "step": 1317 + }, + { + "epoch": 0.1447397320448056, + "grad_norm": 2.3482604026794434, + "learning_rate": 7.228320526893524e-07, + "loss": 1.049, + "mean_token_accuracy": 0.696942925453186, + "num_tokens": 33186858.0, + "step": 1318 + }, + { + "epoch": 0.1448495497474193, + "grad_norm": 2.089491128921509, + "learning_rate": 7.233809001097695e-07, + "loss": 1.078, + "mean_token_accuracy": 0.6798348426818848, + "num_tokens": 33216520.0, + "step": 1319 + }, + { + "epoch": 0.14495936745003293, + "grad_norm": 2.412274122238159, + "learning_rate": 7.239297475301865e-07, + "loss": 1.141, + "mean_token_accuracy": 0.6637119054794312, + "num_tokens": 33240346.0, + "step": 1320 + }, + { + "epoch": 0.1450691851526466, + "grad_norm": 1.8506618738174438, + "learning_rate": 7.244785949506037e-07, + "loss": 1.1333, + "mean_token_accuracy": 0.6667666435241699, + "num_tokens": 33278059.0, + "step": 1321 + }, + { + "epoch": 0.14517900285526028, + "grad_norm": 2.521902084350586, + "learning_rate": 7.250274423710208e-07, + "loss": 1.1071, + "mean_token_accuracy": 0.6756629943847656, + "num_tokens": 33301297.0, + "step": 1322 + }, + { + "epoch": 0.14528882055787393, + "grad_norm": 2.236295223236084, + "learning_rate": 7.255762897914379e-07, + "loss": 1.0662, + "mean_token_accuracy": 0.6852676868438721, + "num_tokens": 33328089.0, + "step": 1323 + }, + { + "epoch": 0.1453986382604876, + "grad_norm": 2.0101633071899414, + "learning_rate": 7.261251372118551e-07, + "loss": 1.1172, + "mean_token_accuracy": 0.6795781254768372, + "num_tokens": 33361766.0, + "step": 1324 + }, + { + "epoch": 0.14550845596310125, + "grad_norm": 2.241321325302124, + "learning_rate": 7.266739846322721e-07, + "loss": 1.1096, + "mean_token_accuracy": 0.6671663522720337, + "num_tokens": 33389508.0, + "step": 1325 + }, + { + "epoch": 0.14561827366571492, + "grad_norm": 2.3291704654693604, + "learning_rate": 7.272228320526893e-07, + "loss": 0.8966, + "mean_token_accuracy": 0.7267590761184692, + "num_tokens": 33413136.0, + "step": 1326 + }, + { + "epoch": 0.14572809136832857, + "grad_norm": 2.535600423812866, + "learning_rate": 7.277716794731065e-07, + "loss": 0.9779, + "mean_token_accuracy": 0.7039250731468201, + "num_tokens": 33434495.0, + "step": 1327 + }, + { + "epoch": 0.14583790907094224, + "grad_norm": 2.1818599700927734, + "learning_rate": 7.283205268935235e-07, + "loss": 1.013, + "mean_token_accuracy": 0.6967664361000061, + "num_tokens": 33461244.0, + "step": 1328 + }, + { + "epoch": 0.14594772677355589, + "grad_norm": 2.508103132247925, + "learning_rate": 7.288693743139407e-07, + "loss": 0.9979, + "mean_token_accuracy": 0.6955915093421936, + "num_tokens": 33483309.0, + "step": 1329 + }, + { + "epoch": 0.14605754447616956, + "grad_norm": 2.357189893722534, + "learning_rate": 7.294182217343579e-07, + "loss": 0.9962, + "mean_token_accuracy": 0.6963033676147461, + "num_tokens": 33507122.0, + "step": 1330 + }, + { + "epoch": 0.14616736217878323, + "grad_norm": 2.276374578475952, + "learning_rate": 7.299670691547749e-07, + "loss": 1.0854, + "mean_token_accuracy": 0.6844006776809692, + "num_tokens": 33531532.0, + "step": 1331 + }, + { + "epoch": 0.14627717988139688, + "grad_norm": 2.5110020637512207, + "learning_rate": 7.305159165751921e-07, + "loss": 1.0164, + "mean_token_accuracy": 0.6972020864486694, + "num_tokens": 33554212.0, + "step": 1332 + }, + { + "epoch": 0.14638699758401055, + "grad_norm": 2.3345043659210205, + "learning_rate": 7.310647639956093e-07, + "loss": 1.0961, + "mean_token_accuracy": 0.6761417388916016, + "num_tokens": 33581495.0, + "step": 1333 + }, + { + "epoch": 0.1464968152866242, + "grad_norm": 2.4277327060699463, + "learning_rate": 7.316136114160263e-07, + "loss": 1.062, + "mean_token_accuracy": 0.6853793859481812, + "num_tokens": 33606026.0, + "step": 1334 + }, + { + "epoch": 0.14660663298923787, + "grad_norm": 2.148731231689453, + "learning_rate": 7.321624588364435e-07, + "loss": 0.992, + "mean_token_accuracy": 0.7041329741477966, + "num_tokens": 33633836.0, + "step": 1335 + }, + { + "epoch": 0.14671645069185152, + "grad_norm": 2.1938512325286865, + "learning_rate": 7.327113062568605e-07, + "loss": 1.1345, + "mean_token_accuracy": 0.6675140857696533, + "num_tokens": 33662149.0, + "step": 1336 + }, + { + "epoch": 0.1468262683944652, + "grad_norm": 2.6513659954071045, + "learning_rate": 7.332601536772777e-07, + "loss": 1.1033, + "mean_token_accuracy": 0.6806584596633911, + "num_tokens": 33684522.0, + "step": 1337 + }, + { + "epoch": 0.14693608609707884, + "grad_norm": 2.2863121032714844, + "learning_rate": 7.338090010976949e-07, + "loss": 1.038, + "mean_token_accuracy": 0.6917024850845337, + "num_tokens": 33710531.0, + "step": 1338 + }, + { + "epoch": 0.1470459037996925, + "grad_norm": 2.3670883178710938, + "learning_rate": 7.343578485181118e-07, + "loss": 1.0882, + "mean_token_accuracy": 0.6775578260421753, + "num_tokens": 33735020.0, + "step": 1339 + }, + { + "epoch": 0.14715572150230616, + "grad_norm": 2.587850570678711, + "learning_rate": 7.34906695938529e-07, + "loss": 1.0711, + "mean_token_accuracy": 0.6784480214118958, + "num_tokens": 33754245.0, + "step": 1340 + }, + { + "epoch": 0.14726553920491983, + "grad_norm": 2.30283784866333, + "learning_rate": 7.354555433589462e-07, + "loss": 0.9307, + "mean_token_accuracy": 0.7133345603942871, + "num_tokens": 33776179.0, + "step": 1341 + }, + { + "epoch": 0.1473753569075335, + "grad_norm": 2.238913059234619, + "learning_rate": 7.360043907793632e-07, + "loss": 1.0519, + "mean_token_accuracy": 0.6836274862289429, + "num_tokens": 33802193.0, + "step": 1342 + }, + { + "epoch": 0.14748517461014715, + "grad_norm": 2.2291347980499268, + "learning_rate": 7.365532381997804e-07, + "loss": 1.0991, + "mean_token_accuracy": 0.6687393188476562, + "num_tokens": 33830415.0, + "step": 1343 + }, + { + "epoch": 0.14759499231276083, + "grad_norm": 2.4375898838043213, + "learning_rate": 7.371020856201976e-07, + "loss": 0.9303, + "mean_token_accuracy": 0.7104696035385132, + "num_tokens": 33852005.0, + "step": 1344 + }, + { + "epoch": 0.14770481001537447, + "grad_norm": 2.2568604946136475, + "learning_rate": 7.376509330406146e-07, + "loss": 1.0186, + "mean_token_accuracy": 0.6996515989303589, + "num_tokens": 33878776.0, + "step": 1345 + }, + { + "epoch": 0.14781462771798815, + "grad_norm": 2.408165693283081, + "learning_rate": 7.381997804610318e-07, + "loss": 1.0706, + "mean_token_accuracy": 0.690880537033081, + "num_tokens": 33901847.0, + "step": 1346 + }, + { + "epoch": 0.1479244454206018, + "grad_norm": 2.3951680660247803, + "learning_rate": 7.387486278814489e-07, + "loss": 1.0463, + "mean_token_accuracy": 0.6908508539199829, + "num_tokens": 33925289.0, + "step": 1347 + }, + { + "epoch": 0.14803426312321546, + "grad_norm": 2.3811535835266113, + "learning_rate": 7.39297475301866e-07, + "loss": 1.1043, + "mean_token_accuracy": 0.6722177267074585, + "num_tokens": 33949933.0, + "step": 1348 + }, + { + "epoch": 0.1481440808258291, + "grad_norm": 2.3400111198425293, + "learning_rate": 7.398463227222832e-07, + "loss": 1.0462, + "mean_token_accuracy": 0.6881300210952759, + "num_tokens": 33975198.0, + "step": 1349 + }, + { + "epoch": 0.14825389852844278, + "grad_norm": 2.5379157066345215, + "learning_rate": 7.403951701427003e-07, + "loss": 1.0154, + "mean_token_accuracy": 0.6929174065589905, + "num_tokens": 33997411.0, + "step": 1350 + }, + { + "epoch": 0.14836371623105646, + "grad_norm": 2.198068141937256, + "learning_rate": 7.409440175631174e-07, + "loss": 1.0978, + "mean_token_accuracy": 0.6778560280799866, + "num_tokens": 34025496.0, + "step": 1351 + }, + { + "epoch": 0.1484735339336701, + "grad_norm": 2.5687854290008545, + "learning_rate": 7.414928649835346e-07, + "loss": 1.0027, + "mean_token_accuracy": 0.6955655813217163, + "num_tokens": 34045492.0, + "step": 1352 + }, + { + "epoch": 0.14858335163628378, + "grad_norm": 2.331805944442749, + "learning_rate": 7.420417124039517e-07, + "loss": 0.9826, + "mean_token_accuracy": 0.6990622282028198, + "num_tokens": 34069837.0, + "step": 1353 + }, + { + "epoch": 0.14869316933889742, + "grad_norm": 2.366109848022461, + "learning_rate": 7.425905598243688e-07, + "loss": 0.9728, + "mean_token_accuracy": 0.7042813897132874, + "num_tokens": 34091951.0, + "step": 1354 + }, + { + "epoch": 0.1488029870415111, + "grad_norm": 2.4451239109039307, + "learning_rate": 7.43139407244786e-07, + "loss": 1.0767, + "mean_token_accuracy": 0.6880520582199097, + "num_tokens": 34115189.0, + "step": 1355 + }, + { + "epoch": 0.14891280474412474, + "grad_norm": 2.3985724449157715, + "learning_rate": 7.436882546652031e-07, + "loss": 1.0057, + "mean_token_accuracy": 0.6978867650032043, + "num_tokens": 34139270.0, + "step": 1356 + }, + { + "epoch": 0.14902262244673842, + "grad_norm": 2.180330991744995, + "learning_rate": 7.442371020856202e-07, + "loss": 1.0337, + "mean_token_accuracy": 0.691195011138916, + "num_tokens": 34168085.0, + "step": 1357 + }, + { + "epoch": 0.14913244014935206, + "grad_norm": 2.544766426086426, + "learning_rate": 7.447859495060372e-07, + "loss": 1.0512, + "mean_token_accuracy": 0.6876026391983032, + "num_tokens": 34189050.0, + "step": 1358 + }, + { + "epoch": 0.14924225785196574, + "grad_norm": 2.128967046737671, + "learning_rate": 7.453347969264544e-07, + "loss": 0.9964, + "mean_token_accuracy": 0.6945736408233643, + "num_tokens": 34215583.0, + "step": 1359 + }, + { + "epoch": 0.1493520755545794, + "grad_norm": 2.3939499855041504, + "learning_rate": 7.458836443468715e-07, + "loss": 1.0359, + "mean_token_accuracy": 0.6947404742240906, + "num_tokens": 34240107.0, + "step": 1360 + }, + { + "epoch": 0.14946189325719306, + "grad_norm": 2.2366368770599365, + "learning_rate": 7.464324917672886e-07, + "loss": 1.0466, + "mean_token_accuracy": 0.6835578680038452, + "num_tokens": 34268634.0, + "step": 1361 + }, + { + "epoch": 0.14957171095980673, + "grad_norm": 2.3490145206451416, + "learning_rate": 7.469813391877058e-07, + "loss": 1.072, + "mean_token_accuracy": 0.6787263751029968, + "num_tokens": 34294787.0, + "step": 1362 + }, + { + "epoch": 0.14968152866242038, + "grad_norm": 2.0980653762817383, + "learning_rate": 7.475301866081229e-07, + "loss": 1.0933, + "mean_token_accuracy": 0.6751233339309692, + "num_tokens": 34324561.0, + "step": 1363 + }, + { + "epoch": 0.14979134636503405, + "grad_norm": 2.1349356174468994, + "learning_rate": 7.4807903402854e-07, + "loss": 0.9872, + "mean_token_accuracy": 0.7030878067016602, + "num_tokens": 34353422.0, + "step": 1364 + }, + { + "epoch": 0.1499011640676477, + "grad_norm": 2.0664050579071045, + "learning_rate": 7.486278814489572e-07, + "loss": 1.0514, + "mean_token_accuracy": 0.6836364269256592, + "num_tokens": 34383961.0, + "step": 1365 + }, + { + "epoch": 0.15001098177026137, + "grad_norm": 2.185267210006714, + "learning_rate": 7.491767288693743e-07, + "loss": 1.1188, + "mean_token_accuracy": 0.6793571710586548, + "num_tokens": 34413113.0, + "step": 1366 + }, + { + "epoch": 0.15012079947287502, + "grad_norm": 2.445864200592041, + "learning_rate": 7.497255762897914e-07, + "loss": 0.9765, + "mean_token_accuracy": 0.7046799659729004, + "num_tokens": 34436137.0, + "step": 1367 + }, + { + "epoch": 0.1502306171754887, + "grad_norm": 2.1955864429473877, + "learning_rate": 7.502744237102086e-07, + "loss": 1.0184, + "mean_token_accuracy": 0.6983675360679626, + "num_tokens": 34463942.0, + "step": 1368 + }, + { + "epoch": 0.15034043487810236, + "grad_norm": 2.410149335861206, + "learning_rate": 7.508232711306256e-07, + "loss": 1.0504, + "mean_token_accuracy": 0.6843950748443604, + "num_tokens": 34487358.0, + "step": 1369 + }, + { + "epoch": 0.150450252580716, + "grad_norm": 2.3682501316070557, + "learning_rate": 7.513721185510428e-07, + "loss": 1.0717, + "mean_token_accuracy": 0.6790552139282227, + "num_tokens": 34514855.0, + "step": 1370 + }, + { + "epoch": 0.15056007028332968, + "grad_norm": 2.337678909301758, + "learning_rate": 7.5192096597146e-07, + "loss": 1.0067, + "mean_token_accuracy": 0.7029684782028198, + "num_tokens": 34539472.0, + "step": 1371 + }, + { + "epoch": 0.15066988798594333, + "grad_norm": 2.6297760009765625, + "learning_rate": 7.52469813391877e-07, + "loss": 0.9933, + "mean_token_accuracy": 0.7103740572929382, + "num_tokens": 34558932.0, + "step": 1372 + }, + { + "epoch": 0.150779705688557, + "grad_norm": 2.238193988800049, + "learning_rate": 7.530186608122942e-07, + "loss": 1.1528, + "mean_token_accuracy": 0.6656259298324585, + "num_tokens": 34587522.0, + "step": 1373 + }, + { + "epoch": 0.15088952339117065, + "grad_norm": 2.674262523651123, + "learning_rate": 7.535675082327113e-07, + "loss": 1.0377, + "mean_token_accuracy": 0.6883845925331116, + "num_tokens": 34608632.0, + "step": 1374 + }, + { + "epoch": 0.15099934109378432, + "grad_norm": 2.441204786300659, + "learning_rate": 7.541163556531284e-07, + "loss": 0.9992, + "mean_token_accuracy": 0.70115727186203, + "num_tokens": 34631363.0, + "step": 1375 + }, + { + "epoch": 0.15110915879639797, + "grad_norm": 2.4561879634857178, + "learning_rate": 7.546652030735456e-07, + "loss": 1.1153, + "mean_token_accuracy": 0.6856068968772888, + "num_tokens": 34655511.0, + "step": 1376 + }, + { + "epoch": 0.15121897649901164, + "grad_norm": 2.3693652153015137, + "learning_rate": 7.552140504939627e-07, + "loss": 0.9418, + "mean_token_accuracy": 0.7086434364318848, + "num_tokens": 34677616.0, + "step": 1377 + }, + { + "epoch": 0.1513287942016253, + "grad_norm": 2.2015743255615234, + "learning_rate": 7.557628979143797e-07, + "loss": 1.0827, + "mean_token_accuracy": 0.6834437847137451, + "num_tokens": 34703815.0, + "step": 1378 + }, + { + "epoch": 0.15143861190423896, + "grad_norm": 2.4094154834747314, + "learning_rate": 7.563117453347969e-07, + "loss": 1.0716, + "mean_token_accuracy": 0.6837234497070312, + "num_tokens": 34727103.0, + "step": 1379 + }, + { + "epoch": 0.15154842960685264, + "grad_norm": 2.108783721923828, + "learning_rate": 7.568605927552139e-07, + "loss": 1.0524, + "mean_token_accuracy": 0.6855013370513916, + "num_tokens": 34754727.0, + "step": 1380 + }, + { + "epoch": 0.15165824730946628, + "grad_norm": 2.4528727531433105, + "learning_rate": 7.574094401756311e-07, + "loss": 0.9732, + "mean_token_accuracy": 0.7060461044311523, + "num_tokens": 34778483.0, + "step": 1381 + }, + { + "epoch": 0.15176806501207996, + "grad_norm": 2.404557466506958, + "learning_rate": 7.579582875960483e-07, + "loss": 1.1342, + "mean_token_accuracy": 0.6601020693778992, + "num_tokens": 34801774.0, + "step": 1382 + }, + { + "epoch": 0.1518778827146936, + "grad_norm": 2.2931599617004395, + "learning_rate": 7.585071350164653e-07, + "loss": 1.0808, + "mean_token_accuracy": 0.6778083443641663, + "num_tokens": 34829566.0, + "step": 1383 + }, + { + "epoch": 0.15198770041730728, + "grad_norm": 2.195823907852173, + "learning_rate": 7.590559824368825e-07, + "loss": 0.9675, + "mean_token_accuracy": 0.7097061276435852, + "num_tokens": 34854096.0, + "step": 1384 + }, + { + "epoch": 0.15209751811992092, + "grad_norm": 2.1574368476867676, + "learning_rate": 7.596048298572997e-07, + "loss": 1.1153, + "mean_token_accuracy": 0.6680634021759033, + "num_tokens": 34883703.0, + "step": 1385 + }, + { + "epoch": 0.1522073358225346, + "grad_norm": 2.488463878631592, + "learning_rate": 7.601536772777167e-07, + "loss": 1.0096, + "mean_token_accuracy": 0.6949572563171387, + "num_tokens": 34907699.0, + "step": 1386 + }, + { + "epoch": 0.15231715352514824, + "grad_norm": 2.019801616668701, + "learning_rate": 7.607025246981339e-07, + "loss": 1.0874, + "mean_token_accuracy": 0.6782500147819519, + "num_tokens": 34938918.0, + "step": 1387 + }, + { + "epoch": 0.15242697122776191, + "grad_norm": 2.4423649311065674, + "learning_rate": 7.612513721185511e-07, + "loss": 1.0286, + "mean_token_accuracy": 0.7088346481323242, + "num_tokens": 34960574.0, + "step": 1388 + }, + { + "epoch": 0.1525367889303756, + "grad_norm": 2.5140957832336426, + "learning_rate": 7.618002195389681e-07, + "loss": 1.0652, + "mean_token_accuracy": 0.6870326995849609, + "num_tokens": 34984521.0, + "step": 1389 + }, + { + "epoch": 0.15264660663298923, + "grad_norm": 2.2970125675201416, + "learning_rate": 7.623490669593853e-07, + "loss": 1.0548, + "mean_token_accuracy": 0.6868065595626831, + "num_tokens": 35012369.0, + "step": 1390 + }, + { + "epoch": 0.1527564243356029, + "grad_norm": 2.458467960357666, + "learning_rate": 7.628979143798024e-07, + "loss": 1.0211, + "mean_token_accuracy": 0.6928580403327942, + "num_tokens": 35035878.0, + "step": 1391 + }, + { + "epoch": 0.15286624203821655, + "grad_norm": 2.3701202869415283, + "learning_rate": 7.634467618002195e-07, + "loss": 1.0543, + "mean_token_accuracy": 0.6874967813491821, + "num_tokens": 35061534.0, + "step": 1392 + }, + { + "epoch": 0.15297605974083023, + "grad_norm": 2.4433753490448, + "learning_rate": 7.639956092206367e-07, + "loss": 1.0672, + "mean_token_accuracy": 0.6827800273895264, + "num_tokens": 35084998.0, + "step": 1393 + }, + { + "epoch": 0.15308587744344387, + "grad_norm": 2.4280803203582764, + "learning_rate": 7.645444566410538e-07, + "loss": 1.1263, + "mean_token_accuracy": 0.6654491424560547, + "num_tokens": 35109435.0, + "step": 1394 + }, + { + "epoch": 0.15319569514605755, + "grad_norm": 2.270047903060913, + "learning_rate": 7.650933040614709e-07, + "loss": 0.9524, + "mean_token_accuracy": 0.7228835821151733, + "num_tokens": 35134320.0, + "step": 1395 + }, + { + "epoch": 0.1533055128486712, + "grad_norm": 2.146667003631592, + "learning_rate": 7.656421514818881e-07, + "loss": 1.0492, + "mean_token_accuracy": 0.6893239617347717, + "num_tokens": 35163096.0, + "step": 1396 + }, + { + "epoch": 0.15341533055128487, + "grad_norm": 2.35102915763855, + "learning_rate": 7.66190998902305e-07, + "loss": 1.0242, + "mean_token_accuracy": 0.7002135515213013, + "num_tokens": 35189217.0, + "step": 1397 + }, + { + "epoch": 0.15352514825389854, + "grad_norm": 2.0050880908966064, + "learning_rate": 7.667398463227222e-07, + "loss": 1.0391, + "mean_token_accuracy": 0.6870787143707275, + "num_tokens": 35220635.0, + "step": 1398 + }, + { + "epoch": 0.1536349659565122, + "grad_norm": 2.2199769020080566, + "learning_rate": 7.672886937431394e-07, + "loss": 1.1345, + "mean_token_accuracy": 0.6624047756195068, + "num_tokens": 35248965.0, + "step": 1399 + }, + { + "epoch": 0.15374478365912586, + "grad_norm": 2.52897572517395, + "learning_rate": 7.678375411635564e-07, + "loss": 1.043, + "mean_token_accuracy": 0.6894803643226624, + "num_tokens": 35270995.0, + "step": 1400 + }, + { + "epoch": 0.1538546013617395, + "grad_norm": 2.457124948501587, + "learning_rate": 7.683863885839736e-07, + "loss": 1.0068, + "mean_token_accuracy": 0.7003116011619568, + "num_tokens": 35292953.0, + "step": 1401 + }, + { + "epoch": 0.15396441906435318, + "grad_norm": 2.0999755859375, + "learning_rate": 7.689352360043907e-07, + "loss": 1.0462, + "mean_token_accuracy": 0.684709370136261, + "num_tokens": 35321794.0, + "step": 1402 + }, + { + "epoch": 0.15407423676696683, + "grad_norm": 2.271263599395752, + "learning_rate": 7.694840834248078e-07, + "loss": 1.0985, + "mean_token_accuracy": 0.6858236193656921, + "num_tokens": 35347314.0, + "step": 1403 + }, + { + "epoch": 0.1541840544695805, + "grad_norm": 2.4549200534820557, + "learning_rate": 7.70032930845225e-07, + "loss": 1.1205, + "mean_token_accuracy": 0.6678794622421265, + "num_tokens": 35369889.0, + "step": 1404 + }, + { + "epoch": 0.15429387217219415, + "grad_norm": 2.1776115894317627, + "learning_rate": 7.705817782656421e-07, + "loss": 0.9668, + "mean_token_accuracy": 0.7069455981254578, + "num_tokens": 35396019.0, + "step": 1405 + }, + { + "epoch": 0.15440368987480782, + "grad_norm": 2.4986729621887207, + "learning_rate": 7.711306256860592e-07, + "loss": 1.05, + "mean_token_accuracy": 0.6870603561401367, + "num_tokens": 35417648.0, + "step": 1406 + }, + { + "epoch": 0.1545135075774215, + "grad_norm": 2.750579357147217, + "learning_rate": 7.716794731064764e-07, + "loss": 1.041, + "mean_token_accuracy": 0.6859967708587646, + "num_tokens": 35438798.0, + "step": 1407 + }, + { + "epoch": 0.15462332528003514, + "grad_norm": 2.5904014110565186, + "learning_rate": 7.722283205268935e-07, + "loss": 1.0211, + "mean_token_accuracy": 0.6978399753570557, + "num_tokens": 35460450.0, + "step": 1408 + }, + { + "epoch": 0.1547331429826488, + "grad_norm": 2.3888986110687256, + "learning_rate": 7.727771679473106e-07, + "loss": 1.0388, + "mean_token_accuracy": 0.6885098814964294, + "num_tokens": 35484770.0, + "step": 1409 + }, + { + "epoch": 0.15484296068526246, + "grad_norm": 2.2913341522216797, + "learning_rate": 7.733260153677278e-07, + "loss": 1.1238, + "mean_token_accuracy": 0.6776375770568848, + "num_tokens": 35510993.0, + "step": 1410 + }, + { + "epoch": 0.15495277838787613, + "grad_norm": 2.3805580139160156, + "learning_rate": 7.738748627881449e-07, + "loss": 1.0735, + "mean_token_accuracy": 0.6885654330253601, + "num_tokens": 35535543.0, + "step": 1411 + }, + { + "epoch": 0.15506259609048978, + "grad_norm": 2.2488391399383545, + "learning_rate": 7.74423710208562e-07, + "loss": 1.0258, + "mean_token_accuracy": 0.7030251026153564, + "num_tokens": 35561632.0, + "step": 1412 + }, + { + "epoch": 0.15517241379310345, + "grad_norm": 2.1955161094665527, + "learning_rate": 7.749725576289791e-07, + "loss": 1.0624, + "mean_token_accuracy": 0.6782138347625732, + "num_tokens": 35587952.0, + "step": 1413 + }, + { + "epoch": 0.1552822314957171, + "grad_norm": 2.4283971786499023, + "learning_rate": 7.755214050493963e-07, + "loss": 1.0486, + "mean_token_accuracy": 0.6881304979324341, + "num_tokens": 35609821.0, + "step": 1414 + }, + { + "epoch": 0.15539204919833077, + "grad_norm": 2.1584994792938232, + "learning_rate": 7.760702524698134e-07, + "loss": 0.9429, + "mean_token_accuracy": 0.7122955322265625, + "num_tokens": 35634582.0, + "step": 1415 + }, + { + "epoch": 0.15550186690094442, + "grad_norm": 2.2594926357269287, + "learning_rate": 7.766190998902304e-07, + "loss": 1.004, + "mean_token_accuracy": 0.704609751701355, + "num_tokens": 35659877.0, + "step": 1416 + }, + { + "epoch": 0.1556116846035581, + "grad_norm": 2.2673282623291016, + "learning_rate": 7.771679473106476e-07, + "loss": 0.9761, + "mean_token_accuracy": 0.709669828414917, + "num_tokens": 35683568.0, + "step": 1417 + }, + { + "epoch": 0.15572150230617177, + "grad_norm": 2.2884132862091064, + "learning_rate": 7.777167947310647e-07, + "loss": 1.0842, + "mean_token_accuracy": 0.687865138053894, + "num_tokens": 35709819.0, + "step": 1418 + }, + { + "epoch": 0.1558313200087854, + "grad_norm": 2.39501690864563, + "learning_rate": 7.782656421514818e-07, + "loss": 1.0001, + "mean_token_accuracy": 0.7057916522026062, + "num_tokens": 35732186.0, + "step": 1419 + }, + { + "epoch": 0.15594113771139909, + "grad_norm": 2.0291249752044678, + "learning_rate": 7.78814489571899e-07, + "loss": 1.1108, + "mean_token_accuracy": 0.6731569170951843, + "num_tokens": 35766893.0, + "step": 1420 + }, + { + "epoch": 0.15605095541401273, + "grad_norm": 2.1943397521972656, + "learning_rate": 7.793633369923161e-07, + "loss": 1.0757, + "mean_token_accuracy": 0.6794184446334839, + "num_tokens": 35796396.0, + "step": 1421 + }, + { + "epoch": 0.1561607731166264, + "grad_norm": 2.546156167984009, + "learning_rate": 7.799121844127332e-07, + "loss": 1.1316, + "mean_token_accuracy": 0.6658259034156799, + "num_tokens": 35819417.0, + "step": 1422 + }, + { + "epoch": 0.15627059081924005, + "grad_norm": 2.1877317428588867, + "learning_rate": 7.804610318331504e-07, + "loss": 1.1372, + "mean_token_accuracy": 0.6620248556137085, + "num_tokens": 35849314.0, + "step": 1423 + }, + { + "epoch": 0.15638040852185373, + "grad_norm": 2.4686670303344727, + "learning_rate": 7.810098792535674e-07, + "loss": 1.0171, + "mean_token_accuracy": 0.6942978501319885, + "num_tokens": 35872864.0, + "step": 1424 + }, + { + "epoch": 0.15649022622446737, + "grad_norm": 2.3738067150115967, + "learning_rate": 7.815587266739846e-07, + "loss": 1.035, + "mean_token_accuracy": 0.6966773271560669, + "num_tokens": 35896982.0, + "step": 1425 + }, + { + "epoch": 0.15660004392708105, + "grad_norm": 2.32277774810791, + "learning_rate": 7.821075740944018e-07, + "loss": 1.0372, + "mean_token_accuracy": 0.6944177150726318, + "num_tokens": 35921212.0, + "step": 1426 + }, + { + "epoch": 0.15670986162969472, + "grad_norm": 2.053898572921753, + "learning_rate": 7.826564215148188e-07, + "loss": 1.0923, + "mean_token_accuracy": 0.6806981563568115, + "num_tokens": 35953187.0, + "step": 1427 + }, + { + "epoch": 0.15681967933230836, + "grad_norm": 2.3897273540496826, + "learning_rate": 7.83205268935236e-07, + "loss": 1.0902, + "mean_token_accuracy": 0.6803723573684692, + "num_tokens": 35976715.0, + "step": 1428 + }, + { + "epoch": 0.15692949703492204, + "grad_norm": 2.174727439880371, + "learning_rate": 7.837541163556532e-07, + "loss": 1.0071, + "mean_token_accuracy": 0.6981343030929565, + "num_tokens": 36004202.0, + "step": 1429 + }, + { + "epoch": 0.15703931473753568, + "grad_norm": 2.177177906036377, + "learning_rate": 7.843029637760702e-07, + "loss": 0.9733, + "mean_token_accuracy": 0.7189179062843323, + "num_tokens": 36030475.0, + "step": 1430 + }, + { + "epoch": 0.15714913244014936, + "grad_norm": 2.4272797107696533, + "learning_rate": 7.848518111964874e-07, + "loss": 0.9118, + "mean_token_accuracy": 0.7173193693161011, + "num_tokens": 36051382.0, + "step": 1431 + }, + { + "epoch": 0.157258950142763, + "grad_norm": 2.5477404594421387, + "learning_rate": 7.854006586169045e-07, + "loss": 1.0442, + "mean_token_accuracy": 0.6863096952438354, + "num_tokens": 36072155.0, + "step": 1432 + }, + { + "epoch": 0.15736876784537668, + "grad_norm": 2.6025946140289307, + "learning_rate": 7.859495060373216e-07, + "loss": 1.1117, + "mean_token_accuracy": 0.6736195087432861, + "num_tokens": 36096682.0, + "step": 1433 + }, + { + "epoch": 0.15747858554799032, + "grad_norm": 2.261437177658081, + "learning_rate": 7.864983534577388e-07, + "loss": 0.969, + "mean_token_accuracy": 0.7078895568847656, + "num_tokens": 36121989.0, + "step": 1434 + }, + { + "epoch": 0.157588403250604, + "grad_norm": 2.535767078399658, + "learning_rate": 7.870472008781557e-07, + "loss": 1.0329, + "mean_token_accuracy": 0.6972866058349609, + "num_tokens": 36146040.0, + "step": 1435 + }, + { + "epoch": 0.15769822095321767, + "grad_norm": 2.339907646179199, + "learning_rate": 7.875960482985729e-07, + "loss": 0.9427, + "mean_token_accuracy": 0.7200837135314941, + "num_tokens": 36169764.0, + "step": 1436 + }, + { + "epoch": 0.15780803865583132, + "grad_norm": 2.218914270401001, + "learning_rate": 7.881448957189901e-07, + "loss": 1.0946, + "mean_token_accuracy": 0.6710537672042847, + "num_tokens": 36198184.0, + "step": 1437 + }, + { + "epoch": 0.157917856358445, + "grad_norm": 2.1956288814544678, + "learning_rate": 7.886937431394071e-07, + "loss": 1.0825, + "mean_token_accuracy": 0.6737610101699829, + "num_tokens": 36227117.0, + "step": 1438 + }, + { + "epoch": 0.15802767406105864, + "grad_norm": 2.3622653484344482, + "learning_rate": 7.892425905598243e-07, + "loss": 0.9912, + "mean_token_accuracy": 0.6982098817825317, + "num_tokens": 36250863.0, + "step": 1439 + }, + { + "epoch": 0.1581374917636723, + "grad_norm": 2.4948010444641113, + "learning_rate": 7.897914379802415e-07, + "loss": 0.9389, + "mean_token_accuracy": 0.7141672968864441, + "num_tokens": 36274183.0, + "step": 1440 + }, + { + "epoch": 0.15824730946628596, + "grad_norm": 2.3584625720977783, + "learning_rate": 7.903402854006585e-07, + "loss": 0.9803, + "mean_token_accuracy": 0.7059836983680725, + "num_tokens": 36298298.0, + "step": 1441 + }, + { + "epoch": 0.15835712716889963, + "grad_norm": 2.530271291732788, + "learning_rate": 7.908891328210757e-07, + "loss": 1.0607, + "mean_token_accuracy": 0.6879582405090332, + "num_tokens": 36322074.0, + "step": 1442 + }, + { + "epoch": 0.15846694487151328, + "grad_norm": 2.538677215576172, + "learning_rate": 7.914379802414928e-07, + "loss": 0.9199, + "mean_token_accuracy": 0.7183574438095093, + "num_tokens": 36343712.0, + "step": 1443 + }, + { + "epoch": 0.15857676257412695, + "grad_norm": 2.1037116050720215, + "learning_rate": 7.919868276619099e-07, + "loss": 0.9429, + "mean_token_accuracy": 0.710242748260498, + "num_tokens": 36372857.0, + "step": 1444 + }, + { + "epoch": 0.15868658027674062, + "grad_norm": 2.2819223403930664, + "learning_rate": 7.925356750823271e-07, + "loss": 1.0412, + "mean_token_accuracy": 0.6969594955444336, + "num_tokens": 36397220.0, + "step": 1445 + }, + { + "epoch": 0.15879639797935427, + "grad_norm": 1.9236485958099365, + "learning_rate": 7.930845225027442e-07, + "loss": 0.9815, + "mean_token_accuracy": 0.6996583342552185, + "num_tokens": 36428272.0, + "step": 1446 + }, + { + "epoch": 0.15890621568196794, + "grad_norm": 2.218705892562866, + "learning_rate": 7.936333699231613e-07, + "loss": 1.0947, + "mean_token_accuracy": 0.6749618053436279, + "num_tokens": 36456053.0, + "step": 1447 + }, + { + "epoch": 0.1590160333845816, + "grad_norm": 2.5348057746887207, + "learning_rate": 7.941822173435785e-07, + "loss": 0.9983, + "mean_token_accuracy": 0.7008190751075745, + "num_tokens": 36478012.0, + "step": 1448 + }, + { + "epoch": 0.15912585108719526, + "grad_norm": 2.338566541671753, + "learning_rate": 7.947310647639956e-07, + "loss": 1.0467, + "mean_token_accuracy": 0.6891171932220459, + "num_tokens": 36502243.0, + "step": 1449 + }, + { + "epoch": 0.1592356687898089, + "grad_norm": 2.2888524532318115, + "learning_rate": 7.952799121844127e-07, + "loss": 0.9664, + "mean_token_accuracy": 0.7085438966751099, + "num_tokens": 36526824.0, + "step": 1450 + }, + { + "epoch": 0.15934548649242258, + "grad_norm": 2.437925100326538, + "learning_rate": 7.958287596048299e-07, + "loss": 1.03, + "mean_token_accuracy": 0.6868795156478882, + "num_tokens": 36549136.0, + "step": 1451 + }, + { + "epoch": 0.15945530419503623, + "grad_norm": 2.104757785797119, + "learning_rate": 7.96377607025247e-07, + "loss": 0.9656, + "mean_token_accuracy": 0.7106059789657593, + "num_tokens": 36577375.0, + "step": 1452 + }, + { + "epoch": 0.1595651218976499, + "grad_norm": 2.2844722270965576, + "learning_rate": 7.969264544456641e-07, + "loss": 1.0567, + "mean_token_accuracy": 0.6894733905792236, + "num_tokens": 36600926.0, + "step": 1453 + }, + { + "epoch": 0.15967493960026355, + "grad_norm": 2.6173555850982666, + "learning_rate": 7.974753018660811e-07, + "loss": 1.1294, + "mean_token_accuracy": 0.664783775806427, + "num_tokens": 36623745.0, + "step": 1454 + }, + { + "epoch": 0.15978475730287722, + "grad_norm": 2.1013381481170654, + "learning_rate": 7.980241492864983e-07, + "loss": 1.0507, + "mean_token_accuracy": 0.6928362250328064, + "num_tokens": 36651806.0, + "step": 1455 + }, + { + "epoch": 0.1598945750054909, + "grad_norm": 2.3071627616882324, + "learning_rate": 7.985729967069154e-07, + "loss": 1.0317, + "mean_token_accuracy": 0.6877004504203796, + "num_tokens": 36679732.0, + "step": 1456 + }, + { + "epoch": 0.16000439270810454, + "grad_norm": 2.531834363937378, + "learning_rate": 7.991218441273325e-07, + "loss": 1.004, + "mean_token_accuracy": 0.6933279633522034, + "num_tokens": 36702501.0, + "step": 1457 + }, + { + "epoch": 0.16011421041071822, + "grad_norm": 2.553989887237549, + "learning_rate": 7.996706915477497e-07, + "loss": 1.0194, + "mean_token_accuracy": 0.7015655040740967, + "num_tokens": 36723545.0, + "step": 1458 + }, + { + "epoch": 0.16022402811333186, + "grad_norm": 2.204103469848633, + "learning_rate": 8.002195389681668e-07, + "loss": 1.0121, + "mean_token_accuracy": 0.6966826915740967, + "num_tokens": 36751701.0, + "step": 1459 + }, + { + "epoch": 0.16033384581594554, + "grad_norm": 2.3304924964904785, + "learning_rate": 8.007683863885839e-07, + "loss": 1.0699, + "mean_token_accuracy": 0.688819169998169, + "num_tokens": 36777794.0, + "step": 1460 + }, + { + "epoch": 0.16044366351855918, + "grad_norm": 2.166259288787842, + "learning_rate": 8.013172338090011e-07, + "loss": 0.9811, + "mean_token_accuracy": 0.7092448472976685, + "num_tokens": 36804562.0, + "step": 1461 + }, + { + "epoch": 0.16055348122117286, + "grad_norm": 2.539764881134033, + "learning_rate": 8.018660812294182e-07, + "loss": 1.0964, + "mean_token_accuracy": 0.6831907629966736, + "num_tokens": 36826761.0, + "step": 1462 + }, + { + "epoch": 0.1606632989237865, + "grad_norm": 2.183666467666626, + "learning_rate": 8.024149286498353e-07, + "loss": 0.9905, + "mean_token_accuracy": 0.7003799676895142, + "num_tokens": 36855448.0, + "step": 1463 + }, + { + "epoch": 0.16077311662640018, + "grad_norm": 2.2307138442993164, + "learning_rate": 8.029637760702525e-07, + "loss": 1.035, + "mean_token_accuracy": 0.6934924125671387, + "num_tokens": 36882287.0, + "step": 1464 + }, + { + "epoch": 0.16088293432901385, + "grad_norm": 2.1852471828460693, + "learning_rate": 8.035126234906695e-07, + "loss": 1.035, + "mean_token_accuracy": 0.6865507364273071, + "num_tokens": 36910206.0, + "step": 1465 + }, + { + "epoch": 0.1609927520316275, + "grad_norm": 2.1335673332214355, + "learning_rate": 8.040614709110867e-07, + "loss": 0.9982, + "mean_token_accuracy": 0.7082875967025757, + "num_tokens": 36939129.0, + "step": 1466 + }, + { + "epoch": 0.16110256973424117, + "grad_norm": 2.3002803325653076, + "learning_rate": 8.046103183315039e-07, + "loss": 0.9663, + "mean_token_accuracy": 0.7071545720100403, + "num_tokens": 36965031.0, + "step": 1467 + }, + { + "epoch": 0.16121238743685481, + "grad_norm": 2.3345751762390137, + "learning_rate": 8.051591657519209e-07, + "loss": 0.9924, + "mean_token_accuracy": 0.7013881206512451, + "num_tokens": 36989319.0, + "step": 1468 + }, + { + "epoch": 0.1613222051394685, + "grad_norm": 2.2270405292510986, + "learning_rate": 8.057080131723381e-07, + "loss": 1.1439, + "mean_token_accuracy": 0.6582419872283936, + "num_tokens": 37018781.0, + "step": 1469 + }, + { + "epoch": 0.16143202284208213, + "grad_norm": 2.448970079421997, + "learning_rate": 8.062568605927553e-07, + "loss": 1.0088, + "mean_token_accuracy": 0.6967966556549072, + "num_tokens": 37041281.0, + "step": 1470 + }, + { + "epoch": 0.1615418405446958, + "grad_norm": 2.4777402877807617, + "learning_rate": 8.068057080131723e-07, + "loss": 0.982, + "mean_token_accuracy": 0.7031992673873901, + "num_tokens": 37062195.0, + "step": 1471 + }, + { + "epoch": 0.16165165824730945, + "grad_norm": 2.544407844543457, + "learning_rate": 8.073545554335894e-07, + "loss": 0.9936, + "mean_token_accuracy": 0.7033355236053467, + "num_tokens": 37082678.0, + "step": 1472 + }, + { + "epoch": 0.16176147594992313, + "grad_norm": 2.1783676147460938, + "learning_rate": 8.079034028540066e-07, + "loss": 1.1223, + "mean_token_accuracy": 0.6673139929771423, + "num_tokens": 37113916.0, + "step": 1473 + }, + { + "epoch": 0.1618712936525368, + "grad_norm": 2.28197979927063, + "learning_rate": 8.084522502744236e-07, + "loss": 0.9946, + "mean_token_accuracy": 0.6999179124832153, + "num_tokens": 37142048.0, + "step": 1474 + }, + { + "epoch": 0.16198111135515045, + "grad_norm": 2.0631887912750244, + "learning_rate": 8.090010976948408e-07, + "loss": 1.1186, + "mean_token_accuracy": 0.6793774366378784, + "num_tokens": 37173394.0, + "step": 1475 + }, + { + "epoch": 0.16209092905776412, + "grad_norm": 2.3865883350372314, + "learning_rate": 8.095499451152578e-07, + "loss": 1.1089, + "mean_token_accuracy": 0.6745956540107727, + "num_tokens": 37197426.0, + "step": 1476 + }, + { + "epoch": 0.16220074676037777, + "grad_norm": 2.7158284187316895, + "learning_rate": 8.10098792535675e-07, + "loss": 0.9657, + "mean_token_accuracy": 0.7080771923065186, + "num_tokens": 37216778.0, + "step": 1477 + }, + { + "epoch": 0.16231056446299144, + "grad_norm": 2.6787822246551514, + "learning_rate": 8.106476399560922e-07, + "loss": 1.0332, + "mean_token_accuracy": 0.6927733421325684, + "num_tokens": 37237245.0, + "step": 1478 + }, + { + "epoch": 0.1624203821656051, + "grad_norm": 2.2955806255340576, + "learning_rate": 8.111964873765092e-07, + "loss": 1.0268, + "mean_token_accuracy": 0.6934144496917725, + "num_tokens": 37261921.0, + "step": 1479 + }, + { + "epoch": 0.16253019986821876, + "grad_norm": 2.6954028606414795, + "learning_rate": 8.117453347969264e-07, + "loss": 0.9545, + "mean_token_accuracy": 0.7099308371543884, + "num_tokens": 37281767.0, + "step": 1480 + }, + { + "epoch": 0.1626400175708324, + "grad_norm": 2.3589353561401367, + "learning_rate": 8.122941822173436e-07, + "loss": 1.0083, + "mean_token_accuracy": 0.6945200562477112, + "num_tokens": 37305803.0, + "step": 1481 + }, + { + "epoch": 0.16274983527344608, + "grad_norm": 2.091693639755249, + "learning_rate": 8.128430296377606e-07, + "loss": 1.1103, + "mean_token_accuracy": 0.6684775948524475, + "num_tokens": 37338150.0, + "step": 1482 + }, + { + "epoch": 0.16285965297605975, + "grad_norm": 2.2507100105285645, + "learning_rate": 8.133918770581778e-07, + "loss": 0.9614, + "mean_token_accuracy": 0.7088044285774231, + "num_tokens": 37364491.0, + "step": 1483 + }, + { + "epoch": 0.1629694706786734, + "grad_norm": 2.3084373474121094, + "learning_rate": 8.13940724478595e-07, + "loss": 0.9869, + "mean_token_accuracy": 0.699738621711731, + "num_tokens": 37389428.0, + "step": 1484 + }, + { + "epoch": 0.16307928838128707, + "grad_norm": 2.42425799369812, + "learning_rate": 8.14489571899012e-07, + "loss": 1.0462, + "mean_token_accuracy": 0.6924953460693359, + "num_tokens": 37412881.0, + "step": 1485 + }, + { + "epoch": 0.16318910608390072, + "grad_norm": 2.4293932914733887, + "learning_rate": 8.150384193194292e-07, + "loss": 1.0964, + "mean_token_accuracy": 0.6827256679534912, + "num_tokens": 37437380.0, + "step": 1486 + }, + { + "epoch": 0.1632989237865144, + "grad_norm": 2.2330379486083984, + "learning_rate": 8.155872667398463e-07, + "loss": 1.1356, + "mean_token_accuracy": 0.6811542510986328, + "num_tokens": 37464447.0, + "step": 1487 + }, + { + "epoch": 0.16340874148912804, + "grad_norm": 2.9543890953063965, + "learning_rate": 8.161361141602634e-07, + "loss": 0.8688, + "mean_token_accuracy": 0.732179582118988, + "num_tokens": 37481365.0, + "step": 1488 + }, + { + "epoch": 0.1635185591917417, + "grad_norm": 2.468417167663574, + "learning_rate": 8.166849615806806e-07, + "loss": 1.0686, + "mean_token_accuracy": 0.6818228363990784, + "num_tokens": 37505950.0, + "step": 1489 + }, + { + "epoch": 0.16362837689435536, + "grad_norm": 2.244245767593384, + "learning_rate": 8.172338090010977e-07, + "loss": 1.0968, + "mean_token_accuracy": 0.6786543130874634, + "num_tokens": 37536882.0, + "step": 1490 + }, + { + "epoch": 0.16373819459696903, + "grad_norm": 2.5080726146698, + "learning_rate": 8.177826564215147e-07, + "loss": 1.0973, + "mean_token_accuracy": 0.6862996220588684, + "num_tokens": 37558732.0, + "step": 1491 + }, + { + "epoch": 0.16384801229958268, + "grad_norm": 2.2192471027374268, + "learning_rate": 8.183315038419319e-07, + "loss": 1.0106, + "mean_token_accuracy": 0.7009302377700806, + "num_tokens": 37585048.0, + "step": 1492 + }, + { + "epoch": 0.16395783000219635, + "grad_norm": 2.604973793029785, + "learning_rate": 8.18880351262349e-07, + "loss": 0.9348, + "mean_token_accuracy": 0.7098878622055054, + "num_tokens": 37604211.0, + "step": 1493 + }, + { + "epoch": 0.16406764770481003, + "grad_norm": 2.428649663925171, + "learning_rate": 8.194291986827661e-07, + "loss": 1.0562, + "mean_token_accuracy": 0.6880694627761841, + "num_tokens": 37629078.0, + "step": 1494 + }, + { + "epoch": 0.16417746540742367, + "grad_norm": 2.6439552307128906, + "learning_rate": 8.199780461031833e-07, + "loss": 0.9528, + "mean_token_accuracy": 0.7121002674102783, + "num_tokens": 37651389.0, + "step": 1495 + }, + { + "epoch": 0.16428728311003735, + "grad_norm": 2.1465001106262207, + "learning_rate": 8.205268935236004e-07, + "loss": 1.0107, + "mean_token_accuracy": 0.6947400569915771, + "num_tokens": 37679332.0, + "step": 1496 + }, + { + "epoch": 0.164397100812651, + "grad_norm": 2.749006509780884, + "learning_rate": 8.210757409440175e-07, + "loss": 1.0955, + "mean_token_accuracy": 0.6752563714981079, + "num_tokens": 37700408.0, + "step": 1497 + }, + { + "epoch": 0.16450691851526467, + "grad_norm": 2.489149332046509, + "learning_rate": 8.216245883644346e-07, + "loss": 1.0575, + "mean_token_accuracy": 0.6872660517692566, + "num_tokens": 37723673.0, + "step": 1498 + }, + { + "epoch": 0.1646167362178783, + "grad_norm": 2.3147425651550293, + "learning_rate": 8.221734357848518e-07, + "loss": 1.0796, + "mean_token_accuracy": 0.6851990222930908, + "num_tokens": 37748654.0, + "step": 1499 + }, + { + "epoch": 0.16472655392049199, + "grad_norm": 2.303367853164673, + "learning_rate": 8.227222832052689e-07, + "loss": 0.9577, + "mean_token_accuracy": 0.7173028588294983, + "num_tokens": 37771804.0, + "step": 1500 + }, + { + "epoch": 0.16483637162310563, + "grad_norm": 2.0976078510284424, + "learning_rate": 8.23271130625686e-07, + "loss": 1.0443, + "mean_token_accuracy": 0.6912973523139954, + "num_tokens": 37800679.0, + "step": 1501 + }, + { + "epoch": 0.1649461893257193, + "grad_norm": 2.369554042816162, + "learning_rate": 8.238199780461032e-07, + "loss": 0.9545, + "mean_token_accuracy": 0.7162089943885803, + "num_tokens": 37823209.0, + "step": 1502 + }, + { + "epoch": 0.16505600702833298, + "grad_norm": 2.3441340923309326, + "learning_rate": 8.243688254665203e-07, + "loss": 0.8397, + "mean_token_accuracy": 0.736594557762146, + "num_tokens": 37843575.0, + "step": 1503 + }, + { + "epoch": 0.16516582473094663, + "grad_norm": 2.4318795204162598, + "learning_rate": 8.249176728869374e-07, + "loss": 0.9635, + "mean_token_accuracy": 0.7207014560699463, + "num_tokens": 37865187.0, + "step": 1504 + }, + { + "epoch": 0.1652756424335603, + "grad_norm": 2.4540483951568604, + "learning_rate": 8.254665203073546e-07, + "loss": 0.9389, + "mean_token_accuracy": 0.7172505855560303, + "num_tokens": 37887521.0, + "step": 1505 + }, + { + "epoch": 0.16538546013617395, + "grad_norm": 2.433631181716919, + "learning_rate": 8.260153677277717e-07, + "loss": 0.9632, + "mean_token_accuracy": 0.7080819606781006, + "num_tokens": 37909216.0, + "step": 1506 + }, + { + "epoch": 0.16549527783878762, + "grad_norm": 2.5609993934631348, + "learning_rate": 8.265642151481888e-07, + "loss": 0.9967, + "mean_token_accuracy": 0.6971263885498047, + "num_tokens": 37930150.0, + "step": 1507 + }, + { + "epoch": 0.16560509554140126, + "grad_norm": 2.4285202026367188, + "learning_rate": 8.27113062568606e-07, + "loss": 0.9519, + "mean_token_accuracy": 0.7114240527153015, + "num_tokens": 37952636.0, + "step": 1508 + }, + { + "epoch": 0.16571491324401494, + "grad_norm": 2.501049280166626, + "learning_rate": 8.27661909989023e-07, + "loss": 1.0435, + "mean_token_accuracy": 0.6991865634918213, + "num_tokens": 37975008.0, + "step": 1509 + }, + { + "epoch": 0.16582473094662858, + "grad_norm": 2.425978422164917, + "learning_rate": 8.282107574094401e-07, + "loss": 1.0318, + "mean_token_accuracy": 0.6884909868240356, + "num_tokens": 37997968.0, + "step": 1510 + }, + { + "epoch": 0.16593454864924226, + "grad_norm": 2.4084689617156982, + "learning_rate": 8.287596048298572e-07, + "loss": 1.0078, + "mean_token_accuracy": 0.6950936317443848, + "num_tokens": 38021514.0, + "step": 1511 + }, + { + "epoch": 0.16604436635185593, + "grad_norm": 2.2430033683776855, + "learning_rate": 8.293084522502743e-07, + "loss": 1.0488, + "mean_token_accuracy": 0.6898123025894165, + "num_tokens": 38045808.0, + "step": 1512 + }, + { + "epoch": 0.16615418405446958, + "grad_norm": 2.4274144172668457, + "learning_rate": 8.298572996706915e-07, + "loss": 1.0223, + "mean_token_accuracy": 0.7047265768051147, + "num_tokens": 38069573.0, + "step": 1513 + }, + { + "epoch": 0.16626400175708325, + "grad_norm": 2.2409474849700928, + "learning_rate": 8.304061470911086e-07, + "loss": 1.1049, + "mean_token_accuracy": 0.6708707809448242, + "num_tokens": 38095722.0, + "step": 1514 + }, + { + "epoch": 0.1663738194596969, + "grad_norm": 2.491971015930176, + "learning_rate": 8.309549945115257e-07, + "loss": 1.0137, + "mean_token_accuracy": 0.699353039264679, + "num_tokens": 38118425.0, + "step": 1515 + }, + { + "epoch": 0.16648363716231057, + "grad_norm": 2.2140345573425293, + "learning_rate": 8.315038419319429e-07, + "loss": 0.9128, + "mean_token_accuracy": 0.7297497987747192, + "num_tokens": 38142578.0, + "step": 1516 + }, + { + "epoch": 0.16659345486492422, + "grad_norm": 2.0694124698638916, + "learning_rate": 8.3205268935236e-07, + "loss": 1.0251, + "mean_token_accuracy": 0.6984779834747314, + "num_tokens": 38171464.0, + "step": 1517 + }, + { + "epoch": 0.1667032725675379, + "grad_norm": 2.6872658729553223, + "learning_rate": 8.326015367727771e-07, + "loss": 1.1079, + "mean_token_accuracy": 0.6713797450065613, + "num_tokens": 38191383.0, + "step": 1518 + }, + { + "epoch": 0.16681309027015154, + "grad_norm": 2.3671770095825195, + "learning_rate": 8.331503841931943e-07, + "loss": 1.0903, + "mean_token_accuracy": 0.6807858943939209, + "num_tokens": 38215180.0, + "step": 1519 + }, + { + "epoch": 0.1669229079727652, + "grad_norm": 2.3112447261810303, + "learning_rate": 8.336992316136113e-07, + "loss": 1.0367, + "mean_token_accuracy": 0.6929733753204346, + "num_tokens": 38240221.0, + "step": 1520 + }, + { + "epoch": 0.16703272567537888, + "grad_norm": 2.757188320159912, + "learning_rate": 8.342480790340285e-07, + "loss": 0.9807, + "mean_token_accuracy": 0.7084041237831116, + "num_tokens": 38258608.0, + "step": 1521 + }, + { + "epoch": 0.16714254337799253, + "grad_norm": 2.2516376972198486, + "learning_rate": 8.347969264544457e-07, + "loss": 1.0523, + "mean_token_accuracy": 0.685731053352356, + "num_tokens": 38285251.0, + "step": 1522 + }, + { + "epoch": 0.1672523610806062, + "grad_norm": 2.3279356956481934, + "learning_rate": 8.353457738748627e-07, + "loss": 1.0376, + "mean_token_accuracy": 0.692534327507019, + "num_tokens": 38309001.0, + "step": 1523 + }, + { + "epoch": 0.16736217878321985, + "grad_norm": 2.209322929382324, + "learning_rate": 8.358946212952799e-07, + "loss": 0.9968, + "mean_token_accuracy": 0.7101652026176453, + "num_tokens": 38335471.0, + "step": 1524 + }, + { + "epoch": 0.16747199648583352, + "grad_norm": 2.0535383224487305, + "learning_rate": 8.364434687156971e-07, + "loss": 1.0522, + "mean_token_accuracy": 0.6844151020050049, + "num_tokens": 38366956.0, + "step": 1525 + }, + { + "epoch": 0.16758181418844717, + "grad_norm": 2.233168601989746, + "learning_rate": 8.369923161361141e-07, + "loss": 1.0134, + "mean_token_accuracy": 0.6976171731948853, + "num_tokens": 38391834.0, + "step": 1526 + }, + { + "epoch": 0.16769163189106084, + "grad_norm": 2.2285187244415283, + "learning_rate": 8.375411635565313e-07, + "loss": 1.0114, + "mean_token_accuracy": 0.6961938738822937, + "num_tokens": 38418477.0, + "step": 1527 + }, + { + "epoch": 0.1678014495936745, + "grad_norm": 2.3232932090759277, + "learning_rate": 8.380900109769485e-07, + "loss": 0.9889, + "mean_token_accuracy": 0.7058116793632507, + "num_tokens": 38443223.0, + "step": 1528 + }, + { + "epoch": 0.16791126729628816, + "grad_norm": 2.31758713722229, + "learning_rate": 8.386388583973654e-07, + "loss": 1.0439, + "mean_token_accuracy": 0.6989888548851013, + "num_tokens": 38469403.0, + "step": 1529 + }, + { + "epoch": 0.1680210849989018, + "grad_norm": 2.269879102706909, + "learning_rate": 8.391877058177826e-07, + "loss": 1.1093, + "mean_token_accuracy": 0.6722866296768188, + "num_tokens": 38496672.0, + "step": 1530 + }, + { + "epoch": 0.16813090270151548, + "grad_norm": 2.1214449405670166, + "learning_rate": 8.397365532381997e-07, + "loss": 0.9198, + "mean_token_accuracy": 0.7295029759407043, + "num_tokens": 38523113.0, + "step": 1531 + }, + { + "epoch": 0.16824072040412916, + "grad_norm": 2.6189284324645996, + "learning_rate": 8.402854006586168e-07, + "loss": 1.0629, + "mean_token_accuracy": 0.6819158792495728, + "num_tokens": 38543368.0, + "step": 1532 + }, + { + "epoch": 0.1683505381067428, + "grad_norm": 2.1321628093719482, + "learning_rate": 8.40834248079034e-07, + "loss": 1.0745, + "mean_token_accuracy": 0.6785415410995483, + "num_tokens": 38573123.0, + "step": 1533 + }, + { + "epoch": 0.16846035580935648, + "grad_norm": 2.5334601402282715, + "learning_rate": 8.413830954994511e-07, + "loss": 1.0875, + "mean_token_accuracy": 0.6836090683937073, + "num_tokens": 38596848.0, + "step": 1534 + }, + { + "epoch": 0.16857017351197012, + "grad_norm": 2.5905823707580566, + "learning_rate": 8.419319429198682e-07, + "loss": 1.0447, + "mean_token_accuracy": 0.6877564191818237, + "num_tokens": 38619199.0, + "step": 1535 + }, + { + "epoch": 0.1686799912145838, + "grad_norm": 2.3792591094970703, + "learning_rate": 8.424807903402854e-07, + "loss": 0.9856, + "mean_token_accuracy": 0.7058150172233582, + "num_tokens": 38643250.0, + "step": 1536 + }, + { + "epoch": 0.16878980891719744, + "grad_norm": 2.3336644172668457, + "learning_rate": 8.430296377607025e-07, + "loss": 1.0235, + "mean_token_accuracy": 0.6937547922134399, + "num_tokens": 38668811.0, + "step": 1537 + }, + { + "epoch": 0.16889962661981112, + "grad_norm": 2.5702860355377197, + "learning_rate": 8.435784851811196e-07, + "loss": 1.0686, + "mean_token_accuracy": 0.6785888671875, + "num_tokens": 38690823.0, + "step": 1538 + }, + { + "epoch": 0.16900944432242476, + "grad_norm": 2.4252676963806152, + "learning_rate": 8.441273326015367e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.6940010786056519, + "num_tokens": 38715571.0, + "step": 1539 + }, + { + "epoch": 0.16911926202503844, + "grad_norm": 2.0990352630615234, + "learning_rate": 8.446761800219539e-07, + "loss": 1.1205, + "mean_token_accuracy": 0.6637523770332336, + "num_tokens": 38745992.0, + "step": 1540 + }, + { + "epoch": 0.1692290797276521, + "grad_norm": 2.264538288116455, + "learning_rate": 8.45225027442371e-07, + "loss": 1.0935, + "mean_token_accuracy": 0.6755550503730774, + "num_tokens": 38771763.0, + "step": 1541 + }, + { + "epoch": 0.16933889743026576, + "grad_norm": 2.321589469909668, + "learning_rate": 8.457738748627881e-07, + "loss": 1.0374, + "mean_token_accuracy": 0.6910505890846252, + "num_tokens": 38795602.0, + "step": 1542 + }, + { + "epoch": 0.16944871513287943, + "grad_norm": 2.577491283416748, + "learning_rate": 8.463227222832053e-07, + "loss": 0.9922, + "mean_token_accuracy": 0.6999373435974121, + "num_tokens": 38816732.0, + "step": 1543 + }, + { + "epoch": 0.16955853283549308, + "grad_norm": 2.4251742362976074, + "learning_rate": 8.468715697036224e-07, + "loss": 1.0146, + "mean_token_accuracy": 0.6955253481864929, + "num_tokens": 38838314.0, + "step": 1544 + }, + { + "epoch": 0.16966835053810675, + "grad_norm": 2.4395995140075684, + "learning_rate": 8.474204171240395e-07, + "loss": 1.0194, + "mean_token_accuracy": 0.69737708568573, + "num_tokens": 38861312.0, + "step": 1545 + }, + { + "epoch": 0.1697781682407204, + "grad_norm": 2.5825154781341553, + "learning_rate": 8.479692645444567e-07, + "loss": 0.9488, + "mean_token_accuracy": 0.7069733142852783, + "num_tokens": 38880588.0, + "step": 1546 + }, + { + "epoch": 0.16988798594333407, + "grad_norm": 2.603083610534668, + "learning_rate": 8.485181119648738e-07, + "loss": 1.0685, + "mean_token_accuracy": 0.6869107484817505, + "num_tokens": 38902929.0, + "step": 1547 + }, + { + "epoch": 0.16999780364594771, + "grad_norm": 2.188109874725342, + "learning_rate": 8.490669593852908e-07, + "loss": 0.9935, + "mean_token_accuracy": 0.7017354369163513, + "num_tokens": 38929260.0, + "step": 1548 + }, + { + "epoch": 0.1701076213485614, + "grad_norm": 2.2421164512634277, + "learning_rate": 8.496158068057079e-07, + "loss": 1.0247, + "mean_token_accuracy": 0.7006188631057739, + "num_tokens": 38955230.0, + "step": 1549 + }, + { + "epoch": 0.17021743905117506, + "grad_norm": 2.326949119567871, + "learning_rate": 8.50164654226125e-07, + "loss": 0.9846, + "mean_token_accuracy": 0.7017822265625, + "num_tokens": 38979913.0, + "step": 1550 + }, + { + "epoch": 0.1703272567537887, + "grad_norm": 2.679081439971924, + "learning_rate": 8.507135016465422e-07, + "loss": 1.0231, + "mean_token_accuracy": 0.6967377662658691, + "num_tokens": 38999337.0, + "step": 1551 + }, + { + "epoch": 0.17043707445640238, + "grad_norm": 2.3872592449188232, + "learning_rate": 8.512623490669593e-07, + "loss": 1.0253, + "mean_token_accuracy": 0.6994940042495728, + "num_tokens": 39022568.0, + "step": 1552 + }, + { + "epoch": 0.17054689215901603, + "grad_norm": 2.2522966861724854, + "learning_rate": 8.518111964873764e-07, + "loss": 1.0564, + "mean_token_accuracy": 0.6816990375518799, + "num_tokens": 39049116.0, + "step": 1553 + }, + { + "epoch": 0.1706567098616297, + "grad_norm": 2.0815818309783936, + "learning_rate": 8.523600439077936e-07, + "loss": 1.0027, + "mean_token_accuracy": 0.6928470134735107, + "num_tokens": 39076982.0, + "step": 1554 + }, + { + "epoch": 0.17076652756424335, + "grad_norm": 1.9967398643493652, + "learning_rate": 8.529088913282107e-07, + "loss": 1.0905, + "mean_token_accuracy": 0.6734042167663574, + "num_tokens": 39110107.0, + "step": 1555 + }, + { + "epoch": 0.17087634526685702, + "grad_norm": 2.1174511909484863, + "learning_rate": 8.534577387486278e-07, + "loss": 1.085, + "mean_token_accuracy": 0.6700443029403687, + "num_tokens": 39139999.0, + "step": 1556 + }, + { + "epoch": 0.17098616296947067, + "grad_norm": 2.3426718711853027, + "learning_rate": 8.54006586169045e-07, + "loss": 0.9881, + "mean_token_accuracy": 0.6988055109977722, + "num_tokens": 39168042.0, + "step": 1557 + }, + { + "epoch": 0.17109598067208434, + "grad_norm": 2.1991629600524902, + "learning_rate": 8.545554335894621e-07, + "loss": 1.1232, + "mean_token_accuracy": 0.6692344546318054, + "num_tokens": 39197358.0, + "step": 1558 + }, + { + "epoch": 0.17120579837469801, + "grad_norm": 2.1780405044555664, + "learning_rate": 8.551042810098792e-07, + "loss": 1.0702, + "mean_token_accuracy": 0.6825194358825684, + "num_tokens": 39226347.0, + "step": 1559 + }, + { + "epoch": 0.17131561607731166, + "grad_norm": 2.3296103477478027, + "learning_rate": 8.556531284302964e-07, + "loss": 1.142, + "mean_token_accuracy": 0.6661136150360107, + "num_tokens": 39253638.0, + "step": 1560 + }, + { + "epoch": 0.17142543377992533, + "grad_norm": 2.4570953845977783, + "learning_rate": 8.562019758507134e-07, + "loss": 1.042, + "mean_token_accuracy": 0.6953758597373962, + "num_tokens": 39275396.0, + "step": 1561 + }, + { + "epoch": 0.17153525148253898, + "grad_norm": 2.2317771911621094, + "learning_rate": 8.567508232711306e-07, + "loss": 1.0642, + "mean_token_accuracy": 0.6828696131706238, + "num_tokens": 39304861.0, + "step": 1562 + }, + { + "epoch": 0.17164506918515265, + "grad_norm": 2.357196092605591, + "learning_rate": 8.572996706915478e-07, + "loss": 0.9629, + "mean_token_accuracy": 0.7113439440727234, + "num_tokens": 39328462.0, + "step": 1563 + }, + { + "epoch": 0.1717548868877663, + "grad_norm": 2.646122455596924, + "learning_rate": 8.578485181119648e-07, + "loss": 0.9336, + "mean_token_accuracy": 0.7146787047386169, + "num_tokens": 39347044.0, + "step": 1564 + }, + { + "epoch": 0.17186470459037997, + "grad_norm": 2.1566977500915527, + "learning_rate": 8.58397365532382e-07, + "loss": 1.0028, + "mean_token_accuracy": 0.6977794170379639, + "num_tokens": 39377108.0, + "step": 1565 + }, + { + "epoch": 0.17197452229299362, + "grad_norm": 2.246332883834839, + "learning_rate": 8.589462129527992e-07, + "loss": 1.0769, + "mean_token_accuracy": 0.6753919124603271, + "num_tokens": 39405354.0, + "step": 1566 + }, + { + "epoch": 0.1720843399956073, + "grad_norm": 2.0526132583618164, + "learning_rate": 8.594950603732161e-07, + "loss": 1.0353, + "mean_token_accuracy": 0.6922034025192261, + "num_tokens": 39437113.0, + "step": 1567 + }, + { + "epoch": 0.17219415769822094, + "grad_norm": 2.5325632095336914, + "learning_rate": 8.600439077936333e-07, + "loss": 0.9567, + "mean_token_accuracy": 0.7087897062301636, + "num_tokens": 39459400.0, + "step": 1568 + }, + { + "epoch": 0.1723039754008346, + "grad_norm": 2.428067207336426, + "learning_rate": 8.605927552140505e-07, + "loss": 0.994, + "mean_token_accuracy": 0.7018665671348572, + "num_tokens": 39481742.0, + "step": 1569 + }, + { + "epoch": 0.1724137931034483, + "grad_norm": 2.6352972984313965, + "learning_rate": 8.611416026344675e-07, + "loss": 0.967, + "mean_token_accuracy": 0.7038784027099609, + "num_tokens": 39500592.0, + "step": 1570 + }, + { + "epoch": 0.17252361080606193, + "grad_norm": 2.3403682708740234, + "learning_rate": 8.616904500548847e-07, + "loss": 1.0308, + "mean_token_accuracy": 0.6894703507423401, + "num_tokens": 39524200.0, + "step": 1571 + }, + { + "epoch": 0.1726334285086756, + "grad_norm": 2.1914730072021484, + "learning_rate": 8.622392974753018e-07, + "loss": 0.983, + "mean_token_accuracy": 0.7038480639457703, + "num_tokens": 39550329.0, + "step": 1572 + }, + { + "epoch": 0.17274324621128925, + "grad_norm": 2.1834216117858887, + "learning_rate": 8.627881448957189e-07, + "loss": 1.1112, + "mean_token_accuracy": 0.6705420017242432, + "num_tokens": 39581004.0, + "step": 1573 + }, + { + "epoch": 0.17285306391390293, + "grad_norm": 2.4166462421417236, + "learning_rate": 8.633369923161361e-07, + "loss": 1.0035, + "mean_token_accuracy": 0.6943192481994629, + "num_tokens": 39602800.0, + "step": 1574 + }, + { + "epoch": 0.17296288161651657, + "grad_norm": 2.046778678894043, + "learning_rate": 8.638858397365532e-07, + "loss": 0.9896, + "mean_token_accuracy": 0.7030194401741028, + "num_tokens": 39632394.0, + "step": 1575 + }, + { + "epoch": 0.17307269931913025, + "grad_norm": 2.788365125656128, + "learning_rate": 8.644346871569703e-07, + "loss": 1.0079, + "mean_token_accuracy": 0.69388747215271, + "num_tokens": 39651547.0, + "step": 1576 + }, + { + "epoch": 0.1731825170217439, + "grad_norm": 2.3181824684143066, + "learning_rate": 8.649835345773875e-07, + "loss": 1.0206, + "mean_token_accuracy": 0.6942815780639648, + "num_tokens": 39675610.0, + "step": 1577 + }, + { + "epoch": 0.17329233472435757, + "grad_norm": 2.3381340503692627, + "learning_rate": 8.655323819978046e-07, + "loss": 1.0145, + "mean_token_accuracy": 0.7045254707336426, + "num_tokens": 39703173.0, + "step": 1578 + }, + { + "epoch": 0.17340215242697124, + "grad_norm": 2.244187355041504, + "learning_rate": 8.660812294182217e-07, + "loss": 1.0225, + "mean_token_accuracy": 0.6848686933517456, + "num_tokens": 39729729.0, + "step": 1579 + }, + { + "epoch": 0.17351197012958489, + "grad_norm": 2.3855772018432617, + "learning_rate": 8.666300768386389e-07, + "loss": 0.9371, + "mean_token_accuracy": 0.7160488367080688, + "num_tokens": 39752624.0, + "step": 1580 + }, + { + "epoch": 0.17362178783219856, + "grad_norm": 2.3321049213409424, + "learning_rate": 8.67178924259056e-07, + "loss": 1.0595, + "mean_token_accuracy": 0.6836932897567749, + "num_tokens": 39776898.0, + "step": 1581 + }, + { + "epoch": 0.1737316055348122, + "grad_norm": 2.489208936691284, + "learning_rate": 8.677277716794731e-07, + "loss": 0.993, + "mean_token_accuracy": 0.6948264837265015, + "num_tokens": 39798887.0, + "step": 1582 + }, + { + "epoch": 0.17384142323742588, + "grad_norm": 2.172022819519043, + "learning_rate": 8.682766190998902e-07, + "loss": 1.1186, + "mean_token_accuracy": 0.6739425659179688, + "num_tokens": 39831149.0, + "step": 1583 + }, + { + "epoch": 0.17395124094003953, + "grad_norm": 2.0439271926879883, + "learning_rate": 8.688254665203073e-07, + "loss": 1.0543, + "mean_token_accuracy": 0.6859376430511475, + "num_tokens": 39864550.0, + "step": 1584 + }, + { + "epoch": 0.1740610586426532, + "grad_norm": 2.3263907432556152, + "learning_rate": 8.693743139407245e-07, + "loss": 1.1369, + "mean_token_accuracy": 0.6659492254257202, + "num_tokens": 39892352.0, + "step": 1585 + }, + { + "epoch": 0.17417087634526685, + "grad_norm": 2.3849713802337646, + "learning_rate": 8.699231613611415e-07, + "loss": 1.0143, + "mean_token_accuracy": 0.6976796388626099, + "num_tokens": 39915854.0, + "step": 1586 + }, + { + "epoch": 0.17428069404788052, + "grad_norm": 2.252122163772583, + "learning_rate": 8.704720087815586e-07, + "loss": 1.0751, + "mean_token_accuracy": 0.6894117593765259, + "num_tokens": 39941998.0, + "step": 1587 + }, + { + "epoch": 0.1743905117504942, + "grad_norm": 2.4421584606170654, + "learning_rate": 8.710208562019758e-07, + "loss": 1.0565, + "mean_token_accuracy": 0.6995408535003662, + "num_tokens": 39966196.0, + "step": 1588 + }, + { + "epoch": 0.17450032945310784, + "grad_norm": 2.1599485874176025, + "learning_rate": 8.715697036223929e-07, + "loss": 1.0288, + "mean_token_accuracy": 0.6928494572639465, + "num_tokens": 39993154.0, + "step": 1589 + }, + { + "epoch": 0.1746101471557215, + "grad_norm": 2.299010753631592, + "learning_rate": 8.7211855104281e-07, + "loss": 1.0187, + "mean_token_accuracy": 0.7006489038467407, + "num_tokens": 40017927.0, + "step": 1590 + }, + { + "epoch": 0.17471996485833516, + "grad_norm": 2.361821174621582, + "learning_rate": 8.726673984632272e-07, + "loss": 0.9921, + "mean_token_accuracy": 0.7026834487915039, + "num_tokens": 40043123.0, + "step": 1591 + }, + { + "epoch": 0.17482978256094883, + "grad_norm": 2.1929311752319336, + "learning_rate": 8.732162458836443e-07, + "loss": 1.1032, + "mean_token_accuracy": 0.6771199107170105, + "num_tokens": 40071027.0, + "step": 1592 + }, + { + "epoch": 0.17493960026356248, + "grad_norm": 2.208176374435425, + "learning_rate": 8.737650933040614e-07, + "loss": 1.0296, + "mean_token_accuracy": 0.6951472163200378, + "num_tokens": 40099834.0, + "step": 1593 + }, + { + "epoch": 0.17504941796617615, + "grad_norm": 2.4041411876678467, + "learning_rate": 8.743139407244785e-07, + "loss": 0.9039, + "mean_token_accuracy": 0.7188951969146729, + "num_tokens": 40120640.0, + "step": 1594 + }, + { + "epoch": 0.1751592356687898, + "grad_norm": 2.2699904441833496, + "learning_rate": 8.748627881448957e-07, + "loss": 1.0419, + "mean_token_accuracy": 0.6901317238807678, + "num_tokens": 40145997.0, + "step": 1595 + }, + { + "epoch": 0.17526905337140347, + "grad_norm": 2.5849833488464355, + "learning_rate": 8.754116355653128e-07, + "loss": 1.0422, + "mean_token_accuracy": 0.6840749979019165, + "num_tokens": 40167669.0, + "step": 1596 + }, + { + "epoch": 0.17537887107401715, + "grad_norm": 2.4956929683685303, + "learning_rate": 8.759604829857299e-07, + "loss": 0.9672, + "mean_token_accuracy": 0.7066905498504639, + "num_tokens": 40189948.0, + "step": 1597 + }, + { + "epoch": 0.1754886887766308, + "grad_norm": 2.548163414001465, + "learning_rate": 8.765093304061471e-07, + "loss": 1.0022, + "mean_token_accuracy": 0.6998872756958008, + "num_tokens": 40215336.0, + "step": 1598 + }, + { + "epoch": 0.17559850647924446, + "grad_norm": 2.0839881896972656, + "learning_rate": 8.770581778265642e-07, + "loss": 1.0499, + "mean_token_accuracy": 0.6875724792480469, + "num_tokens": 40244405.0, + "step": 1599 + }, + { + "epoch": 0.1757083241818581, + "grad_norm": 2.1118826866149902, + "learning_rate": 8.776070252469813e-07, + "loss": 0.9567, + "mean_token_accuracy": 0.7091645002365112, + "num_tokens": 40269748.0, + "step": 1600 + }, + { + "epoch": 0.17581814188447178, + "grad_norm": 2.362896680831909, + "learning_rate": 8.781558726673985e-07, + "loss": 1.0311, + "mean_token_accuracy": 0.6967780590057373, + "num_tokens": 40292751.0, + "step": 1601 + }, + { + "epoch": 0.17592795958708543, + "grad_norm": 2.7508082389831543, + "learning_rate": 8.787047200878156e-07, + "loss": 0.9754, + "mean_token_accuracy": 0.7053436636924744, + "num_tokens": 40311210.0, + "step": 1602 + }, + { + "epoch": 0.1760377772896991, + "grad_norm": 2.17173171043396, + "learning_rate": 8.792535675082327e-07, + "loss": 1.1434, + "mean_token_accuracy": 0.6660941243171692, + "num_tokens": 40339013.0, + "step": 1603 + }, + { + "epoch": 0.17614759499231275, + "grad_norm": 2.2650625705718994, + "learning_rate": 8.798024149286499e-07, + "loss": 1.0318, + "mean_token_accuracy": 0.6937055587768555, + "num_tokens": 40364815.0, + "step": 1604 + }, + { + "epoch": 0.17625741269492642, + "grad_norm": 2.36623215675354, + "learning_rate": 8.803512623490668e-07, + "loss": 0.9312, + "mean_token_accuracy": 0.7180112600326538, + "num_tokens": 40387753.0, + "step": 1605 + }, + { + "epoch": 0.17636723039754007, + "grad_norm": 2.209036111831665, + "learning_rate": 8.80900109769484e-07, + "loss": 1.0257, + "mean_token_accuracy": 0.7037844061851501, + "num_tokens": 40413973.0, + "step": 1606 + }, + { + "epoch": 0.17647704810015374, + "grad_norm": 2.6328811645507812, + "learning_rate": 8.814489571899012e-07, + "loss": 1.0316, + "mean_token_accuracy": 0.6969114542007446, + "num_tokens": 40435088.0, + "step": 1607 + }, + { + "epoch": 0.17658686580276742, + "grad_norm": 2.5856151580810547, + "learning_rate": 8.819978046103182e-07, + "loss": 1.0076, + "mean_token_accuracy": 0.6966661214828491, + "num_tokens": 40456668.0, + "step": 1608 + }, + { + "epoch": 0.17669668350538106, + "grad_norm": 2.2958033084869385, + "learning_rate": 8.825466520307354e-07, + "loss": 1.0962, + "mean_token_accuracy": 0.6828885078430176, + "num_tokens": 40482075.0, + "step": 1609 + }, + { + "epoch": 0.17680650120799474, + "grad_norm": 2.785207748413086, + "learning_rate": 8.830954994511526e-07, + "loss": 0.9985, + "mean_token_accuracy": 0.7008024454116821, + "num_tokens": 40499626.0, + "step": 1610 + }, + { + "epoch": 0.17691631891060838, + "grad_norm": 2.1375465393066406, + "learning_rate": 8.836443468715696e-07, + "loss": 1.0549, + "mean_token_accuracy": 0.6813335418701172, + "num_tokens": 40528134.0, + "step": 1611 + }, + { + "epoch": 0.17702613661322206, + "grad_norm": 2.0887112617492676, + "learning_rate": 8.841931942919868e-07, + "loss": 0.979, + "mean_token_accuracy": 0.7011784911155701, + "num_tokens": 40555612.0, + "step": 1612 + }, + { + "epoch": 0.1771359543158357, + "grad_norm": 2.5327646732330322, + "learning_rate": 8.84742041712404e-07, + "loss": 0.9522, + "mean_token_accuracy": 0.7146562933921814, + "num_tokens": 40576473.0, + "step": 1613 + }, + { + "epoch": 0.17724577201844938, + "grad_norm": 2.3600704669952393, + "learning_rate": 8.85290889132821e-07, + "loss": 1.0574, + "mean_token_accuracy": 0.6847891807556152, + "num_tokens": 40602590.0, + "step": 1614 + }, + { + "epoch": 0.17735558972106302, + "grad_norm": 2.4285593032836914, + "learning_rate": 8.858397365532382e-07, + "loss": 1.013, + "mean_token_accuracy": 0.6981154680252075, + "num_tokens": 40624882.0, + "step": 1615 + }, + { + "epoch": 0.1774654074236767, + "grad_norm": 2.46370267868042, + "learning_rate": 8.863885839736552e-07, + "loss": 1.0803, + "mean_token_accuracy": 0.6806105375289917, + "num_tokens": 40648530.0, + "step": 1616 + }, + { + "epoch": 0.17757522512629037, + "grad_norm": 2.4101455211639404, + "learning_rate": 8.869374313940724e-07, + "loss": 0.9338, + "mean_token_accuracy": 0.7141934037208557, + "num_tokens": 40669472.0, + "step": 1617 + }, + { + "epoch": 0.17768504282890402, + "grad_norm": 2.953660249710083, + "learning_rate": 8.874862788144896e-07, + "loss": 0.8992, + "mean_token_accuracy": 0.7230643630027771, + "num_tokens": 40686497.0, + "step": 1618 + }, + { + "epoch": 0.1777948605315177, + "grad_norm": 2.3830714225769043, + "learning_rate": 8.880351262349066e-07, + "loss": 1.0545, + "mean_token_accuracy": 0.6905080080032349, + "num_tokens": 40710544.0, + "step": 1619 + }, + { + "epoch": 0.17790467823413134, + "grad_norm": 2.4998068809509277, + "learning_rate": 8.885839736553238e-07, + "loss": 1.0236, + "mean_token_accuracy": 0.6982550621032715, + "num_tokens": 40733736.0, + "step": 1620 + }, + { + "epoch": 0.178014495936745, + "grad_norm": 2.2286269664764404, + "learning_rate": 8.89132821075741e-07, + "loss": 1.0636, + "mean_token_accuracy": 0.6820900440216064, + "num_tokens": 40761768.0, + "step": 1621 + }, + { + "epoch": 0.17812431363935866, + "grad_norm": 2.3622183799743652, + "learning_rate": 8.89681668496158e-07, + "loss": 1.0886, + "mean_token_accuracy": 0.6735475063323975, + "num_tokens": 40786459.0, + "step": 1622 + }, + { + "epoch": 0.17823413134197233, + "grad_norm": 2.548768997192383, + "learning_rate": 8.902305159165752e-07, + "loss": 0.9967, + "mean_token_accuracy": 0.6967915296554565, + "num_tokens": 40805929.0, + "step": 1623 + }, + { + "epoch": 0.17834394904458598, + "grad_norm": 2.187037706375122, + "learning_rate": 8.907793633369924e-07, + "loss": 1.0695, + "mean_token_accuracy": 0.6865702271461487, + "num_tokens": 40834174.0, + "step": 1624 + }, + { + "epoch": 0.17845376674719965, + "grad_norm": 2.1947927474975586, + "learning_rate": 8.913282107574093e-07, + "loss": 1.0297, + "mean_token_accuracy": 0.6963163614273071, + "num_tokens": 40861990.0, + "step": 1625 + }, + { + "epoch": 0.17856358444981332, + "grad_norm": 2.8386647701263428, + "learning_rate": 8.918770581778265e-07, + "loss": 1.007, + "mean_token_accuracy": 0.6941222548484802, + "num_tokens": 40880941.0, + "step": 1626 + }, + { + "epoch": 0.17867340215242697, + "grad_norm": 2.46989369392395, + "learning_rate": 8.924259055982436e-07, + "loss": 1.0824, + "mean_token_accuracy": 0.6816776990890503, + "num_tokens": 40904072.0, + "step": 1627 + }, + { + "epoch": 0.17878321985504064, + "grad_norm": 2.706192970275879, + "learning_rate": 8.929747530186607e-07, + "loss": 0.9715, + "mean_token_accuracy": 0.7054144740104675, + "num_tokens": 40922599.0, + "step": 1628 + }, + { + "epoch": 0.1788930375576543, + "grad_norm": 2.266831398010254, + "learning_rate": 8.935236004390779e-07, + "loss": 1.0719, + "mean_token_accuracy": 0.6838179230690002, + "num_tokens": 40946926.0, + "step": 1629 + }, + { + "epoch": 0.17900285526026796, + "grad_norm": 2.2452993392944336, + "learning_rate": 8.94072447859495e-07, + "loss": 1.046, + "mean_token_accuracy": 0.692147970199585, + "num_tokens": 40975920.0, + "step": 1630 + }, + { + "epoch": 0.1791126729628816, + "grad_norm": 2.3121085166931152, + "learning_rate": 8.946212952799121e-07, + "loss": 1.0716, + "mean_token_accuracy": 0.6971217393875122, + "num_tokens": 41000874.0, + "step": 1631 + }, + { + "epoch": 0.17922249066549528, + "grad_norm": 2.67434024810791, + "learning_rate": 8.951701427003293e-07, + "loss": 1.0172, + "mean_token_accuracy": 0.6929565668106079, + "num_tokens": 41020025.0, + "step": 1632 + }, + { + "epoch": 0.17933230836810893, + "grad_norm": 2.102504014968872, + "learning_rate": 8.957189901207464e-07, + "loss": 1.0075, + "mean_token_accuracy": 0.699639081954956, + "num_tokens": 41047217.0, + "step": 1633 + }, + { + "epoch": 0.1794421260707226, + "grad_norm": 2.3209526538848877, + "learning_rate": 8.962678375411635e-07, + "loss": 1.0585, + "mean_token_accuracy": 0.6801285743713379, + "num_tokens": 41072367.0, + "step": 1634 + }, + { + "epoch": 0.17955194377333628, + "grad_norm": 2.383206605911255, + "learning_rate": 8.968166849615807e-07, + "loss": 0.9538, + "mean_token_accuracy": 0.712317943572998, + "num_tokens": 41095231.0, + "step": 1635 + }, + { + "epoch": 0.17966176147594992, + "grad_norm": 2.2953951358795166, + "learning_rate": 8.973655323819978e-07, + "loss": 1.0058, + "mean_token_accuracy": 0.7010247111320496, + "num_tokens": 41120415.0, + "step": 1636 + }, + { + "epoch": 0.1797715791785636, + "grad_norm": 2.3284618854522705, + "learning_rate": 8.979143798024149e-07, + "loss": 0.929, + "mean_token_accuracy": 0.7176580429077148, + "num_tokens": 41144072.0, + "step": 1637 + }, + { + "epoch": 0.17988139688117724, + "grad_norm": 2.307684898376465, + "learning_rate": 8.98463227222832e-07, + "loss": 1.0034, + "mean_token_accuracy": 0.7007387280464172, + "num_tokens": 41167452.0, + "step": 1638 + }, + { + "epoch": 0.17999121458379091, + "grad_norm": 2.3451144695281982, + "learning_rate": 8.990120746432492e-07, + "loss": 1.037, + "mean_token_accuracy": 0.6956196427345276, + "num_tokens": 41192939.0, + "step": 1639 + }, + { + "epoch": 0.18010103228640456, + "grad_norm": 2.37884521484375, + "learning_rate": 8.995609220636663e-07, + "loss": 0.9428, + "mean_token_accuracy": 0.7134151458740234, + "num_tokens": 41217036.0, + "step": 1640 + }, + { + "epoch": 0.18021084998901823, + "grad_norm": 2.2564008235931396, + "learning_rate": 9.001097694840834e-07, + "loss": 1.0551, + "mean_token_accuracy": 0.6850335001945496, + "num_tokens": 41247061.0, + "step": 1641 + }, + { + "epoch": 0.18032066769163188, + "grad_norm": 2.5649020671844482, + "learning_rate": 9.006586169045006e-07, + "loss": 0.8878, + "mean_token_accuracy": 0.7234976291656494, + "num_tokens": 41267474.0, + "step": 1642 + }, + { + "epoch": 0.18043048539424555, + "grad_norm": 2.0734288692474365, + "learning_rate": 9.012074643249177e-07, + "loss": 0.972, + "mean_token_accuracy": 0.7078663110733032, + "num_tokens": 41298081.0, + "step": 1643 + }, + { + "epoch": 0.1805403030968592, + "grad_norm": 2.132082223892212, + "learning_rate": 9.017563117453347e-07, + "loss": 1.1191, + "mean_token_accuracy": 0.6688825488090515, + "num_tokens": 41329108.0, + "step": 1644 + }, + { + "epoch": 0.18065012079947287, + "grad_norm": 2.293675661087036, + "learning_rate": 9.023051591657519e-07, + "loss": 1.0631, + "mean_token_accuracy": 0.6787682771682739, + "num_tokens": 41353147.0, + "step": 1645 + }, + { + "epoch": 0.18075993850208655, + "grad_norm": 2.263488531112671, + "learning_rate": 9.02854006586169e-07, + "loss": 1.0461, + "mean_token_accuracy": 0.6961615085601807, + "num_tokens": 41378626.0, + "step": 1646 + }, + { + "epoch": 0.1808697562047002, + "grad_norm": 2.4564647674560547, + "learning_rate": 9.034028540065861e-07, + "loss": 1.0895, + "mean_token_accuracy": 0.6721789836883545, + "num_tokens": 41402301.0, + "step": 1647 + }, + { + "epoch": 0.18097957390731387, + "grad_norm": 2.4251134395599365, + "learning_rate": 9.039517014270033e-07, + "loss": 1.1463, + "mean_token_accuracy": 0.6614187359809875, + "num_tokens": 41426486.0, + "step": 1648 + }, + { + "epoch": 0.1810893916099275, + "grad_norm": 2.5507285594940186, + "learning_rate": 9.045005488474203e-07, + "loss": 1.0268, + "mean_token_accuracy": 0.691493034362793, + "num_tokens": 41446192.0, + "step": 1649 + }, + { + "epoch": 0.1811992093125412, + "grad_norm": 2.0388131141662598, + "learning_rate": 9.050493962678375e-07, + "loss": 1.0103, + "mean_token_accuracy": 0.6957193613052368, + "num_tokens": 41475802.0, + "step": 1650 + }, + { + "epoch": 0.18130902701515483, + "grad_norm": 2.6619467735290527, + "learning_rate": 9.055982436882547e-07, + "loss": 1.0063, + "mean_token_accuracy": 0.690918505191803, + "num_tokens": 41496423.0, + "step": 1651 + }, + { + "epoch": 0.1814188447177685, + "grad_norm": 2.347534656524658, + "learning_rate": 9.061470911086717e-07, + "loss": 1.0747, + "mean_token_accuracy": 0.6827312707901001, + "num_tokens": 41520607.0, + "step": 1652 + }, + { + "epoch": 0.18152866242038215, + "grad_norm": 2.3709492683410645, + "learning_rate": 9.066959385290889e-07, + "loss": 1.0897, + "mean_token_accuracy": 0.6807558536529541, + "num_tokens": 41545670.0, + "step": 1653 + }, + { + "epoch": 0.18163848012299583, + "grad_norm": 2.1308035850524902, + "learning_rate": 9.07244785949506e-07, + "loss": 1.0695, + "mean_token_accuracy": 0.6834465265274048, + "num_tokens": 41574912.0, + "step": 1654 + }, + { + "epoch": 0.1817482978256095, + "grad_norm": 2.244210958480835, + "learning_rate": 9.077936333699231e-07, + "loss": 0.9884, + "mean_token_accuracy": 0.7005391120910645, + "num_tokens": 41602028.0, + "step": 1655 + }, + { + "epoch": 0.18185811552822315, + "grad_norm": 2.233112096786499, + "learning_rate": 9.083424807903403e-07, + "loss": 0.9391, + "mean_token_accuracy": 0.7164239883422852, + "num_tokens": 41627949.0, + "step": 1656 + }, + { + "epoch": 0.18196793323083682, + "grad_norm": 2.4897541999816895, + "learning_rate": 9.088913282107573e-07, + "loss": 1.0899, + "mean_token_accuracy": 0.6743900775909424, + "num_tokens": 41653883.0, + "step": 1657 + }, + { + "epoch": 0.18207775093345047, + "grad_norm": 2.394615650177002, + "learning_rate": 9.094401756311745e-07, + "loss": 1.0299, + "mean_token_accuracy": 0.6853995323181152, + "num_tokens": 41676293.0, + "step": 1658 + }, + { + "epoch": 0.18218756863606414, + "grad_norm": 2.517061471939087, + "learning_rate": 9.099890230515917e-07, + "loss": 1.0872, + "mean_token_accuracy": 0.6716680526733398, + "num_tokens": 41698806.0, + "step": 1659 + }, + { + "epoch": 0.18229738633867779, + "grad_norm": 2.7378673553466797, + "learning_rate": 9.105378704720087e-07, + "loss": 0.9961, + "mean_token_accuracy": 0.7077842354774475, + "num_tokens": 41721047.0, + "step": 1660 + }, + { + "epoch": 0.18240720404129146, + "grad_norm": 2.3421790599823, + "learning_rate": 9.110867178924259e-07, + "loss": 1.0875, + "mean_token_accuracy": 0.6828655004501343, + "num_tokens": 41744184.0, + "step": 1661 + }, + { + "epoch": 0.1825170217439051, + "grad_norm": 2.0105979442596436, + "learning_rate": 9.11635565312843e-07, + "loss": 1.0038, + "mean_token_accuracy": 0.694951057434082, + "num_tokens": 41773370.0, + "step": 1662 + }, + { + "epoch": 0.18262683944651878, + "grad_norm": 2.6756439208984375, + "learning_rate": 9.1218441273326e-07, + "loss": 0.9873, + "mean_token_accuracy": 0.7038904428482056, + "num_tokens": 41792808.0, + "step": 1663 + }, + { + "epoch": 0.18273665714913245, + "grad_norm": 2.324375867843628, + "learning_rate": 9.127332601536772e-07, + "loss": 0.9803, + "mean_token_accuracy": 0.7039439678192139, + "num_tokens": 41815825.0, + "step": 1664 + }, + { + "epoch": 0.1828464748517461, + "grad_norm": 2.4131598472595215, + "learning_rate": 9.132821075740944e-07, + "loss": 1.0552, + "mean_token_accuracy": 0.6937316060066223, + "num_tokens": 41840469.0, + "step": 1665 + }, + { + "epoch": 0.18295629255435977, + "grad_norm": 2.5282042026519775, + "learning_rate": 9.138309549945114e-07, + "loss": 1.0458, + "mean_token_accuracy": 0.691335916519165, + "num_tokens": 41861731.0, + "step": 1666 + }, + { + "epoch": 0.18306611025697342, + "grad_norm": 2.243509292602539, + "learning_rate": 9.143798024149286e-07, + "loss": 0.9648, + "mean_token_accuracy": 0.707316517829895, + "num_tokens": 41887515.0, + "step": 1667 + }, + { + "epoch": 0.1831759279595871, + "grad_norm": 2.225672960281372, + "learning_rate": 9.149286498353457e-07, + "loss": 1.0371, + "mean_token_accuracy": 0.7045959830284119, + "num_tokens": 41915354.0, + "step": 1668 + }, + { + "epoch": 0.18328574566220074, + "grad_norm": 2.4826786518096924, + "learning_rate": 9.154774972557628e-07, + "loss": 1.0237, + "mean_token_accuracy": 0.6900805234909058, + "num_tokens": 41938567.0, + "step": 1669 + }, + { + "epoch": 0.1833955633648144, + "grad_norm": 2.356130599975586, + "learning_rate": 9.1602634467618e-07, + "loss": 1.0543, + "mean_token_accuracy": 0.6892344951629639, + "num_tokens": 41963826.0, + "step": 1670 + }, + { + "epoch": 0.18350538106742806, + "grad_norm": 2.4373154640197754, + "learning_rate": 9.165751920965971e-07, + "loss": 0.9668, + "mean_token_accuracy": 0.7000976800918579, + "num_tokens": 41986175.0, + "step": 1671 + }, + { + "epoch": 0.18361519877004173, + "grad_norm": 2.5305261611938477, + "learning_rate": 9.171240395170142e-07, + "loss": 0.9849, + "mean_token_accuracy": 0.7055095434188843, + "num_tokens": 42006951.0, + "step": 1672 + }, + { + "epoch": 0.1837250164726554, + "grad_norm": 2.2348885536193848, + "learning_rate": 9.176728869374314e-07, + "loss": 1.0799, + "mean_token_accuracy": 0.6775513887405396, + "num_tokens": 42035500.0, + "step": 1673 + }, + { + "epoch": 0.18383483417526905, + "grad_norm": 2.603083372116089, + "learning_rate": 9.182217343578485e-07, + "loss": 1.0752, + "mean_token_accuracy": 0.6908601522445679, + "num_tokens": 42056290.0, + "step": 1674 + }, + { + "epoch": 0.18394465187788273, + "grad_norm": 2.2409920692443848, + "learning_rate": 9.187705817782656e-07, + "loss": 1.015, + "mean_token_accuracy": 0.7032297253608704, + "num_tokens": 42081516.0, + "step": 1675 + }, + { + "epoch": 0.18405446958049637, + "grad_norm": 2.522563934326172, + "learning_rate": 9.193194291986828e-07, + "loss": 1.0332, + "mean_token_accuracy": 0.689172625541687, + "num_tokens": 42102523.0, + "step": 1676 + }, + { + "epoch": 0.18416428728311005, + "grad_norm": 2.700805425643921, + "learning_rate": 9.198682766190999e-07, + "loss": 1.0238, + "mean_token_accuracy": 0.6953843832015991, + "num_tokens": 42122507.0, + "step": 1677 + }, + { + "epoch": 0.1842741049857237, + "grad_norm": 2.162395715713501, + "learning_rate": 9.20417124039517e-07, + "loss": 1.1299, + "mean_token_accuracy": 0.6686025857925415, + "num_tokens": 42151529.0, + "step": 1678 + }, + { + "epoch": 0.18438392268833736, + "grad_norm": 2.281336545944214, + "learning_rate": 9.209659714599341e-07, + "loss": 1.0781, + "mean_token_accuracy": 0.6823152303695679, + "num_tokens": 42177211.0, + "step": 1679 + }, + { + "epoch": 0.184493740390951, + "grad_norm": 1.9851341247558594, + "learning_rate": 9.215148188803513e-07, + "loss": 1.0887, + "mean_token_accuracy": 0.6747193336486816, + "num_tokens": 42208758.0, + "step": 1680 + }, + { + "epoch": 0.18460355809356468, + "grad_norm": 2.0004959106445312, + "learning_rate": 9.220636663007683e-07, + "loss": 1.0448, + "mean_token_accuracy": 0.6948162913322449, + "num_tokens": 42240765.0, + "step": 1681 + }, + { + "epoch": 0.18471337579617833, + "grad_norm": 2.4555070400238037, + "learning_rate": 9.226125137211854e-07, + "loss": 1.0759, + "mean_token_accuracy": 0.6818782687187195, + "num_tokens": 42264453.0, + "step": 1682 + }, + { + "epoch": 0.184823193498792, + "grad_norm": 2.1183836460113525, + "learning_rate": 9.231613611416026e-07, + "loss": 1.0557, + "mean_token_accuracy": 0.6885294318199158, + "num_tokens": 42292672.0, + "step": 1683 + }, + { + "epoch": 0.18493301120140568, + "grad_norm": 2.34151554107666, + "learning_rate": 9.237102085620197e-07, + "loss": 0.9789, + "mean_token_accuracy": 0.6984686851501465, + "num_tokens": 42315861.0, + "step": 1684 + }, + { + "epoch": 0.18504282890401932, + "grad_norm": 2.251174211502075, + "learning_rate": 9.242590559824368e-07, + "loss": 0.9405, + "mean_token_accuracy": 0.7162308692932129, + "num_tokens": 42340986.0, + "step": 1685 + }, + { + "epoch": 0.185152646606633, + "grad_norm": 2.5267934799194336, + "learning_rate": 9.24807903402854e-07, + "loss": 1.0209, + "mean_token_accuracy": 0.7018711566925049, + "num_tokens": 42362806.0, + "step": 1686 + }, + { + "epoch": 0.18526246430924664, + "grad_norm": 2.1134464740753174, + "learning_rate": 9.253567508232711e-07, + "loss": 1.0499, + "mean_token_accuracy": 0.689340353012085, + "num_tokens": 42392034.0, + "step": 1687 + }, + { + "epoch": 0.18537228201186032, + "grad_norm": 2.342625617980957, + "learning_rate": 9.259055982436882e-07, + "loss": 1.0426, + "mean_token_accuracy": 0.6885315775871277, + "num_tokens": 42416151.0, + "step": 1688 + }, + { + "epoch": 0.18548209971447396, + "grad_norm": 2.378732204437256, + "learning_rate": 9.264544456641053e-07, + "loss": 0.8883, + "mean_token_accuracy": 0.7295328974723816, + "num_tokens": 42438189.0, + "step": 1689 + }, + { + "epoch": 0.18559191741708764, + "grad_norm": 2.613184690475464, + "learning_rate": 9.270032930845224e-07, + "loss": 1.008, + "mean_token_accuracy": 0.6971128582954407, + "num_tokens": 42458663.0, + "step": 1690 + }, + { + "epoch": 0.18570173511970128, + "grad_norm": 2.3278417587280273, + "learning_rate": 9.275521405049396e-07, + "loss": 1.0281, + "mean_token_accuracy": 0.6961514353752136, + "num_tokens": 42482779.0, + "step": 1691 + }, + { + "epoch": 0.18581155282231496, + "grad_norm": 2.6119627952575684, + "learning_rate": 9.281009879253567e-07, + "loss": 1.0175, + "mean_token_accuracy": 0.6931432485580444, + "num_tokens": 42504423.0, + "step": 1692 + }, + { + "epoch": 0.18592137052492863, + "grad_norm": 2.0649921894073486, + "learning_rate": 9.286498353457738e-07, + "loss": 0.9644, + "mean_token_accuracy": 0.7137577533721924, + "num_tokens": 42533883.0, + "step": 1693 + }, + { + "epoch": 0.18603118822754228, + "grad_norm": 2.0980021953582764, + "learning_rate": 9.29198682766191e-07, + "loss": 1.0228, + "mean_token_accuracy": 0.6926316022872925, + "num_tokens": 42562132.0, + "step": 1694 + }, + { + "epoch": 0.18614100593015595, + "grad_norm": 2.2059035301208496, + "learning_rate": 9.297475301866081e-07, + "loss": 1.0457, + "mean_token_accuracy": 0.6859303712844849, + "num_tokens": 42589856.0, + "step": 1695 + }, + { + "epoch": 0.1862508236327696, + "grad_norm": 2.5438010692596436, + "learning_rate": 9.302963776070252e-07, + "loss": 1.0417, + "mean_token_accuracy": 0.6863548159599304, + "num_tokens": 42611933.0, + "step": 1696 + }, + { + "epoch": 0.18636064133538327, + "grad_norm": 2.4991326332092285, + "learning_rate": 9.308452250274424e-07, + "loss": 1.061, + "mean_token_accuracy": 0.6883302330970764, + "num_tokens": 42633033.0, + "step": 1697 + }, + { + "epoch": 0.18647045903799692, + "grad_norm": 2.550699234008789, + "learning_rate": 9.313940724478595e-07, + "loss": 1.0667, + "mean_token_accuracy": 0.6768031120300293, + "num_tokens": 42653758.0, + "step": 1698 + }, + { + "epoch": 0.1865802767406106, + "grad_norm": 2.493412971496582, + "learning_rate": 9.319429198682766e-07, + "loss": 0.9836, + "mean_token_accuracy": 0.702778160572052, + "num_tokens": 42674466.0, + "step": 1699 + }, + { + "epoch": 0.18669009444322424, + "grad_norm": 2.119508981704712, + "learning_rate": 9.324917672886937e-07, + "loss": 1.0903, + "mean_token_accuracy": 0.6708070635795593, + "num_tokens": 42702928.0, + "step": 1700 + }, + { + "epoch": 0.1867999121458379, + "grad_norm": 2.3304343223571777, + "learning_rate": 9.330406147091107e-07, + "loss": 1.047, + "mean_token_accuracy": 0.6818470358848572, + "num_tokens": 42727850.0, + "step": 1701 + }, + { + "epoch": 0.18690972984845158, + "grad_norm": 2.336064577102661, + "learning_rate": 9.335894621295279e-07, + "loss": 1.0489, + "mean_token_accuracy": 0.6980429887771606, + "num_tokens": 42752617.0, + "step": 1702 + }, + { + "epoch": 0.18701954755106523, + "grad_norm": 2.2047135829925537, + "learning_rate": 9.341383095499451e-07, + "loss": 1.0231, + "mean_token_accuracy": 0.6954317092895508, + "num_tokens": 42779464.0, + "step": 1703 + }, + { + "epoch": 0.1871293652536789, + "grad_norm": 2.4711174964904785, + "learning_rate": 9.346871569703621e-07, + "loss": 0.9774, + "mean_token_accuracy": 0.7019391059875488, + "num_tokens": 42801426.0, + "step": 1704 + }, + { + "epoch": 0.18723918295629255, + "grad_norm": 2.4357504844665527, + "learning_rate": 9.352360043907793e-07, + "loss": 1.1024, + "mean_token_accuracy": 0.678626298904419, + "num_tokens": 42823563.0, + "step": 1705 + }, + { + "epoch": 0.18734900065890622, + "grad_norm": 2.443608283996582, + "learning_rate": 9.357848518111965e-07, + "loss": 0.9386, + "mean_token_accuracy": 0.7098392248153687, + "num_tokens": 42848931.0, + "step": 1706 + }, + { + "epoch": 0.18745881836151987, + "grad_norm": 2.1900956630706787, + "learning_rate": 9.363336992316135e-07, + "loss": 1.0348, + "mean_token_accuracy": 0.6846051812171936, + "num_tokens": 42878261.0, + "step": 1707 + }, + { + "epoch": 0.18756863606413354, + "grad_norm": 2.0622429847717285, + "learning_rate": 9.368825466520307e-07, + "loss": 0.9697, + "mean_token_accuracy": 0.7102546691894531, + "num_tokens": 42908601.0, + "step": 1708 + }, + { + "epoch": 0.1876784537667472, + "grad_norm": 2.3043994903564453, + "learning_rate": 9.374313940724479e-07, + "loss": 1.1018, + "mean_token_accuracy": 0.6726185083389282, + "num_tokens": 42933875.0, + "step": 1709 + }, + { + "epoch": 0.18778827146936086, + "grad_norm": 3.024651288986206, + "learning_rate": 9.379802414928649e-07, + "loss": 0.9872, + "mean_token_accuracy": 0.699235200881958, + "num_tokens": 42951786.0, + "step": 1710 + }, + { + "epoch": 0.18789808917197454, + "grad_norm": 2.2644197940826416, + "learning_rate": 9.385290889132821e-07, + "loss": 1.1105, + "mean_token_accuracy": 0.6679924726486206, + "num_tokens": 42981908.0, + "step": 1711 + }, + { + "epoch": 0.18800790687458818, + "grad_norm": 2.2944962978363037, + "learning_rate": 9.390779363336992e-07, + "loss": 1.0561, + "mean_token_accuracy": 0.6843709945678711, + "num_tokens": 43007961.0, + "step": 1712 + }, + { + "epoch": 0.18811772457720186, + "grad_norm": 2.2998626232147217, + "learning_rate": 9.396267837541163e-07, + "loss": 1.0438, + "mean_token_accuracy": 0.6874170303344727, + "num_tokens": 43033909.0, + "step": 1713 + }, + { + "epoch": 0.1882275422798155, + "grad_norm": 2.0277974605560303, + "learning_rate": 9.401756311745335e-07, + "loss": 1.0471, + "mean_token_accuracy": 0.6938719749450684, + "num_tokens": 43067712.0, + "step": 1714 + }, + { + "epoch": 0.18833735998242918, + "grad_norm": 2.2352209091186523, + "learning_rate": 9.407244785949506e-07, + "loss": 1.0198, + "mean_token_accuracy": 0.694832444190979, + "num_tokens": 43093939.0, + "step": 1715 + }, + { + "epoch": 0.18844717768504282, + "grad_norm": 2.1617472171783447, + "learning_rate": 9.412733260153677e-07, + "loss": 1.0722, + "mean_token_accuracy": 0.6824785470962524, + "num_tokens": 43122187.0, + "step": 1716 + }, + { + "epoch": 0.1885569953876565, + "grad_norm": 2.3198301792144775, + "learning_rate": 9.418221734357849e-07, + "loss": 0.9448, + "mean_token_accuracy": 0.7180110812187195, + "num_tokens": 43146206.0, + "step": 1717 + }, + { + "epoch": 0.18866681309027014, + "grad_norm": 2.1225523948669434, + "learning_rate": 9.42371020856202e-07, + "loss": 1.0395, + "mean_token_accuracy": 0.7005214095115662, + "num_tokens": 43172932.0, + "step": 1718 + }, + { + "epoch": 0.18877663079288381, + "grad_norm": 2.5844902992248535, + "learning_rate": 9.42919868276619e-07, + "loss": 1.0296, + "mean_token_accuracy": 0.6853206157684326, + "num_tokens": 43192034.0, + "step": 1719 + }, + { + "epoch": 0.18888644849549746, + "grad_norm": 2.0189528465270996, + "learning_rate": 9.434687156970362e-07, + "loss": 0.9728, + "mean_token_accuracy": 0.6935635805130005, + "num_tokens": 43220617.0, + "step": 1720 + }, + { + "epoch": 0.18899626619811113, + "grad_norm": 2.1412484645843506, + "learning_rate": 9.440175631174532e-07, + "loss": 1.012, + "mean_token_accuracy": 0.697158932685852, + "num_tokens": 43247921.0, + "step": 1721 + }, + { + "epoch": 0.1891060839007248, + "grad_norm": 2.179253101348877, + "learning_rate": 9.445664105378704e-07, + "loss": 1.0564, + "mean_token_accuracy": 0.6803848743438721, + "num_tokens": 43275577.0, + "step": 1722 + }, + { + "epoch": 0.18921590160333845, + "grad_norm": 2.081206798553467, + "learning_rate": 9.451152579582875e-07, + "loss": 1.0324, + "mean_token_accuracy": 0.6944434642791748, + "num_tokens": 43305133.0, + "step": 1723 + }, + { + "epoch": 0.18932571930595213, + "grad_norm": 2.139467716217041, + "learning_rate": 9.456641053787046e-07, + "loss": 0.9306, + "mean_token_accuracy": 0.707513689994812, + "num_tokens": 43331043.0, + "step": 1724 + }, + { + "epoch": 0.18943553700856577, + "grad_norm": 2.345407485961914, + "learning_rate": 9.462129527991218e-07, + "loss": 1.0512, + "mean_token_accuracy": 0.6878550052642822, + "num_tokens": 43354791.0, + "step": 1725 + }, + { + "epoch": 0.18954535471117945, + "grad_norm": 2.132127285003662, + "learning_rate": 9.467618002195389e-07, + "loss": 1.0875, + "mean_token_accuracy": 0.6885207891464233, + "num_tokens": 43384495.0, + "step": 1726 + }, + { + "epoch": 0.1896551724137931, + "grad_norm": 2.322981357574463, + "learning_rate": 9.47310647639956e-07, + "loss": 0.9399, + "mean_token_accuracy": 0.7130044102668762, + "num_tokens": 43410459.0, + "step": 1727 + }, + { + "epoch": 0.18976499011640677, + "grad_norm": 2.2699952125549316, + "learning_rate": 9.478594950603732e-07, + "loss": 1.061, + "mean_token_accuracy": 0.6844862103462219, + "num_tokens": 43435642.0, + "step": 1728 + }, + { + "epoch": 0.1898748078190204, + "grad_norm": 2.4650447368621826, + "learning_rate": 9.484083424807903e-07, + "loss": 0.984, + "mean_token_accuracy": 0.7010929584503174, + "num_tokens": 43457855.0, + "step": 1729 + }, + { + "epoch": 0.1899846255216341, + "grad_norm": 2.0532033443450928, + "learning_rate": 9.489571899012074e-07, + "loss": 1.0518, + "mean_token_accuracy": 0.6883974075317383, + "num_tokens": 43489085.0, + "step": 1730 + }, + { + "epoch": 0.19009444322424776, + "grad_norm": 2.026519536972046, + "learning_rate": 9.495060373216246e-07, + "loss": 0.9698, + "mean_token_accuracy": 0.7033852338790894, + "num_tokens": 43518959.0, + "step": 1731 + }, + { + "epoch": 0.1902042609268614, + "grad_norm": 2.2782094478607178, + "learning_rate": 9.500548847420417e-07, + "loss": 0.9934, + "mean_token_accuracy": 0.6987406015396118, + "num_tokens": 43543656.0, + "step": 1732 + }, + { + "epoch": 0.19031407862947508, + "grad_norm": 2.4137439727783203, + "learning_rate": 9.506037321624588e-07, + "loss": 0.9486, + "mean_token_accuracy": 0.7062510251998901, + "num_tokens": 43565856.0, + "step": 1733 + }, + { + "epoch": 0.19042389633208873, + "grad_norm": 2.558030843734741, + "learning_rate": 9.511525795828759e-07, + "loss": 1.0388, + "mean_token_accuracy": 0.6900935173034668, + "num_tokens": 43588902.0, + "step": 1734 + }, + { + "epoch": 0.1905337140347024, + "grad_norm": 2.0183358192443848, + "learning_rate": 9.517014270032931e-07, + "loss": 1.1041, + "mean_token_accuracy": 0.6800810098648071, + "num_tokens": 43620729.0, + "step": 1735 + }, + { + "epoch": 0.19064353173731605, + "grad_norm": 2.5179989337921143, + "learning_rate": 9.522502744237102e-07, + "loss": 0.9512, + "mean_token_accuracy": 0.7117324471473694, + "num_tokens": 43642453.0, + "step": 1736 + }, + { + "epoch": 0.19075334943992972, + "grad_norm": 2.0188021659851074, + "learning_rate": 9.527991218441273e-07, + "loss": 1.0729, + "mean_token_accuracy": 0.679563045501709, + "num_tokens": 43676173.0, + "step": 1737 + }, + { + "epoch": 0.19086316714254337, + "grad_norm": 2.096041202545166, + "learning_rate": 9.533479692645444e-07, + "loss": 1.0932, + "mean_token_accuracy": 0.6721934676170349, + "num_tokens": 43708759.0, + "step": 1738 + }, + { + "epoch": 0.19097298484515704, + "grad_norm": 2.3916563987731934, + "learning_rate": 9.538968166849616e-07, + "loss": 0.9341, + "mean_token_accuracy": 0.7136728763580322, + "num_tokens": 43731851.0, + "step": 1739 + }, + { + "epoch": 0.1910828025477707, + "grad_norm": 2.565823793411255, + "learning_rate": 9.544456641053787e-07, + "loss": 0.9592, + "mean_token_accuracy": 0.7105197906494141, + "num_tokens": 43751234.0, + "step": 1740 + }, + { + "epoch": 0.19119262025038436, + "grad_norm": 2.291537046432495, + "learning_rate": 9.549945115257958e-07, + "loss": 1.0955, + "mean_token_accuracy": 0.6739344000816345, + "num_tokens": 43778396.0, + "step": 1741 + }, + { + "epoch": 0.19130243795299803, + "grad_norm": 2.2763454914093018, + "learning_rate": 9.55543358946213e-07, + "loss": 0.976, + "mean_token_accuracy": 0.701663076877594, + "num_tokens": 43803431.0, + "step": 1742 + }, + { + "epoch": 0.19141225565561168, + "grad_norm": 2.0129823684692383, + "learning_rate": 9.5609220636663e-07, + "loss": 1.0366, + "mean_token_accuracy": 0.6839510202407837, + "num_tokens": 43835466.0, + "step": 1743 + }, + { + "epoch": 0.19152207335822535, + "grad_norm": 2.326704740524292, + "learning_rate": 9.566410537870472e-07, + "loss": 1.0918, + "mean_token_accuracy": 0.6726713180541992, + "num_tokens": 43859455.0, + "step": 1744 + }, + { + "epoch": 0.191631891060839, + "grad_norm": 2.201308488845825, + "learning_rate": 9.571899012074642e-07, + "loss": 1.1091, + "mean_token_accuracy": 0.6635087132453918, + "num_tokens": 43887195.0, + "step": 1745 + }, + { + "epoch": 0.19174170876345267, + "grad_norm": 2.3218557834625244, + "learning_rate": 9.577387486278815e-07, + "loss": 1.0116, + "mean_token_accuracy": 0.6942388415336609, + "num_tokens": 43911459.0, + "step": 1746 + }, + { + "epoch": 0.19185152646606632, + "grad_norm": 2.3599436283111572, + "learning_rate": 9.582875960482986e-07, + "loss": 0.9108, + "mean_token_accuracy": 0.7176237106323242, + "num_tokens": 43934069.0, + "step": 1747 + }, + { + "epoch": 0.19196134416868, + "grad_norm": 2.2205448150634766, + "learning_rate": 9.588364434687156e-07, + "loss": 1.1035, + "mean_token_accuracy": 0.679337739944458, + "num_tokens": 43961670.0, + "step": 1748 + }, + { + "epoch": 0.19207116187129367, + "grad_norm": 2.5309834480285645, + "learning_rate": 9.593852908891327e-07, + "loss": 0.9095, + "mean_token_accuracy": 0.7168415784835815, + "num_tokens": 43981982.0, + "step": 1749 + }, + { + "epoch": 0.1921809795739073, + "grad_norm": 2.2876009941101074, + "learning_rate": 9.5993413830955e-07, + "loss": 0.9937, + "mean_token_accuracy": 0.6988067626953125, + "num_tokens": 44005327.0, + "step": 1750 + }, + { + "epoch": 0.19229079727652099, + "grad_norm": 2.435267925262451, + "learning_rate": 9.60482985729967e-07, + "loss": 1.0635, + "mean_token_accuracy": 0.6838050484657288, + "num_tokens": 44027714.0, + "step": 1751 + }, + { + "epoch": 0.19240061497913463, + "grad_norm": 2.3144288063049316, + "learning_rate": 9.61031833150384e-07, + "loss": 1.0157, + "mean_token_accuracy": 0.6940395832061768, + "num_tokens": 44052712.0, + "step": 1752 + }, + { + "epoch": 0.1925104326817483, + "grad_norm": 2.521724224090576, + "learning_rate": 9.615806805708014e-07, + "loss": 1.0284, + "mean_token_accuracy": 0.6942777037620544, + "num_tokens": 44075506.0, + "step": 1753 + }, + { + "epoch": 0.19262025038436195, + "grad_norm": 2.0769777297973633, + "learning_rate": 9.621295279912184e-07, + "loss": 1.0841, + "mean_token_accuracy": 0.6776227951049805, + "num_tokens": 44104544.0, + "step": 1754 + }, + { + "epoch": 0.19273006808697563, + "grad_norm": 2.2960920333862305, + "learning_rate": 9.626783754116355e-07, + "loss": 0.9986, + "mean_token_accuracy": 0.7039448618888855, + "num_tokens": 44134442.0, + "step": 1755 + }, + { + "epoch": 0.19283988578958927, + "grad_norm": 2.7219789028167725, + "learning_rate": 9.632272228320525e-07, + "loss": 0.9877, + "mean_token_accuracy": 0.7021812200546265, + "num_tokens": 44152968.0, + "step": 1756 + }, + { + "epoch": 0.19294970349220295, + "grad_norm": 2.0953774452209473, + "learning_rate": 9.637760702524698e-07, + "loss": 1.0624, + "mean_token_accuracy": 0.6824243068695068, + "num_tokens": 44182055.0, + "step": 1757 + }, + { + "epoch": 0.1930595211948166, + "grad_norm": 2.36838960647583, + "learning_rate": 9.643249176728869e-07, + "loss": 1.0345, + "mean_token_accuracy": 0.6880258321762085, + "num_tokens": 44207464.0, + "step": 1758 + }, + { + "epoch": 0.19316933889743026, + "grad_norm": 2.243967056274414, + "learning_rate": 9.64873765093304e-07, + "loss": 1.0436, + "mean_token_accuracy": 0.6858631372451782, + "num_tokens": 44234478.0, + "step": 1759 + }, + { + "epoch": 0.19327915660004394, + "grad_norm": 2.168595552444458, + "learning_rate": 9.654226125137212e-07, + "loss": 1.0406, + "mean_token_accuracy": 0.6884508728981018, + "num_tokens": 44261914.0, + "step": 1760 + }, + { + "epoch": 0.19338897430265758, + "grad_norm": 2.2799909114837646, + "learning_rate": 9.659714599341383e-07, + "loss": 1.1137, + "mean_token_accuracy": 0.6794285178184509, + "num_tokens": 44291071.0, + "step": 1761 + }, + { + "epoch": 0.19349879200527126, + "grad_norm": 2.012622117996216, + "learning_rate": 9.665203073545553e-07, + "loss": 1.1063, + "mean_token_accuracy": 0.6761857271194458, + "num_tokens": 44323294.0, + "step": 1762 + }, + { + "epoch": 0.1936086097078849, + "grad_norm": 2.4900612831115723, + "learning_rate": 9.670691547749726e-07, + "loss": 1.0693, + "mean_token_accuracy": 0.6740827560424805, + "num_tokens": 44347917.0, + "step": 1763 + }, + { + "epoch": 0.19371842741049858, + "grad_norm": 2.2999165058135986, + "learning_rate": 9.676180021953897e-07, + "loss": 1.032, + "mean_token_accuracy": 0.6899101734161377, + "num_tokens": 44374932.0, + "step": 1764 + }, + { + "epoch": 0.19382824511311222, + "grad_norm": 2.3192837238311768, + "learning_rate": 9.681668496158067e-07, + "loss": 0.9553, + "mean_token_accuracy": 0.7135626077651978, + "num_tokens": 44398904.0, + "step": 1765 + }, + { + "epoch": 0.1939380628157259, + "grad_norm": 2.6511809825897217, + "learning_rate": 9.68715697036224e-07, + "loss": 1.0213, + "mean_token_accuracy": 0.6913454532623291, + "num_tokens": 44418657.0, + "step": 1766 + }, + { + "epoch": 0.19404788051833954, + "grad_norm": 2.9175477027893066, + "learning_rate": 9.692645444566409e-07, + "loss": 1.067, + "mean_token_accuracy": 0.6881852746009827, + "num_tokens": 44436772.0, + "step": 1767 + }, + { + "epoch": 0.19415769822095322, + "grad_norm": 2.364819288253784, + "learning_rate": 9.698133918770581e-07, + "loss": 1.0291, + "mean_token_accuracy": 0.6931325197219849, + "num_tokens": 44460098.0, + "step": 1768 + }, + { + "epoch": 0.1942675159235669, + "grad_norm": 2.358994483947754, + "learning_rate": 9.703622392974752e-07, + "loss": 1.0475, + "mean_token_accuracy": 0.6896047592163086, + "num_tokens": 44483132.0, + "step": 1769 + }, + { + "epoch": 0.19437733362618054, + "grad_norm": 2.2859628200531006, + "learning_rate": 9.709110867178923e-07, + "loss": 0.9669, + "mean_token_accuracy": 0.7068904042243958, + "num_tokens": 44506397.0, + "step": 1770 + }, + { + "epoch": 0.1944871513287942, + "grad_norm": 2.402449369430542, + "learning_rate": 9.714599341383095e-07, + "loss": 1.089, + "mean_token_accuracy": 0.6787196397781372, + "num_tokens": 44530528.0, + "step": 1771 + }, + { + "epoch": 0.19459696903140786, + "grad_norm": 2.0836737155914307, + "learning_rate": 9.720087815587266e-07, + "loss": 0.9991, + "mean_token_accuracy": 0.7016440629959106, + "num_tokens": 44560132.0, + "step": 1772 + }, + { + "epoch": 0.19470678673402153, + "grad_norm": 2.629889965057373, + "learning_rate": 9.725576289791437e-07, + "loss": 1.0607, + "mean_token_accuracy": 0.679009199142456, + "num_tokens": 44580968.0, + "step": 1773 + }, + { + "epoch": 0.19481660443663518, + "grad_norm": 2.2275547981262207, + "learning_rate": 9.73106476399561e-07, + "loss": 1.0129, + "mean_token_accuracy": 0.690546989440918, + "num_tokens": 44607234.0, + "step": 1774 + }, + { + "epoch": 0.19492642213924885, + "grad_norm": 2.333397388458252, + "learning_rate": 9.73655323819978e-07, + "loss": 1.0144, + "mean_token_accuracy": 0.6916961073875427, + "num_tokens": 44633572.0, + "step": 1775 + }, + { + "epoch": 0.1950362398418625, + "grad_norm": 2.4835541248321533, + "learning_rate": 9.74204171240395e-07, + "loss": 0.9834, + "mean_token_accuracy": 0.703150749206543, + "num_tokens": 44655383.0, + "step": 1776 + }, + { + "epoch": 0.19514605754447617, + "grad_norm": 2.0899291038513184, + "learning_rate": 9.747530186608123e-07, + "loss": 1.1095, + "mean_token_accuracy": 0.6752181649208069, + "num_tokens": 44688660.0, + "step": 1777 + }, + { + "epoch": 0.19525587524708984, + "grad_norm": 2.1036739349365234, + "learning_rate": 9.753018660812294e-07, + "loss": 1.1067, + "mean_token_accuracy": 0.6679145097732544, + "num_tokens": 44717640.0, + "step": 1778 + }, + { + "epoch": 0.1953656929497035, + "grad_norm": 2.2974319458007812, + "learning_rate": 9.758507135016465e-07, + "loss": 1.0172, + "mean_token_accuracy": 0.6908804774284363, + "num_tokens": 44742288.0, + "step": 1779 + }, + { + "epoch": 0.19547551065231716, + "grad_norm": 2.2894489765167236, + "learning_rate": 9.763995609220637e-07, + "loss": 1.0565, + "mean_token_accuracy": 0.6794065237045288, + "num_tokens": 44767360.0, + "step": 1780 + }, + { + "epoch": 0.1955853283549308, + "grad_norm": 2.174638509750366, + "learning_rate": 9.769484083424808e-07, + "loss": 1.0416, + "mean_token_accuracy": 0.693657636642456, + "num_tokens": 44795565.0, + "step": 1781 + }, + { + "epoch": 0.19569514605754448, + "grad_norm": 2.407572031021118, + "learning_rate": 9.774972557628979e-07, + "loss": 1.1451, + "mean_token_accuracy": 0.6621878743171692, + "num_tokens": 44820123.0, + "step": 1782 + }, + { + "epoch": 0.19580496376015813, + "grad_norm": 2.168095827102661, + "learning_rate": 9.780461031833151e-07, + "loss": 1.0161, + "mean_token_accuracy": 0.6926522254943848, + "num_tokens": 44847463.0, + "step": 1783 + }, + { + "epoch": 0.1959147814627718, + "grad_norm": 2.6492388248443604, + "learning_rate": 9.785949506037322e-07, + "loss": 0.9875, + "mean_token_accuracy": 0.6975008845329285, + "num_tokens": 44867773.0, + "step": 1784 + }, + { + "epoch": 0.19602459916538545, + "grad_norm": 2.0580153465270996, + "learning_rate": 9.791437980241493e-07, + "loss": 0.9906, + "mean_token_accuracy": 0.7047408819198608, + "num_tokens": 44896596.0, + "step": 1785 + }, + { + "epoch": 0.19613441686799912, + "grad_norm": 2.532015323638916, + "learning_rate": 9.796926454445663e-07, + "loss": 1.0269, + "mean_token_accuracy": 0.6879414319992065, + "num_tokens": 44918696.0, + "step": 1786 + }, + { + "epoch": 0.1962442345706128, + "grad_norm": 2.521691083908081, + "learning_rate": 9.802414928649834e-07, + "loss": 1.0282, + "mean_token_accuracy": 0.6887129545211792, + "num_tokens": 44941108.0, + "step": 1787 + }, + { + "epoch": 0.19635405227322644, + "grad_norm": 2.245715379714966, + "learning_rate": 9.807903402854007e-07, + "loss": 0.9377, + "mean_token_accuracy": 0.7253796458244324, + "num_tokens": 44965630.0, + "step": 1788 + }, + { + "epoch": 0.19646386997584012, + "grad_norm": 2.0896072387695312, + "learning_rate": 9.813391877058177e-07, + "loss": 1.0578, + "mean_token_accuracy": 0.6850855946540833, + "num_tokens": 44997548.0, + "step": 1789 + }, + { + "epoch": 0.19657368767845376, + "grad_norm": 2.3842782974243164, + "learning_rate": 9.818880351262348e-07, + "loss": 0.9436, + "mean_token_accuracy": 0.7109794616699219, + "num_tokens": 45019155.0, + "step": 1790 + }, + { + "epoch": 0.19668350538106744, + "grad_norm": 2.3585972785949707, + "learning_rate": 9.82436882546652e-07, + "loss": 1.0735, + "mean_token_accuracy": 0.6785416007041931, + "num_tokens": 45044411.0, + "step": 1791 + }, + { + "epoch": 0.19679332308368108, + "grad_norm": 2.336786985397339, + "learning_rate": 9.829857299670691e-07, + "loss": 1.0417, + "mean_token_accuracy": 0.6959735155105591, + "num_tokens": 45068858.0, + "step": 1792 + }, + { + "epoch": 0.19690314078629476, + "grad_norm": 2.3300273418426514, + "learning_rate": 9.835345773874862e-07, + "loss": 0.9789, + "mean_token_accuracy": 0.7036383152008057, + "num_tokens": 45092738.0, + "step": 1793 + }, + { + "epoch": 0.1970129584889084, + "grad_norm": 2.2468669414520264, + "learning_rate": 9.840834248079035e-07, + "loss": 1.0054, + "mean_token_accuracy": 0.7053021192550659, + "num_tokens": 45120287.0, + "step": 1794 + }, + { + "epoch": 0.19712277619152208, + "grad_norm": 2.5442094802856445, + "learning_rate": 9.846322722283205e-07, + "loss": 0.9317, + "mean_token_accuracy": 0.7171158790588379, + "num_tokens": 45141418.0, + "step": 1795 + }, + { + "epoch": 0.19723259389413572, + "grad_norm": 2.337358236312866, + "learning_rate": 9.851811196487376e-07, + "loss": 1.0573, + "mean_token_accuracy": 0.6829484105110168, + "num_tokens": 45165490.0, + "step": 1796 + }, + { + "epoch": 0.1973424115967494, + "grad_norm": 2.551353693008423, + "learning_rate": 9.857299670691546e-07, + "loss": 1.0389, + "mean_token_accuracy": 0.6844866871833801, + "num_tokens": 45186122.0, + "step": 1797 + }, + { + "epoch": 0.19745222929936307, + "grad_norm": 2.191787004470825, + "learning_rate": 9.86278814489572e-07, + "loss": 0.9788, + "mean_token_accuracy": 0.7046743631362915, + "num_tokens": 45211932.0, + "step": 1798 + }, + { + "epoch": 0.19756204700197671, + "grad_norm": 2.0258896350860596, + "learning_rate": 9.86827661909989e-07, + "loss": 1.082, + "mean_token_accuracy": 0.6778910160064697, + "num_tokens": 45242996.0, + "step": 1799 + }, + { + "epoch": 0.1976718647045904, + "grad_norm": 2.2241477966308594, + "learning_rate": 9.87376509330406e-07, + "loss": 1.0047, + "mean_token_accuracy": 0.6957281827926636, + "num_tokens": 45270504.0, + "step": 1800 + }, + { + "epoch": 0.19778168240720403, + "grad_norm": 2.3843019008636475, + "learning_rate": 9.879253567508233e-07, + "loss": 1.0122, + "mean_token_accuracy": 0.6944935321807861, + "num_tokens": 45294564.0, + "step": 1801 + }, + { + "epoch": 0.1978915001098177, + "grad_norm": 2.496187686920166, + "learning_rate": 9.884742041712404e-07, + "loss": 0.9701, + "mean_token_accuracy": 0.7082325220108032, + "num_tokens": 45314434.0, + "step": 1802 + }, + { + "epoch": 0.19800131781243135, + "grad_norm": 2.2839741706848145, + "learning_rate": 9.890230515916574e-07, + "loss": 1.0129, + "mean_token_accuracy": 0.6959453225135803, + "num_tokens": 45340003.0, + "step": 1803 + }, + { + "epoch": 0.19811113551504503, + "grad_norm": 2.5163934230804443, + "learning_rate": 9.895718990120747e-07, + "loss": 0.955, + "mean_token_accuracy": 0.7072257995605469, + "num_tokens": 45363440.0, + "step": 1804 + }, + { + "epoch": 0.19822095321765867, + "grad_norm": 2.2937145233154297, + "learning_rate": 9.901207464324918e-07, + "loss": 0.9795, + "mean_token_accuracy": 0.7080854177474976, + "num_tokens": 45388186.0, + "step": 1805 + }, + { + "epoch": 0.19833077092027235, + "grad_norm": 2.2749688625335693, + "learning_rate": 9.906695938529088e-07, + "loss": 0.9589, + "mean_token_accuracy": 0.7158190011978149, + "num_tokens": 45413049.0, + "step": 1806 + }, + { + "epoch": 0.19844058862288602, + "grad_norm": 2.1027050018310547, + "learning_rate": 9.91218441273326e-07, + "loss": 1.0092, + "mean_token_accuracy": 0.6946057081222534, + "num_tokens": 45442088.0, + "step": 1807 + }, + { + "epoch": 0.19855040632549967, + "grad_norm": 2.449571132659912, + "learning_rate": 9.91767288693743e-07, + "loss": 1.1319, + "mean_token_accuracy": 0.6860524415969849, + "num_tokens": 45467023.0, + "step": 1808 + }, + { + "epoch": 0.19866022402811334, + "grad_norm": 2.565828800201416, + "learning_rate": 9.923161361141602e-07, + "loss": 1.0394, + "mean_token_accuracy": 0.6879533529281616, + "num_tokens": 45488808.0, + "step": 1809 + }, + { + "epoch": 0.198770041730727, + "grad_norm": 2.4488768577575684, + "learning_rate": 9.928649835345773e-07, + "loss": 1.0122, + "mean_token_accuracy": 0.6923227310180664, + "num_tokens": 45512958.0, + "step": 1810 + }, + { + "epoch": 0.19887985943334066, + "grad_norm": 2.3692522048950195, + "learning_rate": 9.934138309549944e-07, + "loss": 1.0522, + "mean_token_accuracy": 0.67967689037323, + "num_tokens": 45538535.0, + "step": 1811 + }, + { + "epoch": 0.1989896771359543, + "grad_norm": 2.4948229789733887, + "learning_rate": 9.939626783754116e-07, + "loss": 0.9641, + "mean_token_accuracy": 0.7093407511711121, + "num_tokens": 45560314.0, + "step": 1812 + }, + { + "epoch": 0.19909949483856798, + "grad_norm": 2.2624313831329346, + "learning_rate": 9.945115257958287e-07, + "loss": 1.071, + "mean_token_accuracy": 0.6845728158950806, + "num_tokens": 45585985.0, + "step": 1813 + }, + { + "epoch": 0.19920931254118163, + "grad_norm": 2.351118803024292, + "learning_rate": 9.950603732162458e-07, + "loss": 0.9901, + "mean_token_accuracy": 0.7064070105552673, + "num_tokens": 45612585.0, + "step": 1814 + }, + { + "epoch": 0.1993191302437953, + "grad_norm": 2.2466461658477783, + "learning_rate": 9.95609220636663e-07, + "loss": 1.1587, + "mean_token_accuracy": 0.6589902639389038, + "num_tokens": 45638892.0, + "step": 1815 + }, + { + "epoch": 0.19942894794640897, + "grad_norm": 2.5924031734466553, + "learning_rate": 9.9615806805708e-07, + "loss": 1.0362, + "mean_token_accuracy": 0.6916772127151489, + "num_tokens": 45663005.0, + "step": 1816 + }, + { + "epoch": 0.19953876564902262, + "grad_norm": 2.8941283226013184, + "learning_rate": 9.967069154774972e-07, + "loss": 0.9976, + "mean_token_accuracy": 0.7035350799560547, + "num_tokens": 45679670.0, + "step": 1817 + }, + { + "epoch": 0.1996485833516363, + "grad_norm": 2.216067314147949, + "learning_rate": 9.972557628979144e-07, + "loss": 1.0923, + "mean_token_accuracy": 0.6786733865737915, + "num_tokens": 45711994.0, + "step": 1818 + }, + { + "epoch": 0.19975840105424994, + "grad_norm": 2.8817036151885986, + "learning_rate": 9.978046103183315e-07, + "loss": 0.9664, + "mean_token_accuracy": 0.7014486789703369, + "num_tokens": 45728079.0, + "step": 1819 + }, + { + "epoch": 0.1998682187568636, + "grad_norm": 2.162057876586914, + "learning_rate": 9.983534577387486e-07, + "loss": 1.0405, + "mean_token_accuracy": 0.6867098808288574, + "num_tokens": 45757471.0, + "step": 1820 + }, + { + "epoch": 0.19997803645947726, + "grad_norm": 2.270915985107422, + "learning_rate": 9.989023051591658e-07, + "loss": 1.0393, + "mean_token_accuracy": 0.6912167072296143, + "num_tokens": 45782336.0, + "step": 1821 + }, + { + "epoch": 0.20008785416209093, + "grad_norm": 2.006577730178833, + "learning_rate": 9.994511525795829e-07, + "loss": 1.0934, + "mean_token_accuracy": 0.6704486608505249, + "num_tokens": 45814430.0, + "step": 1822 + }, + { + "epoch": 0.20019767186470458, + "grad_norm": 2.3136096000671387, + "learning_rate": 1e-06, + "loss": 1.1366, + "mean_token_accuracy": 0.6643933057785034, + "num_tokens": 45839644.0, + "step": 1823 + }, + { + "epoch": 0.20030748956731825, + "grad_norm": 2.4730753898620605, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6993175148963928, + "num_tokens": 45861357.0, + "step": 1824 + }, + { + "epoch": 0.2004173072699319, + "grad_norm": 2.7156665325164795, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7210752367973328, + "num_tokens": 45877965.0, + "step": 1825 + }, + { + "epoch": 0.20052712497254557, + "grad_norm": 2.369699001312256, + "learning_rate": 1e-06, + "loss": 1.087, + "mean_token_accuracy": 0.6819252371788025, + "num_tokens": 45904814.0, + "step": 1826 + }, + { + "epoch": 0.20063694267515925, + "grad_norm": 2.1801483631134033, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6944938898086548, + "num_tokens": 45932692.0, + "step": 1827 + }, + { + "epoch": 0.2007467603777729, + "grad_norm": 2.4644532203674316, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7051870822906494, + "num_tokens": 45955021.0, + "step": 1828 + }, + { + "epoch": 0.20085657808038657, + "grad_norm": 2.133944272994995, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7107221484184265, + "num_tokens": 45981666.0, + "step": 1829 + }, + { + "epoch": 0.2009663957830002, + "grad_norm": 2.516556978225708, + "learning_rate": 1e-06, + "loss": 1.0947, + "mean_token_accuracy": 0.6748648881912231, + "num_tokens": 46004755.0, + "step": 1830 + }, + { + "epoch": 0.20107621348561389, + "grad_norm": 2.001862049102783, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7057048678398132, + "num_tokens": 46034092.0, + "step": 1831 + }, + { + "epoch": 0.20118603118822753, + "grad_norm": 2.4676356315612793, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7181251049041748, + "num_tokens": 46054929.0, + "step": 1832 + }, + { + "epoch": 0.2012958488908412, + "grad_norm": 2.2703890800476074, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6870971322059631, + "num_tokens": 46081328.0, + "step": 1833 + }, + { + "epoch": 0.20140566659345485, + "grad_norm": 2.698134422302246, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6987049579620361, + "num_tokens": 46100315.0, + "step": 1834 + }, + { + "epoch": 0.20151548429606853, + "grad_norm": 2.235851526260376, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7001876831054688, + "num_tokens": 46129280.0, + "step": 1835 + }, + { + "epoch": 0.2016253019986822, + "grad_norm": 2.6661648750305176, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6987947821617126, + "num_tokens": 46148745.0, + "step": 1836 + }, + { + "epoch": 0.20173511970129585, + "grad_norm": 2.0958826541900635, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7021422386169434, + "num_tokens": 46176981.0, + "step": 1837 + }, + { + "epoch": 0.20184493740390952, + "grad_norm": 2.322295904159546, + "learning_rate": 1e-06, + "loss": 1.0993, + "mean_token_accuracy": 0.6908552646636963, + "num_tokens": 46202335.0, + "step": 1838 + }, + { + "epoch": 0.20195475510652316, + "grad_norm": 2.5098235607147217, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7300383448600769, + "num_tokens": 46222610.0, + "step": 1839 + }, + { + "epoch": 0.20206457280913684, + "grad_norm": 2.659735679626465, + "learning_rate": 1e-06, + "loss": 1.0548, + "mean_token_accuracy": 0.6870657801628113, + "num_tokens": 46243830.0, + "step": 1840 + }, + { + "epoch": 0.20217439051175048, + "grad_norm": 2.5007073879241943, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6944179534912109, + "num_tokens": 46264489.0, + "step": 1841 + }, + { + "epoch": 0.20228420821436416, + "grad_norm": 2.3446974754333496, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.7090363502502441, + "num_tokens": 46288038.0, + "step": 1842 + }, + { + "epoch": 0.2023940259169778, + "grad_norm": 2.340611696243286, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7064798474311829, + "num_tokens": 46311794.0, + "step": 1843 + }, + { + "epoch": 0.20250384361959148, + "grad_norm": 2.2876651287078857, + "learning_rate": 1e-06, + "loss": 1.0995, + "mean_token_accuracy": 0.6753227710723877, + "num_tokens": 46337686.0, + "step": 1844 + }, + { + "epoch": 0.20261366132220515, + "grad_norm": 2.571359157562256, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6860700845718384, + "num_tokens": 46357661.0, + "step": 1845 + }, + { + "epoch": 0.2027234790248188, + "grad_norm": 2.056124687194824, + "learning_rate": 1e-06, + "loss": 1.0752, + "mean_token_accuracy": 0.6735587120056152, + "num_tokens": 46387253.0, + "step": 1846 + }, + { + "epoch": 0.20283329672743247, + "grad_norm": 2.080204963684082, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7155699133872986, + "num_tokens": 46413208.0, + "step": 1847 + }, + { + "epoch": 0.20294311443004612, + "grad_norm": 2.5655901432037354, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6896013021469116, + "num_tokens": 46433998.0, + "step": 1848 + }, + { + "epoch": 0.2030529321326598, + "grad_norm": 2.437591552734375, + "learning_rate": 1e-06, + "loss": 1.1115, + "mean_token_accuracy": 0.6694599986076355, + "num_tokens": 46458434.0, + "step": 1849 + }, + { + "epoch": 0.20316274983527344, + "grad_norm": 2.493644952774048, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6920116543769836, + "num_tokens": 46479616.0, + "step": 1850 + }, + { + "epoch": 0.2032725675378871, + "grad_norm": 2.5835108757019043, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7180741429328918, + "num_tokens": 46500591.0, + "step": 1851 + }, + { + "epoch": 0.20338238524050076, + "grad_norm": 2.091111898422241, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6802190542221069, + "num_tokens": 46527327.0, + "step": 1852 + }, + { + "epoch": 0.20349220294311443, + "grad_norm": 2.155860424041748, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6976932287216187, + "num_tokens": 46555805.0, + "step": 1853 + }, + { + "epoch": 0.2036020206457281, + "grad_norm": 2.2267115116119385, + "learning_rate": 1e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6800447702407837, + "num_tokens": 46583334.0, + "step": 1854 + }, + { + "epoch": 0.20371183834834175, + "grad_norm": 2.2178690433502197, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6965176463127136, + "num_tokens": 46610822.0, + "step": 1855 + }, + { + "epoch": 0.20382165605095542, + "grad_norm": 2.1057024002075195, + "learning_rate": 1e-06, + "loss": 1.0972, + "mean_token_accuracy": 0.6743600964546204, + "num_tokens": 46639995.0, + "step": 1856 + }, + { + "epoch": 0.20393147375356907, + "grad_norm": 2.0366530418395996, + "learning_rate": 1e-06, + "loss": 1.0922, + "mean_token_accuracy": 0.6860584020614624, + "num_tokens": 46672123.0, + "step": 1857 + }, + { + "epoch": 0.20404129145618274, + "grad_norm": 2.4032299518585205, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6919894218444824, + "num_tokens": 46695264.0, + "step": 1858 + }, + { + "epoch": 0.2041511091587964, + "grad_norm": 2.4358341693878174, + "learning_rate": 1e-06, + "loss": 1.0631, + "mean_token_accuracy": 0.6790710091590881, + "num_tokens": 46720053.0, + "step": 1859 + }, + { + "epoch": 0.20426092686141006, + "grad_norm": 2.42596435546875, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7141229510307312, + "num_tokens": 46742878.0, + "step": 1860 + }, + { + "epoch": 0.2043707445640237, + "grad_norm": 2.4679129123687744, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6940814852714539, + "num_tokens": 46765175.0, + "step": 1861 + }, + { + "epoch": 0.20448056226663738, + "grad_norm": 2.428067684173584, + "learning_rate": 1e-06, + "loss": 1.0783, + "mean_token_accuracy": 0.6783467531204224, + "num_tokens": 46789281.0, + "step": 1862 + }, + { + "epoch": 0.20459037996925103, + "grad_norm": 2.3196072578430176, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.707592248916626, + "num_tokens": 46813159.0, + "step": 1863 + }, + { + "epoch": 0.2047001976718647, + "grad_norm": 2.112842082977295, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6978447437286377, + "num_tokens": 46843709.0, + "step": 1864 + }, + { + "epoch": 0.20481001537447838, + "grad_norm": 2.0483360290527344, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7105921506881714, + "num_tokens": 46871793.0, + "step": 1865 + }, + { + "epoch": 0.20491983307709202, + "grad_norm": 2.3707213401794434, + "learning_rate": 1e-06, + "loss": 1.0918, + "mean_token_accuracy": 0.673504114151001, + "num_tokens": 46895075.0, + "step": 1866 + }, + { + "epoch": 0.2050296507797057, + "grad_norm": 3.0081706047058105, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.693781316280365, + "num_tokens": 46910976.0, + "step": 1867 + }, + { + "epoch": 0.20513946848231934, + "grad_norm": 2.2658190727233887, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.698602557182312, + "num_tokens": 46937659.0, + "step": 1868 + }, + { + "epoch": 0.20524928618493302, + "grad_norm": 2.240849733352661, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6997722387313843, + "num_tokens": 46961637.0, + "step": 1869 + }, + { + "epoch": 0.20535910388754666, + "grad_norm": 2.2570362091064453, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6868165135383606, + "num_tokens": 46990407.0, + "step": 1870 + }, + { + "epoch": 0.20546892159016034, + "grad_norm": 2.3342418670654297, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6958929300308228, + "num_tokens": 47016771.0, + "step": 1871 + }, + { + "epoch": 0.20557873929277398, + "grad_norm": 2.386927604675293, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6869351863861084, + "num_tokens": 47040662.0, + "step": 1872 + }, + { + "epoch": 0.20568855699538766, + "grad_norm": 2.1522040367126465, + "learning_rate": 1e-06, + "loss": 1.0815, + "mean_token_accuracy": 0.6746194958686829, + "num_tokens": 47071342.0, + "step": 1873 + }, + { + "epoch": 0.20579837469800133, + "grad_norm": 2.0364904403686523, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6974906921386719, + "num_tokens": 47103896.0, + "step": 1874 + }, + { + "epoch": 0.20590819240061498, + "grad_norm": 2.2008109092712402, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7104274034500122, + "num_tokens": 47131797.0, + "step": 1875 + }, + { + "epoch": 0.20601801010322865, + "grad_norm": 2.3413286209106445, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6860202550888062, + "num_tokens": 47156371.0, + "step": 1876 + }, + { + "epoch": 0.2061278278058423, + "grad_norm": 2.5509555339813232, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7161629796028137, + "num_tokens": 47176547.0, + "step": 1877 + }, + { + "epoch": 0.20623764550845597, + "grad_norm": 2.151960849761963, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6893997192382812, + "num_tokens": 47203927.0, + "step": 1878 + }, + { + "epoch": 0.20634746321106961, + "grad_norm": 2.6630048751831055, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6836710572242737, + "num_tokens": 47227072.0, + "step": 1879 + }, + { + "epoch": 0.2064572809136833, + "grad_norm": 2.256741523742676, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7014416456222534, + "num_tokens": 47251461.0, + "step": 1880 + }, + { + "epoch": 0.20656709861629693, + "grad_norm": 2.1561384201049805, + "learning_rate": 1e-06, + "loss": 1.101, + "mean_token_accuracy": 0.6706845164299011, + "num_tokens": 47280295.0, + "step": 1881 + }, + { + "epoch": 0.2066769163189106, + "grad_norm": 2.3120219707489014, + "learning_rate": 1e-06, + "loss": 1.0523, + "mean_token_accuracy": 0.6865193843841553, + "num_tokens": 47305790.0, + "step": 1882 + }, + { + "epoch": 0.20678673402152428, + "grad_norm": 2.2585103511810303, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6972310543060303, + "num_tokens": 47333358.0, + "step": 1883 + }, + { + "epoch": 0.20689655172413793, + "grad_norm": 2.4562292098999023, + "learning_rate": 1e-06, + "loss": 1.085, + "mean_token_accuracy": 0.6799046993255615, + "num_tokens": 47354872.0, + "step": 1884 + }, + { + "epoch": 0.2070063694267516, + "grad_norm": 2.215660333633423, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7215579748153687, + "num_tokens": 47380541.0, + "step": 1885 + }, + { + "epoch": 0.20711618712936525, + "grad_norm": 2.15390682220459, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6884719133377075, + "num_tokens": 47409296.0, + "step": 1886 + }, + { + "epoch": 0.20722600483197892, + "grad_norm": 2.388145685195923, + "learning_rate": 1e-06, + "loss": 1.0644, + "mean_token_accuracy": 0.6826938986778259, + "num_tokens": 47431185.0, + "step": 1887 + }, + { + "epoch": 0.20733582253459257, + "grad_norm": 2.2743256092071533, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6861929893493652, + "num_tokens": 47457374.0, + "step": 1888 + }, + { + "epoch": 0.20744564023720624, + "grad_norm": 2.668592691421509, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.699317216873169, + "num_tokens": 47478692.0, + "step": 1889 + }, + { + "epoch": 0.2075554579398199, + "grad_norm": 2.1853771209716797, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6897099614143372, + "num_tokens": 47506577.0, + "step": 1890 + }, + { + "epoch": 0.20766527564243356, + "grad_norm": 2.1789586544036865, + "learning_rate": 1e-06, + "loss": 1.1486, + "mean_token_accuracy": 0.6612690687179565, + "num_tokens": 47538487.0, + "step": 1891 + }, + { + "epoch": 0.20777509334504723, + "grad_norm": 2.512906789779663, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7204810380935669, + "num_tokens": 47559382.0, + "step": 1892 + }, + { + "epoch": 0.20788491104766088, + "grad_norm": 2.280902147293091, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6921294331550598, + "num_tokens": 47585753.0, + "step": 1893 + }, + { + "epoch": 0.20799472875027455, + "grad_norm": 2.2774457931518555, + "learning_rate": 1e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.6730077862739563, + "num_tokens": 47610986.0, + "step": 1894 + }, + { + "epoch": 0.2081045464528882, + "grad_norm": 2.3070530891418457, + "learning_rate": 1e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6776089072227478, + "num_tokens": 47635828.0, + "step": 1895 + }, + { + "epoch": 0.20821436415550187, + "grad_norm": 2.384267568588257, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6849778294563293, + "num_tokens": 47659859.0, + "step": 1896 + }, + { + "epoch": 0.20832418185811552, + "grad_norm": 2.7698636054992676, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7242801785469055, + "num_tokens": 47677622.0, + "step": 1897 + }, + { + "epoch": 0.2084339995607292, + "grad_norm": 2.783299207687378, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6841159462928772, + "num_tokens": 47696909.0, + "step": 1898 + }, + { + "epoch": 0.20854381726334284, + "grad_norm": 2.510348081588745, + "learning_rate": 1e-06, + "loss": 1.0786, + "mean_token_accuracy": 0.6820162534713745, + "num_tokens": 47720156.0, + "step": 1899 + }, + { + "epoch": 0.2086536349659565, + "grad_norm": 2.5114519596099854, + "learning_rate": 1e-06, + "loss": 1.1009, + "mean_token_accuracy": 0.6847738027572632, + "num_tokens": 47742537.0, + "step": 1900 + }, + { + "epoch": 0.20876345266857016, + "grad_norm": 1.9844413995742798, + "learning_rate": 1e-06, + "loss": 1.0552, + "mean_token_accuracy": 0.6844044923782349, + "num_tokens": 47773295.0, + "step": 1901 + }, + { + "epoch": 0.20887327037118383, + "grad_norm": 2.0290520191192627, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7094122171401978, + "num_tokens": 47802541.0, + "step": 1902 + }, + { + "epoch": 0.2089830880737975, + "grad_norm": 2.4843649864196777, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6932482719421387, + "num_tokens": 47822811.0, + "step": 1903 + }, + { + "epoch": 0.20909290577641115, + "grad_norm": 2.099483013153076, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6951972246170044, + "num_tokens": 47849681.0, + "step": 1904 + }, + { + "epoch": 0.20920272347902483, + "grad_norm": 2.325974941253662, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7221724987030029, + "num_tokens": 47872590.0, + "step": 1905 + }, + { + "epoch": 0.20931254118163847, + "grad_norm": 2.3712215423583984, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6989796161651611, + "num_tokens": 47894801.0, + "step": 1906 + }, + { + "epoch": 0.20942235888425215, + "grad_norm": 2.422877550125122, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6961145997047424, + "num_tokens": 47916879.0, + "step": 1907 + }, + { + "epoch": 0.2095321765868658, + "grad_norm": 2.4787867069244385, + "learning_rate": 1e-06, + "loss": 1.0877, + "mean_token_accuracy": 0.6821272969245911, + "num_tokens": 47938628.0, + "step": 1908 + }, + { + "epoch": 0.20964199428947947, + "grad_norm": 2.2570269107818604, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.690015971660614, + "num_tokens": 47963150.0, + "step": 1909 + }, + { + "epoch": 0.2097518119920931, + "grad_norm": 2.421330213546753, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.689788281917572, + "num_tokens": 47988215.0, + "step": 1910 + }, + { + "epoch": 0.20986162969470679, + "grad_norm": 2.0169174671173096, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7194011211395264, + "num_tokens": 48016731.0, + "step": 1911 + }, + { + "epoch": 0.20997144739732046, + "grad_norm": 2.5580782890319824, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7276474833488464, + "num_tokens": 48034663.0, + "step": 1912 + }, + { + "epoch": 0.2100812650999341, + "grad_norm": 2.518181562423706, + "learning_rate": 1e-06, + "loss": 1.0651, + "mean_token_accuracy": 0.6828634142875671, + "num_tokens": 48057109.0, + "step": 1913 + }, + { + "epoch": 0.21019108280254778, + "grad_norm": 2.1419172286987305, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6975865364074707, + "num_tokens": 48085996.0, + "step": 1914 + }, + { + "epoch": 0.21030090050516143, + "grad_norm": 2.3850746154785156, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6885272860527039, + "num_tokens": 48109488.0, + "step": 1915 + }, + { + "epoch": 0.2104107182077751, + "grad_norm": 2.669363498687744, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7303900122642517, + "num_tokens": 48127391.0, + "step": 1916 + }, + { + "epoch": 0.21052053591038875, + "grad_norm": 2.2896366119384766, + "learning_rate": 1e-06, + "loss": 1.0793, + "mean_token_accuracy": 0.6757088899612427, + "num_tokens": 48151969.0, + "step": 1917 + }, + { + "epoch": 0.21063035361300242, + "grad_norm": 2.5276832580566406, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6974761486053467, + "num_tokens": 48173508.0, + "step": 1918 + }, + { + "epoch": 0.21074017131561606, + "grad_norm": 2.571223735809326, + "learning_rate": 1e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6869673728942871, + "num_tokens": 48194140.0, + "step": 1919 + }, + { + "epoch": 0.21084998901822974, + "grad_norm": 2.0662550926208496, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6892933249473572, + "num_tokens": 48225305.0, + "step": 1920 + }, + { + "epoch": 0.2109598067208434, + "grad_norm": 2.031815528869629, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.691376805305481, + "num_tokens": 48252935.0, + "step": 1921 + }, + { + "epoch": 0.21106962442345706, + "grad_norm": 2.2876901626586914, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6922062039375305, + "num_tokens": 48279446.0, + "step": 1922 + }, + { + "epoch": 0.21117944212607073, + "grad_norm": 2.121882915496826, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7160550355911255, + "num_tokens": 48306749.0, + "step": 1923 + }, + { + "epoch": 0.21128925982868438, + "grad_norm": 2.352696657180786, + "learning_rate": 1e-06, + "loss": 1.0313, + "mean_token_accuracy": 0.6906137466430664, + "num_tokens": 48330318.0, + "step": 1924 + }, + { + "epoch": 0.21139907753129805, + "grad_norm": 2.282437562942505, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7058871984481812, + "num_tokens": 48356483.0, + "step": 1925 + }, + { + "epoch": 0.2115088952339117, + "grad_norm": 2.464564085006714, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7075897455215454, + "num_tokens": 48378907.0, + "step": 1926 + }, + { + "epoch": 0.21161871293652537, + "grad_norm": 2.04365873336792, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6811234951019287, + "num_tokens": 48409878.0, + "step": 1927 + }, + { + "epoch": 0.21172853063913902, + "grad_norm": 2.2400519847869873, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6885976195335388, + "num_tokens": 48435526.0, + "step": 1928 + }, + { + "epoch": 0.2118383483417527, + "grad_norm": 2.3840808868408203, + "learning_rate": 1e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.6804670095443726, + "num_tokens": 48459616.0, + "step": 1929 + }, + { + "epoch": 0.21194816604436637, + "grad_norm": 2.3022072315216064, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7082474827766418, + "num_tokens": 48484182.0, + "step": 1930 + }, + { + "epoch": 0.21205798374698, + "grad_norm": 2.4096148014068604, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.733256995677948, + "num_tokens": 48504913.0, + "step": 1931 + }, + { + "epoch": 0.21216780144959368, + "grad_norm": 2.347572088241577, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6966167688369751, + "num_tokens": 48530244.0, + "step": 1932 + }, + { + "epoch": 0.21227761915220733, + "grad_norm": 2.359590768814087, + "learning_rate": 1e-06, + "loss": 1.097, + "mean_token_accuracy": 0.6761556267738342, + "num_tokens": 48554738.0, + "step": 1933 + }, + { + "epoch": 0.212387436854821, + "grad_norm": 2.0199451446533203, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6900782585144043, + "num_tokens": 48586417.0, + "step": 1934 + }, + { + "epoch": 0.21249725455743465, + "grad_norm": 2.473973274230957, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7322555780410767, + "num_tokens": 48605664.0, + "step": 1935 + }, + { + "epoch": 0.21260707226004832, + "grad_norm": 2.456740140914917, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6990986466407776, + "num_tokens": 48627311.0, + "step": 1936 + }, + { + "epoch": 0.21271688996266197, + "grad_norm": 2.046825647354126, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6954162120819092, + "num_tokens": 48658774.0, + "step": 1937 + }, + { + "epoch": 0.21282670766527564, + "grad_norm": 2.1758625507354736, + "learning_rate": 1e-06, + "loss": 1.1266, + "mean_token_accuracy": 0.6681012511253357, + "num_tokens": 48687711.0, + "step": 1938 + }, + { + "epoch": 0.2129365253678893, + "grad_norm": 1.9006001949310303, + "learning_rate": 1e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.6819002628326416, + "num_tokens": 48724134.0, + "step": 1939 + }, + { + "epoch": 0.21304634307050296, + "grad_norm": 2.5216639041900635, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6847694516181946, + "num_tokens": 48745174.0, + "step": 1940 + }, + { + "epoch": 0.21315616077311664, + "grad_norm": 2.1509523391723633, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6924325227737427, + "num_tokens": 48774067.0, + "step": 1941 + }, + { + "epoch": 0.21326597847573028, + "grad_norm": 2.1004252433776855, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6895812153816223, + "num_tokens": 48805465.0, + "step": 1942 + }, + { + "epoch": 0.21337579617834396, + "grad_norm": 2.3333470821380615, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6988362669944763, + "num_tokens": 48829739.0, + "step": 1943 + }, + { + "epoch": 0.2134856138809576, + "grad_norm": 2.180206060409546, + "learning_rate": 1e-06, + "loss": 1.1909, + "mean_token_accuracy": 0.6518286466598511, + "num_tokens": 48857851.0, + "step": 1944 + }, + { + "epoch": 0.21359543158357128, + "grad_norm": 2.1966552734375, + "learning_rate": 1e-06, + "loss": 1.0845, + "mean_token_accuracy": 0.6788771748542786, + "num_tokens": 48888028.0, + "step": 1945 + }, + { + "epoch": 0.21370524928618492, + "grad_norm": 2.537302017211914, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.708235502243042, + "num_tokens": 48909875.0, + "step": 1946 + }, + { + "epoch": 0.2138150669887986, + "grad_norm": 2.1065728664398193, + "learning_rate": 1e-06, + "loss": 1.1385, + "mean_token_accuracy": 0.6606022715568542, + "num_tokens": 48942112.0, + "step": 1947 + }, + { + "epoch": 0.21392488469141224, + "grad_norm": 2.0072474479675293, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7152011394500732, + "num_tokens": 48971293.0, + "step": 1948 + }, + { + "epoch": 0.21403470239402592, + "grad_norm": 2.36076283454895, + "learning_rate": 1e-06, + "loss": 1.1188, + "mean_token_accuracy": 0.6657110452651978, + "num_tokens": 48994326.0, + "step": 1949 + }, + { + "epoch": 0.2141445200966396, + "grad_norm": 2.1280040740966797, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.701553225517273, + "num_tokens": 49021075.0, + "step": 1950 + }, + { + "epoch": 0.21425433779925324, + "grad_norm": 2.2186059951782227, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6894683241844177, + "num_tokens": 49047609.0, + "step": 1951 + }, + { + "epoch": 0.2143641555018669, + "grad_norm": 2.385849714279175, + "learning_rate": 1e-06, + "loss": 1.0817, + "mean_token_accuracy": 0.6754437685012817, + "num_tokens": 49071854.0, + "step": 1952 + }, + { + "epoch": 0.21447397320448056, + "grad_norm": 2.2824137210845947, + "learning_rate": 1e-06, + "loss": 1.0633, + "mean_token_accuracy": 0.6902182698249817, + "num_tokens": 49096783.0, + "step": 1953 + }, + { + "epoch": 0.21458379090709423, + "grad_norm": 2.621089458465576, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7006957530975342, + "num_tokens": 49115194.0, + "step": 1954 + }, + { + "epoch": 0.21469360860970788, + "grad_norm": 2.6638753414154053, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.713215708732605, + "num_tokens": 49134732.0, + "step": 1955 + }, + { + "epoch": 0.21480342631232155, + "grad_norm": 2.256880760192871, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7210825085639954, + "num_tokens": 49160067.0, + "step": 1956 + }, + { + "epoch": 0.2149132440149352, + "grad_norm": 2.133500099182129, + "learning_rate": 1e-06, + "loss": 1.075, + "mean_token_accuracy": 0.6774581670761108, + "num_tokens": 49188816.0, + "step": 1957 + }, + { + "epoch": 0.21502306171754887, + "grad_norm": 2.3635470867156982, + "learning_rate": 1e-06, + "loss": 1.1203, + "mean_token_accuracy": 0.6660624146461487, + "num_tokens": 49212929.0, + "step": 1958 + }, + { + "epoch": 0.21513287942016254, + "grad_norm": 2.5505287647247314, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.7021253108978271, + "num_tokens": 49234103.0, + "step": 1959 + }, + { + "epoch": 0.2152426971227762, + "grad_norm": 2.194465160369873, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6806060671806335, + "num_tokens": 49260784.0, + "step": 1960 + }, + { + "epoch": 0.21535251482538986, + "grad_norm": 2.222200870513916, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6743382215499878, + "num_tokens": 49288468.0, + "step": 1961 + }, + { + "epoch": 0.2154623325280035, + "grad_norm": 2.1101577281951904, + "learning_rate": 1e-06, + "loss": 1.0763, + "mean_token_accuracy": 0.6782536506652832, + "num_tokens": 49317330.0, + "step": 1962 + }, + { + "epoch": 0.21557215023061718, + "grad_norm": 2.520779609680176, + "learning_rate": 1e-06, + "loss": 1.0593, + "mean_token_accuracy": 0.6832654476165771, + "num_tokens": 49340324.0, + "step": 1963 + }, + { + "epoch": 0.21568196793323083, + "grad_norm": 2.0283119678497314, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7070800065994263, + "num_tokens": 49367703.0, + "step": 1964 + }, + { + "epoch": 0.2157917856358445, + "grad_norm": 2.1681599617004395, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6949841380119324, + "num_tokens": 49393781.0, + "step": 1965 + }, + { + "epoch": 0.21590160333845815, + "grad_norm": 2.1561155319213867, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6943297386169434, + "num_tokens": 49420343.0, + "step": 1966 + }, + { + "epoch": 0.21601142104107182, + "grad_norm": 2.3069398403167725, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7030014991760254, + "num_tokens": 49443609.0, + "step": 1967 + }, + { + "epoch": 0.2161212387436855, + "grad_norm": 2.4856009483337402, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7046696543693542, + "num_tokens": 49464852.0, + "step": 1968 + }, + { + "epoch": 0.21623105644629914, + "grad_norm": 2.235412359237671, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6975530385971069, + "num_tokens": 49491303.0, + "step": 1969 + }, + { + "epoch": 0.21634087414891282, + "grad_norm": 2.5921857357025146, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7065624594688416, + "num_tokens": 49510537.0, + "step": 1970 + }, + { + "epoch": 0.21645069185152646, + "grad_norm": 2.41689395904541, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6943636536598206, + "num_tokens": 49533091.0, + "step": 1971 + }, + { + "epoch": 0.21656050955414013, + "grad_norm": 2.207854747772217, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6892036199569702, + "num_tokens": 49558810.0, + "step": 1972 + }, + { + "epoch": 0.21667032725675378, + "grad_norm": 2.5270159244537354, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6992186307907104, + "num_tokens": 49579813.0, + "step": 1973 + }, + { + "epoch": 0.21678014495936745, + "grad_norm": 2.1962473392486572, + "learning_rate": 1e-06, + "loss": 1.069, + "mean_token_accuracy": 0.6742621660232544, + "num_tokens": 49606547.0, + "step": 1974 + }, + { + "epoch": 0.2168899626619811, + "grad_norm": 2.202651023864746, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.688467264175415, + "num_tokens": 49631944.0, + "step": 1975 + }, + { + "epoch": 0.21699978036459477, + "grad_norm": 2.0825035572052, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7146783471107483, + "num_tokens": 49657470.0, + "step": 1976 + }, + { + "epoch": 0.21710959806720842, + "grad_norm": 1.959723949432373, + "learning_rate": 1e-06, + "loss": 1.0727, + "mean_token_accuracy": 0.6830534338951111, + "num_tokens": 49691918.0, + "step": 1977 + }, + { + "epoch": 0.2172194157698221, + "grad_norm": 2.080482006072998, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6866227984428406, + "num_tokens": 49719911.0, + "step": 1978 + }, + { + "epoch": 0.21732923347243577, + "grad_norm": 2.3248379230499268, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6759632229804993, + "num_tokens": 49747900.0, + "step": 1979 + }, + { + "epoch": 0.2174390511750494, + "grad_norm": 2.2694132328033447, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6992307305335999, + "num_tokens": 49772081.0, + "step": 1980 + }, + { + "epoch": 0.2175488688776631, + "grad_norm": 2.375373363494873, + "learning_rate": 1e-06, + "loss": 1.0676, + "mean_token_accuracy": 0.6849392652511597, + "num_tokens": 49796363.0, + "step": 1981 + }, + { + "epoch": 0.21765868658027673, + "grad_norm": 2.2872190475463867, + "learning_rate": 1e-06, + "loss": 1.0821, + "mean_token_accuracy": 0.673598051071167, + "num_tokens": 49822292.0, + "step": 1982 + }, + { + "epoch": 0.2177685042828904, + "grad_norm": 2.0861804485321045, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7117042541503906, + "num_tokens": 49853865.0, + "step": 1983 + }, + { + "epoch": 0.21787832198550405, + "grad_norm": 2.513725757598877, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6967766284942627, + "num_tokens": 49876233.0, + "step": 1984 + }, + { + "epoch": 0.21798813968811773, + "grad_norm": 1.9297674894332886, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6897668838500977, + "num_tokens": 49909505.0, + "step": 1985 + }, + { + "epoch": 0.21809795739073137, + "grad_norm": 2.708906650543213, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7138892412185669, + "num_tokens": 49926735.0, + "step": 1986 + }, + { + "epoch": 0.21820777509334505, + "grad_norm": 2.3259997367858887, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6807504892349243, + "num_tokens": 49951549.0, + "step": 1987 + }, + { + "epoch": 0.21831759279595872, + "grad_norm": 2.5232861042022705, + "learning_rate": 1e-06, + "loss": 1.0305, + "mean_token_accuracy": 0.6929011344909668, + "num_tokens": 49974920.0, + "step": 1988 + }, + { + "epoch": 0.21842741049857237, + "grad_norm": 2.3558337688446045, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6924656629562378, + "num_tokens": 50001343.0, + "step": 1989 + }, + { + "epoch": 0.21853722820118604, + "grad_norm": 2.7839601039886475, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7059413194656372, + "num_tokens": 50020555.0, + "step": 1990 + }, + { + "epoch": 0.21864704590379969, + "grad_norm": 2.008159637451172, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.694001317024231, + "num_tokens": 50051879.0, + "step": 1991 + }, + { + "epoch": 0.21875686360641336, + "grad_norm": 2.524523973464966, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6832473278045654, + "num_tokens": 50074969.0, + "step": 1992 + }, + { + "epoch": 0.218866681309027, + "grad_norm": 2.7744171619415283, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7143646478652954, + "num_tokens": 50092444.0, + "step": 1993 + }, + { + "epoch": 0.21897649901164068, + "grad_norm": 2.2113070487976074, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7076555490493774, + "num_tokens": 50117586.0, + "step": 1994 + }, + { + "epoch": 0.21908631671425433, + "grad_norm": 1.995128870010376, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6993818879127502, + "num_tokens": 50148871.0, + "step": 1995 + }, + { + "epoch": 0.219196134416868, + "grad_norm": 2.4430463314056396, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6793913245201111, + "num_tokens": 50172541.0, + "step": 1996 + }, + { + "epoch": 0.21930595211948167, + "grad_norm": 2.247879981994629, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6889113783836365, + "num_tokens": 50198760.0, + "step": 1997 + }, + { + "epoch": 0.21941576982209532, + "grad_norm": 2.262946367263794, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7172313928604126, + "num_tokens": 50224586.0, + "step": 1998 + }, + { + "epoch": 0.219525587524709, + "grad_norm": 2.501370906829834, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7050769925117493, + "num_tokens": 50246507.0, + "step": 1999 + }, + { + "epoch": 0.21963540522732264, + "grad_norm": 1.927942156791687, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7280640602111816, + "num_tokens": 50277399.0, + "step": 2000 + }, + { + "epoch": 0.2197452229299363, + "grad_norm": 2.499566078186035, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6887005567550659, + "num_tokens": 50299820.0, + "step": 2001 + }, + { + "epoch": 0.21985504063254996, + "grad_norm": 2.5972275733947754, + "learning_rate": 1e-06, + "loss": 1.1341, + "mean_token_accuracy": 0.691347599029541, + "num_tokens": 50320730.0, + "step": 2002 + }, + { + "epoch": 0.21996485833516363, + "grad_norm": 2.255338191986084, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7137113809585571, + "num_tokens": 50347110.0, + "step": 2003 + }, + { + "epoch": 0.22007467603777728, + "grad_norm": 2.3431992530822754, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7078394889831543, + "num_tokens": 50370276.0, + "step": 2004 + }, + { + "epoch": 0.22018449374039095, + "grad_norm": 2.1768951416015625, + "learning_rate": 1e-06, + "loss": 1.1289, + "mean_token_accuracy": 0.6708897352218628, + "num_tokens": 50400467.0, + "step": 2005 + }, + { + "epoch": 0.22029431144300463, + "grad_norm": 2.635629415512085, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7161219120025635, + "num_tokens": 50421501.0, + "step": 2006 + }, + { + "epoch": 0.22040412914561827, + "grad_norm": 2.596776247024536, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7040906548500061, + "num_tokens": 50441501.0, + "step": 2007 + }, + { + "epoch": 0.22051394684823195, + "grad_norm": 2.3556652069091797, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7007156014442444, + "num_tokens": 50463976.0, + "step": 2008 + }, + { + "epoch": 0.2206237645508456, + "grad_norm": 2.3015899658203125, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7145158052444458, + "num_tokens": 50487697.0, + "step": 2009 + }, + { + "epoch": 0.22073358225345927, + "grad_norm": 2.5582268238067627, + "learning_rate": 1e-06, + "loss": 1.0798, + "mean_token_accuracy": 0.6823606491088867, + "num_tokens": 50508832.0, + "step": 2010 + }, + { + "epoch": 0.2208433999560729, + "grad_norm": 2.3879005908966064, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6751831769943237, + "num_tokens": 50533823.0, + "step": 2011 + }, + { + "epoch": 0.22095321765868658, + "grad_norm": 2.1963977813720703, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.71075040102005, + "num_tokens": 50560607.0, + "step": 2012 + }, + { + "epoch": 0.22106303536130023, + "grad_norm": 2.4931037425994873, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6968904137611389, + "num_tokens": 50583326.0, + "step": 2013 + }, + { + "epoch": 0.2211728530639139, + "grad_norm": 1.992456316947937, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.702613353729248, + "num_tokens": 50613168.0, + "step": 2014 + }, + { + "epoch": 0.22128267076652755, + "grad_norm": 2.0310239791870117, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6990818977355957, + "num_tokens": 50645279.0, + "step": 2015 + }, + { + "epoch": 0.22139248846914122, + "grad_norm": 2.1929731369018555, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6900599598884583, + "num_tokens": 50674466.0, + "step": 2016 + }, + { + "epoch": 0.2215023061717549, + "grad_norm": 2.306018352508545, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7145761251449585, + "num_tokens": 50699787.0, + "step": 2017 + }, + { + "epoch": 0.22161212387436854, + "grad_norm": 2.530759811401367, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6879627704620361, + "num_tokens": 50722658.0, + "step": 2018 + }, + { + "epoch": 0.22172194157698222, + "grad_norm": 2.4438068866729736, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.7041242122650146, + "num_tokens": 50744971.0, + "step": 2019 + }, + { + "epoch": 0.22183175927959586, + "grad_norm": 2.1867706775665283, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6959993839263916, + "num_tokens": 50769276.0, + "step": 2020 + }, + { + "epoch": 0.22194157698220954, + "grad_norm": 2.3820247650146484, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6900508403778076, + "num_tokens": 50792536.0, + "step": 2021 + }, + { + "epoch": 0.22205139468482318, + "grad_norm": 2.5039637088775635, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.700673520565033, + "num_tokens": 50814758.0, + "step": 2022 + }, + { + "epoch": 0.22216121238743686, + "grad_norm": 2.319925546646118, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7170048356056213, + "num_tokens": 50839764.0, + "step": 2023 + }, + { + "epoch": 0.2222710300900505, + "grad_norm": 2.5037126541137695, + "learning_rate": 1e-06, + "loss": 1.0731, + "mean_token_accuracy": 0.6839791536331177, + "num_tokens": 50861821.0, + "step": 2024 + }, + { + "epoch": 0.22238084779266418, + "grad_norm": 2.439453601837158, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6915714740753174, + "num_tokens": 50886573.0, + "step": 2025 + }, + { + "epoch": 0.22249066549527785, + "grad_norm": 2.353386878967285, + "learning_rate": 1e-06, + "loss": 1.1218, + "mean_token_accuracy": 0.6752877235412598, + "num_tokens": 50910975.0, + "step": 2026 + }, + { + "epoch": 0.2226004831978915, + "grad_norm": 2.411543846130371, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7023316025733948, + "num_tokens": 50934002.0, + "step": 2027 + }, + { + "epoch": 0.22271030090050517, + "grad_norm": 2.342045545578003, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6902337670326233, + "num_tokens": 50958944.0, + "step": 2028 + }, + { + "epoch": 0.22282011860311882, + "grad_norm": 2.484362840652466, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7089871168136597, + "num_tokens": 50981114.0, + "step": 2029 + }, + { + "epoch": 0.2229299363057325, + "grad_norm": 2.575772762298584, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7202367186546326, + "num_tokens": 51000386.0, + "step": 2030 + }, + { + "epoch": 0.22303975400834614, + "grad_norm": 2.2417478561401367, + "learning_rate": 1e-06, + "loss": 1.2053, + "mean_token_accuracy": 0.6511048674583435, + "num_tokens": 51029507.0, + "step": 2031 + }, + { + "epoch": 0.2231495717109598, + "grad_norm": 2.190232992172241, + "learning_rate": 1e-06, + "loss": 1.0914, + "mean_token_accuracy": 0.6715776324272156, + "num_tokens": 51058858.0, + "step": 2032 + }, + { + "epoch": 0.22325938941357346, + "grad_norm": 2.4418067932128906, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6895276308059692, + "num_tokens": 51081439.0, + "step": 2033 + }, + { + "epoch": 0.22336920711618713, + "grad_norm": 2.196627140045166, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.696772038936615, + "num_tokens": 51109921.0, + "step": 2034 + }, + { + "epoch": 0.2234790248188008, + "grad_norm": 2.03651762008667, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7075334787368774, + "num_tokens": 51141126.0, + "step": 2035 + }, + { + "epoch": 0.22358884252141445, + "grad_norm": 2.1215221881866455, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7053980827331543, + "num_tokens": 51168640.0, + "step": 2036 + }, + { + "epoch": 0.22369866022402812, + "grad_norm": 2.1349198818206787, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7048134803771973, + "num_tokens": 51195909.0, + "step": 2037 + }, + { + "epoch": 0.22380847792664177, + "grad_norm": 2.1291205883026123, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6945759654045105, + "num_tokens": 51224087.0, + "step": 2038 + }, + { + "epoch": 0.22391829562925544, + "grad_norm": 2.032676935195923, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7151415944099426, + "num_tokens": 51252165.0, + "step": 2039 + }, + { + "epoch": 0.2240281133318691, + "grad_norm": 2.3034303188323975, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7208583354949951, + "num_tokens": 51276735.0, + "step": 2040 + }, + { + "epoch": 0.22413793103448276, + "grad_norm": 2.3767001628875732, + "learning_rate": 1e-06, + "loss": 1.0872, + "mean_token_accuracy": 0.6731855273246765, + "num_tokens": 51299015.0, + "step": 2041 + }, + { + "epoch": 0.2242477487370964, + "grad_norm": 2.094759702682495, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6984843015670776, + "num_tokens": 51327078.0, + "step": 2042 + }, + { + "epoch": 0.22435756643971008, + "grad_norm": 2.5644986629486084, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7326312065124512, + "num_tokens": 51348484.0, + "step": 2043 + }, + { + "epoch": 0.22446738414232376, + "grad_norm": 2.4807863235473633, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7098376750946045, + "num_tokens": 51371046.0, + "step": 2044 + }, + { + "epoch": 0.2245772018449374, + "grad_norm": 2.181105613708496, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.705081045627594, + "num_tokens": 51396996.0, + "step": 2045 + }, + { + "epoch": 0.22468701954755108, + "grad_norm": 2.459658145904541, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6983360052108765, + "num_tokens": 51419132.0, + "step": 2046 + }, + { + "epoch": 0.22479683725016472, + "grad_norm": 2.2591404914855957, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7098692059516907, + "num_tokens": 51442890.0, + "step": 2047 + }, + { + "epoch": 0.2249066549527784, + "grad_norm": 2.358250856399536, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7077467441558838, + "num_tokens": 51466085.0, + "step": 2048 + }, + { + "epoch": 0.22501647265539204, + "grad_norm": 2.8452773094177246, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7138423323631287, + "num_tokens": 51483478.0, + "step": 2049 + }, + { + "epoch": 0.22512629035800572, + "grad_norm": 2.0910234451293945, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6772505640983582, + "num_tokens": 51514735.0, + "step": 2050 + }, + { + "epoch": 0.22523610806061936, + "grad_norm": 2.431511163711548, + "learning_rate": 1e-06, + "loss": 1.0485, + "mean_token_accuracy": 0.6866263151168823, + "num_tokens": 51536048.0, + "step": 2051 + }, + { + "epoch": 0.22534592576323303, + "grad_norm": 2.0695817470550537, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6848939061164856, + "num_tokens": 51564173.0, + "step": 2052 + }, + { + "epoch": 0.22545574346584668, + "grad_norm": 2.571281909942627, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6836624145507812, + "num_tokens": 51584842.0, + "step": 2053 + }, + { + "epoch": 0.22556556116846035, + "grad_norm": 2.6255130767822266, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7028502225875854, + "num_tokens": 51604995.0, + "step": 2054 + }, + { + "epoch": 0.22567537887107403, + "grad_norm": 1.9691932201385498, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7246835231781006, + "num_tokens": 51634151.0, + "step": 2055 + }, + { + "epoch": 0.22578519657368767, + "grad_norm": 2.245534896850586, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6783108115196228, + "num_tokens": 51660176.0, + "step": 2056 + }, + { + "epoch": 0.22589501427630135, + "grad_norm": 2.3857810497283936, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6990474462509155, + "num_tokens": 51681578.0, + "step": 2057 + }, + { + "epoch": 0.226004831978915, + "grad_norm": 2.5221610069274902, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.6961569786071777, + "num_tokens": 51703640.0, + "step": 2058 + }, + { + "epoch": 0.22611464968152867, + "grad_norm": 2.0782594680786133, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6920117735862732, + "num_tokens": 51733553.0, + "step": 2059 + }, + { + "epoch": 0.2262244673841423, + "grad_norm": 2.7161641120910645, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6880964636802673, + "num_tokens": 51754711.0, + "step": 2060 + }, + { + "epoch": 0.226334285086756, + "grad_norm": 2.5115954875946045, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.7107924818992615, + "num_tokens": 51775749.0, + "step": 2061 + }, + { + "epoch": 0.22644410278936963, + "grad_norm": 2.203446626663208, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6870777606964111, + "num_tokens": 51804336.0, + "step": 2062 + }, + { + "epoch": 0.2265539204919833, + "grad_norm": 2.2790145874023438, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6935772895812988, + "num_tokens": 51829235.0, + "step": 2063 + }, + { + "epoch": 0.22666373819459698, + "grad_norm": 2.1397783756256104, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6878979206085205, + "num_tokens": 51857420.0, + "step": 2064 + }, + { + "epoch": 0.22677355589721063, + "grad_norm": 2.3287529945373535, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6882129907608032, + "num_tokens": 51882493.0, + "step": 2065 + }, + { + "epoch": 0.2268833735998243, + "grad_norm": 2.183983325958252, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7102267146110535, + "num_tokens": 51909785.0, + "step": 2066 + }, + { + "epoch": 0.22699319130243795, + "grad_norm": 2.654073476791382, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6798595190048218, + "num_tokens": 51933193.0, + "step": 2067 + }, + { + "epoch": 0.22710300900505162, + "grad_norm": 2.589336633682251, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6926640272140503, + "num_tokens": 51955570.0, + "step": 2068 + }, + { + "epoch": 0.22721282670766527, + "grad_norm": 2.1231191158294678, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.6857291460037231, + "num_tokens": 51983780.0, + "step": 2069 + }, + { + "epoch": 0.22732264441027894, + "grad_norm": 2.2899155616760254, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6914081573486328, + "num_tokens": 52009704.0, + "step": 2070 + }, + { + "epoch": 0.22743246211289259, + "grad_norm": 2.4772987365722656, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6982097029685974, + "num_tokens": 52028628.0, + "step": 2071 + }, + { + "epoch": 0.22754227981550626, + "grad_norm": 2.3127191066741943, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.699590802192688, + "num_tokens": 52052686.0, + "step": 2072 + }, + { + "epoch": 0.22765209751811993, + "grad_norm": 2.33449125289917, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7130611538887024, + "num_tokens": 52074581.0, + "step": 2073 + }, + { + "epoch": 0.22776191522073358, + "grad_norm": 2.18507981300354, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.693992018699646, + "num_tokens": 52101390.0, + "step": 2074 + }, + { + "epoch": 0.22787173292334725, + "grad_norm": 2.1329283714294434, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.713520884513855, + "num_tokens": 52131750.0, + "step": 2075 + }, + { + "epoch": 0.2279815506259609, + "grad_norm": 2.1788504123687744, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.699324369430542, + "num_tokens": 52158567.0, + "step": 2076 + }, + { + "epoch": 0.22809136832857457, + "grad_norm": 2.3711912631988525, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6900352239608765, + "num_tokens": 52184289.0, + "step": 2077 + }, + { + "epoch": 0.22820118603118822, + "grad_norm": 2.9844624996185303, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.724798858165741, + "num_tokens": 52200973.0, + "step": 2078 + }, + { + "epoch": 0.2283110037338019, + "grad_norm": 2.537076950073242, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.686352014541626, + "num_tokens": 52222173.0, + "step": 2079 + }, + { + "epoch": 0.22842082143641554, + "grad_norm": 2.4498767852783203, + "learning_rate": 1e-06, + "loss": 1.0393, + "mean_token_accuracy": 0.6864898204803467, + "num_tokens": 52244714.0, + "step": 2080 + }, + { + "epoch": 0.2285306391390292, + "grad_norm": 2.3254194259643555, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6985958814620972, + "num_tokens": 52268128.0, + "step": 2081 + }, + { + "epoch": 0.2286404568416429, + "grad_norm": 2.013786554336548, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6985588073730469, + "num_tokens": 52300376.0, + "step": 2082 + }, + { + "epoch": 0.22875027454425653, + "grad_norm": 2.4718847274780273, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6995320320129395, + "num_tokens": 52321833.0, + "step": 2083 + }, + { + "epoch": 0.2288600922468702, + "grad_norm": 2.3926994800567627, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.6969904899597168, + "num_tokens": 52343634.0, + "step": 2084 + }, + { + "epoch": 0.22896990994948385, + "grad_norm": 2.4427788257598877, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7232990860939026, + "num_tokens": 52366253.0, + "step": 2085 + }, + { + "epoch": 0.22907972765209753, + "grad_norm": 2.396672487258911, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7052053213119507, + "num_tokens": 52387014.0, + "step": 2086 + }, + { + "epoch": 0.22918954535471117, + "grad_norm": 2.4556431770324707, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7122262716293335, + "num_tokens": 52407942.0, + "step": 2087 + }, + { + "epoch": 0.22929936305732485, + "grad_norm": 2.4376556873321533, + "learning_rate": 1e-06, + "loss": 1.0953, + "mean_token_accuracy": 0.6855592727661133, + "num_tokens": 52430814.0, + "step": 2088 + }, + { + "epoch": 0.2294091807599385, + "grad_norm": 2.570903778076172, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7040070295333862, + "num_tokens": 52453641.0, + "step": 2089 + }, + { + "epoch": 0.22951899846255217, + "grad_norm": 1.9389125108718872, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.6916778087615967, + "num_tokens": 52488161.0, + "step": 2090 + }, + { + "epoch": 0.2296288161651658, + "grad_norm": 2.6264255046844482, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.7054520845413208, + "num_tokens": 52511264.0, + "step": 2091 + }, + { + "epoch": 0.22973863386777948, + "grad_norm": 2.1839981079101562, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6783906817436218, + "num_tokens": 52539887.0, + "step": 2092 + }, + { + "epoch": 0.22984845157039316, + "grad_norm": 1.9881532192230225, + "learning_rate": 1e-06, + "loss": 1.0988, + "mean_token_accuracy": 0.675410807132721, + "num_tokens": 52571023.0, + "step": 2093 + }, + { + "epoch": 0.2299582692730068, + "grad_norm": 2.3656227588653564, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7257188558578491, + "num_tokens": 52592311.0, + "step": 2094 + }, + { + "epoch": 0.23006808697562048, + "grad_norm": 2.25533127784729, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6966568231582642, + "num_tokens": 52618400.0, + "step": 2095 + }, + { + "epoch": 0.23017790467823412, + "grad_norm": 2.774844169616699, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6922389268875122, + "num_tokens": 52637580.0, + "step": 2096 + }, + { + "epoch": 0.2302877223808478, + "grad_norm": 1.975168228149414, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7356710433959961, + "num_tokens": 52665661.0, + "step": 2097 + }, + { + "epoch": 0.23039754008346144, + "grad_norm": 2.82094669342041, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7180930376052856, + "num_tokens": 52683006.0, + "step": 2098 + }, + { + "epoch": 0.23050735778607512, + "grad_norm": 2.0542755126953125, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.700950562953949, + "num_tokens": 52711075.0, + "step": 2099 + }, + { + "epoch": 0.23061717548868876, + "grad_norm": 2.127192497253418, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.693577229976654, + "num_tokens": 52741499.0, + "step": 2100 + }, + { + "epoch": 0.23072699319130244, + "grad_norm": 2.4600095748901367, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6868303418159485, + "num_tokens": 52766324.0, + "step": 2101 + }, + { + "epoch": 0.2308368108939161, + "grad_norm": 2.131594657897949, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6951006650924683, + "num_tokens": 52794579.0, + "step": 2102 + }, + { + "epoch": 0.23094662859652976, + "grad_norm": 2.7084672451019287, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.692487895488739, + "num_tokens": 52815261.0, + "step": 2103 + }, + { + "epoch": 0.23105644629914343, + "grad_norm": 2.25893497467041, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6883146166801453, + "num_tokens": 52841587.0, + "step": 2104 + }, + { + "epoch": 0.23116626400175708, + "grad_norm": 2.5212087631225586, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.697806715965271, + "num_tokens": 52861544.0, + "step": 2105 + }, + { + "epoch": 0.23127608170437075, + "grad_norm": 2.089907646179199, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6935766339302063, + "num_tokens": 52888634.0, + "step": 2106 + }, + { + "epoch": 0.2313858994069844, + "grad_norm": 2.246443271636963, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6912482976913452, + "num_tokens": 52916291.0, + "step": 2107 + }, + { + "epoch": 0.23149571710959807, + "grad_norm": 2.217594861984253, + "learning_rate": 1e-06, + "loss": 1.055, + "mean_token_accuracy": 0.6796955466270447, + "num_tokens": 52945050.0, + "step": 2108 + }, + { + "epoch": 0.23160553481221172, + "grad_norm": 2.4839026927948, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6871445178985596, + "num_tokens": 52965396.0, + "step": 2109 + }, + { + "epoch": 0.2317153525148254, + "grad_norm": 2.334146499633789, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7097147107124329, + "num_tokens": 52987751.0, + "step": 2110 + }, + { + "epoch": 0.23182517021743906, + "grad_norm": 2.6923954486846924, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7044738531112671, + "num_tokens": 53006975.0, + "step": 2111 + }, + { + "epoch": 0.2319349879200527, + "grad_norm": 2.4341976642608643, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7155849933624268, + "num_tokens": 53030769.0, + "step": 2112 + }, + { + "epoch": 0.23204480562266638, + "grad_norm": 2.4385457038879395, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7030882835388184, + "num_tokens": 53053471.0, + "step": 2113 + }, + { + "epoch": 0.23215462332528003, + "grad_norm": 2.577996015548706, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6923301219940186, + "num_tokens": 53075243.0, + "step": 2114 + }, + { + "epoch": 0.2322644410278937, + "grad_norm": 2.3142223358154297, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.700895369052887, + "num_tokens": 53100555.0, + "step": 2115 + }, + { + "epoch": 0.23237425873050735, + "grad_norm": 2.242600440979004, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6849604249000549, + "num_tokens": 53127924.0, + "step": 2116 + }, + { + "epoch": 0.23248407643312102, + "grad_norm": 2.0896213054656982, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7085682153701782, + "num_tokens": 53156415.0, + "step": 2117 + }, + { + "epoch": 0.23259389413573467, + "grad_norm": 2.3955955505371094, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6914924383163452, + "num_tokens": 53181571.0, + "step": 2118 + }, + { + "epoch": 0.23270371183834834, + "grad_norm": 2.359788656234741, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6815974116325378, + "num_tokens": 53206840.0, + "step": 2119 + }, + { + "epoch": 0.23281352954096202, + "grad_norm": 2.7836050987243652, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6898103952407837, + "num_tokens": 53224499.0, + "step": 2120 + }, + { + "epoch": 0.23292334724357566, + "grad_norm": 2.2907602787017822, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7297948598861694, + "num_tokens": 53245983.0, + "step": 2121 + }, + { + "epoch": 0.23303316494618934, + "grad_norm": 2.332833766937256, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.699930727481842, + "num_tokens": 53270170.0, + "step": 2122 + }, + { + "epoch": 0.23314298264880298, + "grad_norm": 2.605891704559326, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6970904469490051, + "num_tokens": 53292816.0, + "step": 2123 + }, + { + "epoch": 0.23325280035141666, + "grad_norm": 2.528653383255005, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6925716400146484, + "num_tokens": 53312510.0, + "step": 2124 + }, + { + "epoch": 0.2333626180540303, + "grad_norm": 2.3793323040008545, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6946766376495361, + "num_tokens": 53337245.0, + "step": 2125 + }, + { + "epoch": 0.23347243575664398, + "grad_norm": 2.2019078731536865, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.699387788772583, + "num_tokens": 53363825.0, + "step": 2126 + }, + { + "epoch": 0.23358225345925762, + "grad_norm": 2.4873907566070557, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6926016211509705, + "num_tokens": 53385234.0, + "step": 2127 + }, + { + "epoch": 0.2336920711618713, + "grad_norm": 2.1314096450805664, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6889772415161133, + "num_tokens": 53415316.0, + "step": 2128 + }, + { + "epoch": 0.23380188886448494, + "grad_norm": 2.004216194152832, + "learning_rate": 1e-06, + "loss": 1.134, + "mean_token_accuracy": 0.6608620882034302, + "num_tokens": 53449575.0, + "step": 2129 + }, + { + "epoch": 0.23391170656709862, + "grad_norm": 2.133880138397217, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7022624015808105, + "num_tokens": 53479092.0, + "step": 2130 + }, + { + "epoch": 0.2340215242697123, + "grad_norm": 2.3363447189331055, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7070730924606323, + "num_tokens": 53501766.0, + "step": 2131 + }, + { + "epoch": 0.23413134197232593, + "grad_norm": 2.359957218170166, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6971629858016968, + "num_tokens": 53525650.0, + "step": 2132 + }, + { + "epoch": 0.2342411596749396, + "grad_norm": 2.24223256111145, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.704131543636322, + "num_tokens": 53551396.0, + "step": 2133 + }, + { + "epoch": 0.23435097737755325, + "grad_norm": 2.2534193992614746, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.6876411437988281, + "num_tokens": 53578733.0, + "step": 2134 + }, + { + "epoch": 0.23446079508016693, + "grad_norm": 2.735987663269043, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6993378400802612, + "num_tokens": 53601335.0, + "step": 2135 + }, + { + "epoch": 0.23457061278278057, + "grad_norm": 2.8504111766815186, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.70652174949646, + "num_tokens": 53617238.0, + "step": 2136 + }, + { + "epoch": 0.23468043048539425, + "grad_norm": 2.202261447906494, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6862832307815552, + "num_tokens": 53643476.0, + "step": 2137 + }, + { + "epoch": 0.2347902481880079, + "grad_norm": 2.4667766094207764, + "learning_rate": 1e-06, + "loss": 1.0754, + "mean_token_accuracy": 0.6787671446800232, + "num_tokens": 53667430.0, + "step": 2138 + }, + { + "epoch": 0.23490006589062157, + "grad_norm": 2.3017513751983643, + "learning_rate": 1e-06, + "loss": 1.0917, + "mean_token_accuracy": 0.6729373931884766, + "num_tokens": 53694496.0, + "step": 2139 + }, + { + "epoch": 0.23500988359323524, + "grad_norm": 2.407177686691284, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7220096588134766, + "num_tokens": 53715303.0, + "step": 2140 + }, + { + "epoch": 0.2351197012958489, + "grad_norm": 2.05564022064209, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6987556219100952, + "num_tokens": 53745615.0, + "step": 2141 + }, + { + "epoch": 0.23522951899846256, + "grad_norm": 2.2894270420074463, + "learning_rate": 1e-06, + "loss": 1.0715, + "mean_token_accuracy": 0.6813335418701172, + "num_tokens": 53771688.0, + "step": 2142 + }, + { + "epoch": 0.2353393367010762, + "grad_norm": 1.9064326286315918, + "learning_rate": 1e-06, + "loss": 1.0995, + "mean_token_accuracy": 0.677571177482605, + "num_tokens": 53806127.0, + "step": 2143 + }, + { + "epoch": 0.23544915440368988, + "grad_norm": 2.0601859092712402, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7201737761497498, + "num_tokens": 53835860.0, + "step": 2144 + }, + { + "epoch": 0.23555897210630353, + "grad_norm": 2.3681535720825195, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.697277307510376, + "num_tokens": 53858459.0, + "step": 2145 + }, + { + "epoch": 0.2356687898089172, + "grad_norm": 2.3622612953186035, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7014732360839844, + "num_tokens": 53883027.0, + "step": 2146 + }, + { + "epoch": 0.23577860751153085, + "grad_norm": 2.305140972137451, + "learning_rate": 1e-06, + "loss": 1.1396, + "mean_token_accuracy": 0.6650381684303284, + "num_tokens": 53909437.0, + "step": 2147 + }, + { + "epoch": 0.23588842521414452, + "grad_norm": 1.9479587078094482, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7100943326950073, + "num_tokens": 53941349.0, + "step": 2148 + }, + { + "epoch": 0.2359982429167582, + "grad_norm": 2.4893996715545654, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7208856344223022, + "num_tokens": 53961185.0, + "step": 2149 + }, + { + "epoch": 0.23610806061937184, + "grad_norm": 2.3067824840545654, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6934727430343628, + "num_tokens": 53985560.0, + "step": 2150 + }, + { + "epoch": 0.2362178783219855, + "grad_norm": 2.2612674236297607, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7002620100975037, + "num_tokens": 54011333.0, + "step": 2151 + }, + { + "epoch": 0.23632769602459916, + "grad_norm": 2.2114343643188477, + "learning_rate": 1e-06, + "loss": 1.0973, + "mean_token_accuracy": 0.6817824840545654, + "num_tokens": 54041113.0, + "step": 2152 + }, + { + "epoch": 0.23643751372721283, + "grad_norm": 2.3350799083709717, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7200766801834106, + "num_tokens": 54064500.0, + "step": 2153 + }, + { + "epoch": 0.23654733142982648, + "grad_norm": 2.276989459991455, + "learning_rate": 1e-06, + "loss": 1.1015, + "mean_token_accuracy": 0.6744542121887207, + "num_tokens": 54090258.0, + "step": 2154 + }, + { + "epoch": 0.23665714913244015, + "grad_norm": 2.029971122741699, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7195465564727783, + "num_tokens": 54119477.0, + "step": 2155 + }, + { + "epoch": 0.2367669668350538, + "grad_norm": 2.3777103424072266, + "learning_rate": 1e-06, + "loss": 1.0881, + "mean_token_accuracy": 0.6796612739562988, + "num_tokens": 54144438.0, + "step": 2156 + }, + { + "epoch": 0.23687678453766747, + "grad_norm": 2.708263397216797, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7173943519592285, + "num_tokens": 54163208.0, + "step": 2157 + }, + { + "epoch": 0.23698660224028115, + "grad_norm": 2.3203728199005127, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7062132954597473, + "num_tokens": 54187301.0, + "step": 2158 + }, + { + "epoch": 0.2370964199428948, + "grad_norm": 2.3494646549224854, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6999123096466064, + "num_tokens": 54210801.0, + "step": 2159 + }, + { + "epoch": 0.23720623764550847, + "grad_norm": 2.450178861618042, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7043713331222534, + "num_tokens": 54234597.0, + "step": 2160 + }, + { + "epoch": 0.2373160553481221, + "grad_norm": 2.476135492324829, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7078233957290649, + "num_tokens": 54255782.0, + "step": 2161 + }, + { + "epoch": 0.2374258730507358, + "grad_norm": 2.231459856033325, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6992075443267822, + "num_tokens": 54281922.0, + "step": 2162 + }, + { + "epoch": 0.23753569075334943, + "grad_norm": 2.4796934127807617, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7045416235923767, + "num_tokens": 54303974.0, + "step": 2163 + }, + { + "epoch": 0.2376455084559631, + "grad_norm": 2.4816412925720215, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7058541774749756, + "num_tokens": 54325564.0, + "step": 2164 + }, + { + "epoch": 0.23775532615857675, + "grad_norm": 2.1023573875427246, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.70806884765625, + "num_tokens": 54352610.0, + "step": 2165 + }, + { + "epoch": 0.23786514386119043, + "grad_norm": 2.2296860218048096, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6954408884048462, + "num_tokens": 54378370.0, + "step": 2166 + }, + { + "epoch": 0.23797496156380407, + "grad_norm": 2.1978302001953125, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6861161589622498, + "num_tokens": 54406159.0, + "step": 2167 + }, + { + "epoch": 0.23808477926641775, + "grad_norm": 2.2241687774658203, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7088029384613037, + "num_tokens": 54430869.0, + "step": 2168 + }, + { + "epoch": 0.23819459696903142, + "grad_norm": 2.20198655128479, + "learning_rate": 1e-06, + "loss": 1.0458, + "mean_token_accuracy": 0.6866894364356995, + "num_tokens": 54456628.0, + "step": 2169 + }, + { + "epoch": 0.23830441467164507, + "grad_norm": 2.3069326877593994, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7093581557273865, + "num_tokens": 54481646.0, + "step": 2170 + }, + { + "epoch": 0.23841423237425874, + "grad_norm": 2.339305877685547, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6943153738975525, + "num_tokens": 54507428.0, + "step": 2171 + }, + { + "epoch": 0.23852405007687238, + "grad_norm": 2.4209513664245605, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7137616872787476, + "num_tokens": 54528709.0, + "step": 2172 + }, + { + "epoch": 0.23863386777948606, + "grad_norm": 2.1931004524230957, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6993986368179321, + "num_tokens": 54554424.0, + "step": 2173 + }, + { + "epoch": 0.2387436854820997, + "grad_norm": 1.9824107885360718, + "learning_rate": 1e-06, + "loss": 1.0787, + "mean_token_accuracy": 0.6795756816864014, + "num_tokens": 54588146.0, + "step": 2174 + }, + { + "epoch": 0.23885350318471338, + "grad_norm": 2.193732738494873, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7013936042785645, + "num_tokens": 54615194.0, + "step": 2175 + }, + { + "epoch": 0.23896332088732702, + "grad_norm": 2.050382614135742, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6786014437675476, + "num_tokens": 54647207.0, + "step": 2176 + }, + { + "epoch": 0.2390731385899407, + "grad_norm": 2.899237871170044, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7010189294815063, + "num_tokens": 54662769.0, + "step": 2177 + }, + { + "epoch": 0.23918295629255437, + "grad_norm": 2.6804983615875244, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7050098776817322, + "num_tokens": 54681233.0, + "step": 2178 + }, + { + "epoch": 0.23929277399516802, + "grad_norm": 2.0047616958618164, + "learning_rate": 1e-06, + "loss": 1.0595, + "mean_token_accuracy": 0.6784146428108215, + "num_tokens": 54712260.0, + "step": 2179 + }, + { + "epoch": 0.2394025916977817, + "grad_norm": 2.3577075004577637, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6896274089813232, + "num_tokens": 54737244.0, + "step": 2180 + }, + { + "epoch": 0.23951240940039534, + "grad_norm": 2.3507399559020996, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6987879872322083, + "num_tokens": 54761530.0, + "step": 2181 + }, + { + "epoch": 0.239622227103009, + "grad_norm": 2.302428960800171, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.7010058164596558, + "num_tokens": 54786130.0, + "step": 2182 + }, + { + "epoch": 0.23973204480562266, + "grad_norm": 2.2675461769104004, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6849400997161865, + "num_tokens": 54812437.0, + "step": 2183 + }, + { + "epoch": 0.23984186250823633, + "grad_norm": 2.1470906734466553, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.699419379234314, + "num_tokens": 54837036.0, + "step": 2184 + }, + { + "epoch": 0.23995168021084998, + "grad_norm": 2.2488229274749756, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.705047070980072, + "num_tokens": 54861460.0, + "step": 2185 + }, + { + "epoch": 0.24006149791346365, + "grad_norm": 2.3897268772125244, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7109524607658386, + "num_tokens": 54884964.0, + "step": 2186 + }, + { + "epoch": 0.24017131561607732, + "grad_norm": 2.4218978881835938, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.6932575106620789, + "num_tokens": 54907009.0, + "step": 2187 + }, + { + "epoch": 0.24028113331869097, + "grad_norm": 1.9168161153793335, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.69026780128479, + "num_tokens": 54940937.0, + "step": 2188 + }, + { + "epoch": 0.24039095102130464, + "grad_norm": 2.321523427963257, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.7158099412918091, + "num_tokens": 54966516.0, + "step": 2189 + }, + { + "epoch": 0.2405007687239183, + "grad_norm": 2.275775909423828, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.6917653679847717, + "num_tokens": 54991933.0, + "step": 2190 + }, + { + "epoch": 0.24061058642653196, + "grad_norm": 2.20770001411438, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7075517177581787, + "num_tokens": 55018455.0, + "step": 2191 + }, + { + "epoch": 0.2407204041291456, + "grad_norm": 2.4598655700683594, + "learning_rate": 1e-06, + "loss": 1.0802, + "mean_token_accuracy": 0.6765882968902588, + "num_tokens": 55041189.0, + "step": 2192 + }, + { + "epoch": 0.24083022183175928, + "grad_norm": 2.2607083320617676, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7122592926025391, + "num_tokens": 55065445.0, + "step": 2193 + }, + { + "epoch": 0.24094003953437293, + "grad_norm": 2.438775062561035, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.707538902759552, + "num_tokens": 55090316.0, + "step": 2194 + }, + { + "epoch": 0.2410498572369866, + "grad_norm": 2.418393135070801, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7055865526199341, + "num_tokens": 55113304.0, + "step": 2195 + }, + { + "epoch": 0.24115967493960028, + "grad_norm": 2.2057650089263916, + "learning_rate": 1e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.679230809211731, + "num_tokens": 55142074.0, + "step": 2196 + }, + { + "epoch": 0.24126949264221392, + "grad_norm": 2.038351058959961, + "learning_rate": 1e-06, + "loss": 1.079, + "mean_token_accuracy": 0.6949983835220337, + "num_tokens": 55172960.0, + "step": 2197 + }, + { + "epoch": 0.2413793103448276, + "grad_norm": 2.237985372543335, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6885496973991394, + "num_tokens": 55199028.0, + "step": 2198 + }, + { + "epoch": 0.24148912804744124, + "grad_norm": 2.229856491088867, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7010309100151062, + "num_tokens": 55224227.0, + "step": 2199 + }, + { + "epoch": 0.24159894575005492, + "grad_norm": 2.596426248550415, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6911693811416626, + "num_tokens": 55246575.0, + "step": 2200 + }, + { + "epoch": 0.24170876345266856, + "grad_norm": 2.292419910430908, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7342162132263184, + "num_tokens": 55269559.0, + "step": 2201 + }, + { + "epoch": 0.24181858115528224, + "grad_norm": 2.0446090698242188, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.684501051902771, + "num_tokens": 55298711.0, + "step": 2202 + }, + { + "epoch": 0.24192839885789588, + "grad_norm": 2.2631380558013916, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6863114833831787, + "num_tokens": 55323925.0, + "step": 2203 + }, + { + "epoch": 0.24203821656050956, + "grad_norm": 2.1230080127716064, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6772652864456177, + "num_tokens": 55353282.0, + "step": 2204 + }, + { + "epoch": 0.2421480342631232, + "grad_norm": 2.289344549179077, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6936261653900146, + "num_tokens": 55378866.0, + "step": 2205 + }, + { + "epoch": 0.24225785196573688, + "grad_norm": 2.7351582050323486, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6895892024040222, + "num_tokens": 55398404.0, + "step": 2206 + }, + { + "epoch": 0.24236766966835055, + "grad_norm": 2.3436975479125977, + "learning_rate": 1e-06, + "loss": 1.0885, + "mean_token_accuracy": 0.6720830798149109, + "num_tokens": 55423540.0, + "step": 2207 + }, + { + "epoch": 0.2424774873709642, + "grad_norm": 2.084501028060913, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7159706354141235, + "num_tokens": 55452266.0, + "step": 2208 + }, + { + "epoch": 0.24258730507357787, + "grad_norm": 2.131943702697754, + "learning_rate": 1e-06, + "loss": 1.0891, + "mean_token_accuracy": 0.6839829683303833, + "num_tokens": 55482000.0, + "step": 2209 + }, + { + "epoch": 0.24269712277619152, + "grad_norm": 2.5807719230651855, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6930810809135437, + "num_tokens": 55504002.0, + "step": 2210 + }, + { + "epoch": 0.2428069404788052, + "grad_norm": 2.2106640338897705, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7058393359184265, + "num_tokens": 55530220.0, + "step": 2211 + }, + { + "epoch": 0.24291675818141883, + "grad_norm": 2.2193527221679688, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7070245146751404, + "num_tokens": 55556080.0, + "step": 2212 + }, + { + "epoch": 0.2430265758840325, + "grad_norm": 2.3090603351593018, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6883500218391418, + "num_tokens": 55580897.0, + "step": 2213 + }, + { + "epoch": 0.24313639358664615, + "grad_norm": 2.078563928604126, + "learning_rate": 1e-06, + "loss": 1.1058, + "mean_token_accuracy": 0.6701878309249878, + "num_tokens": 55610920.0, + "step": 2214 + }, + { + "epoch": 0.24324621128925983, + "grad_norm": 2.1025753021240234, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7031222581863403, + "num_tokens": 55639346.0, + "step": 2215 + }, + { + "epoch": 0.2433560289918735, + "grad_norm": 2.155458688735962, + "learning_rate": 1e-06, + "loss": 1.1101, + "mean_token_accuracy": 0.6801283359527588, + "num_tokens": 55669144.0, + "step": 2216 + }, + { + "epoch": 0.24346584669448715, + "grad_norm": 2.1923294067382812, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7076976895332336, + "num_tokens": 55696074.0, + "step": 2217 + }, + { + "epoch": 0.24357566439710082, + "grad_norm": 2.354445695877075, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.692630410194397, + "num_tokens": 55720294.0, + "step": 2218 + }, + { + "epoch": 0.24368548209971447, + "grad_norm": 2.564708709716797, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6760811805725098, + "num_tokens": 55742375.0, + "step": 2219 + }, + { + "epoch": 0.24379529980232814, + "grad_norm": 2.1224205493927, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6895186901092529, + "num_tokens": 55772873.0, + "step": 2220 + }, + { + "epoch": 0.2439051175049418, + "grad_norm": 2.397538185119629, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7118865251541138, + "num_tokens": 55795230.0, + "step": 2221 + }, + { + "epoch": 0.24401493520755546, + "grad_norm": 2.057709217071533, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6860187649726868, + "num_tokens": 55825221.0, + "step": 2222 + }, + { + "epoch": 0.2441247529101691, + "grad_norm": 2.372244358062744, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7116659879684448, + "num_tokens": 55846808.0, + "step": 2223 + }, + { + "epoch": 0.24423457061278278, + "grad_norm": 2.0057132244110107, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6985883712768555, + "num_tokens": 55876040.0, + "step": 2224 + }, + { + "epoch": 0.24434438831539645, + "grad_norm": 2.048870086669922, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6995150446891785, + "num_tokens": 55905423.0, + "step": 2225 + }, + { + "epoch": 0.2444542060180101, + "grad_norm": 2.426778554916382, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7107402086257935, + "num_tokens": 55928111.0, + "step": 2226 + }, + { + "epoch": 0.24456402372062377, + "grad_norm": 2.391855478286743, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7018898725509644, + "num_tokens": 55950811.0, + "step": 2227 + }, + { + "epoch": 0.24467384142323742, + "grad_norm": 2.3945751190185547, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7183212041854858, + "num_tokens": 55971721.0, + "step": 2228 + }, + { + "epoch": 0.2447836591258511, + "grad_norm": 2.2182133197784424, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6832965016365051, + "num_tokens": 56000670.0, + "step": 2229 + }, + { + "epoch": 0.24489347682846474, + "grad_norm": 2.269892930984497, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6983002424240112, + "num_tokens": 56024404.0, + "step": 2230 + }, + { + "epoch": 0.2450032945310784, + "grad_norm": 2.099740743637085, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6833815574645996, + "num_tokens": 56055500.0, + "step": 2231 + }, + { + "epoch": 0.24511311223369206, + "grad_norm": 2.4213106632232666, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6835068464279175, + "num_tokens": 56078603.0, + "step": 2232 + }, + { + "epoch": 0.24522292993630573, + "grad_norm": 2.4459004402160645, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6826993227005005, + "num_tokens": 56102326.0, + "step": 2233 + }, + { + "epoch": 0.2453327476389194, + "grad_norm": 2.603031873703003, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6835683584213257, + "num_tokens": 56124013.0, + "step": 2234 + }, + { + "epoch": 0.24544256534153305, + "grad_norm": 2.639486074447632, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7092322707176208, + "num_tokens": 56143114.0, + "step": 2235 + }, + { + "epoch": 0.24555238304414673, + "grad_norm": 2.2723798751831055, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7116342186927795, + "num_tokens": 56166784.0, + "step": 2236 + }, + { + "epoch": 0.24566220074676037, + "grad_norm": 2.1584150791168213, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6961196660995483, + "num_tokens": 56194436.0, + "step": 2237 + }, + { + "epoch": 0.24577201844937405, + "grad_norm": 2.2619709968566895, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6844212412834167, + "num_tokens": 56219037.0, + "step": 2238 + }, + { + "epoch": 0.2458818361519877, + "grad_norm": 2.240626573562622, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6973243951797485, + "num_tokens": 56246435.0, + "step": 2239 + }, + { + "epoch": 0.24599165385460137, + "grad_norm": 2.194093942642212, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6876094937324524, + "num_tokens": 56275718.0, + "step": 2240 + }, + { + "epoch": 0.246101471557215, + "grad_norm": 2.348877191543579, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6874991655349731, + "num_tokens": 56298976.0, + "step": 2241 + }, + { + "epoch": 0.2462112892598287, + "grad_norm": 2.465467691421509, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.716606855392456, + "num_tokens": 56318079.0, + "step": 2242 + }, + { + "epoch": 0.24632110696244233, + "grad_norm": 2.2313482761383057, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6735361814498901, + "num_tokens": 56344472.0, + "step": 2243 + }, + { + "epoch": 0.246430924665056, + "grad_norm": 2.4243576526641846, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7031711339950562, + "num_tokens": 56369431.0, + "step": 2244 + }, + { + "epoch": 0.24654074236766968, + "grad_norm": 2.5803675651550293, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7106964588165283, + "num_tokens": 56389723.0, + "step": 2245 + }, + { + "epoch": 0.24665056007028333, + "grad_norm": 2.3031210899353027, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7161004543304443, + "num_tokens": 56413451.0, + "step": 2246 + }, + { + "epoch": 0.246760377772897, + "grad_norm": 1.9308347702026367, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.720102071762085, + "num_tokens": 56445839.0, + "step": 2247 + }, + { + "epoch": 0.24687019547551065, + "grad_norm": 2.688082695007324, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7194280624389648, + "num_tokens": 56464142.0, + "step": 2248 + }, + { + "epoch": 0.24698001317812432, + "grad_norm": 2.2877442836761475, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6947815418243408, + "num_tokens": 56488271.0, + "step": 2249 + }, + { + "epoch": 0.24708983088073797, + "grad_norm": 2.088885545730591, + "learning_rate": 1e-06, + "loss": 1.0653, + "mean_token_accuracy": 0.6798728108406067, + "num_tokens": 56517232.0, + "step": 2250 + }, + { + "epoch": 0.24719964858335164, + "grad_norm": 2.2216784954071045, + "learning_rate": 1e-06, + "loss": 1.0998, + "mean_token_accuracy": 0.6768954992294312, + "num_tokens": 56546953.0, + "step": 2251 + }, + { + "epoch": 0.24730946628596528, + "grad_norm": 2.261646032333374, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7176963090896606, + "num_tokens": 56572624.0, + "step": 2252 + }, + { + "epoch": 0.24741928398857896, + "grad_norm": 2.337357759475708, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7177044153213501, + "num_tokens": 56597018.0, + "step": 2253 + }, + { + "epoch": 0.24752910169119263, + "grad_norm": 2.2739529609680176, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7099725008010864, + "num_tokens": 56620787.0, + "step": 2254 + }, + { + "epoch": 0.24763891939380628, + "grad_norm": 2.098174810409546, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7021948099136353, + "num_tokens": 56648256.0, + "step": 2255 + }, + { + "epoch": 0.24774873709641995, + "grad_norm": 2.1874094009399414, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6960831880569458, + "num_tokens": 56674452.0, + "step": 2256 + }, + { + "epoch": 0.2478585547990336, + "grad_norm": 1.910514235496521, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6826067566871643, + "num_tokens": 56710439.0, + "step": 2257 + }, + { + "epoch": 0.24796837250164727, + "grad_norm": 2.1168711185455322, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6960049867630005, + "num_tokens": 56736389.0, + "step": 2258 + }, + { + "epoch": 0.24807819020426092, + "grad_norm": 2.2169058322906494, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6737426519393921, + "num_tokens": 56762607.0, + "step": 2259 + }, + { + "epoch": 0.2481880079068746, + "grad_norm": 2.2215209007263184, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.7015739679336548, + "num_tokens": 56787429.0, + "step": 2260 + }, + { + "epoch": 0.24829782560948824, + "grad_norm": 2.269744873046875, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7054465413093567, + "num_tokens": 56811196.0, + "step": 2261 + }, + { + "epoch": 0.2484076433121019, + "grad_norm": 2.290252208709717, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7001038193702698, + "num_tokens": 56835502.0, + "step": 2262 + }, + { + "epoch": 0.24851746101471558, + "grad_norm": 2.307500123977661, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7109091281890869, + "num_tokens": 56858499.0, + "step": 2263 + }, + { + "epoch": 0.24862727871732923, + "grad_norm": 2.0819387435913086, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6813821792602539, + "num_tokens": 56890312.0, + "step": 2264 + }, + { + "epoch": 0.2487370964199429, + "grad_norm": 2.3956103324890137, + "learning_rate": 1e-06, + "loss": 1.0757, + "mean_token_accuracy": 0.6898938417434692, + "num_tokens": 56914891.0, + "step": 2265 + }, + { + "epoch": 0.24884691412255655, + "grad_norm": 2.5345492362976074, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6960490942001343, + "num_tokens": 56935154.0, + "step": 2266 + }, + { + "epoch": 0.24895673182517022, + "grad_norm": 2.4688339233398438, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6959023475646973, + "num_tokens": 56956966.0, + "step": 2267 + }, + { + "epoch": 0.24906654952778387, + "grad_norm": 2.2216532230377197, + "learning_rate": 1e-06, + "loss": 1.1092, + "mean_token_accuracy": 0.6766194105148315, + "num_tokens": 56984621.0, + "step": 2268 + }, + { + "epoch": 0.24917636723039754, + "grad_norm": 2.20605206489563, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.7001474499702454, + "num_tokens": 57009256.0, + "step": 2269 + }, + { + "epoch": 0.2492861849330112, + "grad_norm": 2.4721016883850098, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7126013040542603, + "num_tokens": 57029495.0, + "step": 2270 + }, + { + "epoch": 0.24939600263562486, + "grad_norm": 2.0741002559661865, + "learning_rate": 1e-06, + "loss": 1.1071, + "mean_token_accuracy": 0.6737792491912842, + "num_tokens": 57060740.0, + "step": 2271 + }, + { + "epoch": 0.24950582033823854, + "grad_norm": 2.139145851135254, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6914717555046082, + "num_tokens": 57088270.0, + "step": 2272 + }, + { + "epoch": 0.24961563804085218, + "grad_norm": 2.2348010540008545, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7186306715011597, + "num_tokens": 57112800.0, + "step": 2273 + }, + { + "epoch": 0.24972545574346586, + "grad_norm": 2.27769136428833, + "learning_rate": 1e-06, + "loss": 1.1189, + "mean_token_accuracy": 0.6853711009025574, + "num_tokens": 57138673.0, + "step": 2274 + }, + { + "epoch": 0.2498352734460795, + "grad_norm": 2.4476821422576904, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6770493984222412, + "num_tokens": 57161421.0, + "step": 2275 + }, + { + "epoch": 0.24994509114869318, + "grad_norm": 2.417736530303955, + "learning_rate": 1e-06, + "loss": 1.0821, + "mean_token_accuracy": 0.675523579120636, + "num_tokens": 57185499.0, + "step": 2276 + }, + { + "epoch": 0.2500549088513068, + "grad_norm": 2.234651565551758, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.6926935911178589, + "num_tokens": 57211477.0, + "step": 2277 + }, + { + "epoch": 0.25016472655392047, + "grad_norm": 2.103127956390381, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7256775498390198, + "num_tokens": 57237537.0, + "step": 2278 + }, + { + "epoch": 0.25027454425653417, + "grad_norm": 2.348043203353882, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7136235237121582, + "num_tokens": 57259318.0, + "step": 2279 + }, + { + "epoch": 0.2503843619591478, + "grad_norm": 2.236584186553955, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6926605701446533, + "num_tokens": 57285112.0, + "step": 2280 + }, + { + "epoch": 0.25049417966176146, + "grad_norm": 2.277740716934204, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6981011629104614, + "num_tokens": 57310150.0, + "step": 2281 + }, + { + "epoch": 0.25060399736437516, + "grad_norm": 2.4154052734375, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7090768814086914, + "num_tokens": 57333030.0, + "step": 2282 + }, + { + "epoch": 0.2507138150669888, + "grad_norm": 2.445446491241455, + "learning_rate": 1e-06, + "loss": 1.1127, + "mean_token_accuracy": 0.6670981049537659, + "num_tokens": 57356541.0, + "step": 2283 + }, + { + "epoch": 0.25082363276960246, + "grad_norm": 2.1758534908294678, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7052783966064453, + "num_tokens": 57383286.0, + "step": 2284 + }, + { + "epoch": 0.2509334504722161, + "grad_norm": 2.1688597202301025, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7177867889404297, + "num_tokens": 57409654.0, + "step": 2285 + }, + { + "epoch": 0.2510432681748298, + "grad_norm": 2.2939000129699707, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7027666568756104, + "num_tokens": 57431986.0, + "step": 2286 + }, + { + "epoch": 0.25115308587744345, + "grad_norm": 2.16989803314209, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7078540921211243, + "num_tokens": 57457539.0, + "step": 2287 + }, + { + "epoch": 0.2512629035800571, + "grad_norm": 2.195882797241211, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6849364042282104, + "num_tokens": 57485523.0, + "step": 2288 + }, + { + "epoch": 0.25137272128267074, + "grad_norm": 2.0274524688720703, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6937854290008545, + "num_tokens": 57517265.0, + "step": 2289 + }, + { + "epoch": 0.25148253898528444, + "grad_norm": 2.288722515106201, + "learning_rate": 1e-06, + "loss": 1.11, + "mean_token_accuracy": 0.672199547290802, + "num_tokens": 57543091.0, + "step": 2290 + }, + { + "epoch": 0.2515923566878981, + "grad_norm": 2.23289155960083, + "learning_rate": 1e-06, + "loss": 1.08, + "mean_token_accuracy": 0.6827161312103271, + "num_tokens": 57569327.0, + "step": 2291 + }, + { + "epoch": 0.25170217439051173, + "grad_norm": 2.330965280532837, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7047749757766724, + "num_tokens": 57592066.0, + "step": 2292 + }, + { + "epoch": 0.25181199209312544, + "grad_norm": 2.4636106491088867, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7165394425392151, + "num_tokens": 57611527.0, + "step": 2293 + }, + { + "epoch": 0.2519218097957391, + "grad_norm": 2.519869327545166, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6959436535835266, + "num_tokens": 57632573.0, + "step": 2294 + }, + { + "epoch": 0.25203162749835273, + "grad_norm": 2.630793571472168, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7073323130607605, + "num_tokens": 57652983.0, + "step": 2295 + }, + { + "epoch": 0.2521414452009664, + "grad_norm": 2.2290380001068115, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.698197603225708, + "num_tokens": 57679898.0, + "step": 2296 + }, + { + "epoch": 0.2522512629035801, + "grad_norm": 2.345107316970825, + "learning_rate": 1e-06, + "loss": 1.1157, + "mean_token_accuracy": 0.6692866086959839, + "num_tokens": 57708533.0, + "step": 2297 + }, + { + "epoch": 0.2523610806061937, + "grad_norm": 2.091127634048462, + "learning_rate": 1e-06, + "loss": 1.1267, + "mean_token_accuracy": 0.6616343855857849, + "num_tokens": 57739872.0, + "step": 2298 + }, + { + "epoch": 0.25247089830880737, + "grad_norm": 2.131920576095581, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6867744326591492, + "num_tokens": 57768099.0, + "step": 2299 + }, + { + "epoch": 0.252580716011421, + "grad_norm": 2.164243459701538, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.689995527267456, + "num_tokens": 57797236.0, + "step": 2300 + }, + { + "epoch": 0.2526905337140347, + "grad_norm": 2.1427032947540283, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6889066100120544, + "num_tokens": 57824300.0, + "step": 2301 + }, + { + "epoch": 0.25280035141664836, + "grad_norm": 2.086282968521118, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.709629237651825, + "num_tokens": 57851749.0, + "step": 2302 + }, + { + "epoch": 0.252910169119262, + "grad_norm": 2.2482128143310547, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7301932573318481, + "num_tokens": 57876366.0, + "step": 2303 + }, + { + "epoch": 0.2530199868218757, + "grad_norm": 1.957065463066101, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.702094316482544, + "num_tokens": 57906780.0, + "step": 2304 + }, + { + "epoch": 0.25312980452448935, + "grad_norm": 2.1564056873321533, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7009170055389404, + "num_tokens": 57932010.0, + "step": 2305 + }, + { + "epoch": 0.253239622227103, + "grad_norm": 2.19124174118042, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6775805354118347, + "num_tokens": 57959845.0, + "step": 2306 + }, + { + "epoch": 0.25334943992971665, + "grad_norm": 2.433032512664795, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6861187219619751, + "num_tokens": 57982050.0, + "step": 2307 + }, + { + "epoch": 0.25345925763233035, + "grad_norm": 2.220229148864746, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6960124969482422, + "num_tokens": 58009999.0, + "step": 2308 + }, + { + "epoch": 0.253569075334944, + "grad_norm": 2.1551127433776855, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.700929582118988, + "num_tokens": 58038846.0, + "step": 2309 + }, + { + "epoch": 0.25367889303755764, + "grad_norm": 2.0398011207580566, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7085423469543457, + "num_tokens": 58068054.0, + "step": 2310 + }, + { + "epoch": 0.25378871074017134, + "grad_norm": 2.2652831077575684, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6989349126815796, + "num_tokens": 58094268.0, + "step": 2311 + }, + { + "epoch": 0.253898528442785, + "grad_norm": 2.1203577518463135, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7099559307098389, + "num_tokens": 58120385.0, + "step": 2312 + }, + { + "epoch": 0.25400834614539863, + "grad_norm": 2.060298204421997, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6967782974243164, + "num_tokens": 58151143.0, + "step": 2313 + }, + { + "epoch": 0.2541181638480123, + "grad_norm": 2.391990900039673, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.714106559753418, + "num_tokens": 58173585.0, + "step": 2314 + }, + { + "epoch": 0.254227981550626, + "grad_norm": 2.1909608840942383, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6900712847709656, + "num_tokens": 58199542.0, + "step": 2315 + }, + { + "epoch": 0.2543377992532396, + "grad_norm": 2.3451030254364014, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7018197178840637, + "num_tokens": 58224053.0, + "step": 2316 + }, + { + "epoch": 0.2544476169558533, + "grad_norm": 2.46077561378479, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6905432939529419, + "num_tokens": 58246311.0, + "step": 2317 + }, + { + "epoch": 0.2545574346584669, + "grad_norm": 2.269899368286133, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6984167098999023, + "num_tokens": 58270728.0, + "step": 2318 + }, + { + "epoch": 0.2546672523610806, + "grad_norm": 2.542567253112793, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7305426597595215, + "num_tokens": 58290380.0, + "step": 2319 + }, + { + "epoch": 0.25477707006369427, + "grad_norm": 2.199512004852295, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6966711282730103, + "num_tokens": 58315222.0, + "step": 2320 + }, + { + "epoch": 0.2548868877663079, + "grad_norm": 2.2201666831970215, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6930890679359436, + "num_tokens": 58341008.0, + "step": 2321 + }, + { + "epoch": 0.2549967054689216, + "grad_norm": 2.457231044769287, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6990233659744263, + "num_tokens": 58361855.0, + "step": 2322 + }, + { + "epoch": 0.25510652317153526, + "grad_norm": 2.40449595451355, + "learning_rate": 1e-06, + "loss": 1.0714, + "mean_token_accuracy": 0.6843600273132324, + "num_tokens": 58383282.0, + "step": 2323 + }, + { + "epoch": 0.2552163408741489, + "grad_norm": 2.243551015853882, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.702757716178894, + "num_tokens": 58407761.0, + "step": 2324 + }, + { + "epoch": 0.25532615857676255, + "grad_norm": 2.3434436321258545, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6921384334564209, + "num_tokens": 58429788.0, + "step": 2325 + }, + { + "epoch": 0.25543597627937625, + "grad_norm": 2.392951726913452, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7227658033370972, + "num_tokens": 58450490.0, + "step": 2326 + }, + { + "epoch": 0.2555457939819899, + "grad_norm": 2.0916693210601807, + "learning_rate": 1e-06, + "loss": 1.1313, + "mean_token_accuracy": 0.6653953790664673, + "num_tokens": 58479723.0, + "step": 2327 + }, + { + "epoch": 0.25565561168460355, + "grad_norm": 2.131234884262085, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6958224177360535, + "num_tokens": 58506550.0, + "step": 2328 + }, + { + "epoch": 0.25576542938721725, + "grad_norm": 2.183443307876587, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7259788513183594, + "num_tokens": 58530811.0, + "step": 2329 + }, + { + "epoch": 0.2558752470898309, + "grad_norm": 2.28562331199646, + "learning_rate": 1e-06, + "loss": 1.0891, + "mean_token_accuracy": 0.6724904775619507, + "num_tokens": 58556356.0, + "step": 2330 + }, + { + "epoch": 0.25598506479244454, + "grad_norm": 2.2868545055389404, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7083829641342163, + "num_tokens": 58580721.0, + "step": 2331 + }, + { + "epoch": 0.2560948824950582, + "grad_norm": 2.0380427837371826, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6823147535324097, + "num_tokens": 58611194.0, + "step": 2332 + }, + { + "epoch": 0.2562047001976719, + "grad_norm": 2.4688944816589355, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7148977518081665, + "num_tokens": 58631909.0, + "step": 2333 + }, + { + "epoch": 0.25631451790028553, + "grad_norm": 2.3837406635284424, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6889594793319702, + "num_tokens": 58655397.0, + "step": 2334 + }, + { + "epoch": 0.2564243356028992, + "grad_norm": 2.360232353210449, + "learning_rate": 1e-06, + "loss": 1.061, + "mean_token_accuracy": 0.680243730545044, + "num_tokens": 58680217.0, + "step": 2335 + }, + { + "epoch": 0.2565341533055128, + "grad_norm": 2.1674060821533203, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7044316530227661, + "num_tokens": 58709700.0, + "step": 2336 + }, + { + "epoch": 0.2566439710081265, + "grad_norm": 2.4876184463500977, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7091023325920105, + "num_tokens": 58730741.0, + "step": 2337 + }, + { + "epoch": 0.25675378871074017, + "grad_norm": 2.173590660095215, + "learning_rate": 1e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6801119446754456, + "num_tokens": 58760642.0, + "step": 2338 + }, + { + "epoch": 0.2568636064133538, + "grad_norm": 2.2239511013031006, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7063674926757812, + "num_tokens": 58786255.0, + "step": 2339 + }, + { + "epoch": 0.2569734241159675, + "grad_norm": 2.320669174194336, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6895831823348999, + "num_tokens": 58809874.0, + "step": 2340 + }, + { + "epoch": 0.25708324181858117, + "grad_norm": 2.0961482524871826, + "learning_rate": 1e-06, + "loss": 1.0957, + "mean_token_accuracy": 0.6710644960403442, + "num_tokens": 58838229.0, + "step": 2341 + }, + { + "epoch": 0.2571930595211948, + "grad_norm": 2.295018434524536, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.6889511346817017, + "num_tokens": 58862368.0, + "step": 2342 + }, + { + "epoch": 0.25730287722380846, + "grad_norm": 2.2034554481506348, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7121145725250244, + "num_tokens": 58886592.0, + "step": 2343 + }, + { + "epoch": 0.25741269492642216, + "grad_norm": 2.306952476501465, + "learning_rate": 1e-06, + "loss": 1.0489, + "mean_token_accuracy": 0.6833012700080872, + "num_tokens": 58910783.0, + "step": 2344 + }, + { + "epoch": 0.2575225126290358, + "grad_norm": 2.181117534637451, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7073929309844971, + "num_tokens": 58936441.0, + "step": 2345 + }, + { + "epoch": 0.25763233033164945, + "grad_norm": 2.370140314102173, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6845480799674988, + "num_tokens": 58959030.0, + "step": 2346 + }, + { + "epoch": 0.2577421480342631, + "grad_norm": 2.60583758354187, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7113242149353027, + "num_tokens": 58978209.0, + "step": 2347 + }, + { + "epoch": 0.2578519657368768, + "grad_norm": 2.3406031131744385, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6840227842330933, + "num_tokens": 59002286.0, + "step": 2348 + }, + { + "epoch": 0.25796178343949044, + "grad_norm": 2.07955002784729, + "learning_rate": 1e-06, + "loss": 1.095, + "mean_token_accuracy": 0.6724963188171387, + "num_tokens": 59032081.0, + "step": 2349 + }, + { + "epoch": 0.2580716011421041, + "grad_norm": 2.3025238513946533, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6992263793945312, + "num_tokens": 59055610.0, + "step": 2350 + }, + { + "epoch": 0.2581814188447178, + "grad_norm": 2.217850923538208, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6906877756118774, + "num_tokens": 59081320.0, + "step": 2351 + }, + { + "epoch": 0.25829123654733144, + "grad_norm": 2.204719305038452, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7038872241973877, + "num_tokens": 59108392.0, + "step": 2352 + }, + { + "epoch": 0.2584010542499451, + "grad_norm": 2.141209840774536, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.705147385597229, + "num_tokens": 59138390.0, + "step": 2353 + }, + { + "epoch": 0.25851087195255873, + "grad_norm": 2.5891199111938477, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7358096241950989, + "num_tokens": 59156861.0, + "step": 2354 + }, + { + "epoch": 0.25862068965517243, + "grad_norm": 2.216782569885254, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.7000911235809326, + "num_tokens": 59183422.0, + "step": 2355 + }, + { + "epoch": 0.2587305073577861, + "grad_norm": 2.3786449432373047, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6956948041915894, + "num_tokens": 59204458.0, + "step": 2356 + }, + { + "epoch": 0.2588403250603997, + "grad_norm": 2.6384689807891846, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7120224237442017, + "num_tokens": 59223801.0, + "step": 2357 + }, + { + "epoch": 0.2589501427630134, + "grad_norm": 2.162215232849121, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6831587553024292, + "num_tokens": 59250644.0, + "step": 2358 + }, + { + "epoch": 0.25905996046562707, + "grad_norm": 2.1918914318084717, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6964749097824097, + "num_tokens": 59277184.0, + "step": 2359 + }, + { + "epoch": 0.2591697781682407, + "grad_norm": 2.0263140201568604, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6885265111923218, + "num_tokens": 59306880.0, + "step": 2360 + }, + { + "epoch": 0.25927959587085436, + "grad_norm": 2.590855360031128, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7156683206558228, + "num_tokens": 59325135.0, + "step": 2361 + }, + { + "epoch": 0.25938941357346806, + "grad_norm": 2.3137216567993164, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7241332530975342, + "num_tokens": 59346774.0, + "step": 2362 + }, + { + "epoch": 0.2594992312760817, + "grad_norm": 2.4015257358551025, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7014692425727844, + "num_tokens": 59369879.0, + "step": 2363 + }, + { + "epoch": 0.25960904897869536, + "grad_norm": 2.236565113067627, + "learning_rate": 1e-06, + "loss": 1.0376, + "mean_token_accuracy": 0.693891704082489, + "num_tokens": 59395619.0, + "step": 2364 + }, + { + "epoch": 0.259718866681309, + "grad_norm": 2.2782747745513916, + "learning_rate": 1e-06, + "loss": 1.0878, + "mean_token_accuracy": 0.6719852089881897, + "num_tokens": 59424299.0, + "step": 2365 + }, + { + "epoch": 0.2598286843839227, + "grad_norm": 2.096245288848877, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6843655109405518, + "num_tokens": 59452759.0, + "step": 2366 + }, + { + "epoch": 0.25993850208653635, + "grad_norm": 2.334364652633667, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7013121843338013, + "num_tokens": 59476667.0, + "step": 2367 + }, + { + "epoch": 0.26004831978915, + "grad_norm": 2.3523874282836914, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.682348370552063, + "num_tokens": 59501708.0, + "step": 2368 + }, + { + "epoch": 0.2601581374917637, + "grad_norm": 2.2240331172943115, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6938170790672302, + "num_tokens": 59526561.0, + "step": 2369 + }, + { + "epoch": 0.26026795519437734, + "grad_norm": 2.4768526554107666, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6906308531761169, + "num_tokens": 59549044.0, + "step": 2370 + }, + { + "epoch": 0.260377772896991, + "grad_norm": 2.6126925945281982, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7040345072746277, + "num_tokens": 59569134.0, + "step": 2371 + }, + { + "epoch": 0.26048759059960463, + "grad_norm": 2.1031301021575928, + "learning_rate": 1e-06, + "loss": 1.0862, + "mean_token_accuracy": 0.6849391460418701, + "num_tokens": 59599110.0, + "step": 2372 + }, + { + "epoch": 0.26059740830221834, + "grad_norm": 2.1512420177459717, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7205766439437866, + "num_tokens": 59622500.0, + "step": 2373 + }, + { + "epoch": 0.260707226004832, + "grad_norm": 2.372170925140381, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6917601227760315, + "num_tokens": 59645870.0, + "step": 2374 + }, + { + "epoch": 0.26081704370744563, + "grad_norm": 2.336069345474243, + "learning_rate": 1e-06, + "loss": 1.1131, + "mean_token_accuracy": 0.6689721345901489, + "num_tokens": 59669754.0, + "step": 2375 + }, + { + "epoch": 0.2609268614100593, + "grad_norm": 2.4280002117156982, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7145185470581055, + "num_tokens": 59690591.0, + "step": 2376 + }, + { + "epoch": 0.261036679112673, + "grad_norm": 2.238051414489746, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.6997473239898682, + "num_tokens": 59716648.0, + "step": 2377 + }, + { + "epoch": 0.2611464968152866, + "grad_norm": 2.5934267044067383, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7027902007102966, + "num_tokens": 59735831.0, + "step": 2378 + }, + { + "epoch": 0.26125631451790027, + "grad_norm": 2.288172721862793, + "learning_rate": 1e-06, + "loss": 1.067, + "mean_token_accuracy": 0.6814719438552856, + "num_tokens": 59762294.0, + "step": 2379 + }, + { + "epoch": 0.26136613222051397, + "grad_norm": 2.3060522079467773, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6872516870498657, + "num_tokens": 59788441.0, + "step": 2380 + }, + { + "epoch": 0.2614759499231276, + "grad_norm": 2.5607035160064697, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7065768241882324, + "num_tokens": 59809179.0, + "step": 2381 + }, + { + "epoch": 0.26158576762574126, + "grad_norm": 2.2771270275115967, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7035287022590637, + "num_tokens": 59836579.0, + "step": 2382 + }, + { + "epoch": 0.2616955853283549, + "grad_norm": 2.557321071624756, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6969177722930908, + "num_tokens": 59857232.0, + "step": 2383 + }, + { + "epoch": 0.2618054030309686, + "grad_norm": 1.9798297882080078, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.700168251991272, + "num_tokens": 59890398.0, + "step": 2384 + }, + { + "epoch": 0.26191522073358225, + "grad_norm": 2.19169020652771, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6852776408195496, + "num_tokens": 59916051.0, + "step": 2385 + }, + { + "epoch": 0.2620250384361959, + "grad_norm": 2.244966506958008, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7096320390701294, + "num_tokens": 59942110.0, + "step": 2386 + }, + { + "epoch": 0.2621348561388096, + "grad_norm": 2.6739912033081055, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.6994320154190063, + "num_tokens": 59960556.0, + "step": 2387 + }, + { + "epoch": 0.26224467384142325, + "grad_norm": 2.3624279499053955, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6863350868225098, + "num_tokens": 59983597.0, + "step": 2388 + }, + { + "epoch": 0.2623544915440369, + "grad_norm": 2.007728338241577, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6911590099334717, + "num_tokens": 60015179.0, + "step": 2389 + }, + { + "epoch": 0.26246430924665054, + "grad_norm": 2.6206324100494385, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6926881670951843, + "num_tokens": 60035516.0, + "step": 2390 + }, + { + "epoch": 0.26257412694926424, + "grad_norm": 2.43605899810791, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6978436708450317, + "num_tokens": 60056802.0, + "step": 2391 + }, + { + "epoch": 0.2626839446518779, + "grad_norm": 2.1411478519439697, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6859561204910278, + "num_tokens": 60085783.0, + "step": 2392 + }, + { + "epoch": 0.26279376235449153, + "grad_norm": 2.3166427612304688, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6863341331481934, + "num_tokens": 60109849.0, + "step": 2393 + }, + { + "epoch": 0.2629035800571052, + "grad_norm": 2.208425283432007, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6994242072105408, + "num_tokens": 60136676.0, + "step": 2394 + }, + { + "epoch": 0.2630133977597189, + "grad_norm": 2.2868380546569824, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7123802900314331, + "num_tokens": 60159844.0, + "step": 2395 + }, + { + "epoch": 0.2631232154623325, + "grad_norm": 1.915921926498413, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6927323341369629, + "num_tokens": 60192740.0, + "step": 2396 + }, + { + "epoch": 0.2632330331649462, + "grad_norm": 2.1335947513580322, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6886714696884155, + "num_tokens": 60220959.0, + "step": 2397 + }, + { + "epoch": 0.2633428508675599, + "grad_norm": 2.3017477989196777, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7007340788841248, + "num_tokens": 60245008.0, + "step": 2398 + }, + { + "epoch": 0.2634526685701735, + "grad_norm": 2.228666305541992, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6920315027236938, + "num_tokens": 60272824.0, + "step": 2399 + }, + { + "epoch": 0.26356248627278717, + "grad_norm": 2.0610783100128174, + "learning_rate": 1e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.67223060131073, + "num_tokens": 60304602.0, + "step": 2400 + }, + { + "epoch": 0.2636723039754008, + "grad_norm": 2.005711078643799, + "learning_rate": 1e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6768155097961426, + "num_tokens": 60334911.0, + "step": 2401 + }, + { + "epoch": 0.2637821216780145, + "grad_norm": 2.1553099155426025, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.6948296427726746, + "num_tokens": 60362345.0, + "step": 2402 + }, + { + "epoch": 0.26389193938062816, + "grad_norm": 2.1012659072875977, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7164272665977478, + "num_tokens": 60389952.0, + "step": 2403 + }, + { + "epoch": 0.2640017570832418, + "grad_norm": 2.6835076808929443, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.706520140171051, + "num_tokens": 60407685.0, + "step": 2404 + }, + { + "epoch": 0.2641115747858555, + "grad_norm": 2.47257137298584, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7108137011528015, + "num_tokens": 60429269.0, + "step": 2405 + }, + { + "epoch": 0.26422139248846915, + "grad_norm": 2.331496477127075, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6894291043281555, + "num_tokens": 60454022.0, + "step": 2406 + }, + { + "epoch": 0.2643312101910828, + "grad_norm": 2.6776797771453857, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7005617618560791, + "num_tokens": 60475902.0, + "step": 2407 + }, + { + "epoch": 0.26444102789369645, + "grad_norm": 2.260042667388916, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7226529121398926, + "num_tokens": 60500416.0, + "step": 2408 + }, + { + "epoch": 0.26455084559631015, + "grad_norm": 2.2095303535461426, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6895872354507446, + "num_tokens": 60526333.0, + "step": 2409 + }, + { + "epoch": 0.2646606632989238, + "grad_norm": 2.0658650398254395, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.711205005645752, + "num_tokens": 60554685.0, + "step": 2410 + }, + { + "epoch": 0.26477048100153744, + "grad_norm": 2.326105833053589, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6933929324150085, + "num_tokens": 60579761.0, + "step": 2411 + }, + { + "epoch": 0.2648802987041511, + "grad_norm": 2.7606992721557617, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6815733909606934, + "num_tokens": 60599161.0, + "step": 2412 + }, + { + "epoch": 0.2649901164067648, + "grad_norm": 2.47635555267334, + "learning_rate": 1e-06, + "loss": 1.1145, + "mean_token_accuracy": 0.6655948162078857, + "num_tokens": 60624994.0, + "step": 2413 + }, + { + "epoch": 0.26509993410937843, + "grad_norm": 2.51257061958313, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6948988437652588, + "num_tokens": 60647183.0, + "step": 2414 + }, + { + "epoch": 0.2652097518119921, + "grad_norm": 2.666116952896118, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.6984290480613708, + "num_tokens": 60665926.0, + "step": 2415 + }, + { + "epoch": 0.2653195695146058, + "grad_norm": 2.433034658432007, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.7062555551528931, + "num_tokens": 60690102.0, + "step": 2416 + }, + { + "epoch": 0.2654293872172194, + "grad_norm": 2.1312668323516846, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6755645275115967, + "num_tokens": 60719482.0, + "step": 2417 + }, + { + "epoch": 0.26553920491983307, + "grad_norm": 1.9989104270935059, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6926470994949341, + "num_tokens": 60753881.0, + "step": 2418 + }, + { + "epoch": 0.2656490226224467, + "grad_norm": 2.550128221511841, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7010809183120728, + "num_tokens": 60775288.0, + "step": 2419 + }, + { + "epoch": 0.2657588403250604, + "grad_norm": 2.346162796020508, + "learning_rate": 1e-06, + "loss": 1.1714, + "mean_token_accuracy": 0.6660013794898987, + "num_tokens": 60801479.0, + "step": 2420 + }, + { + "epoch": 0.26586865802767407, + "grad_norm": 2.163036346435547, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6964415311813354, + "num_tokens": 60827501.0, + "step": 2421 + }, + { + "epoch": 0.2659784757302877, + "grad_norm": 2.2519748210906982, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7004199028015137, + "num_tokens": 60855113.0, + "step": 2422 + }, + { + "epoch": 0.26608829343290136, + "grad_norm": 2.786306142807007, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.72296541929245, + "num_tokens": 60870427.0, + "step": 2423 + }, + { + "epoch": 0.26619811113551506, + "grad_norm": 2.2121756076812744, + "learning_rate": 1e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.6854794025421143, + "num_tokens": 60895922.0, + "step": 2424 + }, + { + "epoch": 0.2663079288381287, + "grad_norm": 2.615098714828491, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.709975004196167, + "num_tokens": 60916266.0, + "step": 2425 + }, + { + "epoch": 0.26641774654074235, + "grad_norm": 2.116633892059326, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6753023862838745, + "num_tokens": 60948298.0, + "step": 2426 + }, + { + "epoch": 0.26652756424335605, + "grad_norm": 2.3266854286193848, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7082377672195435, + "num_tokens": 60971928.0, + "step": 2427 + }, + { + "epoch": 0.2666373819459697, + "grad_norm": 1.9322253465652466, + "learning_rate": 1e-06, + "loss": 1.1137, + "mean_token_accuracy": 0.6738818883895874, + "num_tokens": 61007355.0, + "step": 2428 + }, + { + "epoch": 0.26674719964858334, + "grad_norm": 2.302670955657959, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6999996900558472, + "num_tokens": 61030586.0, + "step": 2429 + }, + { + "epoch": 0.266857017351197, + "grad_norm": 2.1872098445892334, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6907274723052979, + "num_tokens": 61055355.0, + "step": 2430 + }, + { + "epoch": 0.2669668350538107, + "grad_norm": 2.3834798336029053, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6921965479850769, + "num_tokens": 61079434.0, + "step": 2431 + }, + { + "epoch": 0.26707665275642434, + "grad_norm": 2.121077060699463, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7063847780227661, + "num_tokens": 61104743.0, + "step": 2432 + }, + { + "epoch": 0.267186470459038, + "grad_norm": 2.3643410205841064, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.731655478477478, + "num_tokens": 61125181.0, + "step": 2433 + }, + { + "epoch": 0.2672962881616517, + "grad_norm": 2.0315539836883545, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6860315203666687, + "num_tokens": 61158744.0, + "step": 2434 + }, + { + "epoch": 0.26740610586426533, + "grad_norm": 2.2929346561431885, + "learning_rate": 1e-06, + "loss": 1.1136, + "mean_token_accuracy": 0.670006275177002, + "num_tokens": 61187135.0, + "step": 2435 + }, + { + "epoch": 0.267515923566879, + "grad_norm": 2.4799907207489014, + "learning_rate": 1e-06, + "loss": 1.0665, + "mean_token_accuracy": 0.6826097965240479, + "num_tokens": 61211958.0, + "step": 2436 + }, + { + "epoch": 0.2676257412694926, + "grad_norm": 2.4483091831207275, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6809677481651306, + "num_tokens": 61232776.0, + "step": 2437 + }, + { + "epoch": 0.2677355589721063, + "grad_norm": 2.1427478790283203, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6840055584907532, + "num_tokens": 61260201.0, + "step": 2438 + }, + { + "epoch": 0.26784537667471997, + "grad_norm": 2.336181640625, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.717035174369812, + "num_tokens": 61284006.0, + "step": 2439 + }, + { + "epoch": 0.2679551943773336, + "grad_norm": 2.505847215652466, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7055004239082336, + "num_tokens": 61305532.0, + "step": 2440 + }, + { + "epoch": 0.26806501207994726, + "grad_norm": 2.2890615463256836, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.688288688659668, + "num_tokens": 61330936.0, + "step": 2441 + }, + { + "epoch": 0.26817482978256096, + "grad_norm": 2.1825592517852783, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7102240920066833, + "num_tokens": 61358764.0, + "step": 2442 + }, + { + "epoch": 0.2682846474851746, + "grad_norm": 2.192030906677246, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.7007057070732117, + "num_tokens": 61384750.0, + "step": 2443 + }, + { + "epoch": 0.26839446518778826, + "grad_norm": 2.37184739112854, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7075691223144531, + "num_tokens": 61408495.0, + "step": 2444 + }, + { + "epoch": 0.26850428289040196, + "grad_norm": 2.1754181385040283, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6853628158569336, + "num_tokens": 61436285.0, + "step": 2445 + }, + { + "epoch": 0.2686141005930156, + "grad_norm": 2.64428448677063, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7250133156776428, + "num_tokens": 61455657.0, + "step": 2446 + }, + { + "epoch": 0.26872391829562925, + "grad_norm": 2.0443999767303467, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6931607127189636, + "num_tokens": 61485287.0, + "step": 2447 + }, + { + "epoch": 0.2688337359982429, + "grad_norm": 2.1051743030548096, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6963009238243103, + "num_tokens": 61512034.0, + "step": 2448 + }, + { + "epoch": 0.2689435537008566, + "grad_norm": 2.396113872528076, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7092851996421814, + "num_tokens": 61534451.0, + "step": 2449 + }, + { + "epoch": 0.26905337140347024, + "grad_norm": 2.335326671600342, + "learning_rate": 1e-06, + "loss": 1.076, + "mean_token_accuracy": 0.6823700666427612, + "num_tokens": 61559304.0, + "step": 2450 + }, + { + "epoch": 0.2691631891060839, + "grad_norm": 2.281102180480957, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.7040243148803711, + "num_tokens": 61585234.0, + "step": 2451 + }, + { + "epoch": 0.26927300680869753, + "grad_norm": 2.165045738220215, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.7006091475486755, + "num_tokens": 61612588.0, + "step": 2452 + }, + { + "epoch": 0.26938282451131124, + "grad_norm": 2.214831590652466, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.7021700739860535, + "num_tokens": 61639429.0, + "step": 2453 + }, + { + "epoch": 0.2694926422139249, + "grad_norm": 2.1539361476898193, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7085070013999939, + "num_tokens": 61664790.0, + "step": 2454 + }, + { + "epoch": 0.26960245991653853, + "grad_norm": 2.569007158279419, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.709041953086853, + "num_tokens": 61685293.0, + "step": 2455 + }, + { + "epoch": 0.26971227761915223, + "grad_norm": 2.0407352447509766, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6980781555175781, + "num_tokens": 61715557.0, + "step": 2456 + }, + { + "epoch": 0.2698220953217659, + "grad_norm": 2.386389970779419, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6972963213920593, + "num_tokens": 61737838.0, + "step": 2457 + }, + { + "epoch": 0.2699319130243795, + "grad_norm": 2.3703219890594482, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7140052318572998, + "num_tokens": 61759453.0, + "step": 2458 + }, + { + "epoch": 0.27004173072699317, + "grad_norm": 2.562359571456909, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7007274031639099, + "num_tokens": 61778119.0, + "step": 2459 + }, + { + "epoch": 0.27015154842960687, + "grad_norm": 1.9079521894454956, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6932992935180664, + "num_tokens": 61809279.0, + "step": 2460 + }, + { + "epoch": 0.2702613661322205, + "grad_norm": 2.1956896781921387, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6884182095527649, + "num_tokens": 61836245.0, + "step": 2461 + }, + { + "epoch": 0.27037118383483416, + "grad_norm": 2.2184841632843018, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7058579921722412, + "num_tokens": 61861493.0, + "step": 2462 + }, + { + "epoch": 0.27048100153744786, + "grad_norm": 2.1965315341949463, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6919693946838379, + "num_tokens": 61886821.0, + "step": 2463 + }, + { + "epoch": 0.2705908192400615, + "grad_norm": 2.2284862995147705, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6919086575508118, + "num_tokens": 61913068.0, + "step": 2464 + }, + { + "epoch": 0.27070063694267515, + "grad_norm": 2.4129087924957275, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6863582730293274, + "num_tokens": 61934024.0, + "step": 2465 + }, + { + "epoch": 0.2708104546452888, + "grad_norm": 2.54126238822937, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6979774832725525, + "num_tokens": 61954074.0, + "step": 2466 + }, + { + "epoch": 0.2709202723479025, + "grad_norm": 2.358544111251831, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6896025538444519, + "num_tokens": 61977230.0, + "step": 2467 + }, + { + "epoch": 0.27103009005051615, + "grad_norm": 2.236987829208374, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6942092776298523, + "num_tokens": 62004378.0, + "step": 2468 + }, + { + "epoch": 0.2711399077531298, + "grad_norm": 2.2699005603790283, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.712681233882904, + "num_tokens": 62027589.0, + "step": 2469 + }, + { + "epoch": 0.27124972545574344, + "grad_norm": 2.1218695640563965, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6891078948974609, + "num_tokens": 62057185.0, + "step": 2470 + }, + { + "epoch": 0.27135954315835714, + "grad_norm": 2.4333930015563965, + "learning_rate": 1e-06, + "loss": 1.1009, + "mean_token_accuracy": 0.6653498411178589, + "num_tokens": 62080936.0, + "step": 2471 + }, + { + "epoch": 0.2714693608609708, + "grad_norm": 2.2753899097442627, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.703184187412262, + "num_tokens": 62105708.0, + "step": 2472 + }, + { + "epoch": 0.27157917856358443, + "grad_norm": 2.0254430770874023, + "learning_rate": 1e-06, + "loss": 1.1052, + "mean_token_accuracy": 0.6657108068466187, + "num_tokens": 62138836.0, + "step": 2473 + }, + { + "epoch": 0.27168899626619814, + "grad_norm": 2.396878719329834, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6957377791404724, + "num_tokens": 62162769.0, + "step": 2474 + }, + { + "epoch": 0.2717988139688118, + "grad_norm": 2.107501745223999, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6960406303405762, + "num_tokens": 62190668.0, + "step": 2475 + }, + { + "epoch": 0.2719086316714254, + "grad_norm": 2.425309896469116, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6842442750930786, + "num_tokens": 62213889.0, + "step": 2476 + }, + { + "epoch": 0.2720184493740391, + "grad_norm": 2.2038915157318115, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7029052972793579, + "num_tokens": 62241262.0, + "step": 2477 + }, + { + "epoch": 0.2721282670766528, + "grad_norm": 2.2662715911865234, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.7040034532546997, + "num_tokens": 62264694.0, + "step": 2478 + }, + { + "epoch": 0.2722380847792664, + "grad_norm": 2.2194507122039795, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6913156509399414, + "num_tokens": 62292069.0, + "step": 2479 + }, + { + "epoch": 0.27234790248188007, + "grad_norm": 2.366790533065796, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6966978311538696, + "num_tokens": 62316144.0, + "step": 2480 + }, + { + "epoch": 0.27245772018449377, + "grad_norm": 2.195669412612915, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6989057660102844, + "num_tokens": 62344256.0, + "step": 2481 + }, + { + "epoch": 0.2725675378871074, + "grad_norm": 1.9451453685760498, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6966017484664917, + "num_tokens": 62377399.0, + "step": 2482 + }, + { + "epoch": 0.27267735558972106, + "grad_norm": 2.474918842315674, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6946871876716614, + "num_tokens": 62399326.0, + "step": 2483 + }, + { + "epoch": 0.2727871732923347, + "grad_norm": 2.013317584991455, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.697385311126709, + "num_tokens": 62429534.0, + "step": 2484 + }, + { + "epoch": 0.2728969909949484, + "grad_norm": 2.5591256618499756, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7119796276092529, + "num_tokens": 62450167.0, + "step": 2485 + }, + { + "epoch": 0.27300680869756205, + "grad_norm": 2.215607166290283, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6842603087425232, + "num_tokens": 62475208.0, + "step": 2486 + }, + { + "epoch": 0.2731166264001757, + "grad_norm": 1.9221874475479126, + "learning_rate": 1e-06, + "loss": 1.0821, + "mean_token_accuracy": 0.6773484945297241, + "num_tokens": 62509984.0, + "step": 2487 + }, + { + "epoch": 0.27322644410278935, + "grad_norm": 2.283442735671997, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6845747232437134, + "num_tokens": 62536742.0, + "step": 2488 + }, + { + "epoch": 0.27333626180540305, + "grad_norm": 2.1317076683044434, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7022824883460999, + "num_tokens": 62565109.0, + "step": 2489 + }, + { + "epoch": 0.2734460795080167, + "grad_norm": 2.107487440109253, + "learning_rate": 1e-06, + "loss": 1.0851, + "mean_token_accuracy": 0.6811997890472412, + "num_tokens": 62593822.0, + "step": 2490 + }, + { + "epoch": 0.27355589721063034, + "grad_norm": 2.6078455448150635, + "learning_rate": 1e-06, + "loss": 1.0867, + "mean_token_accuracy": 0.6690268516540527, + "num_tokens": 62614076.0, + "step": 2491 + }, + { + "epoch": 0.27366571491324404, + "grad_norm": 2.4255995750427246, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7325183749198914, + "num_tokens": 62636397.0, + "step": 2492 + }, + { + "epoch": 0.2737755326158577, + "grad_norm": 2.6530890464782715, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.6978088617324829, + "num_tokens": 62657099.0, + "step": 2493 + }, + { + "epoch": 0.27388535031847133, + "grad_norm": 2.2977664470672607, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7215265035629272, + "num_tokens": 62681402.0, + "step": 2494 + }, + { + "epoch": 0.273995168021085, + "grad_norm": 2.643411874771118, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7141866683959961, + "num_tokens": 62700545.0, + "step": 2495 + }, + { + "epoch": 0.2741049857236987, + "grad_norm": 2.18404221534729, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6864283084869385, + "num_tokens": 62730296.0, + "step": 2496 + }, + { + "epoch": 0.2742148034263123, + "grad_norm": 2.2724924087524414, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7136543393135071, + "num_tokens": 62755339.0, + "step": 2497 + }, + { + "epoch": 0.27432462112892597, + "grad_norm": 2.0116944313049316, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7010092735290527, + "num_tokens": 62786430.0, + "step": 2498 + }, + { + "epoch": 0.2744344388315396, + "grad_norm": 2.280759811401367, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6868424415588379, + "num_tokens": 62813861.0, + "step": 2499 + }, + { + "epoch": 0.2745442565341533, + "grad_norm": 2.340407371520996, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.70466148853302, + "num_tokens": 62837815.0, + "step": 2500 + }, + { + "epoch": 0.27465407423676697, + "grad_norm": 2.330181360244751, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7053000330924988, + "num_tokens": 62863512.0, + "step": 2501 + }, + { + "epoch": 0.2747638919393806, + "grad_norm": 2.57783842086792, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.713119387626648, + "num_tokens": 62882305.0, + "step": 2502 + }, + { + "epoch": 0.2748737096419943, + "grad_norm": 2.517691135406494, + "learning_rate": 1e-06, + "loss": 1.0836, + "mean_token_accuracy": 0.6763880848884583, + "num_tokens": 62904919.0, + "step": 2503 + }, + { + "epoch": 0.27498352734460796, + "grad_norm": 2.194854259490967, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6891590356826782, + "num_tokens": 62931669.0, + "step": 2504 + }, + { + "epoch": 0.2750933450472216, + "grad_norm": 2.1749629974365234, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7023754119873047, + "num_tokens": 62957452.0, + "step": 2505 + }, + { + "epoch": 0.27520316274983525, + "grad_norm": 2.189581871032715, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7024087905883789, + "num_tokens": 62985616.0, + "step": 2506 + }, + { + "epoch": 0.27531298045244895, + "grad_norm": 2.304112195968628, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7024217844009399, + "num_tokens": 63009977.0, + "step": 2507 + }, + { + "epoch": 0.2754227981550626, + "grad_norm": 2.5682451725006104, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6967606544494629, + "num_tokens": 63030467.0, + "step": 2508 + }, + { + "epoch": 0.27553261585767624, + "grad_norm": 2.294499397277832, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.6969573497772217, + "num_tokens": 63055264.0, + "step": 2509 + }, + { + "epoch": 0.27564243356028995, + "grad_norm": 2.449016809463501, + "learning_rate": 1e-06, + "loss": 1.0924, + "mean_token_accuracy": 0.6700223684310913, + "num_tokens": 63077091.0, + "step": 2510 + }, + { + "epoch": 0.2757522512629036, + "grad_norm": 1.9367485046386719, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6811912059783936, + "num_tokens": 63108777.0, + "step": 2511 + }, + { + "epoch": 0.27586206896551724, + "grad_norm": 2.3571560382843018, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.697460412979126, + "num_tokens": 63133075.0, + "step": 2512 + }, + { + "epoch": 0.2759718866681309, + "grad_norm": 2.3331682682037354, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7004110813140869, + "num_tokens": 63157140.0, + "step": 2513 + }, + { + "epoch": 0.2760817043707446, + "grad_norm": 2.4222309589385986, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7016029357910156, + "num_tokens": 63180735.0, + "step": 2514 + }, + { + "epoch": 0.27619152207335823, + "grad_norm": 2.626286268234253, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7077455520629883, + "num_tokens": 63200306.0, + "step": 2515 + }, + { + "epoch": 0.2763013397759719, + "grad_norm": 2.231816053390503, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6923990249633789, + "num_tokens": 63227248.0, + "step": 2516 + }, + { + "epoch": 0.2764111574785855, + "grad_norm": 2.555615186691284, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.687694787979126, + "num_tokens": 63248551.0, + "step": 2517 + }, + { + "epoch": 0.2765209751811992, + "grad_norm": 1.9705276489257812, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6994972229003906, + "num_tokens": 63282746.0, + "step": 2518 + }, + { + "epoch": 0.27663079288381287, + "grad_norm": 2.3061399459838867, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7012748718261719, + "num_tokens": 63308132.0, + "step": 2519 + }, + { + "epoch": 0.2767406105864265, + "grad_norm": 2.2017579078674316, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.690434455871582, + "num_tokens": 63336362.0, + "step": 2520 + }, + { + "epoch": 0.2768504282890402, + "grad_norm": 2.3559160232543945, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6945036053657532, + "num_tokens": 63360514.0, + "step": 2521 + }, + { + "epoch": 0.27696024599165386, + "grad_norm": 2.4050638675689697, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7130299210548401, + "num_tokens": 63382752.0, + "step": 2522 + }, + { + "epoch": 0.2770700636942675, + "grad_norm": 2.2185895442962646, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6961824893951416, + "num_tokens": 63409580.0, + "step": 2523 + }, + { + "epoch": 0.27717988139688116, + "grad_norm": 2.526289939880371, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7024586796760559, + "num_tokens": 63429742.0, + "step": 2524 + }, + { + "epoch": 0.27728969909949486, + "grad_norm": 2.2984886169433594, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7087610960006714, + "num_tokens": 63453635.0, + "step": 2525 + }, + { + "epoch": 0.2773995168021085, + "grad_norm": 2.6738288402557373, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6814554929733276, + "num_tokens": 63471849.0, + "step": 2526 + }, + { + "epoch": 0.27750933450472215, + "grad_norm": 2.039402961730957, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.6979061365127563, + "num_tokens": 63503749.0, + "step": 2527 + }, + { + "epoch": 0.2776191522073358, + "grad_norm": 2.2016842365264893, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6950960159301758, + "num_tokens": 63533285.0, + "step": 2528 + }, + { + "epoch": 0.2777289699099495, + "grad_norm": 2.232560634613037, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7042960524559021, + "num_tokens": 63560575.0, + "step": 2529 + }, + { + "epoch": 0.27783878761256314, + "grad_norm": 2.1273248195648193, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7123779654502869, + "num_tokens": 63585920.0, + "step": 2530 + }, + { + "epoch": 0.2779486053151768, + "grad_norm": 2.5499348640441895, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7124748826026917, + "num_tokens": 63604952.0, + "step": 2531 + }, + { + "epoch": 0.2780584230177905, + "grad_norm": 2.2050559520721436, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7028633952140808, + "num_tokens": 63630594.0, + "step": 2532 + }, + { + "epoch": 0.27816824072040414, + "grad_norm": 2.375535249710083, + "learning_rate": 1e-06, + "loss": 1.111, + "mean_token_accuracy": 0.6647810339927673, + "num_tokens": 63655941.0, + "step": 2533 + }, + { + "epoch": 0.2782780584230178, + "grad_norm": 2.6204283237457275, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.706587553024292, + "num_tokens": 63678117.0, + "step": 2534 + }, + { + "epoch": 0.27838787612563143, + "grad_norm": 2.066242218017578, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7258816361427307, + "num_tokens": 63705102.0, + "step": 2535 + }, + { + "epoch": 0.27849769382824513, + "grad_norm": 2.187535524368286, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.720867931842804, + "num_tokens": 63729866.0, + "step": 2536 + }, + { + "epoch": 0.2786075115308588, + "grad_norm": 2.4411821365356445, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.707390546798706, + "num_tokens": 63750289.0, + "step": 2537 + }, + { + "epoch": 0.2787173292334724, + "grad_norm": 2.314700126647949, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.7026731371879578, + "num_tokens": 63774814.0, + "step": 2538 + }, + { + "epoch": 0.2788271469360861, + "grad_norm": 2.009385585784912, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6888874769210815, + "num_tokens": 63806736.0, + "step": 2539 + }, + { + "epoch": 0.27893696463869977, + "grad_norm": 2.423470973968506, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.694627046585083, + "num_tokens": 63829841.0, + "step": 2540 + }, + { + "epoch": 0.2790467823413134, + "grad_norm": 2.0396230220794678, + "learning_rate": 1e-06, + "loss": 1.0851, + "mean_token_accuracy": 0.6820549964904785, + "num_tokens": 63859289.0, + "step": 2541 + }, + { + "epoch": 0.27915660004392706, + "grad_norm": 2.3336381912231445, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6835339069366455, + "num_tokens": 63883186.0, + "step": 2542 + }, + { + "epoch": 0.27926641774654076, + "grad_norm": 2.08699893951416, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7090340852737427, + "num_tokens": 63910581.0, + "step": 2543 + }, + { + "epoch": 0.2793762354491544, + "grad_norm": 2.4097633361816406, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.689897894859314, + "num_tokens": 63933877.0, + "step": 2544 + }, + { + "epoch": 0.27948605315176805, + "grad_norm": 2.0726377964019775, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6967602968215942, + "num_tokens": 63962630.0, + "step": 2545 + }, + { + "epoch": 0.2795958708543817, + "grad_norm": 2.4625961780548096, + "learning_rate": 1e-06, + "loss": 1.0699, + "mean_token_accuracy": 0.680037260055542, + "num_tokens": 63984357.0, + "step": 2546 + }, + { + "epoch": 0.2797056885569954, + "grad_norm": 2.3491885662078857, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6852930188179016, + "num_tokens": 64007796.0, + "step": 2547 + }, + { + "epoch": 0.27981550625960905, + "grad_norm": 2.171307325363159, + "learning_rate": 1e-06, + "loss": 1.121, + "mean_token_accuracy": 0.6652302742004395, + "num_tokens": 64037878.0, + "step": 2548 + }, + { + "epoch": 0.2799253239622227, + "grad_norm": 2.2686548233032227, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.70911705493927, + "num_tokens": 64063815.0, + "step": 2549 + }, + { + "epoch": 0.2800351416648364, + "grad_norm": 2.4254372119903564, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.7026336193084717, + "num_tokens": 64086977.0, + "step": 2550 + }, + { + "epoch": 0.28014495936745004, + "grad_norm": 2.6359453201293945, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6866934299468994, + "num_tokens": 64107143.0, + "step": 2551 + }, + { + "epoch": 0.2802547770700637, + "grad_norm": 2.360159158706665, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6883187890052795, + "num_tokens": 64129486.0, + "step": 2552 + }, + { + "epoch": 0.28036459477267733, + "grad_norm": 2.1367273330688477, + "learning_rate": 1e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.6727042198181152, + "num_tokens": 64158465.0, + "step": 2553 + }, + { + "epoch": 0.28047441247529104, + "grad_norm": 2.024442195892334, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6868480443954468, + "num_tokens": 64189327.0, + "step": 2554 + }, + { + "epoch": 0.2805842301779047, + "grad_norm": 2.49996280670166, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7063494324684143, + "num_tokens": 64210447.0, + "step": 2555 + }, + { + "epoch": 0.2806940478805183, + "grad_norm": 2.393921136856079, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6920029520988464, + "num_tokens": 64232956.0, + "step": 2556 + }, + { + "epoch": 0.28080386558313203, + "grad_norm": 2.210930585861206, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.695279061794281, + "num_tokens": 64259555.0, + "step": 2557 + }, + { + "epoch": 0.2809136832857457, + "grad_norm": 2.471102237701416, + "learning_rate": 1e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6745966672897339, + "num_tokens": 64282474.0, + "step": 2558 + }, + { + "epoch": 0.2810235009883593, + "grad_norm": 2.2767109870910645, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7206125259399414, + "num_tokens": 64305315.0, + "step": 2559 + }, + { + "epoch": 0.28113331869097297, + "grad_norm": 2.5371365547180176, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7089684009552002, + "num_tokens": 64327195.0, + "step": 2560 + }, + { + "epoch": 0.28124313639358667, + "grad_norm": 2.213730573654175, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7070936560630798, + "num_tokens": 64351541.0, + "step": 2561 + }, + { + "epoch": 0.2813529540962003, + "grad_norm": 2.1606037616729736, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6818834543228149, + "num_tokens": 64378646.0, + "step": 2562 + }, + { + "epoch": 0.28146277179881396, + "grad_norm": 2.681396961212158, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7105700969696045, + "num_tokens": 64397793.0, + "step": 2563 + }, + { + "epoch": 0.2815725895014276, + "grad_norm": 2.1793830394744873, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6773754358291626, + "num_tokens": 64426895.0, + "step": 2564 + }, + { + "epoch": 0.2816824072040413, + "grad_norm": 2.3562722206115723, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7092365026473999, + "num_tokens": 64450103.0, + "step": 2565 + }, + { + "epoch": 0.28179222490665495, + "grad_norm": 2.4332337379455566, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7064959406852722, + "num_tokens": 64471234.0, + "step": 2566 + }, + { + "epoch": 0.2819020426092686, + "grad_norm": 2.455705165863037, + "learning_rate": 1e-06, + "loss": 1.0979, + "mean_token_accuracy": 0.6793391108512878, + "num_tokens": 64493239.0, + "step": 2567 + }, + { + "epoch": 0.2820118603118823, + "grad_norm": 2.279736042022705, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6859068870544434, + "num_tokens": 64522458.0, + "step": 2568 + }, + { + "epoch": 0.28212167801449595, + "grad_norm": 2.452082633972168, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7148061990737915, + "num_tokens": 64543377.0, + "step": 2569 + }, + { + "epoch": 0.2822314957171096, + "grad_norm": 2.28579044342041, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6966915726661682, + "num_tokens": 64567853.0, + "step": 2570 + }, + { + "epoch": 0.28234131341972324, + "grad_norm": 2.3189234733581543, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.7045653462409973, + "num_tokens": 64591294.0, + "step": 2571 + }, + { + "epoch": 0.28245113112233694, + "grad_norm": 2.442819595336914, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.694509744644165, + "num_tokens": 64613156.0, + "step": 2572 + }, + { + "epoch": 0.2825609488249506, + "grad_norm": 2.218254327774048, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.728291392326355, + "num_tokens": 64636049.0, + "step": 2573 + }, + { + "epoch": 0.28267076652756423, + "grad_norm": 2.5835721492767334, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6817720532417297, + "num_tokens": 64657709.0, + "step": 2574 + }, + { + "epoch": 0.2827805842301779, + "grad_norm": 2.185490131378174, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6880613565444946, + "num_tokens": 64682836.0, + "step": 2575 + }, + { + "epoch": 0.2828904019327916, + "grad_norm": 2.3605968952178955, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7042312026023865, + "num_tokens": 64705869.0, + "step": 2576 + }, + { + "epoch": 0.2830002196354052, + "grad_norm": 2.507667064666748, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7018929719924927, + "num_tokens": 64727109.0, + "step": 2577 + }, + { + "epoch": 0.28311003733801887, + "grad_norm": 2.2010276317596436, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.7007261514663696, + "num_tokens": 64753913.0, + "step": 2578 + }, + { + "epoch": 0.2832198550406326, + "grad_norm": 2.447721242904663, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7037502527236938, + "num_tokens": 64777938.0, + "step": 2579 + }, + { + "epoch": 0.2833296727432462, + "grad_norm": 2.3094184398651123, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6814182996749878, + "num_tokens": 64804478.0, + "step": 2580 + }, + { + "epoch": 0.28343949044585987, + "grad_norm": 2.060894727706909, + "learning_rate": 1e-06, + "loss": 1.0619, + "mean_token_accuracy": 0.6831807494163513, + "num_tokens": 64833423.0, + "step": 2581 + }, + { + "epoch": 0.2835493081484735, + "grad_norm": 2.176039218902588, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7085230350494385, + "num_tokens": 64863060.0, + "step": 2582 + }, + { + "epoch": 0.2836591258510872, + "grad_norm": 2.452423572540283, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7094262838363647, + "num_tokens": 64884307.0, + "step": 2583 + }, + { + "epoch": 0.28376894355370086, + "grad_norm": 2.0999159812927246, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7065525650978088, + "num_tokens": 64911640.0, + "step": 2584 + }, + { + "epoch": 0.2838787612563145, + "grad_norm": 2.5552403926849365, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.6931823492050171, + "num_tokens": 64932394.0, + "step": 2585 + }, + { + "epoch": 0.2839885789589282, + "grad_norm": 2.1811861991882324, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7131698131561279, + "num_tokens": 64960475.0, + "step": 2586 + }, + { + "epoch": 0.28409839666154185, + "grad_norm": 1.9326868057250977, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7042446136474609, + "num_tokens": 64993395.0, + "step": 2587 + }, + { + "epoch": 0.2842082143641555, + "grad_norm": 2.375236749649048, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6944425106048584, + "num_tokens": 65017291.0, + "step": 2588 + }, + { + "epoch": 0.28431803206676914, + "grad_norm": 2.392366886138916, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.700285792350769, + "num_tokens": 65039370.0, + "step": 2589 + }, + { + "epoch": 0.28442784976938285, + "grad_norm": 2.093299627304077, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.7014915347099304, + "num_tokens": 65067883.0, + "step": 2590 + }, + { + "epoch": 0.2845376674719965, + "grad_norm": 2.208899974822998, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6944417357444763, + "num_tokens": 65092853.0, + "step": 2591 + }, + { + "epoch": 0.28464748517461014, + "grad_norm": 2.188833713531494, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6853203177452087, + "num_tokens": 65119123.0, + "step": 2592 + }, + { + "epoch": 0.2847573028772238, + "grad_norm": 2.2031633853912354, + "learning_rate": 1e-06, + "loss": 1.0958, + "mean_token_accuracy": 0.6742926836013794, + "num_tokens": 65148713.0, + "step": 2593 + }, + { + "epoch": 0.2848671205798375, + "grad_norm": 2.3485186100006104, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6961633563041687, + "num_tokens": 65171973.0, + "step": 2594 + }, + { + "epoch": 0.28497693828245113, + "grad_norm": 2.3994619846343994, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6932390928268433, + "num_tokens": 65195904.0, + "step": 2595 + }, + { + "epoch": 0.2850867559850648, + "grad_norm": 2.3178064823150635, + "learning_rate": 1e-06, + "loss": 1.0246, + "mean_token_accuracy": 0.6941348910331726, + "num_tokens": 65220446.0, + "step": 2596 + }, + { + "epoch": 0.2851965736876785, + "grad_norm": 2.2550504207611084, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7117663621902466, + "num_tokens": 65245715.0, + "step": 2597 + }, + { + "epoch": 0.2853063913902921, + "grad_norm": 2.125500202178955, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6994136571884155, + "num_tokens": 65273862.0, + "step": 2598 + }, + { + "epoch": 0.28541620909290577, + "grad_norm": 2.333402633666992, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7121127247810364, + "num_tokens": 65297278.0, + "step": 2599 + }, + { + "epoch": 0.2855260267955194, + "grad_norm": 2.2226743698120117, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6878629922866821, + "num_tokens": 65324786.0, + "step": 2600 + }, + { + "epoch": 0.2856358444981331, + "grad_norm": 2.169746160507202, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.709102988243103, + "num_tokens": 65350959.0, + "step": 2601 + }, + { + "epoch": 0.28574566220074676, + "grad_norm": 2.2091236114501953, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7012593746185303, + "num_tokens": 65375532.0, + "step": 2602 + }, + { + "epoch": 0.2858554799033604, + "grad_norm": 2.309642791748047, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7191863656044006, + "num_tokens": 65398814.0, + "step": 2603 + }, + { + "epoch": 0.28596529760597406, + "grad_norm": 2.464226484298706, + "learning_rate": 1e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.6783791780471802, + "num_tokens": 65421427.0, + "step": 2604 + }, + { + "epoch": 0.28607511530858776, + "grad_norm": 2.1282553672790527, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.7016624212265015, + "num_tokens": 65449207.0, + "step": 2605 + }, + { + "epoch": 0.2861849330112014, + "grad_norm": 2.231900215148926, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7045590877532959, + "num_tokens": 65474736.0, + "step": 2606 + }, + { + "epoch": 0.28629475071381505, + "grad_norm": 2.2444019317626953, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7102773189544678, + "num_tokens": 65497829.0, + "step": 2607 + }, + { + "epoch": 0.28640456841642875, + "grad_norm": 2.390070915222168, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6911417245864868, + "num_tokens": 65522859.0, + "step": 2608 + }, + { + "epoch": 0.2865143861190424, + "grad_norm": 2.4168248176574707, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7196215987205505, + "num_tokens": 65544340.0, + "step": 2609 + }, + { + "epoch": 0.28662420382165604, + "grad_norm": 2.4785268306732178, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7062402367591858, + "num_tokens": 65565242.0, + "step": 2610 + }, + { + "epoch": 0.2867340215242697, + "grad_norm": 2.1361515522003174, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6954139471054077, + "num_tokens": 65593690.0, + "step": 2611 + }, + { + "epoch": 0.2868438392268834, + "grad_norm": 1.979006290435791, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7054519653320312, + "num_tokens": 65622308.0, + "step": 2612 + }, + { + "epoch": 0.28695365692949704, + "grad_norm": 2.5328707695007324, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7076466083526611, + "num_tokens": 65643457.0, + "step": 2613 + }, + { + "epoch": 0.2870634746321107, + "grad_norm": 2.4056458473205566, + "learning_rate": 1e-06, + "loss": 1.0425, + "mean_token_accuracy": 0.6874262094497681, + "num_tokens": 65666767.0, + "step": 2614 + }, + { + "epoch": 0.2871732923347244, + "grad_norm": 2.3422586917877197, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7016975283622742, + "num_tokens": 65691515.0, + "step": 2615 + }, + { + "epoch": 0.28728311003733803, + "grad_norm": 2.3663077354431152, + "learning_rate": 1e-06, + "loss": 1.1066, + "mean_token_accuracy": 0.6780569553375244, + "num_tokens": 65716278.0, + "step": 2616 + }, + { + "epoch": 0.2873929277399517, + "grad_norm": 2.5675973892211914, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6887905597686768, + "num_tokens": 65736380.0, + "step": 2617 + }, + { + "epoch": 0.2875027454425653, + "grad_norm": 2.4914145469665527, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6900588274002075, + "num_tokens": 65757887.0, + "step": 2618 + }, + { + "epoch": 0.287612563145179, + "grad_norm": 2.5999996662139893, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.6956565380096436, + "num_tokens": 65780569.0, + "step": 2619 + }, + { + "epoch": 0.28772238084779267, + "grad_norm": 2.226823329925537, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6866136789321899, + "num_tokens": 65805394.0, + "step": 2620 + }, + { + "epoch": 0.2878321985504063, + "grad_norm": 2.4114201068878174, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7174851298332214, + "num_tokens": 65827490.0, + "step": 2621 + }, + { + "epoch": 0.28794201625301996, + "grad_norm": 2.4031877517700195, + "learning_rate": 1e-06, + "loss": 1.0914, + "mean_token_accuracy": 0.6794236898422241, + "num_tokens": 65853607.0, + "step": 2622 + }, + { + "epoch": 0.28805183395563366, + "grad_norm": 2.409088134765625, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7219182252883911, + "num_tokens": 65875061.0, + "step": 2623 + }, + { + "epoch": 0.2881616516582473, + "grad_norm": 2.502103328704834, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6933274269104004, + "num_tokens": 65898094.0, + "step": 2624 + }, + { + "epoch": 0.28827146936086095, + "grad_norm": 2.2712795734405518, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7042895555496216, + "num_tokens": 65922277.0, + "step": 2625 + }, + { + "epoch": 0.28838128706347466, + "grad_norm": 2.30246901512146, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6853289008140564, + "num_tokens": 65947317.0, + "step": 2626 + }, + { + "epoch": 0.2884911047660883, + "grad_norm": 2.3772053718566895, + "learning_rate": 1e-06, + "loss": 0.9975, + "mean_token_accuracy": 0.7072361707687378, + "num_tokens": 65970417.0, + "step": 2627 + }, + { + "epoch": 0.28860092246870195, + "grad_norm": 2.228330135345459, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7052270174026489, + "num_tokens": 65995815.0, + "step": 2628 + }, + { + "epoch": 0.2887107401713156, + "grad_norm": 2.2161977291107178, + "learning_rate": 1e-06, + "loss": 1.0551, + "mean_token_accuracy": 0.6834270358085632, + "num_tokens": 66022991.0, + "step": 2629 + }, + { + "epoch": 0.2888205578739293, + "grad_norm": 2.4873995780944824, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7141168713569641, + "num_tokens": 66043090.0, + "step": 2630 + }, + { + "epoch": 0.28893037557654294, + "grad_norm": 2.202908992767334, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.6814894676208496, + "num_tokens": 66069227.0, + "step": 2631 + }, + { + "epoch": 0.2890401932791566, + "grad_norm": 2.230609655380249, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6941406726837158, + "num_tokens": 66096065.0, + "step": 2632 + }, + { + "epoch": 0.2891500109817703, + "grad_norm": 2.2312042713165283, + "learning_rate": 1e-06, + "loss": 1.1083, + "mean_token_accuracy": 0.6764600872993469, + "num_tokens": 66123951.0, + "step": 2633 + }, + { + "epoch": 0.28925982868438394, + "grad_norm": 2.1595592498779297, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6989185214042664, + "num_tokens": 66152132.0, + "step": 2634 + }, + { + "epoch": 0.2893696463869976, + "grad_norm": 2.2639353275299072, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6988635659217834, + "num_tokens": 66177290.0, + "step": 2635 + }, + { + "epoch": 0.2894794640896112, + "grad_norm": 2.3368945121765137, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6953414678573608, + "num_tokens": 66200318.0, + "step": 2636 + }, + { + "epoch": 0.28958928179222493, + "grad_norm": 2.367358922958374, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6968969106674194, + "num_tokens": 66224231.0, + "step": 2637 + }, + { + "epoch": 0.2896990994948386, + "grad_norm": 2.372314453125, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7008341550827026, + "num_tokens": 66247823.0, + "step": 2638 + }, + { + "epoch": 0.2898089171974522, + "grad_norm": 2.211585760116577, + "learning_rate": 1e-06, + "loss": 1.0743, + "mean_token_accuracy": 0.6799015998840332, + "num_tokens": 66275609.0, + "step": 2639 + }, + { + "epoch": 0.28991873490006587, + "grad_norm": 2.26098895072937, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7016662955284119, + "num_tokens": 66301584.0, + "step": 2640 + }, + { + "epoch": 0.29002855260267957, + "grad_norm": 2.1995067596435547, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7011446952819824, + "num_tokens": 66326945.0, + "step": 2641 + }, + { + "epoch": 0.2901383703052932, + "grad_norm": 1.925787329673767, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7208143472671509, + "num_tokens": 66359217.0, + "step": 2642 + }, + { + "epoch": 0.29024818800790686, + "grad_norm": 2.2248802185058594, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7191047072410583, + "num_tokens": 66384380.0, + "step": 2643 + }, + { + "epoch": 0.29035800571052056, + "grad_norm": 2.357884168624878, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7113088369369507, + "num_tokens": 66406541.0, + "step": 2644 + }, + { + "epoch": 0.2904678234131342, + "grad_norm": 2.204336166381836, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6939241886138916, + "num_tokens": 66433890.0, + "step": 2645 + }, + { + "epoch": 0.29057764111574785, + "grad_norm": 2.3337676525115967, + "learning_rate": 1e-06, + "loss": 1.0727, + "mean_token_accuracy": 0.6819303035736084, + "num_tokens": 66457550.0, + "step": 2646 + }, + { + "epoch": 0.2906874588183615, + "grad_norm": 2.035432815551758, + "learning_rate": 1e-06, + "loss": 1.0731, + "mean_token_accuracy": 0.6854873895645142, + "num_tokens": 66488139.0, + "step": 2647 + }, + { + "epoch": 0.2907972765209752, + "grad_norm": 2.1372010707855225, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7092969417572021, + "num_tokens": 66513967.0, + "step": 2648 + }, + { + "epoch": 0.29090709422358885, + "grad_norm": 2.5840094089508057, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7141435742378235, + "num_tokens": 66535218.0, + "step": 2649 + }, + { + "epoch": 0.2910169119262025, + "grad_norm": 2.1439614295959473, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6950520277023315, + "num_tokens": 66563432.0, + "step": 2650 + }, + { + "epoch": 0.29112672962881614, + "grad_norm": 2.3914554119110107, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.688933253288269, + "num_tokens": 66588932.0, + "step": 2651 + }, + { + "epoch": 0.29123654733142984, + "grad_norm": 2.409661054611206, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.697152853012085, + "num_tokens": 66612223.0, + "step": 2652 + }, + { + "epoch": 0.2913463650340435, + "grad_norm": 2.381716728210449, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7018009424209595, + "num_tokens": 66633905.0, + "step": 2653 + }, + { + "epoch": 0.29145618273665713, + "grad_norm": 2.5518100261688232, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7227147817611694, + "num_tokens": 66653928.0, + "step": 2654 + }, + { + "epoch": 0.29156600043927083, + "grad_norm": 2.1677825450897217, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7034388780593872, + "num_tokens": 66680080.0, + "step": 2655 + }, + { + "epoch": 0.2916758181418845, + "grad_norm": 2.721212148666382, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7054692506790161, + "num_tokens": 66700271.0, + "step": 2656 + }, + { + "epoch": 0.2917856358444981, + "grad_norm": 2.319235324859619, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.6930310130119324, + "num_tokens": 66725664.0, + "step": 2657 + }, + { + "epoch": 0.29189545354711177, + "grad_norm": 2.399569034576416, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7013370394706726, + "num_tokens": 66747755.0, + "step": 2658 + }, + { + "epoch": 0.2920052712497255, + "grad_norm": 2.3823225498199463, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6960731744766235, + "num_tokens": 66770424.0, + "step": 2659 + }, + { + "epoch": 0.2921150889523391, + "grad_norm": 2.317141056060791, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7197617888450623, + "num_tokens": 66792765.0, + "step": 2660 + }, + { + "epoch": 0.29222490665495277, + "grad_norm": 2.1985883712768555, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6857092976570129, + "num_tokens": 66816133.0, + "step": 2661 + }, + { + "epoch": 0.29233472435756647, + "grad_norm": 2.284254312515259, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7114474177360535, + "num_tokens": 66841107.0, + "step": 2662 + }, + { + "epoch": 0.2924445420601801, + "grad_norm": 2.348750352859497, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6901848316192627, + "num_tokens": 66865103.0, + "step": 2663 + }, + { + "epoch": 0.29255435976279376, + "grad_norm": 2.234010934829712, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7244426012039185, + "num_tokens": 66890120.0, + "step": 2664 + }, + { + "epoch": 0.2926641774654074, + "grad_norm": 2.4694712162017822, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6980277895927429, + "num_tokens": 66911898.0, + "step": 2665 + }, + { + "epoch": 0.2927739951680211, + "grad_norm": 2.367783784866333, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7132750749588013, + "num_tokens": 66933167.0, + "step": 2666 + }, + { + "epoch": 0.29288381287063475, + "grad_norm": 2.1498260498046875, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6877090930938721, + "num_tokens": 66958985.0, + "step": 2667 + }, + { + "epoch": 0.2929936305732484, + "grad_norm": 2.2201406955718994, + "learning_rate": 1e-06, + "loss": 1.082, + "mean_token_accuracy": 0.6868523955345154, + "num_tokens": 66985648.0, + "step": 2668 + }, + { + "epoch": 0.29310344827586204, + "grad_norm": 2.099294900894165, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.693402886390686, + "num_tokens": 67013738.0, + "step": 2669 + }, + { + "epoch": 0.29321326597847575, + "grad_norm": 2.7972869873046875, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.710702121257782, + "num_tokens": 67031578.0, + "step": 2670 + }, + { + "epoch": 0.2933230836810894, + "grad_norm": 2.161496162414551, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7045111656188965, + "num_tokens": 67057070.0, + "step": 2671 + }, + { + "epoch": 0.29343290138370304, + "grad_norm": 2.0763866901397705, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6863036155700684, + "num_tokens": 67085655.0, + "step": 2672 + }, + { + "epoch": 0.29354271908631674, + "grad_norm": 2.5628035068511963, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7037544250488281, + "num_tokens": 67104011.0, + "step": 2673 + }, + { + "epoch": 0.2936525367889304, + "grad_norm": 2.219149589538574, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7021877765655518, + "num_tokens": 67129697.0, + "step": 2674 + }, + { + "epoch": 0.29376235449154403, + "grad_norm": 2.2906012535095215, + "learning_rate": 1e-06, + "loss": 1.1119, + "mean_token_accuracy": 0.6723999977111816, + "num_tokens": 67156258.0, + "step": 2675 + }, + { + "epoch": 0.2938721721941577, + "grad_norm": 2.409090280532837, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6939147710800171, + "num_tokens": 67179756.0, + "step": 2676 + }, + { + "epoch": 0.2939819898967714, + "grad_norm": 2.2182600498199463, + "learning_rate": 1e-06, + "loss": 1.0881, + "mean_token_accuracy": 0.6863908767700195, + "num_tokens": 67208541.0, + "step": 2677 + }, + { + "epoch": 0.294091807599385, + "grad_norm": 2.571378231048584, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7052197456359863, + "num_tokens": 67229057.0, + "step": 2678 + }, + { + "epoch": 0.29420162530199867, + "grad_norm": 2.582306146621704, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.6992868185043335, + "num_tokens": 67247504.0, + "step": 2679 + }, + { + "epoch": 0.2943114430046123, + "grad_norm": 2.914675712585449, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6905432939529419, + "num_tokens": 67264976.0, + "step": 2680 + }, + { + "epoch": 0.294421260707226, + "grad_norm": 2.1954667568206787, + "learning_rate": 1e-06, + "loss": 1.039, + "mean_token_accuracy": 0.6878048777580261, + "num_tokens": 67292567.0, + "step": 2681 + }, + { + "epoch": 0.29453107840983966, + "grad_norm": 2.4574568271636963, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6916320323944092, + "num_tokens": 67312514.0, + "step": 2682 + }, + { + "epoch": 0.2946408961124533, + "grad_norm": 2.415107250213623, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7077081203460693, + "num_tokens": 67335035.0, + "step": 2683 + }, + { + "epoch": 0.294750713815067, + "grad_norm": 2.3094518184661865, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6946007013320923, + "num_tokens": 67359333.0, + "step": 2684 + }, + { + "epoch": 0.29486053151768066, + "grad_norm": 2.359457492828369, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6748325228691101, + "num_tokens": 67381219.0, + "step": 2685 + }, + { + "epoch": 0.2949703492202943, + "grad_norm": 2.358384132385254, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7201439738273621, + "num_tokens": 67403990.0, + "step": 2686 + }, + { + "epoch": 0.29508016692290795, + "grad_norm": 2.425229072570801, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7053844928741455, + "num_tokens": 67427120.0, + "step": 2687 + }, + { + "epoch": 0.29518998462552165, + "grad_norm": 2.247347116470337, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6968393325805664, + "num_tokens": 67452791.0, + "step": 2688 + }, + { + "epoch": 0.2952998023281353, + "grad_norm": 2.1344552040100098, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7018333077430725, + "num_tokens": 67479395.0, + "step": 2689 + }, + { + "epoch": 0.29540962003074894, + "grad_norm": 2.7589752674102783, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6954775452613831, + "num_tokens": 67496917.0, + "step": 2690 + }, + { + "epoch": 0.29551943773336264, + "grad_norm": 2.3350372314453125, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6844907402992249, + "num_tokens": 67522025.0, + "step": 2691 + }, + { + "epoch": 0.2956292554359763, + "grad_norm": 2.2152516841888428, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7034913301467896, + "num_tokens": 67547023.0, + "step": 2692 + }, + { + "epoch": 0.29573907313858994, + "grad_norm": 2.1579551696777344, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6840916275978088, + "num_tokens": 67576182.0, + "step": 2693 + }, + { + "epoch": 0.2958488908412036, + "grad_norm": 2.1935269832611084, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7011462450027466, + "num_tokens": 67602966.0, + "step": 2694 + }, + { + "epoch": 0.2959587085438173, + "grad_norm": 2.369779586791992, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.702312707901001, + "num_tokens": 67625817.0, + "step": 2695 + }, + { + "epoch": 0.29606852624643093, + "grad_norm": 2.2580180168151855, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7286548018455505, + "num_tokens": 67649989.0, + "step": 2696 + }, + { + "epoch": 0.2961783439490446, + "grad_norm": 2.243784189224243, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.7002599835395813, + "num_tokens": 67675261.0, + "step": 2697 + }, + { + "epoch": 0.2962881616516582, + "grad_norm": 2.0476880073547363, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.695215106010437, + "num_tokens": 67704282.0, + "step": 2698 + }, + { + "epoch": 0.2963979793542719, + "grad_norm": 2.7507386207580566, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6864377856254578, + "num_tokens": 67723561.0, + "step": 2699 + }, + { + "epoch": 0.29650779705688557, + "grad_norm": 2.2369039058685303, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6966334581375122, + "num_tokens": 67748698.0, + "step": 2700 + }, + { + "epoch": 0.2966176147594992, + "grad_norm": 2.134589910507202, + "learning_rate": 1e-06, + "loss": 1.0451, + "mean_token_accuracy": 0.6868945956230164, + "num_tokens": 67775961.0, + "step": 2701 + }, + { + "epoch": 0.2967274324621129, + "grad_norm": 2.2717533111572266, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.717738687992096, + "num_tokens": 67799573.0, + "step": 2702 + }, + { + "epoch": 0.29683725016472656, + "grad_norm": 3.311978340148926, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7200181484222412, + "num_tokens": 67812999.0, + "step": 2703 + }, + { + "epoch": 0.2969470678673402, + "grad_norm": 2.091855049133301, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7200413346290588, + "num_tokens": 67837727.0, + "step": 2704 + }, + { + "epoch": 0.29705688556995385, + "grad_norm": 2.464418649673462, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6836121082305908, + "num_tokens": 67860661.0, + "step": 2705 + }, + { + "epoch": 0.29716670327256756, + "grad_norm": 2.574251413345337, + "learning_rate": 1e-06, + "loss": 1.0439, + "mean_token_accuracy": 0.6861384510993958, + "num_tokens": 67881072.0, + "step": 2706 + }, + { + "epoch": 0.2972765209751812, + "grad_norm": 2.3869967460632324, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.6939197778701782, + "num_tokens": 67904194.0, + "step": 2707 + }, + { + "epoch": 0.29738633867779485, + "grad_norm": 2.3880326747894287, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.708659291267395, + "num_tokens": 67928290.0, + "step": 2708 + }, + { + "epoch": 0.29749615638040855, + "grad_norm": 2.4790761470794678, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7124089002609253, + "num_tokens": 67949204.0, + "step": 2709 + }, + { + "epoch": 0.2976059740830222, + "grad_norm": 2.2118747234344482, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6872342824935913, + "num_tokens": 67976560.0, + "step": 2710 + }, + { + "epoch": 0.29771579178563584, + "grad_norm": 2.2148756980895996, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6950139999389648, + "num_tokens": 68004575.0, + "step": 2711 + }, + { + "epoch": 0.2978256094882495, + "grad_norm": 2.3359076976776123, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6854970455169678, + "num_tokens": 68028447.0, + "step": 2712 + }, + { + "epoch": 0.2979354271908632, + "grad_norm": 2.633556365966797, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7182178497314453, + "num_tokens": 68046361.0, + "step": 2713 + }, + { + "epoch": 0.29804524489347684, + "grad_norm": 2.4587836265563965, + "learning_rate": 1e-06, + "loss": 1.0713, + "mean_token_accuracy": 0.6816854476928711, + "num_tokens": 68068624.0, + "step": 2714 + }, + { + "epoch": 0.2981550625960905, + "grad_norm": 2.1245639324188232, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7065246105194092, + "num_tokens": 68095598.0, + "step": 2715 + }, + { + "epoch": 0.2982648802987041, + "grad_norm": 2.548022747039795, + "learning_rate": 1e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.6776682138442993, + "num_tokens": 68118866.0, + "step": 2716 + }, + { + "epoch": 0.29837469800131783, + "grad_norm": 2.077833414077759, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6867518424987793, + "num_tokens": 68150064.0, + "step": 2717 + }, + { + "epoch": 0.2984845157039315, + "grad_norm": 2.4504504203796387, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7125071287155151, + "num_tokens": 68170497.0, + "step": 2718 + }, + { + "epoch": 0.2985943334065451, + "grad_norm": 2.5614402294158936, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7127130031585693, + "num_tokens": 68191601.0, + "step": 2719 + }, + { + "epoch": 0.2987041511091588, + "grad_norm": 2.548769950866699, + "learning_rate": 1e-06, + "loss": 1.0803, + "mean_token_accuracy": 0.6720218062400818, + "num_tokens": 68213693.0, + "step": 2720 + }, + { + "epoch": 0.29881396881177247, + "grad_norm": 2.2457664012908936, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6961695551872253, + "num_tokens": 68241399.0, + "step": 2721 + }, + { + "epoch": 0.2989237865143861, + "grad_norm": 2.3964507579803467, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6917725801467896, + "num_tokens": 68265833.0, + "step": 2722 + }, + { + "epoch": 0.29903360421699976, + "grad_norm": 2.3405957221984863, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.720231294631958, + "num_tokens": 68287063.0, + "step": 2723 + }, + { + "epoch": 0.29914342191961346, + "grad_norm": 2.205317497253418, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7167255878448486, + "num_tokens": 68312729.0, + "step": 2724 + }, + { + "epoch": 0.2992532396222271, + "grad_norm": 2.2503936290740967, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.6996970176696777, + "num_tokens": 68340253.0, + "step": 2725 + }, + { + "epoch": 0.29936305732484075, + "grad_norm": 2.4315268993377686, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.700148344039917, + "num_tokens": 68362423.0, + "step": 2726 + }, + { + "epoch": 0.2994728750274544, + "grad_norm": 3.0247392654418945, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7169504165649414, + "num_tokens": 68378271.0, + "step": 2727 + }, + { + "epoch": 0.2995826927300681, + "grad_norm": 2.37040376663208, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6911273002624512, + "num_tokens": 68402921.0, + "step": 2728 + }, + { + "epoch": 0.29969251043268175, + "grad_norm": 2.595452308654785, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.6944050788879395, + "num_tokens": 68423331.0, + "step": 2729 + }, + { + "epoch": 0.2998023281352954, + "grad_norm": 2.2246057987213135, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7014281749725342, + "num_tokens": 68448713.0, + "step": 2730 + }, + { + "epoch": 0.2999121458379091, + "grad_norm": 2.3802249431610107, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7045078873634338, + "num_tokens": 68473656.0, + "step": 2731 + }, + { + "epoch": 0.30002196354052274, + "grad_norm": 2.4359655380249023, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7124860286712646, + "num_tokens": 68496405.0, + "step": 2732 + }, + { + "epoch": 0.3001317812431364, + "grad_norm": 2.4296908378601074, + "learning_rate": 1e-06, + "loss": 1.0375, + "mean_token_accuracy": 0.69645756483078, + "num_tokens": 68519812.0, + "step": 2733 + }, + { + "epoch": 0.30024159894575003, + "grad_norm": 2.5026755332946777, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.695560097694397, + "num_tokens": 68544423.0, + "step": 2734 + }, + { + "epoch": 0.30035141664836373, + "grad_norm": 1.860400676727295, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7029756307601929, + "num_tokens": 68581553.0, + "step": 2735 + }, + { + "epoch": 0.3004612343509774, + "grad_norm": 2.3248233795166016, + "learning_rate": 1e-06, + "loss": 1.0828, + "mean_token_accuracy": 0.6799084544181824, + "num_tokens": 68606339.0, + "step": 2736 + }, + { + "epoch": 0.300571052053591, + "grad_norm": 2.058196783065796, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7301863431930542, + "num_tokens": 68634884.0, + "step": 2737 + }, + { + "epoch": 0.3006808697562047, + "grad_norm": 2.355997085571289, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7270005941390991, + "num_tokens": 68656649.0, + "step": 2738 + }, + { + "epoch": 0.3007906874588184, + "grad_norm": 2.2897276878356934, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6998276114463806, + "num_tokens": 68678949.0, + "step": 2739 + }, + { + "epoch": 0.300900505161432, + "grad_norm": 2.159247875213623, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6926642656326294, + "num_tokens": 68706834.0, + "step": 2740 + }, + { + "epoch": 0.30101032286404567, + "grad_norm": 2.201011896133423, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.680450439453125, + "num_tokens": 68735567.0, + "step": 2741 + }, + { + "epoch": 0.30112014056665937, + "grad_norm": 2.271888256072998, + "learning_rate": 1e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.6713575720787048, + "num_tokens": 68761256.0, + "step": 2742 + }, + { + "epoch": 0.301229958269273, + "grad_norm": 2.0776329040527344, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6915881633758545, + "num_tokens": 68789385.0, + "step": 2743 + }, + { + "epoch": 0.30133977597188666, + "grad_norm": 2.441740036010742, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7039361000061035, + "num_tokens": 68811977.0, + "step": 2744 + }, + { + "epoch": 0.3014495936745003, + "grad_norm": 2.013206720352173, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6868212223052979, + "num_tokens": 68841437.0, + "step": 2745 + }, + { + "epoch": 0.301559411377114, + "grad_norm": 2.618058204650879, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.7152509689331055, + "num_tokens": 68861306.0, + "step": 2746 + }, + { + "epoch": 0.30166922907972765, + "grad_norm": 2.666635036468506, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7047213912010193, + "num_tokens": 68880385.0, + "step": 2747 + }, + { + "epoch": 0.3017790467823413, + "grad_norm": 2.7983272075653076, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6809881925582886, + "num_tokens": 68899747.0, + "step": 2748 + }, + { + "epoch": 0.301888864484955, + "grad_norm": 2.66963267326355, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.688431978225708, + "num_tokens": 68919500.0, + "step": 2749 + }, + { + "epoch": 0.30199868218756865, + "grad_norm": 2.1885733604431152, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6955137252807617, + "num_tokens": 68948051.0, + "step": 2750 + }, + { + "epoch": 0.3021084998901823, + "grad_norm": 2.3442723751068115, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6840051412582397, + "num_tokens": 68971332.0, + "step": 2751 + }, + { + "epoch": 0.30221831759279594, + "grad_norm": 2.3885514736175537, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6812027096748352, + "num_tokens": 68993873.0, + "step": 2752 + }, + { + "epoch": 0.30232813529540964, + "grad_norm": 2.0649282932281494, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6926971673965454, + "num_tokens": 69025495.0, + "step": 2753 + }, + { + "epoch": 0.3024379529980233, + "grad_norm": 2.2101876735687256, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7115730047225952, + "num_tokens": 69051271.0, + "step": 2754 + }, + { + "epoch": 0.30254777070063693, + "grad_norm": 2.044595718383789, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7174801826477051, + "num_tokens": 69080775.0, + "step": 2755 + }, + { + "epoch": 0.3026575884032506, + "grad_norm": 2.2150492668151855, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6925843954086304, + "num_tokens": 69107097.0, + "step": 2756 + }, + { + "epoch": 0.3027674061058643, + "grad_norm": 2.2832257747650146, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6952252388000488, + "num_tokens": 69130837.0, + "step": 2757 + }, + { + "epoch": 0.3028772238084779, + "grad_norm": 1.978566288948059, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7019034624099731, + "num_tokens": 69160817.0, + "step": 2758 + }, + { + "epoch": 0.30298704151109157, + "grad_norm": 2.134420156478882, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7178559303283691, + "num_tokens": 69189391.0, + "step": 2759 + }, + { + "epoch": 0.30309685921370527, + "grad_norm": 2.6503775119781494, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7171794176101685, + "num_tokens": 69207521.0, + "step": 2760 + }, + { + "epoch": 0.3032066769163189, + "grad_norm": 2.2391817569732666, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7103930115699768, + "num_tokens": 69233324.0, + "step": 2761 + }, + { + "epoch": 0.30331649461893256, + "grad_norm": 2.2737860679626465, + "learning_rate": 1e-06, + "loss": 1.0588, + "mean_token_accuracy": 0.6908139586448669, + "num_tokens": 69259205.0, + "step": 2762 + }, + { + "epoch": 0.3034263123215462, + "grad_norm": 2.3064260482788086, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.711627721786499, + "num_tokens": 69281320.0, + "step": 2763 + }, + { + "epoch": 0.3035361300241599, + "grad_norm": 2.3746724128723145, + "learning_rate": 1e-06, + "loss": 1.0288, + "mean_token_accuracy": 0.6835583448410034, + "num_tokens": 69303976.0, + "step": 2764 + }, + { + "epoch": 0.30364594772677356, + "grad_norm": 2.192404270172119, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7122348546981812, + "num_tokens": 69329184.0, + "step": 2765 + }, + { + "epoch": 0.3037557654293872, + "grad_norm": 2.42791485786438, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6911358833312988, + "num_tokens": 69353963.0, + "step": 2766 + }, + { + "epoch": 0.3038655831320009, + "grad_norm": 2.208552598953247, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6951279640197754, + "num_tokens": 69383297.0, + "step": 2767 + }, + { + "epoch": 0.30397540083461455, + "grad_norm": 2.290627956390381, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7180633544921875, + "num_tokens": 69408581.0, + "step": 2768 + }, + { + "epoch": 0.3040852185372282, + "grad_norm": 2.3007352352142334, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6938474178314209, + "num_tokens": 69434577.0, + "step": 2769 + }, + { + "epoch": 0.30419503623984184, + "grad_norm": 2.460549831390381, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6814860701560974, + "num_tokens": 69456983.0, + "step": 2770 + }, + { + "epoch": 0.30430485394245554, + "grad_norm": 2.5980846881866455, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6994659900665283, + "num_tokens": 69478023.0, + "step": 2771 + }, + { + "epoch": 0.3044146716450692, + "grad_norm": 2.583392858505249, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6876501441001892, + "num_tokens": 69500534.0, + "step": 2772 + }, + { + "epoch": 0.30452448934768284, + "grad_norm": 2.209869861602783, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6978738307952881, + "num_tokens": 69527221.0, + "step": 2773 + }, + { + "epoch": 0.3046343070502965, + "grad_norm": 2.543692111968994, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7067774534225464, + "num_tokens": 69549335.0, + "step": 2774 + }, + { + "epoch": 0.3047441247529102, + "grad_norm": 2.067601442337036, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6801194548606873, + "num_tokens": 69579214.0, + "step": 2775 + }, + { + "epoch": 0.30485394245552383, + "grad_norm": 2.058373212814331, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6841800212860107, + "num_tokens": 69608421.0, + "step": 2776 + }, + { + "epoch": 0.3049637601581375, + "grad_norm": 2.890366554260254, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7269526124000549, + "num_tokens": 69626272.0, + "step": 2777 + }, + { + "epoch": 0.3050735778607512, + "grad_norm": 2.2628424167633057, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7032084465026855, + "num_tokens": 69649989.0, + "step": 2778 + }, + { + "epoch": 0.3051833955633648, + "grad_norm": 2.3063995838165283, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7132774591445923, + "num_tokens": 69672813.0, + "step": 2779 + }, + { + "epoch": 0.30529321326597847, + "grad_norm": 1.9171388149261475, + "learning_rate": 1e-06, + "loss": 1.0882, + "mean_token_accuracy": 0.6807069778442383, + "num_tokens": 69706779.0, + "step": 2780 + }, + { + "epoch": 0.3054030309685921, + "grad_norm": 2.195906639099121, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6956781148910522, + "num_tokens": 69731903.0, + "step": 2781 + }, + { + "epoch": 0.3055128486712058, + "grad_norm": 2.240743398666382, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6902823448181152, + "num_tokens": 69758701.0, + "step": 2782 + }, + { + "epoch": 0.30562266637381946, + "grad_norm": 2.3542189598083496, + "learning_rate": 1e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.6858819127082825, + "num_tokens": 69782401.0, + "step": 2783 + }, + { + "epoch": 0.3057324840764331, + "grad_norm": 2.029132127761841, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7144769430160522, + "num_tokens": 69808854.0, + "step": 2784 + }, + { + "epoch": 0.30584230177904675, + "grad_norm": 2.028512477874756, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7110785245895386, + "num_tokens": 69837802.0, + "step": 2785 + }, + { + "epoch": 0.30595211948166046, + "grad_norm": 1.9142292737960815, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7044085264205933, + "num_tokens": 69872964.0, + "step": 2786 + }, + { + "epoch": 0.3060619371842741, + "grad_norm": 2.2914481163024902, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6871963739395142, + "num_tokens": 69897935.0, + "step": 2787 + }, + { + "epoch": 0.30617175488688775, + "grad_norm": 2.1805872917175293, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7263649106025696, + "num_tokens": 69923991.0, + "step": 2788 + }, + { + "epoch": 0.30628157258950145, + "grad_norm": 2.531794309616089, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6945909857749939, + "num_tokens": 69946116.0, + "step": 2789 + }, + { + "epoch": 0.3063913902921151, + "grad_norm": 2.2388951778411865, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7281452417373657, + "num_tokens": 69969721.0, + "step": 2790 + }, + { + "epoch": 0.30650120799472874, + "grad_norm": 2.48435115814209, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7292490601539612, + "num_tokens": 69990642.0, + "step": 2791 + }, + { + "epoch": 0.3066110256973424, + "grad_norm": 2.1973068714141846, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6949381828308105, + "num_tokens": 70015325.0, + "step": 2792 + }, + { + "epoch": 0.3067208433999561, + "grad_norm": 2.213393211364746, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6894385814666748, + "num_tokens": 70042194.0, + "step": 2793 + }, + { + "epoch": 0.30683066110256974, + "grad_norm": 2.4188685417175293, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7111889719963074, + "num_tokens": 70063195.0, + "step": 2794 + }, + { + "epoch": 0.3069404788051834, + "grad_norm": 2.449086904525757, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7014651894569397, + "num_tokens": 70085373.0, + "step": 2795 + }, + { + "epoch": 0.3070502965077971, + "grad_norm": 2.3821914196014404, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7014554738998413, + "num_tokens": 70110180.0, + "step": 2796 + }, + { + "epoch": 0.30716011421041073, + "grad_norm": 2.2322466373443604, + "learning_rate": 1e-06, + "loss": 1.0695, + "mean_token_accuracy": 0.6810818910598755, + "num_tokens": 70137083.0, + "step": 2797 + }, + { + "epoch": 0.3072699319130244, + "grad_norm": 2.1753346920013428, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7103140354156494, + "num_tokens": 70164918.0, + "step": 2798 + }, + { + "epoch": 0.307379749615638, + "grad_norm": 2.480849504470825, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.699871301651001, + "num_tokens": 70186982.0, + "step": 2799 + }, + { + "epoch": 0.3074895673182517, + "grad_norm": 2.186958074569702, + "learning_rate": 1e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6906628608703613, + "num_tokens": 70215638.0, + "step": 2800 + }, + { + "epoch": 0.30759938502086537, + "grad_norm": 2.0618515014648438, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6972083449363708, + "num_tokens": 70245529.0, + "step": 2801 + }, + { + "epoch": 0.307709202723479, + "grad_norm": 2.1315040588378906, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6780894994735718, + "num_tokens": 70273704.0, + "step": 2802 + }, + { + "epoch": 0.30781902042609266, + "grad_norm": 2.2941970825195312, + "learning_rate": 1e-06, + "loss": 1.1413, + "mean_token_accuracy": 0.6648054122924805, + "num_tokens": 70300047.0, + "step": 2803 + }, + { + "epoch": 0.30792883812870636, + "grad_norm": 2.022188663482666, + "learning_rate": 1e-06, + "loss": 1.0723, + "mean_token_accuracy": 0.6813027858734131, + "num_tokens": 70330617.0, + "step": 2804 + }, + { + "epoch": 0.30803865583132, + "grad_norm": 2.544215202331543, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7034549713134766, + "num_tokens": 70349962.0, + "step": 2805 + }, + { + "epoch": 0.30814847353393365, + "grad_norm": 2.0976972579956055, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6931514143943787, + "num_tokens": 70378686.0, + "step": 2806 + }, + { + "epoch": 0.30825829123654735, + "grad_norm": 2.3696770668029785, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7229390144348145, + "num_tokens": 70400793.0, + "step": 2807 + }, + { + "epoch": 0.308368108939161, + "grad_norm": 2.0701098442077637, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7277430295944214, + "num_tokens": 70427682.0, + "step": 2808 + }, + { + "epoch": 0.30847792664177465, + "grad_norm": 2.452477216720581, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6979273557662964, + "num_tokens": 70448478.0, + "step": 2809 + }, + { + "epoch": 0.3085877443443883, + "grad_norm": 2.30415415763855, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6982042193412781, + "num_tokens": 70471820.0, + "step": 2810 + }, + { + "epoch": 0.308697562047002, + "grad_norm": 2.141977548599243, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6874269247055054, + "num_tokens": 70499388.0, + "step": 2811 + }, + { + "epoch": 0.30880737974961564, + "grad_norm": 2.255225896835327, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7064594030380249, + "num_tokens": 70525183.0, + "step": 2812 + }, + { + "epoch": 0.3089171974522293, + "grad_norm": 2.1618735790252686, + "learning_rate": 1e-06, + "loss": 1.071, + "mean_token_accuracy": 0.6769479513168335, + "num_tokens": 70556632.0, + "step": 2813 + }, + { + "epoch": 0.309027015154843, + "grad_norm": 1.9360164403915405, + "learning_rate": 1e-06, + "loss": 1.0925, + "mean_token_accuracy": 0.6738420128822327, + "num_tokens": 70591759.0, + "step": 2814 + }, + { + "epoch": 0.30913683285745663, + "grad_norm": 2.2639195919036865, + "learning_rate": 1e-06, + "loss": 1.08, + "mean_token_accuracy": 0.6804649233818054, + "num_tokens": 70616102.0, + "step": 2815 + }, + { + "epoch": 0.3092466505600703, + "grad_norm": 2.389733076095581, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.7041547894477844, + "num_tokens": 70641905.0, + "step": 2816 + }, + { + "epoch": 0.3093564682626839, + "grad_norm": 2.180325508117676, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7019388675689697, + "num_tokens": 70669577.0, + "step": 2817 + }, + { + "epoch": 0.3094662859652976, + "grad_norm": 2.4529452323913574, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7007869482040405, + "num_tokens": 70690516.0, + "step": 2818 + }, + { + "epoch": 0.3095761036679113, + "grad_norm": 2.404083490371704, + "learning_rate": 1e-06, + "loss": 1.0813, + "mean_token_accuracy": 0.6801519393920898, + "num_tokens": 70714700.0, + "step": 2819 + }, + { + "epoch": 0.3096859213705249, + "grad_norm": 2.3304078578948975, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6944113969802856, + "num_tokens": 70738678.0, + "step": 2820 + }, + { + "epoch": 0.30979573907313857, + "grad_norm": 1.9634442329406738, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.688481867313385, + "num_tokens": 70772342.0, + "step": 2821 + }, + { + "epoch": 0.30990555677575227, + "grad_norm": 2.2475814819335938, + "learning_rate": 1e-06, + "loss": 1.0643, + "mean_token_accuracy": 0.6771775484085083, + "num_tokens": 70798890.0, + "step": 2822 + }, + { + "epoch": 0.3100153744783659, + "grad_norm": 2.329914093017578, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7129995226860046, + "num_tokens": 70821487.0, + "step": 2823 + }, + { + "epoch": 0.31012519218097956, + "grad_norm": 2.143531560897827, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6978985071182251, + "num_tokens": 70847163.0, + "step": 2824 + }, + { + "epoch": 0.31023500988359326, + "grad_norm": 2.1537463665008545, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7063530087471008, + "num_tokens": 70875225.0, + "step": 2825 + }, + { + "epoch": 0.3103448275862069, + "grad_norm": 2.0911037921905518, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6739269495010376, + "num_tokens": 70907063.0, + "step": 2826 + }, + { + "epoch": 0.31045464528882055, + "grad_norm": 2.6210365295410156, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7209956049919128, + "num_tokens": 70923802.0, + "step": 2827 + }, + { + "epoch": 0.3105644629914342, + "grad_norm": 2.286818504333496, + "learning_rate": 1e-06, + "loss": 1.0634, + "mean_token_accuracy": 0.6827425956726074, + "num_tokens": 70951449.0, + "step": 2828 + }, + { + "epoch": 0.3106742806940479, + "grad_norm": 2.2386157512664795, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6895577907562256, + "num_tokens": 70975326.0, + "step": 2829 + }, + { + "epoch": 0.31078409839666155, + "grad_norm": 2.097707748413086, + "learning_rate": 1e-06, + "loss": 1.0635, + "mean_token_accuracy": 0.6812185049057007, + "num_tokens": 71004183.0, + "step": 2830 + }, + { + "epoch": 0.3108939160992752, + "grad_norm": 2.263032913208008, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7095516920089722, + "num_tokens": 71029269.0, + "step": 2831 + }, + { + "epoch": 0.31100373380188884, + "grad_norm": 2.2177062034606934, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.7067364454269409, + "num_tokens": 71054138.0, + "step": 2832 + }, + { + "epoch": 0.31111355150450254, + "grad_norm": 2.3463134765625, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6976603865623474, + "num_tokens": 71078518.0, + "step": 2833 + }, + { + "epoch": 0.3112233692071162, + "grad_norm": 2.3752357959747314, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.687812089920044, + "num_tokens": 71103330.0, + "step": 2834 + }, + { + "epoch": 0.31133318690972983, + "grad_norm": 2.2696638107299805, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6790960431098938, + "num_tokens": 71128159.0, + "step": 2835 + }, + { + "epoch": 0.31144300461234353, + "grad_norm": 2.224012851715088, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7143624424934387, + "num_tokens": 71154388.0, + "step": 2836 + }, + { + "epoch": 0.3115528223149572, + "grad_norm": 2.3062024116516113, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6939388513565063, + "num_tokens": 71179638.0, + "step": 2837 + }, + { + "epoch": 0.3116626400175708, + "grad_norm": 2.2763829231262207, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6914333701133728, + "num_tokens": 71204632.0, + "step": 2838 + }, + { + "epoch": 0.31177245772018447, + "grad_norm": 2.211146354675293, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6958396434783936, + "num_tokens": 71230321.0, + "step": 2839 + }, + { + "epoch": 0.31188227542279817, + "grad_norm": 2.3272156715393066, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6960151195526123, + "num_tokens": 71252558.0, + "step": 2840 + }, + { + "epoch": 0.3119920931254118, + "grad_norm": 2.1939542293548584, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7037150859832764, + "num_tokens": 71277288.0, + "step": 2841 + }, + { + "epoch": 0.31210191082802546, + "grad_norm": 2.2302401065826416, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6989932060241699, + "num_tokens": 71302605.0, + "step": 2842 + }, + { + "epoch": 0.31221172853063917, + "grad_norm": 1.8601797819137573, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6932060718536377, + "num_tokens": 71336103.0, + "step": 2843 + }, + { + "epoch": 0.3123215462332528, + "grad_norm": 2.368619918823242, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.699959933757782, + "num_tokens": 71359024.0, + "step": 2844 + }, + { + "epoch": 0.31243136393586646, + "grad_norm": 2.1913411617279053, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6932022571563721, + "num_tokens": 71387324.0, + "step": 2845 + }, + { + "epoch": 0.3125411816384801, + "grad_norm": 2.2201831340789795, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7112436890602112, + "num_tokens": 71411634.0, + "step": 2846 + }, + { + "epoch": 0.3126509993410938, + "grad_norm": 2.4515609741210938, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.7003819942474365, + "num_tokens": 71432158.0, + "step": 2847 + }, + { + "epoch": 0.31276081704370745, + "grad_norm": 2.160860300064087, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7061798572540283, + "num_tokens": 71458184.0, + "step": 2848 + }, + { + "epoch": 0.3128706347463211, + "grad_norm": 2.240323066711426, + "learning_rate": 1e-06, + "loss": 1.0564, + "mean_token_accuracy": 0.6856249570846558, + "num_tokens": 71484583.0, + "step": 2849 + }, + { + "epoch": 0.31298045244893474, + "grad_norm": 2.284066915512085, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6987619996070862, + "num_tokens": 71508868.0, + "step": 2850 + }, + { + "epoch": 0.31309027015154844, + "grad_norm": 2.3088183403015137, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7047021389007568, + "num_tokens": 71532661.0, + "step": 2851 + }, + { + "epoch": 0.3132000878541621, + "grad_norm": 2.0005853176116943, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7013921737670898, + "num_tokens": 71560869.0, + "step": 2852 + }, + { + "epoch": 0.31330990555677574, + "grad_norm": 2.3402395248413086, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6854867935180664, + "num_tokens": 71584571.0, + "step": 2853 + }, + { + "epoch": 0.31341972325938944, + "grad_norm": 2.1612730026245117, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6911486387252808, + "num_tokens": 71613403.0, + "step": 2854 + }, + { + "epoch": 0.3135295409620031, + "grad_norm": 2.0431134700775146, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7149717807769775, + "num_tokens": 71643325.0, + "step": 2855 + }, + { + "epoch": 0.31363935866461673, + "grad_norm": 2.081698179244995, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.698910653591156, + "num_tokens": 71673960.0, + "step": 2856 + }, + { + "epoch": 0.3137491763672304, + "grad_norm": 2.2858192920684814, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6926075220108032, + "num_tokens": 71698639.0, + "step": 2857 + }, + { + "epoch": 0.3138589940698441, + "grad_norm": 1.9676226377487183, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7178808450698853, + "num_tokens": 71728052.0, + "step": 2858 + }, + { + "epoch": 0.3139688117724577, + "grad_norm": 2.3793997764587402, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7264877557754517, + "num_tokens": 71752155.0, + "step": 2859 + }, + { + "epoch": 0.31407862947507137, + "grad_norm": 2.3668313026428223, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7068102955818176, + "num_tokens": 71774832.0, + "step": 2860 + }, + { + "epoch": 0.314188447177685, + "grad_norm": 2.082362413406372, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6944538354873657, + "num_tokens": 71805323.0, + "step": 2861 + }, + { + "epoch": 0.3142982648802987, + "grad_norm": 2.319347858428955, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7051020860671997, + "num_tokens": 71828263.0, + "step": 2862 + }, + { + "epoch": 0.31440808258291236, + "grad_norm": 2.2760937213897705, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6952103972434998, + "num_tokens": 71853290.0, + "step": 2863 + }, + { + "epoch": 0.314517900285526, + "grad_norm": 2.2970595359802246, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.6870888471603394, + "num_tokens": 71878050.0, + "step": 2864 + }, + { + "epoch": 0.3146277179881397, + "grad_norm": 2.340658664703369, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7167787551879883, + "num_tokens": 71900498.0, + "step": 2865 + }, + { + "epoch": 0.31473753569075336, + "grad_norm": 2.287468671798706, + "learning_rate": 1e-06, + "loss": 1.0411, + "mean_token_accuracy": 0.6816987991333008, + "num_tokens": 71925321.0, + "step": 2866 + }, + { + "epoch": 0.314847353393367, + "grad_norm": 2.344114065170288, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7154492139816284, + "num_tokens": 71948649.0, + "step": 2867 + }, + { + "epoch": 0.31495717109598065, + "grad_norm": 2.237950086593628, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6960000991821289, + "num_tokens": 71971953.0, + "step": 2868 + }, + { + "epoch": 0.31506698879859435, + "grad_norm": 2.6438090801239014, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.702069878578186, + "num_tokens": 71992634.0, + "step": 2869 + }, + { + "epoch": 0.315176806501208, + "grad_norm": 2.3158509731292725, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6886919736862183, + "num_tokens": 72015846.0, + "step": 2870 + }, + { + "epoch": 0.31528662420382164, + "grad_norm": 1.8979462385177612, + "learning_rate": 1e-06, + "loss": 1.0841, + "mean_token_accuracy": 0.6777279376983643, + "num_tokens": 72050492.0, + "step": 2871 + }, + { + "epoch": 0.31539644190643534, + "grad_norm": 2.2518301010131836, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6881394386291504, + "num_tokens": 72076605.0, + "step": 2872 + }, + { + "epoch": 0.315506259609049, + "grad_norm": 2.146775722503662, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7078137993812561, + "num_tokens": 72102635.0, + "step": 2873 + }, + { + "epoch": 0.31561607731166264, + "grad_norm": 2.0579724311828613, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7177479267120361, + "num_tokens": 72130493.0, + "step": 2874 + }, + { + "epoch": 0.3157258950142763, + "grad_norm": 2.310135841369629, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.704666018486023, + "num_tokens": 72154354.0, + "step": 2875 + }, + { + "epoch": 0.31583571271689, + "grad_norm": 2.202277898788452, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6991446614265442, + "num_tokens": 72178923.0, + "step": 2876 + }, + { + "epoch": 0.31594553041950363, + "grad_norm": 2.3538568019866943, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6973135471343994, + "num_tokens": 72202782.0, + "step": 2877 + }, + { + "epoch": 0.3160553481221173, + "grad_norm": 2.1660923957824707, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6895176768302917, + "num_tokens": 72230783.0, + "step": 2878 + }, + { + "epoch": 0.3161651658247309, + "grad_norm": 2.3652775287628174, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7228618264198303, + "num_tokens": 72254013.0, + "step": 2879 + }, + { + "epoch": 0.3162749835273446, + "grad_norm": 2.2684943675994873, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7292400598526001, + "num_tokens": 72275917.0, + "step": 2880 + }, + { + "epoch": 0.31638480122995827, + "grad_norm": 2.2006139755249023, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7166824340820312, + "num_tokens": 72301049.0, + "step": 2881 + }, + { + "epoch": 0.3164946189325719, + "grad_norm": 2.121612548828125, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.675018846988678, + "num_tokens": 72330790.0, + "step": 2882 + }, + { + "epoch": 0.3166044366351856, + "grad_norm": 2.3320274353027344, + "learning_rate": 1e-06, + "loss": 1.0848, + "mean_token_accuracy": 0.6745350956916809, + "num_tokens": 72355314.0, + "step": 2883 + }, + { + "epoch": 0.31671425433779926, + "grad_norm": 2.3119170665740967, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7167413830757141, + "num_tokens": 72378930.0, + "step": 2884 + }, + { + "epoch": 0.3168240720404129, + "grad_norm": 2.341780185699463, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.720099151134491, + "num_tokens": 72400599.0, + "step": 2885 + }, + { + "epoch": 0.31693388974302655, + "grad_norm": 2.0277140140533447, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6926259994506836, + "num_tokens": 72431954.0, + "step": 2886 + }, + { + "epoch": 0.31704370744564025, + "grad_norm": 2.2879559993743896, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7012534141540527, + "num_tokens": 72457423.0, + "step": 2887 + }, + { + "epoch": 0.3171535251482539, + "grad_norm": 2.359938383102417, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7045738697052002, + "num_tokens": 72482151.0, + "step": 2888 + }, + { + "epoch": 0.31726334285086755, + "grad_norm": 2.2915244102478027, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6898317337036133, + "num_tokens": 72505838.0, + "step": 2889 + }, + { + "epoch": 0.31737316055348125, + "grad_norm": 2.085033893585205, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6917130947113037, + "num_tokens": 72534759.0, + "step": 2890 + }, + { + "epoch": 0.3174829782560949, + "grad_norm": 2.1151602268218994, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6994866728782654, + "num_tokens": 72561046.0, + "step": 2891 + }, + { + "epoch": 0.31759279595870854, + "grad_norm": 2.668717861175537, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.7059148550033569, + "num_tokens": 72581347.0, + "step": 2892 + }, + { + "epoch": 0.3177026136613222, + "grad_norm": 2.2781803607940674, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6853878498077393, + "num_tokens": 72609757.0, + "step": 2893 + }, + { + "epoch": 0.3178124313639359, + "grad_norm": 2.1431925296783447, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7073367238044739, + "num_tokens": 72635984.0, + "step": 2894 + }, + { + "epoch": 0.31792224906654953, + "grad_norm": 2.4593379497528076, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.687290608882904, + "num_tokens": 72660592.0, + "step": 2895 + }, + { + "epoch": 0.3180320667691632, + "grad_norm": 2.4162545204162598, + "learning_rate": 1e-06, + "loss": 1.1347, + "mean_token_accuracy": 0.6680567264556885, + "num_tokens": 72684596.0, + "step": 2896 + }, + { + "epoch": 0.3181418844717768, + "grad_norm": 2.2680504322052, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6920263767242432, + "num_tokens": 72708911.0, + "step": 2897 + }, + { + "epoch": 0.3182517021743905, + "grad_norm": 2.165712594985962, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6969488263130188, + "num_tokens": 72734096.0, + "step": 2898 + }, + { + "epoch": 0.3183615198770042, + "grad_norm": 2.3369407653808594, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7087079286575317, + "num_tokens": 72756251.0, + "step": 2899 + }, + { + "epoch": 0.3184713375796178, + "grad_norm": 2.394735097885132, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7032394409179688, + "num_tokens": 72779276.0, + "step": 2900 + }, + { + "epoch": 0.3185811552822315, + "grad_norm": 2.273749351501465, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7040572762489319, + "num_tokens": 72804994.0, + "step": 2901 + }, + { + "epoch": 0.31869097298484517, + "grad_norm": 2.3115274906158447, + "learning_rate": 1e-06, + "loss": 1.0501, + "mean_token_accuracy": 0.6822195649147034, + "num_tokens": 72831085.0, + "step": 2902 + }, + { + "epoch": 0.3188007906874588, + "grad_norm": 2.2624826431274414, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6947582364082336, + "num_tokens": 72856451.0, + "step": 2903 + }, + { + "epoch": 0.31891060839007246, + "grad_norm": 2.9429163932800293, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6844273209571838, + "num_tokens": 72874231.0, + "step": 2904 + }, + { + "epoch": 0.31902042609268616, + "grad_norm": 2.507110357284546, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7092208862304688, + "num_tokens": 72895639.0, + "step": 2905 + }, + { + "epoch": 0.3191302437952998, + "grad_norm": 2.0505402088165283, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6971858143806458, + "num_tokens": 72923914.0, + "step": 2906 + }, + { + "epoch": 0.31924006149791345, + "grad_norm": 2.2250819206237793, + "learning_rate": 1e-06, + "loss": 1.1111, + "mean_token_accuracy": 0.6647425889968872, + "num_tokens": 72951566.0, + "step": 2907 + }, + { + "epoch": 0.3193498792005271, + "grad_norm": 2.325566530227661, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7214514017105103, + "num_tokens": 72975608.0, + "step": 2908 + }, + { + "epoch": 0.3194596969031408, + "grad_norm": 2.454963207244873, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7048571109771729, + "num_tokens": 72997406.0, + "step": 2909 + }, + { + "epoch": 0.31956951460575445, + "grad_norm": 2.4013233184814453, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6985100507736206, + "num_tokens": 73020888.0, + "step": 2910 + }, + { + "epoch": 0.3196793323083681, + "grad_norm": 2.3250207901000977, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.710536003112793, + "num_tokens": 73042313.0, + "step": 2911 + }, + { + "epoch": 0.3197891500109818, + "grad_norm": 2.3956174850463867, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7080700397491455, + "num_tokens": 73065386.0, + "step": 2912 + }, + { + "epoch": 0.31989896771359544, + "grad_norm": 2.268538236618042, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.706834077835083, + "num_tokens": 73090331.0, + "step": 2913 + }, + { + "epoch": 0.3200087854162091, + "grad_norm": 2.2312238216400146, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6999523043632507, + "num_tokens": 73118504.0, + "step": 2914 + }, + { + "epoch": 0.32011860311882273, + "grad_norm": 2.0171213150024414, + "learning_rate": 1e-06, + "loss": 1.0933, + "mean_token_accuracy": 0.6731925010681152, + "num_tokens": 73148301.0, + "step": 2915 + }, + { + "epoch": 0.32022842082143643, + "grad_norm": 2.512526512145996, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7160414457321167, + "num_tokens": 73168868.0, + "step": 2916 + }, + { + "epoch": 0.3203382385240501, + "grad_norm": 2.357482671737671, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7084896564483643, + "num_tokens": 73192531.0, + "step": 2917 + }, + { + "epoch": 0.3204480562266637, + "grad_norm": 2.150904417037964, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.704321026802063, + "num_tokens": 73220087.0, + "step": 2918 + }, + { + "epoch": 0.3205578739292774, + "grad_norm": 2.0905280113220215, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7218285799026489, + "num_tokens": 73245928.0, + "step": 2919 + }, + { + "epoch": 0.32066769163189107, + "grad_norm": 2.1745221614837646, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6901116371154785, + "num_tokens": 73271671.0, + "step": 2920 + }, + { + "epoch": 0.3207775093345047, + "grad_norm": 2.4488162994384766, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6991372108459473, + "num_tokens": 73292732.0, + "step": 2921 + }, + { + "epoch": 0.32088732703711836, + "grad_norm": 2.030211925506592, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6990379691123962, + "num_tokens": 73321452.0, + "step": 2922 + }, + { + "epoch": 0.32099714473973207, + "grad_norm": 1.9661705493927002, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6948028206825256, + "num_tokens": 73353132.0, + "step": 2923 + }, + { + "epoch": 0.3211069624423457, + "grad_norm": 2.183945417404175, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6863033771514893, + "num_tokens": 73380022.0, + "step": 2924 + }, + { + "epoch": 0.32121678014495936, + "grad_norm": 2.0603585243225098, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.704810380935669, + "num_tokens": 73409267.0, + "step": 2925 + }, + { + "epoch": 0.321326597847573, + "grad_norm": 2.3577640056610107, + "learning_rate": 1e-06, + "loss": 1.0639, + "mean_token_accuracy": 0.6787264943122864, + "num_tokens": 73432777.0, + "step": 2926 + }, + { + "epoch": 0.3214364155501867, + "grad_norm": 2.7548725605010986, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7055022120475769, + "num_tokens": 73449630.0, + "step": 2927 + }, + { + "epoch": 0.32154623325280035, + "grad_norm": 2.1143579483032227, + "learning_rate": 1e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.6737402081489563, + "num_tokens": 73479473.0, + "step": 2928 + }, + { + "epoch": 0.321656050955414, + "grad_norm": 2.4007983207702637, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7256868481636047, + "num_tokens": 73502820.0, + "step": 2929 + }, + { + "epoch": 0.3217658686580277, + "grad_norm": 2.4262239933013916, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.704308271408081, + "num_tokens": 73525678.0, + "step": 2930 + }, + { + "epoch": 0.32187568636064134, + "grad_norm": 2.249006986618042, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6960777044296265, + "num_tokens": 73551269.0, + "step": 2931 + }, + { + "epoch": 0.321985504063255, + "grad_norm": 2.4466335773468018, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.6993241310119629, + "num_tokens": 73571856.0, + "step": 2932 + }, + { + "epoch": 0.32209532176586864, + "grad_norm": 2.771372079849243, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7062973976135254, + "num_tokens": 73588773.0, + "step": 2933 + }, + { + "epoch": 0.32220513946848234, + "grad_norm": 2.316157817840576, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7091442942619324, + "num_tokens": 73612352.0, + "step": 2934 + }, + { + "epoch": 0.322314957171096, + "grad_norm": 2.444859504699707, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7110384702682495, + "num_tokens": 73633364.0, + "step": 2935 + }, + { + "epoch": 0.32242477487370963, + "grad_norm": 2.311976432800293, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.7012077569961548, + "num_tokens": 73661235.0, + "step": 2936 + }, + { + "epoch": 0.3225345925763233, + "grad_norm": 2.011009931564331, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.688679575920105, + "num_tokens": 73694162.0, + "step": 2937 + }, + { + "epoch": 0.322644410278937, + "grad_norm": 2.165311098098755, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6971607208251953, + "num_tokens": 73720662.0, + "step": 2938 + }, + { + "epoch": 0.3227542279815506, + "grad_norm": 2.2137930393218994, + "learning_rate": 1e-06, + "loss": 1.0616, + "mean_token_accuracy": 0.6816102266311646, + "num_tokens": 73750383.0, + "step": 2939 + }, + { + "epoch": 0.32286404568416427, + "grad_norm": 2.204291343688965, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7201051712036133, + "num_tokens": 73774628.0, + "step": 2940 + }, + { + "epoch": 0.32297386338677797, + "grad_norm": 2.4320733547210693, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7113039493560791, + "num_tokens": 73796291.0, + "step": 2941 + }, + { + "epoch": 0.3230836810893916, + "grad_norm": 2.1413817405700684, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6841552257537842, + "num_tokens": 73824078.0, + "step": 2942 + }, + { + "epoch": 0.32319349879200526, + "grad_norm": 2.1369404792785645, + "learning_rate": 1e-06, + "loss": 1.1229, + "mean_token_accuracy": 0.6647174954414368, + "num_tokens": 73855211.0, + "step": 2943 + }, + { + "epoch": 0.3233033164946189, + "grad_norm": 2.525294065475464, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7066359519958496, + "num_tokens": 73876312.0, + "step": 2944 + }, + { + "epoch": 0.3234131341972326, + "grad_norm": 2.4990036487579346, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6950645446777344, + "num_tokens": 73897942.0, + "step": 2945 + }, + { + "epoch": 0.32352295189984626, + "grad_norm": 2.117750644683838, + "learning_rate": 1e-06, + "loss": 1.0397, + "mean_token_accuracy": 0.691458523273468, + "num_tokens": 73923617.0, + "step": 2946 + }, + { + "epoch": 0.3236327696024599, + "grad_norm": 2.282181978225708, + "learning_rate": 1e-06, + "loss": 1.042, + "mean_token_accuracy": 0.6991794109344482, + "num_tokens": 73950560.0, + "step": 2947 + }, + { + "epoch": 0.3237425873050736, + "grad_norm": 2.6893012523651123, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.7031763195991516, + "num_tokens": 73970390.0, + "step": 2948 + }, + { + "epoch": 0.32385240500768725, + "grad_norm": 2.4657726287841797, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7034945487976074, + "num_tokens": 73992569.0, + "step": 2949 + }, + { + "epoch": 0.3239622227103009, + "grad_norm": 2.30795955657959, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.690754234790802, + "num_tokens": 74017488.0, + "step": 2950 + }, + { + "epoch": 0.32407204041291454, + "grad_norm": 2.5983810424804688, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6906368732452393, + "num_tokens": 74037166.0, + "step": 2951 + }, + { + "epoch": 0.32418185811552824, + "grad_norm": 2.556783437728882, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6961110830307007, + "num_tokens": 74057650.0, + "step": 2952 + }, + { + "epoch": 0.3242916758181419, + "grad_norm": 2.1150429248809814, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7066043615341187, + "num_tokens": 74087398.0, + "step": 2953 + }, + { + "epoch": 0.32440149352075554, + "grad_norm": 2.2296292781829834, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6811155080795288, + "num_tokens": 74114803.0, + "step": 2954 + }, + { + "epoch": 0.3245113112233692, + "grad_norm": 2.1015117168426514, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7050292491912842, + "num_tokens": 74144500.0, + "step": 2955 + }, + { + "epoch": 0.3246211289259829, + "grad_norm": 2.124476194381714, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7094211578369141, + "num_tokens": 74171707.0, + "step": 2956 + }, + { + "epoch": 0.32473094662859653, + "grad_norm": 1.9533392190933228, + "learning_rate": 1e-06, + "loss": 1.0676, + "mean_token_accuracy": 0.6893404722213745, + "num_tokens": 74201370.0, + "step": 2957 + }, + { + "epoch": 0.3248407643312102, + "grad_norm": 2.1446454524993896, + "learning_rate": 1e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6753921508789062, + "num_tokens": 74230390.0, + "step": 2958 + }, + { + "epoch": 0.3249505820338239, + "grad_norm": 2.4873604774475098, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7044851183891296, + "num_tokens": 74250232.0, + "step": 2959 + }, + { + "epoch": 0.3250603997364375, + "grad_norm": 2.292675495147705, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.707120418548584, + "num_tokens": 74274064.0, + "step": 2960 + }, + { + "epoch": 0.32517021743905117, + "grad_norm": 2.5845448970794678, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7030895948410034, + "num_tokens": 74294013.0, + "step": 2961 + }, + { + "epoch": 0.3252800351416648, + "grad_norm": 1.9696391820907593, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6966514587402344, + "num_tokens": 74323688.0, + "step": 2962 + }, + { + "epoch": 0.3253898528442785, + "grad_norm": 2.2162177562713623, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7016893625259399, + "num_tokens": 74347588.0, + "step": 2963 + }, + { + "epoch": 0.32549967054689216, + "grad_norm": 2.07613205909729, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6882578730583191, + "num_tokens": 74375849.0, + "step": 2964 + }, + { + "epoch": 0.3256094882495058, + "grad_norm": 2.251302480697632, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.679117739200592, + "num_tokens": 74401031.0, + "step": 2965 + }, + { + "epoch": 0.3257193059521195, + "grad_norm": 2.0158627033233643, + "learning_rate": 1e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.686767578125, + "num_tokens": 74432185.0, + "step": 2966 + }, + { + "epoch": 0.32582912365473315, + "grad_norm": 2.2182364463806152, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7293542623519897, + "num_tokens": 74456240.0, + "step": 2967 + }, + { + "epoch": 0.3259389413573468, + "grad_norm": 2.2678420543670654, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7382879257202148, + "num_tokens": 74480700.0, + "step": 2968 + }, + { + "epoch": 0.32604875905996045, + "grad_norm": 2.22857403755188, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7152063846588135, + "num_tokens": 74505225.0, + "step": 2969 + }, + { + "epoch": 0.32615857676257415, + "grad_norm": 2.1460201740264893, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.7066303491592407, + "num_tokens": 74532396.0, + "step": 2970 + }, + { + "epoch": 0.3262683944651878, + "grad_norm": 2.2832653522491455, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7184832096099854, + "num_tokens": 74554326.0, + "step": 2971 + }, + { + "epoch": 0.32637821216780144, + "grad_norm": 2.555522918701172, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.704468846321106, + "num_tokens": 74574134.0, + "step": 2972 + }, + { + "epoch": 0.3264880298704151, + "grad_norm": 2.152167797088623, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7065075039863586, + "num_tokens": 74599703.0, + "step": 2973 + }, + { + "epoch": 0.3265978475730288, + "grad_norm": 2.3226423263549805, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6967867016792297, + "num_tokens": 74621515.0, + "step": 2974 + }, + { + "epoch": 0.32670766527564243, + "grad_norm": 2.202256202697754, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6965809464454651, + "num_tokens": 74647093.0, + "step": 2975 + }, + { + "epoch": 0.3268174829782561, + "grad_norm": 2.755463123321533, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6912597417831421, + "num_tokens": 74667325.0, + "step": 2976 + }, + { + "epoch": 0.3269273006808698, + "grad_norm": 2.554382801055908, + "learning_rate": 1e-06, + "loss": 1.0781, + "mean_token_accuracy": 0.6800861358642578, + "num_tokens": 74688319.0, + "step": 2977 + }, + { + "epoch": 0.3270371183834834, + "grad_norm": 2.29060959815979, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.7015506625175476, + "num_tokens": 74714509.0, + "step": 2978 + }, + { + "epoch": 0.3271469360860971, + "grad_norm": 2.4176714420318604, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6965007781982422, + "num_tokens": 74735230.0, + "step": 2979 + }, + { + "epoch": 0.3272567537887107, + "grad_norm": 2.5496058464050293, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7009384036064148, + "num_tokens": 74754822.0, + "step": 2980 + }, + { + "epoch": 0.3273665714913244, + "grad_norm": 2.2488653659820557, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6923056840896606, + "num_tokens": 74778625.0, + "step": 2981 + }, + { + "epoch": 0.32747638919393807, + "grad_norm": 2.2268967628479004, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.7015502452850342, + "num_tokens": 74805066.0, + "step": 2982 + }, + { + "epoch": 0.3275862068965517, + "grad_norm": 2.0382509231567383, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6855646371841431, + "num_tokens": 74834917.0, + "step": 2983 + }, + { + "epoch": 0.32769602459916536, + "grad_norm": 2.331327438354492, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6839667558670044, + "num_tokens": 74858579.0, + "step": 2984 + }, + { + "epoch": 0.32780584230177906, + "grad_norm": 2.2430968284606934, + "learning_rate": 1e-06, + "loss": 1.0631, + "mean_token_accuracy": 0.6842948198318481, + "num_tokens": 74883331.0, + "step": 2985 + }, + { + "epoch": 0.3279156600043927, + "grad_norm": 2.256019353866577, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7055443525314331, + "num_tokens": 74906498.0, + "step": 2986 + }, + { + "epoch": 0.32802547770700635, + "grad_norm": 2.058209180831909, + "learning_rate": 1e-06, + "loss": 1.0653, + "mean_token_accuracy": 0.6838130354881287, + "num_tokens": 74937519.0, + "step": 2987 + }, + { + "epoch": 0.32813529540962005, + "grad_norm": 2.345923900604248, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7133980989456177, + "num_tokens": 74958930.0, + "step": 2988 + }, + { + "epoch": 0.3282451131122337, + "grad_norm": 2.4372024536132812, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6858312487602234, + "num_tokens": 74981617.0, + "step": 2989 + }, + { + "epoch": 0.32835493081484735, + "grad_norm": 2.1910245418548584, + "learning_rate": 1e-06, + "loss": 1.0585, + "mean_token_accuracy": 0.6871091723442078, + "num_tokens": 75006024.0, + "step": 2990 + }, + { + "epoch": 0.328464748517461, + "grad_norm": 2.3103249073028564, + "learning_rate": 1e-06, + "loss": 1.0848, + "mean_token_accuracy": 0.6780299544334412, + "num_tokens": 75029988.0, + "step": 2991 + }, + { + "epoch": 0.3285745662200747, + "grad_norm": 2.215512990951538, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7058711647987366, + "num_tokens": 75054485.0, + "step": 2992 + }, + { + "epoch": 0.32868438392268834, + "grad_norm": 2.3299508094787598, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7262634634971619, + "num_tokens": 75075148.0, + "step": 2993 + }, + { + "epoch": 0.328794201625302, + "grad_norm": 2.223289966583252, + "learning_rate": 1e-06, + "loss": 1.0788, + "mean_token_accuracy": 0.67790687084198, + "num_tokens": 75102859.0, + "step": 2994 + }, + { + "epoch": 0.3289040193279157, + "grad_norm": 2.186033248901367, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7068983912467957, + "num_tokens": 75128273.0, + "step": 2995 + }, + { + "epoch": 0.32901383703052933, + "grad_norm": 2.148484468460083, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.691501259803772, + "num_tokens": 75155608.0, + "step": 2996 + }, + { + "epoch": 0.329123654733143, + "grad_norm": 2.1431775093078613, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7030653953552246, + "num_tokens": 75182425.0, + "step": 2997 + }, + { + "epoch": 0.3292334724357566, + "grad_norm": 2.3120810985565186, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6900590658187866, + "num_tokens": 75205699.0, + "step": 2998 + }, + { + "epoch": 0.3293432901383703, + "grad_norm": 2.581068754196167, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6924809217453003, + "num_tokens": 75227410.0, + "step": 2999 + }, + { + "epoch": 0.32945310784098397, + "grad_norm": 2.7735610008239746, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7093019485473633, + "num_tokens": 75246156.0, + "step": 3000 + }, + { + "epoch": 0.3295629255435976, + "grad_norm": 2.2918760776519775, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7070372700691223, + "num_tokens": 75270404.0, + "step": 3001 + }, + { + "epoch": 0.32967274324621126, + "grad_norm": 2.088458776473999, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6844791173934937, + "num_tokens": 75299728.0, + "step": 3002 + }, + { + "epoch": 0.32978256094882497, + "grad_norm": 2.312558650970459, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7140241861343384, + "num_tokens": 75325699.0, + "step": 3003 + }, + { + "epoch": 0.3298923786514386, + "grad_norm": 2.2832508087158203, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6885980367660522, + "num_tokens": 75350215.0, + "step": 3004 + }, + { + "epoch": 0.33000219635405226, + "grad_norm": 2.1160922050476074, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6926679015159607, + "num_tokens": 75377635.0, + "step": 3005 + }, + { + "epoch": 0.33011201405666596, + "grad_norm": 2.207484483718872, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.6973435282707214, + "num_tokens": 75404940.0, + "step": 3006 + }, + { + "epoch": 0.3302218317592796, + "grad_norm": 2.3699941635131836, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6922872066497803, + "num_tokens": 75426570.0, + "step": 3007 + }, + { + "epoch": 0.33033164946189325, + "grad_norm": 2.2065927982330322, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.7096160650253296, + "num_tokens": 75453743.0, + "step": 3008 + }, + { + "epoch": 0.3304414671645069, + "grad_norm": 2.43900465965271, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7038701772689819, + "num_tokens": 75475850.0, + "step": 3009 + }, + { + "epoch": 0.3305512848671206, + "grad_norm": 2.1737334728240967, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6952781677246094, + "num_tokens": 75504471.0, + "step": 3010 + }, + { + "epoch": 0.33066110256973424, + "grad_norm": 2.3760297298431396, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6909916996955872, + "num_tokens": 75527084.0, + "step": 3011 + }, + { + "epoch": 0.3307709202723479, + "grad_norm": 2.3051438331604004, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.694977879524231, + "num_tokens": 75553250.0, + "step": 3012 + }, + { + "epoch": 0.33088073797496154, + "grad_norm": 2.6112797260284424, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6921201944351196, + "num_tokens": 75571922.0, + "step": 3013 + }, + { + "epoch": 0.33099055567757524, + "grad_norm": 2.4610211849212646, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7040435075759888, + "num_tokens": 75593571.0, + "step": 3014 + }, + { + "epoch": 0.3311003733801889, + "grad_norm": 2.2205970287323, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7321046590805054, + "num_tokens": 75616282.0, + "step": 3015 + }, + { + "epoch": 0.33121019108280253, + "grad_norm": 2.2321126461029053, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6964473128318787, + "num_tokens": 75642571.0, + "step": 3016 + }, + { + "epoch": 0.33132000878541623, + "grad_norm": 2.5024783611297607, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7235419750213623, + "num_tokens": 75665188.0, + "step": 3017 + }, + { + "epoch": 0.3314298264880299, + "grad_norm": 2.280890941619873, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7000802755355835, + "num_tokens": 75690458.0, + "step": 3018 + }, + { + "epoch": 0.3315396441906435, + "grad_norm": 2.201951026916504, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7115060687065125, + "num_tokens": 75718143.0, + "step": 3019 + }, + { + "epoch": 0.33164946189325717, + "grad_norm": 2.1738855838775635, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.693388819694519, + "num_tokens": 75745984.0, + "step": 3020 + }, + { + "epoch": 0.33175927959587087, + "grad_norm": 2.221194267272949, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6898465752601624, + "num_tokens": 75772744.0, + "step": 3021 + }, + { + "epoch": 0.3318690972984845, + "grad_norm": 2.4463841915130615, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7168581485748291, + "num_tokens": 75794177.0, + "step": 3022 + }, + { + "epoch": 0.33197891500109816, + "grad_norm": 2.2687506675720215, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6874912977218628, + "num_tokens": 75819606.0, + "step": 3023 + }, + { + "epoch": 0.33208873270371186, + "grad_norm": 2.347623348236084, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7040305733680725, + "num_tokens": 75845131.0, + "step": 3024 + }, + { + "epoch": 0.3321985504063255, + "grad_norm": 2.174257755279541, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.6855599880218506, + "num_tokens": 75873474.0, + "step": 3025 + }, + { + "epoch": 0.33230836810893916, + "grad_norm": 2.056556224822998, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7011731863021851, + "num_tokens": 75900427.0, + "step": 3026 + }, + { + "epoch": 0.3324181858115528, + "grad_norm": 2.3969039916992188, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.6990674734115601, + "num_tokens": 75922631.0, + "step": 3027 + }, + { + "epoch": 0.3325280035141665, + "grad_norm": 2.219168186187744, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.712552011013031, + "num_tokens": 75947804.0, + "step": 3028 + }, + { + "epoch": 0.33263782121678015, + "grad_norm": 1.8933531045913696, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7014285326004028, + "num_tokens": 75981738.0, + "step": 3029 + }, + { + "epoch": 0.3327476389193938, + "grad_norm": 2.194742202758789, + "learning_rate": 1e-06, + "loss": 1.0658, + "mean_token_accuracy": 0.6857858300209045, + "num_tokens": 76008618.0, + "step": 3030 + }, + { + "epoch": 0.33285745662200744, + "grad_norm": 2.276911735534668, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.700429379940033, + "num_tokens": 76032100.0, + "step": 3031 + }, + { + "epoch": 0.33296727432462114, + "grad_norm": 2.1540110111236572, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6837643384933472, + "num_tokens": 76059995.0, + "step": 3032 + }, + { + "epoch": 0.3330770920272348, + "grad_norm": 2.1447527408599854, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.7000188231468201, + "num_tokens": 76087908.0, + "step": 3033 + }, + { + "epoch": 0.33318690972984844, + "grad_norm": 2.2570178508758545, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.698756754398346, + "num_tokens": 76113111.0, + "step": 3034 + }, + { + "epoch": 0.33329672743246214, + "grad_norm": 2.1023099422454834, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6939586400985718, + "num_tokens": 76140912.0, + "step": 3035 + }, + { + "epoch": 0.3334065451350758, + "grad_norm": 2.0895793437957764, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7099654674530029, + "num_tokens": 76169222.0, + "step": 3036 + }, + { + "epoch": 0.33351636283768943, + "grad_norm": 2.2533657550811768, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7053096294403076, + "num_tokens": 76193929.0, + "step": 3037 + }, + { + "epoch": 0.3336261805403031, + "grad_norm": 2.4502475261688232, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6989991664886475, + "num_tokens": 76216955.0, + "step": 3038 + }, + { + "epoch": 0.3337359982429168, + "grad_norm": 2.553537607192993, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6910231113433838, + "num_tokens": 76237526.0, + "step": 3039 + }, + { + "epoch": 0.3338458159455304, + "grad_norm": 2.1487107276916504, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7013341188430786, + "num_tokens": 76267720.0, + "step": 3040 + }, + { + "epoch": 0.33395563364814407, + "grad_norm": 1.9777430295944214, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.699694037437439, + "num_tokens": 76299587.0, + "step": 3041 + }, + { + "epoch": 0.33406545135075777, + "grad_norm": 2.09196138381958, + "learning_rate": 1e-06, + "loss": 1.1006, + "mean_token_accuracy": 0.6694934964179993, + "num_tokens": 76327733.0, + "step": 3042 + }, + { + "epoch": 0.3341752690533714, + "grad_norm": 2.336503505706787, + "learning_rate": 1e-06, + "loss": 1.0624, + "mean_token_accuracy": 0.684998631477356, + "num_tokens": 76354632.0, + "step": 3043 + }, + { + "epoch": 0.33428508675598506, + "grad_norm": 2.01143741607666, + "learning_rate": 1e-06, + "loss": 1.0448, + "mean_token_accuracy": 0.6872023940086365, + "num_tokens": 76386558.0, + "step": 3044 + }, + { + "epoch": 0.3343949044585987, + "grad_norm": 2.3646533489227295, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6967374086380005, + "num_tokens": 76409899.0, + "step": 3045 + }, + { + "epoch": 0.3345047221612124, + "grad_norm": 2.7286126613616943, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6969618201255798, + "num_tokens": 76428510.0, + "step": 3046 + }, + { + "epoch": 0.33461453986382605, + "grad_norm": 2.2543716430664062, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6925511360168457, + "num_tokens": 76457639.0, + "step": 3047 + }, + { + "epoch": 0.3347243575664397, + "grad_norm": 2.4009435176849365, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7243015766143799, + "num_tokens": 76479968.0, + "step": 3048 + }, + { + "epoch": 0.33483417526905335, + "grad_norm": 2.2658021450042725, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6776343584060669, + "num_tokens": 76503455.0, + "step": 3049 + }, + { + "epoch": 0.33494399297166705, + "grad_norm": 2.0036509037017822, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6863001585006714, + "num_tokens": 76534195.0, + "step": 3050 + }, + { + "epoch": 0.3350538106742807, + "grad_norm": 2.411375045776367, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7249252200126648, + "num_tokens": 76555721.0, + "step": 3051 + }, + { + "epoch": 0.33516362837689434, + "grad_norm": 2.3009817600250244, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.689582347869873, + "num_tokens": 76579463.0, + "step": 3052 + }, + { + "epoch": 0.33527344607950804, + "grad_norm": 2.143277406692505, + "learning_rate": 1e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.6844642162322998, + "num_tokens": 76607287.0, + "step": 3053 + }, + { + "epoch": 0.3353832637821217, + "grad_norm": 2.2907183170318604, + "learning_rate": 1e-06, + "loss": 0.9949, + "mean_token_accuracy": 0.6970516443252563, + "num_tokens": 76630458.0, + "step": 3054 + }, + { + "epoch": 0.33549308148473533, + "grad_norm": 2.123087167739868, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7154431343078613, + "num_tokens": 76656053.0, + "step": 3055 + }, + { + "epoch": 0.335602899187349, + "grad_norm": 2.4795737266540527, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.725721538066864, + "num_tokens": 76680385.0, + "step": 3056 + }, + { + "epoch": 0.3357127168899627, + "grad_norm": 2.2913269996643066, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7156502604484558, + "num_tokens": 76704258.0, + "step": 3057 + }, + { + "epoch": 0.3358225345925763, + "grad_norm": 2.1161203384399414, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6905035376548767, + "num_tokens": 76731962.0, + "step": 3058 + }, + { + "epoch": 0.33593235229519, + "grad_norm": 2.349273920059204, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7082155346870422, + "num_tokens": 76755186.0, + "step": 3059 + }, + { + "epoch": 0.3360421699978036, + "grad_norm": 2.2114675045013428, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6921372413635254, + "num_tokens": 76779873.0, + "step": 3060 + }, + { + "epoch": 0.3361519877004173, + "grad_norm": 2.5652239322662354, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7151626348495483, + "num_tokens": 76800536.0, + "step": 3061 + }, + { + "epoch": 0.33626180540303097, + "grad_norm": 1.990675449371338, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7101292610168457, + "num_tokens": 76829969.0, + "step": 3062 + }, + { + "epoch": 0.3363716231056446, + "grad_norm": 2.382704734802246, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6816396117210388, + "num_tokens": 76853530.0, + "step": 3063 + }, + { + "epoch": 0.3364814408082583, + "grad_norm": 2.287632942199707, + "learning_rate": 1e-06, + "loss": 1.0216, + "mean_token_accuracy": 0.6855555772781372, + "num_tokens": 76877920.0, + "step": 3064 + }, + { + "epoch": 0.33659125851087196, + "grad_norm": 1.9531161785125732, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6836024522781372, + "num_tokens": 76908715.0, + "step": 3065 + }, + { + "epoch": 0.3367010762134856, + "grad_norm": 2.0134177207946777, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.6960589289665222, + "num_tokens": 76938558.0, + "step": 3066 + }, + { + "epoch": 0.33681089391609925, + "grad_norm": 2.2053513526916504, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6869066953659058, + "num_tokens": 76963470.0, + "step": 3067 + }, + { + "epoch": 0.33692071161871295, + "grad_norm": 2.3889424800872803, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7093877792358398, + "num_tokens": 76984792.0, + "step": 3068 + }, + { + "epoch": 0.3370305293213266, + "grad_norm": 2.1796681880950928, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.71861732006073, + "num_tokens": 77011445.0, + "step": 3069 + }, + { + "epoch": 0.33714034702394025, + "grad_norm": 2.2851734161376953, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7262777090072632, + "num_tokens": 77034123.0, + "step": 3070 + }, + { + "epoch": 0.33725016472655395, + "grad_norm": 2.365668296813965, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6878336071968079, + "num_tokens": 77057127.0, + "step": 3071 + }, + { + "epoch": 0.3373599824291676, + "grad_norm": 2.0851802825927734, + "learning_rate": 1e-06, + "loss": 1.1031, + "mean_token_accuracy": 0.6694197654724121, + "num_tokens": 77087419.0, + "step": 3072 + }, + { + "epoch": 0.33746980013178124, + "grad_norm": 2.1642870903015137, + "learning_rate": 1e-06, + "loss": 1.1008, + "mean_token_accuracy": 0.6775911450386047, + "num_tokens": 77114521.0, + "step": 3073 + }, + { + "epoch": 0.3375796178343949, + "grad_norm": 2.725813865661621, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7193711996078491, + "num_tokens": 77134794.0, + "step": 3074 + }, + { + "epoch": 0.3376894355370086, + "grad_norm": 1.9807329177856445, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7195233106613159, + "num_tokens": 77164127.0, + "step": 3075 + }, + { + "epoch": 0.33779925323962223, + "grad_norm": 2.172715187072754, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6922723650932312, + "num_tokens": 77190144.0, + "step": 3076 + }, + { + "epoch": 0.3379090709422359, + "grad_norm": 2.3221280574798584, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6913671493530273, + "num_tokens": 77217566.0, + "step": 3077 + }, + { + "epoch": 0.3380188886448495, + "grad_norm": 2.2117931842803955, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.6975549459457397, + "num_tokens": 77244290.0, + "step": 3078 + }, + { + "epoch": 0.3381287063474632, + "grad_norm": 2.345203161239624, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.71192467212677, + "num_tokens": 77265947.0, + "step": 3079 + }, + { + "epoch": 0.33823852405007687, + "grad_norm": 2.3634629249572754, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6964254379272461, + "num_tokens": 77288221.0, + "step": 3080 + }, + { + "epoch": 0.3383483417526905, + "grad_norm": 2.2904183864593506, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.688465416431427, + "num_tokens": 77312772.0, + "step": 3081 + }, + { + "epoch": 0.3384581594553042, + "grad_norm": 2.588268995285034, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6954887509346008, + "num_tokens": 77332748.0, + "step": 3082 + }, + { + "epoch": 0.33856797715791787, + "grad_norm": 2.089754819869995, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7140039205551147, + "num_tokens": 77362655.0, + "step": 3083 + }, + { + "epoch": 0.3386777948605315, + "grad_norm": 2.028385639190674, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7059810161590576, + "num_tokens": 77392725.0, + "step": 3084 + }, + { + "epoch": 0.33878761256314516, + "grad_norm": 2.294246196746826, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7049776911735535, + "num_tokens": 77415741.0, + "step": 3085 + }, + { + "epoch": 0.33889743026575886, + "grad_norm": 2.1698803901672363, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.700558066368103, + "num_tokens": 77442820.0, + "step": 3086 + }, + { + "epoch": 0.3390072479683725, + "grad_norm": 2.4345452785491943, + "learning_rate": 1e-06, + "loss": 1.0787, + "mean_token_accuracy": 0.6816895008087158, + "num_tokens": 77465459.0, + "step": 3087 + }, + { + "epoch": 0.33911706567098615, + "grad_norm": 2.226149082183838, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7188950777053833, + "num_tokens": 77491112.0, + "step": 3088 + }, + { + "epoch": 0.3392268833735998, + "grad_norm": 2.315293788909912, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7106751203536987, + "num_tokens": 77515919.0, + "step": 3089 + }, + { + "epoch": 0.3393367010762135, + "grad_norm": 2.471508502960205, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6912388801574707, + "num_tokens": 77536088.0, + "step": 3090 + }, + { + "epoch": 0.33944651877882714, + "grad_norm": 2.2281124591827393, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.716033935546875, + "num_tokens": 77562074.0, + "step": 3091 + }, + { + "epoch": 0.3395563364814408, + "grad_norm": 2.263632297515869, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6936014890670776, + "num_tokens": 77585358.0, + "step": 3092 + }, + { + "epoch": 0.3396661541840545, + "grad_norm": 2.2288830280303955, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6940150260925293, + "num_tokens": 77610987.0, + "step": 3093 + }, + { + "epoch": 0.33977597188666814, + "grad_norm": 2.1013870239257812, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7085552215576172, + "num_tokens": 77637972.0, + "step": 3094 + }, + { + "epoch": 0.3398857895892818, + "grad_norm": 2.4583423137664795, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7017804384231567, + "num_tokens": 77658685.0, + "step": 3095 + }, + { + "epoch": 0.33999560729189543, + "grad_norm": 2.2500879764556885, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6956847310066223, + "num_tokens": 77683278.0, + "step": 3096 + }, + { + "epoch": 0.34010542499450913, + "grad_norm": 2.464961051940918, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6962646245956421, + "num_tokens": 77706912.0, + "step": 3097 + }, + { + "epoch": 0.3402152426971228, + "grad_norm": 2.4368908405303955, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.6839430928230286, + "num_tokens": 77728912.0, + "step": 3098 + }, + { + "epoch": 0.3403250603997364, + "grad_norm": 1.9568774700164795, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7069602012634277, + "num_tokens": 77762314.0, + "step": 3099 + }, + { + "epoch": 0.3404348781023501, + "grad_norm": 2.0224609375, + "learning_rate": 1e-06, + "loss": 1.1579, + "mean_token_accuracy": 0.6552478075027466, + "num_tokens": 77794617.0, + "step": 3100 + }, + { + "epoch": 0.34054469580496377, + "grad_norm": 2.3969531059265137, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7128236293792725, + "num_tokens": 77817558.0, + "step": 3101 + }, + { + "epoch": 0.3406545135075774, + "grad_norm": 2.3132925033569336, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6833535432815552, + "num_tokens": 77840870.0, + "step": 3102 + }, + { + "epoch": 0.34076433121019106, + "grad_norm": 2.2688324451446533, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6919394135475159, + "num_tokens": 77865881.0, + "step": 3103 + }, + { + "epoch": 0.34087414891280476, + "grad_norm": 2.3087563514709473, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6965223550796509, + "num_tokens": 77888274.0, + "step": 3104 + }, + { + "epoch": 0.3409839666154184, + "grad_norm": 2.221226215362549, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6948829889297485, + "num_tokens": 77914000.0, + "step": 3105 + }, + { + "epoch": 0.34109378431803206, + "grad_norm": 2.599137306213379, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7372053861618042, + "num_tokens": 77931675.0, + "step": 3106 + }, + { + "epoch": 0.3412036020206457, + "grad_norm": 2.332045316696167, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6954668760299683, + "num_tokens": 77955457.0, + "step": 3107 + }, + { + "epoch": 0.3413134197232594, + "grad_norm": 2.375523567199707, + "learning_rate": 1e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6725680828094482, + "num_tokens": 77981594.0, + "step": 3108 + }, + { + "epoch": 0.34142323742587305, + "grad_norm": 2.2790825366973877, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6907775402069092, + "num_tokens": 78007389.0, + "step": 3109 + }, + { + "epoch": 0.3415330551284867, + "grad_norm": 2.041344165802002, + "learning_rate": 1e-06, + "loss": 1.1197, + "mean_token_accuracy": 0.6655477285385132, + "num_tokens": 78038928.0, + "step": 3110 + }, + { + "epoch": 0.3416428728311004, + "grad_norm": 2.3109004497528076, + "learning_rate": 1e-06, + "loss": 1.1045, + "mean_token_accuracy": 0.6695227026939392, + "num_tokens": 78062774.0, + "step": 3111 + }, + { + "epoch": 0.34175269053371404, + "grad_norm": 2.4241151809692383, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7182257175445557, + "num_tokens": 78083615.0, + "step": 3112 + }, + { + "epoch": 0.3418625082363277, + "grad_norm": 2.1497974395751953, + "learning_rate": 1e-06, + "loss": 1.0129, + "mean_token_accuracy": 0.6880751252174377, + "num_tokens": 78112479.0, + "step": 3113 + }, + { + "epoch": 0.34197232593894134, + "grad_norm": 2.593461513519287, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7253649234771729, + "num_tokens": 78131013.0, + "step": 3114 + }, + { + "epoch": 0.34208214364155504, + "grad_norm": 2.3846988677978516, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7108118534088135, + "num_tokens": 78152092.0, + "step": 3115 + }, + { + "epoch": 0.3421919613441687, + "grad_norm": 2.1204373836517334, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6900022029876709, + "num_tokens": 78179235.0, + "step": 3116 + }, + { + "epoch": 0.34230177904678233, + "grad_norm": 2.056885004043579, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.6890393495559692, + "num_tokens": 78207161.0, + "step": 3117 + }, + { + "epoch": 0.34241159674939603, + "grad_norm": 2.1689610481262207, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7087855935096741, + "num_tokens": 78232500.0, + "step": 3118 + }, + { + "epoch": 0.3425214144520097, + "grad_norm": 2.1871838569641113, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.69890296459198, + "num_tokens": 78257707.0, + "step": 3119 + }, + { + "epoch": 0.3426312321546233, + "grad_norm": 2.1778643131256104, + "learning_rate": 1e-06, + "loss": 1.068, + "mean_token_accuracy": 0.6811010241508484, + "num_tokens": 78284660.0, + "step": 3120 + }, + { + "epoch": 0.34274104985723697, + "grad_norm": 2.0002903938293457, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7077349424362183, + "num_tokens": 78314716.0, + "step": 3121 + }, + { + "epoch": 0.34285086755985067, + "grad_norm": 2.313809871673584, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.696515679359436, + "num_tokens": 78339890.0, + "step": 3122 + }, + { + "epoch": 0.3429606852624643, + "grad_norm": 2.0860087871551514, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6811299920082092, + "num_tokens": 78368332.0, + "step": 3123 + }, + { + "epoch": 0.34307050296507796, + "grad_norm": 2.447822093963623, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7097442150115967, + "num_tokens": 78390894.0, + "step": 3124 + }, + { + "epoch": 0.3431803206676916, + "grad_norm": 2.0828964710235596, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6992416381835938, + "num_tokens": 78419793.0, + "step": 3125 + }, + { + "epoch": 0.3432901383703053, + "grad_norm": 2.477224826812744, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.7080405354499817, + "num_tokens": 78442211.0, + "step": 3126 + }, + { + "epoch": 0.34339995607291895, + "grad_norm": 2.5701792240142822, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7107759714126587, + "num_tokens": 78462255.0, + "step": 3127 + }, + { + "epoch": 0.3435097737755326, + "grad_norm": 2.100090980529785, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6954529881477356, + "num_tokens": 78491259.0, + "step": 3128 + }, + { + "epoch": 0.3436195914781463, + "grad_norm": 2.561711311340332, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.7007218599319458, + "num_tokens": 78511248.0, + "step": 3129 + }, + { + "epoch": 0.34372940918075995, + "grad_norm": 2.056034564971924, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6965884566307068, + "num_tokens": 78538742.0, + "step": 3130 + }, + { + "epoch": 0.3438392268833736, + "grad_norm": 2.500459909439087, + "learning_rate": 1e-06, + "loss": 1.1003, + "mean_token_accuracy": 0.6726624965667725, + "num_tokens": 78561850.0, + "step": 3131 + }, + { + "epoch": 0.34394904458598724, + "grad_norm": 2.3386049270629883, + "learning_rate": 1e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6812601089477539, + "num_tokens": 78586270.0, + "step": 3132 + }, + { + "epoch": 0.34405886228860094, + "grad_norm": 2.270824670791626, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7033743262290955, + "num_tokens": 78609674.0, + "step": 3133 + }, + { + "epoch": 0.3441686799912146, + "grad_norm": 2.379528045654297, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6886170506477356, + "num_tokens": 78632305.0, + "step": 3134 + }, + { + "epoch": 0.34427849769382823, + "grad_norm": 2.4652013778686523, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7306491136550903, + "num_tokens": 78651747.0, + "step": 3135 + }, + { + "epoch": 0.3443883153964419, + "grad_norm": 2.453585147857666, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.6993511915206909, + "num_tokens": 78673700.0, + "step": 3136 + }, + { + "epoch": 0.3444981330990556, + "grad_norm": 2.64274263381958, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.6996229887008667, + "num_tokens": 78693917.0, + "step": 3137 + }, + { + "epoch": 0.3446079508016692, + "grad_norm": 2.483346462249756, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7096370458602905, + "num_tokens": 78714608.0, + "step": 3138 + }, + { + "epoch": 0.3447177685042829, + "grad_norm": 2.0445823669433594, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6905710697174072, + "num_tokens": 78742929.0, + "step": 3139 + }, + { + "epoch": 0.3448275862068966, + "grad_norm": 2.0791540145874023, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7064881324768066, + "num_tokens": 78770964.0, + "step": 3140 + }, + { + "epoch": 0.3449374039095102, + "grad_norm": 2.41754412651062, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6862087249755859, + "num_tokens": 78793611.0, + "step": 3141 + }, + { + "epoch": 0.34504722161212387, + "grad_norm": 2.179327964782715, + "learning_rate": 1e-06, + "loss": 1.108, + "mean_token_accuracy": 0.6654514670372009, + "num_tokens": 78821257.0, + "step": 3142 + }, + { + "epoch": 0.3451570393147375, + "grad_norm": 2.1068968772888184, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6871693730354309, + "num_tokens": 78848019.0, + "step": 3143 + }, + { + "epoch": 0.3452668570173512, + "grad_norm": 2.347198009490967, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7314964532852173, + "num_tokens": 78869691.0, + "step": 3144 + }, + { + "epoch": 0.34537667471996486, + "grad_norm": 2.4099936485290527, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7157256007194519, + "num_tokens": 78890840.0, + "step": 3145 + }, + { + "epoch": 0.3454864924225785, + "grad_norm": 2.2167012691497803, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7238125801086426, + "num_tokens": 78915624.0, + "step": 3146 + }, + { + "epoch": 0.3455963101251922, + "grad_norm": 2.1667120456695557, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7186118364334106, + "num_tokens": 78940511.0, + "step": 3147 + }, + { + "epoch": 0.34570612782780585, + "grad_norm": 2.511213541030884, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.731449544429779, + "num_tokens": 78959733.0, + "step": 3148 + }, + { + "epoch": 0.3458159455304195, + "grad_norm": 2.2095768451690674, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6991105079650879, + "num_tokens": 78986426.0, + "step": 3149 + }, + { + "epoch": 0.34592576323303315, + "grad_norm": 2.2364413738250732, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7038798332214355, + "num_tokens": 79011585.0, + "step": 3150 + }, + { + "epoch": 0.34603558093564685, + "grad_norm": 2.368762969970703, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6977515816688538, + "num_tokens": 79035294.0, + "step": 3151 + }, + { + "epoch": 0.3461453986382605, + "grad_norm": 2.2590725421905518, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7026362419128418, + "num_tokens": 79061326.0, + "step": 3152 + }, + { + "epoch": 0.34625521634087414, + "grad_norm": 1.979776382446289, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.689781904220581, + "num_tokens": 79091834.0, + "step": 3153 + }, + { + "epoch": 0.3463650340434878, + "grad_norm": 2.4395811557769775, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7175390720367432, + "num_tokens": 79114651.0, + "step": 3154 + }, + { + "epoch": 0.3464748517461015, + "grad_norm": 2.2584903240203857, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.720738410949707, + "num_tokens": 79139612.0, + "step": 3155 + }, + { + "epoch": 0.34658466944871513, + "grad_norm": 2.345309019088745, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.678969144821167, + "num_tokens": 79163652.0, + "step": 3156 + }, + { + "epoch": 0.3466944871513288, + "grad_norm": 2.0136935710906982, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7129217982292175, + "num_tokens": 79192084.0, + "step": 3157 + }, + { + "epoch": 0.3468043048539425, + "grad_norm": 2.623713731765747, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6931326985359192, + "num_tokens": 79211840.0, + "step": 3158 + }, + { + "epoch": 0.3469141225565561, + "grad_norm": 2.0418074131011963, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6904170513153076, + "num_tokens": 79242348.0, + "step": 3159 + }, + { + "epoch": 0.34702394025916977, + "grad_norm": 2.1033284664154053, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6772189140319824, + "num_tokens": 79271247.0, + "step": 3160 + }, + { + "epoch": 0.3471337579617834, + "grad_norm": 2.158949613571167, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7040213346481323, + "num_tokens": 79298436.0, + "step": 3161 + }, + { + "epoch": 0.3472435756643971, + "grad_norm": 2.1758389472961426, + "learning_rate": 1e-06, + "loss": 1.1022, + "mean_token_accuracy": 0.6726793646812439, + "num_tokens": 79328784.0, + "step": 3162 + }, + { + "epoch": 0.34735339336701077, + "grad_norm": 1.9706978797912598, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6944760680198669, + "num_tokens": 79360895.0, + "step": 3163 + }, + { + "epoch": 0.3474632110696244, + "grad_norm": 2.3418052196502686, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7109267711639404, + "num_tokens": 79383633.0, + "step": 3164 + }, + { + "epoch": 0.34757302877223806, + "grad_norm": 1.9058067798614502, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6914464235305786, + "num_tokens": 79416196.0, + "step": 3165 + }, + { + "epoch": 0.34768284647485176, + "grad_norm": 2.1064207553863525, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7075964212417603, + "num_tokens": 79442356.0, + "step": 3166 + }, + { + "epoch": 0.3477926641774654, + "grad_norm": 2.2207531929016113, + "learning_rate": 1e-06, + "loss": 1.0433, + "mean_token_accuracy": 0.6855348348617554, + "num_tokens": 79470066.0, + "step": 3167 + }, + { + "epoch": 0.34790248188007905, + "grad_norm": 2.4993956089019775, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7148823738098145, + "num_tokens": 79490872.0, + "step": 3168 + }, + { + "epoch": 0.34801229958269275, + "grad_norm": 1.9990496635437012, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7151286602020264, + "num_tokens": 79517603.0, + "step": 3169 + }, + { + "epoch": 0.3481221172853064, + "grad_norm": 2.5999512672424316, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7111489176750183, + "num_tokens": 79536528.0, + "step": 3170 + }, + { + "epoch": 0.34823193498792004, + "grad_norm": 2.3176751136779785, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.705225944519043, + "num_tokens": 79558743.0, + "step": 3171 + }, + { + "epoch": 0.3483417526905337, + "grad_norm": 2.336035966873169, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7113518118858337, + "num_tokens": 79581784.0, + "step": 3172 + }, + { + "epoch": 0.3484515703931474, + "grad_norm": 2.1475741863250732, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7231210470199585, + "num_tokens": 79608771.0, + "step": 3173 + }, + { + "epoch": 0.34856138809576104, + "grad_norm": 2.4516255855560303, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7011138796806335, + "num_tokens": 79629247.0, + "step": 3174 + }, + { + "epoch": 0.3486712057983747, + "grad_norm": 2.2823195457458496, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7017412185668945, + "num_tokens": 79653737.0, + "step": 3175 + }, + { + "epoch": 0.3487810235009884, + "grad_norm": 2.2596356868743896, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7060104012489319, + "num_tokens": 79676526.0, + "step": 3176 + }, + { + "epoch": 0.34889084120360203, + "grad_norm": 2.5408387184143066, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6871728897094727, + "num_tokens": 79697356.0, + "step": 3177 + }, + { + "epoch": 0.3490006589062157, + "grad_norm": 2.201502561569214, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.6995586156845093, + "num_tokens": 79721553.0, + "step": 3178 + }, + { + "epoch": 0.3491104766088293, + "grad_norm": 2.5325021743774414, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6852223873138428, + "num_tokens": 79743076.0, + "step": 3179 + }, + { + "epoch": 0.349220294311443, + "grad_norm": 2.4209601879119873, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6857950687408447, + "num_tokens": 79765552.0, + "step": 3180 + }, + { + "epoch": 0.34933011201405667, + "grad_norm": 2.45135235786438, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7041183710098267, + "num_tokens": 79787058.0, + "step": 3181 + }, + { + "epoch": 0.3494399297166703, + "grad_norm": 2.3102729320526123, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6812903881072998, + "num_tokens": 79813113.0, + "step": 3182 + }, + { + "epoch": 0.34954974741928396, + "grad_norm": 2.2935502529144287, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6923164129257202, + "num_tokens": 79839519.0, + "step": 3183 + }, + { + "epoch": 0.34965956512189766, + "grad_norm": 1.9284147024154663, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6949631571769714, + "num_tokens": 79872159.0, + "step": 3184 + }, + { + "epoch": 0.3497693828245113, + "grad_norm": 2.406674861907959, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.6983018517494202, + "num_tokens": 79895827.0, + "step": 3185 + }, + { + "epoch": 0.34987920052712496, + "grad_norm": 2.6795496940612793, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7098439931869507, + "num_tokens": 79914367.0, + "step": 3186 + }, + { + "epoch": 0.34998901822973866, + "grad_norm": 2.3223795890808105, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.7024625539779663, + "num_tokens": 79939088.0, + "step": 3187 + }, + { + "epoch": 0.3500988359323523, + "grad_norm": 2.5019798278808594, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7028748989105225, + "num_tokens": 79960800.0, + "step": 3188 + }, + { + "epoch": 0.35020865363496595, + "grad_norm": 2.2969090938568115, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7159936428070068, + "num_tokens": 79983257.0, + "step": 3189 + }, + { + "epoch": 0.3503184713375796, + "grad_norm": 2.6501989364624023, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7015211582183838, + "num_tokens": 80001263.0, + "step": 3190 + }, + { + "epoch": 0.3504282890401933, + "grad_norm": 2.1289546489715576, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.704216718673706, + "num_tokens": 80028193.0, + "step": 3191 + }, + { + "epoch": 0.35053810674280694, + "grad_norm": 2.341461181640625, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6929842233657837, + "num_tokens": 80051162.0, + "step": 3192 + }, + { + "epoch": 0.3506479244454206, + "grad_norm": 2.3143744468688965, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7032138109207153, + "num_tokens": 80075675.0, + "step": 3193 + }, + { + "epoch": 0.3507577421480343, + "grad_norm": 2.3159961700439453, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6980084776878357, + "num_tokens": 80100944.0, + "step": 3194 + }, + { + "epoch": 0.35086755985064794, + "grad_norm": 2.1899070739746094, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6925408840179443, + "num_tokens": 80129283.0, + "step": 3195 + }, + { + "epoch": 0.3509773775532616, + "grad_norm": 2.946521759033203, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7279527187347412, + "num_tokens": 80144090.0, + "step": 3196 + }, + { + "epoch": 0.35108719525587523, + "grad_norm": 2.4484357833862305, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6949704885482788, + "num_tokens": 80165772.0, + "step": 3197 + }, + { + "epoch": 0.35119701295848893, + "grad_norm": 2.247971773147583, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6995368599891663, + "num_tokens": 80191365.0, + "step": 3198 + }, + { + "epoch": 0.3513068306611026, + "grad_norm": 2.1935982704162598, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6905182600021362, + "num_tokens": 80216511.0, + "step": 3199 + }, + { + "epoch": 0.3514166483637162, + "grad_norm": 2.1523494720458984, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6893212795257568, + "num_tokens": 80244042.0, + "step": 3200 + }, + { + "epoch": 0.35152646606632987, + "grad_norm": 2.7837793827056885, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7128000855445862, + "num_tokens": 80261930.0, + "step": 3201 + }, + { + "epoch": 0.35163628376894357, + "grad_norm": 2.1894755363464355, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7090721130371094, + "num_tokens": 80288919.0, + "step": 3202 + }, + { + "epoch": 0.3517461014715572, + "grad_norm": 2.2924628257751465, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.691474199295044, + "num_tokens": 80313688.0, + "step": 3203 + }, + { + "epoch": 0.35185591917417086, + "grad_norm": 2.215627908706665, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6892653107643127, + "num_tokens": 80339166.0, + "step": 3204 + }, + { + "epoch": 0.35196573687678456, + "grad_norm": 2.3188436031341553, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7101714611053467, + "num_tokens": 80363645.0, + "step": 3205 + }, + { + "epoch": 0.3520755545793982, + "grad_norm": 1.8914384841918945, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6793807148933411, + "num_tokens": 80399603.0, + "step": 3206 + }, + { + "epoch": 0.35218537228201185, + "grad_norm": 2.2640907764434814, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7050219178199768, + "num_tokens": 80423835.0, + "step": 3207 + }, + { + "epoch": 0.3522951899846255, + "grad_norm": 2.2148215770721436, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.685352087020874, + "num_tokens": 80450513.0, + "step": 3208 + }, + { + "epoch": 0.3524050076872392, + "grad_norm": 1.9810103178024292, + "learning_rate": 1e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.683706521987915, + "num_tokens": 80481243.0, + "step": 3209 + }, + { + "epoch": 0.35251482538985285, + "grad_norm": 2.415166139602661, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7168340682983398, + "num_tokens": 80501474.0, + "step": 3210 + }, + { + "epoch": 0.3526246430924665, + "grad_norm": 2.320692300796509, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6979607939720154, + "num_tokens": 80524381.0, + "step": 3211 + }, + { + "epoch": 0.35273446079508014, + "grad_norm": 2.164839267730713, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.710074245929718, + "num_tokens": 80549858.0, + "step": 3212 + }, + { + "epoch": 0.35284427849769384, + "grad_norm": 1.9388890266418457, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7017436027526855, + "num_tokens": 80580210.0, + "step": 3213 + }, + { + "epoch": 0.3529540962003075, + "grad_norm": 2.3669416904449463, + "learning_rate": 1e-06, + "loss": 1.0803, + "mean_token_accuracy": 0.6825728416442871, + "num_tokens": 80604107.0, + "step": 3214 + }, + { + "epoch": 0.35306391390292113, + "grad_norm": 2.327383518218994, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6934349536895752, + "num_tokens": 80627467.0, + "step": 3215 + }, + { + "epoch": 0.35317373160553484, + "grad_norm": 2.0453832149505615, + "learning_rate": 1e-06, + "loss": 1.07, + "mean_token_accuracy": 0.6780928373336792, + "num_tokens": 80658264.0, + "step": 3216 + }, + { + "epoch": 0.3532835493081485, + "grad_norm": 2.3298823833465576, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6982392072677612, + "num_tokens": 80682843.0, + "step": 3217 + }, + { + "epoch": 0.3533933670107621, + "grad_norm": 2.310014486312866, + "learning_rate": 1e-06, + "loss": 1.0251, + "mean_token_accuracy": 0.6964686512947083, + "num_tokens": 80706906.0, + "step": 3218 + }, + { + "epoch": 0.3535031847133758, + "grad_norm": 2.3036816120147705, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6955363750457764, + "num_tokens": 80729370.0, + "step": 3219 + }, + { + "epoch": 0.3536130024159895, + "grad_norm": 2.0362749099731445, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6868103742599487, + "num_tokens": 80759504.0, + "step": 3220 + }, + { + "epoch": 0.3537228201186031, + "grad_norm": 2.355757474899292, + "learning_rate": 1e-06, + "loss": 1.0827, + "mean_token_accuracy": 0.6798877716064453, + "num_tokens": 80783643.0, + "step": 3221 + }, + { + "epoch": 0.35383263782121677, + "grad_norm": 2.103466272354126, + "learning_rate": 1e-06, + "loss": 1.108, + "mean_token_accuracy": 0.6686922311782837, + "num_tokens": 80813651.0, + "step": 3222 + }, + { + "epoch": 0.35394245552383047, + "grad_norm": 2.3785035610198975, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6940819025039673, + "num_tokens": 80835052.0, + "step": 3223 + }, + { + "epoch": 0.3540522732264441, + "grad_norm": 2.3771872520446777, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.6780062913894653, + "num_tokens": 80859875.0, + "step": 3224 + }, + { + "epoch": 0.35416209092905776, + "grad_norm": 2.412048816680908, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7234270572662354, + "num_tokens": 80882390.0, + "step": 3225 + }, + { + "epoch": 0.3542719086316714, + "grad_norm": 2.409597158432007, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7224075794219971, + "num_tokens": 80903598.0, + "step": 3226 + }, + { + "epoch": 0.3543817263342851, + "grad_norm": 2.003286600112915, + "learning_rate": 1e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.6865584254264832, + "num_tokens": 80935143.0, + "step": 3227 + }, + { + "epoch": 0.35449154403689875, + "grad_norm": 2.321465492248535, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7149709463119507, + "num_tokens": 80958141.0, + "step": 3228 + }, + { + "epoch": 0.3546013617395124, + "grad_norm": 2.4137094020843506, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7319729328155518, + "num_tokens": 80977813.0, + "step": 3229 + }, + { + "epoch": 0.35471117944212605, + "grad_norm": 2.235112190246582, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.7070757746696472, + "num_tokens": 81001562.0, + "step": 3230 + }, + { + "epoch": 0.35482099714473975, + "grad_norm": 2.6273298263549805, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7021976709365845, + "num_tokens": 81022212.0, + "step": 3231 + }, + { + "epoch": 0.3549308148473534, + "grad_norm": 2.123248815536499, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6853835582733154, + "num_tokens": 81051635.0, + "step": 3232 + }, + { + "epoch": 0.35504063254996704, + "grad_norm": 2.3014471530914307, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7186354994773865, + "num_tokens": 81075362.0, + "step": 3233 + }, + { + "epoch": 0.35515045025258074, + "grad_norm": 2.461151599884033, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6943486928939819, + "num_tokens": 81098434.0, + "step": 3234 + }, + { + "epoch": 0.3552602679551944, + "grad_norm": 2.2542707920074463, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7148190140724182, + "num_tokens": 81122894.0, + "step": 3235 + }, + { + "epoch": 0.35537008565780803, + "grad_norm": 2.120030403137207, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6793162822723389, + "num_tokens": 81150363.0, + "step": 3236 + }, + { + "epoch": 0.3554799033604217, + "grad_norm": 2.213141441345215, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.699124276638031, + "num_tokens": 81178177.0, + "step": 3237 + }, + { + "epoch": 0.3555897210630354, + "grad_norm": 2.0909862518310547, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7205336093902588, + "num_tokens": 81204765.0, + "step": 3238 + }, + { + "epoch": 0.355699538765649, + "grad_norm": 2.3509676456451416, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7025951147079468, + "num_tokens": 81227342.0, + "step": 3239 + }, + { + "epoch": 0.35580935646826267, + "grad_norm": 2.013306140899658, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7100391387939453, + "num_tokens": 81258211.0, + "step": 3240 + }, + { + "epoch": 0.3559191741708763, + "grad_norm": 1.8555023670196533, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7169104814529419, + "num_tokens": 81289721.0, + "step": 3241 + }, + { + "epoch": 0.35602899187349, + "grad_norm": 2.512657880783081, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7120738625526428, + "num_tokens": 81310023.0, + "step": 3242 + }, + { + "epoch": 0.35613880957610367, + "grad_norm": 2.4990200996398926, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7042950391769409, + "num_tokens": 81331825.0, + "step": 3243 + }, + { + "epoch": 0.3562486272787173, + "grad_norm": 2.51141619682312, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6991444230079651, + "num_tokens": 81353480.0, + "step": 3244 + }, + { + "epoch": 0.356358444981331, + "grad_norm": 2.026599884033203, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6969157457351685, + "num_tokens": 81382353.0, + "step": 3245 + }, + { + "epoch": 0.35646826268394466, + "grad_norm": 2.5915980339050293, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6974539756774902, + "num_tokens": 81403062.0, + "step": 3246 + }, + { + "epoch": 0.3565780803865583, + "grad_norm": 2.027621030807495, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.711931586265564, + "num_tokens": 81432809.0, + "step": 3247 + }, + { + "epoch": 0.35668789808917195, + "grad_norm": 2.337805986404419, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7211431264877319, + "num_tokens": 81453497.0, + "step": 3248 + }, + { + "epoch": 0.35679771579178565, + "grad_norm": 2.0205204486846924, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7009298205375671, + "num_tokens": 81483711.0, + "step": 3249 + }, + { + "epoch": 0.3569075334943993, + "grad_norm": 2.276348352432251, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6962072253227234, + "num_tokens": 81509746.0, + "step": 3250 + }, + { + "epoch": 0.35701735119701294, + "grad_norm": 2.5879881381988525, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6910815238952637, + "num_tokens": 81530243.0, + "step": 3251 + }, + { + "epoch": 0.35712716889962665, + "grad_norm": 1.987374186515808, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7147741317749023, + "num_tokens": 81560077.0, + "step": 3252 + }, + { + "epoch": 0.3572369866022403, + "grad_norm": 2.2068402767181396, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6939857006072998, + "num_tokens": 81584499.0, + "step": 3253 + }, + { + "epoch": 0.35734680430485394, + "grad_norm": 2.335338830947876, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6898493766784668, + "num_tokens": 81607242.0, + "step": 3254 + }, + { + "epoch": 0.3574566220074676, + "grad_norm": 2.360982894897461, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6978550553321838, + "num_tokens": 81632635.0, + "step": 3255 + }, + { + "epoch": 0.3575664397100813, + "grad_norm": 2.1947085857391357, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7041626572608948, + "num_tokens": 81658466.0, + "step": 3256 + }, + { + "epoch": 0.35767625741269493, + "grad_norm": 2.175222635269165, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6920658946037292, + "num_tokens": 81687082.0, + "step": 3257 + }, + { + "epoch": 0.3577860751153086, + "grad_norm": 2.265833616256714, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6989692449569702, + "num_tokens": 81711218.0, + "step": 3258 + }, + { + "epoch": 0.3578958928179222, + "grad_norm": 2.5844123363494873, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7073318958282471, + "num_tokens": 81730832.0, + "step": 3259 + }, + { + "epoch": 0.3580057105205359, + "grad_norm": 2.008314609527588, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7113488912582397, + "num_tokens": 81760169.0, + "step": 3260 + }, + { + "epoch": 0.35811552822314957, + "grad_norm": 2.407707452774048, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7040783166885376, + "num_tokens": 81781876.0, + "step": 3261 + }, + { + "epoch": 0.3582253459257632, + "grad_norm": 2.3153088092803955, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.696495532989502, + "num_tokens": 81805741.0, + "step": 3262 + }, + { + "epoch": 0.3583351636283769, + "grad_norm": 2.3690905570983887, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6998462677001953, + "num_tokens": 81830027.0, + "step": 3263 + }, + { + "epoch": 0.35844498133099056, + "grad_norm": 2.518706798553467, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7165473103523254, + "num_tokens": 81851375.0, + "step": 3264 + }, + { + "epoch": 0.3585547990336042, + "grad_norm": 2.186063528060913, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6859135031700134, + "num_tokens": 81878637.0, + "step": 3265 + }, + { + "epoch": 0.35866461673621786, + "grad_norm": 2.101670742034912, + "learning_rate": 1e-06, + "loss": 1.0919, + "mean_token_accuracy": 0.6782062649726868, + "num_tokens": 81908792.0, + "step": 3266 + }, + { + "epoch": 0.35877443443883156, + "grad_norm": 2.485592842102051, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6796143651008606, + "num_tokens": 81930483.0, + "step": 3267 + }, + { + "epoch": 0.3588842521414452, + "grad_norm": 2.179396390914917, + "learning_rate": 1e-06, + "loss": 1.0647, + "mean_token_accuracy": 0.6776559352874756, + "num_tokens": 81959109.0, + "step": 3268 + }, + { + "epoch": 0.35899406984405885, + "grad_norm": 2.340524196624756, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.7024322748184204, + "num_tokens": 81983214.0, + "step": 3269 + }, + { + "epoch": 0.35910388754667255, + "grad_norm": 2.0201830863952637, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6987746953964233, + "num_tokens": 82012788.0, + "step": 3270 + }, + { + "epoch": 0.3592137052492862, + "grad_norm": 2.434685230255127, + "learning_rate": 1e-06, + "loss": 1.0164, + "mean_token_accuracy": 0.6958194971084595, + "num_tokens": 82035975.0, + "step": 3271 + }, + { + "epoch": 0.35932352295189984, + "grad_norm": 2.1679513454437256, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.691940426826477, + "num_tokens": 82060586.0, + "step": 3272 + }, + { + "epoch": 0.3594333406545135, + "grad_norm": 2.620481252670288, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7060121297836304, + "num_tokens": 82079823.0, + "step": 3273 + }, + { + "epoch": 0.3595431583571272, + "grad_norm": 2.1675400733947754, + "learning_rate": 1e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.6851650476455688, + "num_tokens": 82106843.0, + "step": 3274 + }, + { + "epoch": 0.35965297605974084, + "grad_norm": 2.4217875003814697, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6959192156791687, + "num_tokens": 82129593.0, + "step": 3275 + }, + { + "epoch": 0.3597627937623545, + "grad_norm": 2.353806257247925, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6783842444419861, + "num_tokens": 82155040.0, + "step": 3276 + }, + { + "epoch": 0.35987261146496813, + "grad_norm": 2.4643354415893555, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6943336725234985, + "num_tokens": 82176638.0, + "step": 3277 + }, + { + "epoch": 0.35998242916758183, + "grad_norm": 2.1534008979797363, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6902530193328857, + "num_tokens": 82202150.0, + "step": 3278 + }, + { + "epoch": 0.3600922468701955, + "grad_norm": 2.1829161643981934, + "learning_rate": 1e-06, + "loss": 1.0971, + "mean_token_accuracy": 0.6790231466293335, + "num_tokens": 82230416.0, + "step": 3279 + }, + { + "epoch": 0.3602020645728091, + "grad_norm": 2.7143588066101074, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7045445442199707, + "num_tokens": 82248049.0, + "step": 3280 + }, + { + "epoch": 0.3603118822754228, + "grad_norm": 2.371849775314331, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7080344557762146, + "num_tokens": 82270279.0, + "step": 3281 + }, + { + "epoch": 0.36042169997803647, + "grad_norm": 2.399893045425415, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7089710235595703, + "num_tokens": 82292483.0, + "step": 3282 + }, + { + "epoch": 0.3605315176806501, + "grad_norm": 2.201986312866211, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7170209884643555, + "num_tokens": 82317429.0, + "step": 3283 + }, + { + "epoch": 0.36064133538326376, + "grad_norm": 2.1373603343963623, + "learning_rate": 1e-06, + "loss": 1.1213, + "mean_token_accuracy": 0.6659445762634277, + "num_tokens": 82346390.0, + "step": 3284 + }, + { + "epoch": 0.36075115308587746, + "grad_norm": 2.596284866333008, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7210656404495239, + "num_tokens": 82366251.0, + "step": 3285 + }, + { + "epoch": 0.3608609707884911, + "grad_norm": 2.6910288333892822, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7027115225791931, + "num_tokens": 82383829.0, + "step": 3286 + }, + { + "epoch": 0.36097078849110475, + "grad_norm": 2.722310781478882, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7081440687179565, + "num_tokens": 82401120.0, + "step": 3287 + }, + { + "epoch": 0.3610806061937184, + "grad_norm": 1.9836735725402832, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6855011582374573, + "num_tokens": 82430791.0, + "step": 3288 + }, + { + "epoch": 0.3611904238963321, + "grad_norm": 2.17417049407959, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7109934091567993, + "num_tokens": 82455688.0, + "step": 3289 + }, + { + "epoch": 0.36130024159894575, + "grad_norm": 2.3344500064849854, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7280600070953369, + "num_tokens": 82476527.0, + "step": 3290 + }, + { + "epoch": 0.3614100593015594, + "grad_norm": 2.2180395126342773, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7082304954528809, + "num_tokens": 82501856.0, + "step": 3291 + }, + { + "epoch": 0.3615198770041731, + "grad_norm": 2.240675687789917, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.677204966545105, + "num_tokens": 82527681.0, + "step": 3292 + }, + { + "epoch": 0.36162969470678674, + "grad_norm": 2.4155802726745605, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7049239873886108, + "num_tokens": 82551989.0, + "step": 3293 + }, + { + "epoch": 0.3617395124094004, + "grad_norm": 2.1872289180755615, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7003651261329651, + "num_tokens": 82579879.0, + "step": 3294 + }, + { + "epoch": 0.36184933011201403, + "grad_norm": 2.2463676929473877, + "learning_rate": 1e-06, + "loss": 1.1107, + "mean_token_accuracy": 0.670167863368988, + "num_tokens": 82608968.0, + "step": 3295 + }, + { + "epoch": 0.36195914781462774, + "grad_norm": 2.0607032775878906, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7019004225730896, + "num_tokens": 82639243.0, + "step": 3296 + }, + { + "epoch": 0.3620689655172414, + "grad_norm": 1.899152159690857, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6810516119003296, + "num_tokens": 82675684.0, + "step": 3297 + }, + { + "epoch": 0.362178783219855, + "grad_norm": 2.2112903594970703, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7074100971221924, + "num_tokens": 82700174.0, + "step": 3298 + }, + { + "epoch": 0.36228860092246873, + "grad_norm": 1.877864122390747, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6900785565376282, + "num_tokens": 82733795.0, + "step": 3299 + }, + { + "epoch": 0.3623984186250824, + "grad_norm": 2.1852951049804688, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.6998541951179504, + "num_tokens": 82759818.0, + "step": 3300 + }, + { + "epoch": 0.362508236327696, + "grad_norm": 2.3382198810577393, + "learning_rate": 1e-06, + "loss": 1.0942, + "mean_token_accuracy": 0.6761075854301453, + "num_tokens": 82790658.0, + "step": 3301 + }, + { + "epoch": 0.36261805403030967, + "grad_norm": 2.618309497833252, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7247247695922852, + "num_tokens": 82810269.0, + "step": 3302 + }, + { + "epoch": 0.36272787173292337, + "grad_norm": 2.2501752376556396, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7167420387268066, + "num_tokens": 82834979.0, + "step": 3303 + }, + { + "epoch": 0.362837689435537, + "grad_norm": 2.295835494995117, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.7065684795379639, + "num_tokens": 82860522.0, + "step": 3304 + }, + { + "epoch": 0.36294750713815066, + "grad_norm": 2.224451780319214, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7169665098190308, + "num_tokens": 82883277.0, + "step": 3305 + }, + { + "epoch": 0.3630573248407643, + "grad_norm": 2.537620782852173, + "learning_rate": 1e-06, + "loss": 1.0741, + "mean_token_accuracy": 0.687830924987793, + "num_tokens": 82905365.0, + "step": 3306 + }, + { + "epoch": 0.363167142543378, + "grad_norm": 2.577449321746826, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6968604326248169, + "num_tokens": 82928974.0, + "step": 3307 + }, + { + "epoch": 0.36327696024599165, + "grad_norm": 2.4435012340545654, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7009098529815674, + "num_tokens": 82952537.0, + "step": 3308 + }, + { + "epoch": 0.3633867779486053, + "grad_norm": 2.5803513526916504, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6887090802192688, + "num_tokens": 82972897.0, + "step": 3309 + }, + { + "epoch": 0.363496595651219, + "grad_norm": 2.278418779373169, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6979730129241943, + "num_tokens": 82996528.0, + "step": 3310 + }, + { + "epoch": 0.36360641335383265, + "grad_norm": 2.258880376815796, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6931410431861877, + "num_tokens": 83020241.0, + "step": 3311 + }, + { + "epoch": 0.3637162310564463, + "grad_norm": 2.1857500076293945, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7114039063453674, + "num_tokens": 83048249.0, + "step": 3312 + }, + { + "epoch": 0.36382604875905994, + "grad_norm": 2.1641533374786377, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6997100114822388, + "num_tokens": 83076315.0, + "step": 3313 + }, + { + "epoch": 0.36393586646167364, + "grad_norm": 2.019213914871216, + "learning_rate": 1e-06, + "loss": 1.0658, + "mean_token_accuracy": 0.6894826292991638, + "num_tokens": 83108475.0, + "step": 3314 + }, + { + "epoch": 0.3640456841642873, + "grad_norm": 1.9435715675354004, + "learning_rate": 1e-06, + "loss": 1.0286, + "mean_token_accuracy": 0.6873912811279297, + "num_tokens": 83139975.0, + "step": 3315 + }, + { + "epoch": 0.36415550186690093, + "grad_norm": 2.393234968185425, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7049338817596436, + "num_tokens": 83162317.0, + "step": 3316 + }, + { + "epoch": 0.3642653195695146, + "grad_norm": 2.347291946411133, + "learning_rate": 1e-06, + "loss": 1.1024, + "mean_token_accuracy": 0.670421838760376, + "num_tokens": 83187400.0, + "step": 3317 + }, + { + "epoch": 0.3643751372721283, + "grad_norm": 2.0397942066192627, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6825156211853027, + "num_tokens": 83216671.0, + "step": 3318 + }, + { + "epoch": 0.3644849549747419, + "grad_norm": 1.938792109489441, + "learning_rate": 1e-06, + "loss": 1.0566, + "mean_token_accuracy": 0.6808333992958069, + "num_tokens": 83247657.0, + "step": 3319 + }, + { + "epoch": 0.36459477267735557, + "grad_norm": 2.337073564529419, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7256641387939453, + "num_tokens": 83271399.0, + "step": 3320 + }, + { + "epoch": 0.3647045903799693, + "grad_norm": 2.3562119007110596, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7024866938591003, + "num_tokens": 83295806.0, + "step": 3321 + }, + { + "epoch": 0.3648144080825829, + "grad_norm": 1.9534991979599, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.694560170173645, + "num_tokens": 83327892.0, + "step": 3322 + }, + { + "epoch": 0.36492422578519657, + "grad_norm": 2.5128486156463623, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7015373706817627, + "num_tokens": 83348736.0, + "step": 3323 + }, + { + "epoch": 0.3650340434878102, + "grad_norm": 2.238199234008789, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7080600261688232, + "num_tokens": 83374655.0, + "step": 3324 + }, + { + "epoch": 0.3651438611904239, + "grad_norm": 2.1014654636383057, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7090548872947693, + "num_tokens": 83401555.0, + "step": 3325 + }, + { + "epoch": 0.36525367889303756, + "grad_norm": 2.5751519203186035, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6950127482414246, + "num_tokens": 83421890.0, + "step": 3326 + }, + { + "epoch": 0.3653634965956512, + "grad_norm": 2.817652463912964, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7182841300964355, + "num_tokens": 83439577.0, + "step": 3327 + }, + { + "epoch": 0.3654733142982649, + "grad_norm": 2.223551034927368, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7095439434051514, + "num_tokens": 83466227.0, + "step": 3328 + }, + { + "epoch": 0.36558313200087855, + "grad_norm": 2.3850021362304688, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7087888121604919, + "num_tokens": 83489440.0, + "step": 3329 + }, + { + "epoch": 0.3656929497034922, + "grad_norm": 2.001908302307129, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7093874216079712, + "num_tokens": 83519695.0, + "step": 3330 + }, + { + "epoch": 0.36580276740610584, + "grad_norm": 2.07399845123291, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6946258544921875, + "num_tokens": 83547623.0, + "step": 3331 + }, + { + "epoch": 0.36591258510871955, + "grad_norm": 2.609776735305786, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.713897168636322, + "num_tokens": 83565638.0, + "step": 3332 + }, + { + "epoch": 0.3660224028113332, + "grad_norm": 2.681661367416382, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6933214664459229, + "num_tokens": 83585459.0, + "step": 3333 + }, + { + "epoch": 0.36613222051394684, + "grad_norm": 1.9695743322372437, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7142961621284485, + "num_tokens": 83615182.0, + "step": 3334 + }, + { + "epoch": 0.3662420382165605, + "grad_norm": 2.143137216567993, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7277941703796387, + "num_tokens": 83640049.0, + "step": 3335 + }, + { + "epoch": 0.3663518559191742, + "grad_norm": 2.345986843109131, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6988407969474792, + "num_tokens": 83664323.0, + "step": 3336 + }, + { + "epoch": 0.36646167362178783, + "grad_norm": 2.269881248474121, + "learning_rate": 1e-06, + "loss": 1.0967, + "mean_token_accuracy": 0.6748188734054565, + "num_tokens": 83691799.0, + "step": 3337 + }, + { + "epoch": 0.3665714913244015, + "grad_norm": 2.297943592071533, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7170277833938599, + "num_tokens": 83714918.0, + "step": 3338 + }, + { + "epoch": 0.3666813090270152, + "grad_norm": 2.2210323810577393, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.69423907995224, + "num_tokens": 83739146.0, + "step": 3339 + }, + { + "epoch": 0.3667911267296288, + "grad_norm": 2.377549886703491, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7027859091758728, + "num_tokens": 83761848.0, + "step": 3340 + }, + { + "epoch": 0.36690094443224247, + "grad_norm": 2.3437633514404297, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6904206871986389, + "num_tokens": 83786090.0, + "step": 3341 + }, + { + "epoch": 0.3670107621348561, + "grad_norm": 2.0286240577697754, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6898343563079834, + "num_tokens": 83815569.0, + "step": 3342 + }, + { + "epoch": 0.3671205798374698, + "grad_norm": 2.9885945320129395, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7400644421577454, + "num_tokens": 83830465.0, + "step": 3343 + }, + { + "epoch": 0.36723039754008346, + "grad_norm": 2.531628131866455, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6890762448310852, + "num_tokens": 83850816.0, + "step": 3344 + }, + { + "epoch": 0.3673402152426971, + "grad_norm": 2.2222936153411865, + "learning_rate": 1e-06, + "loss": 1.0992, + "mean_token_accuracy": 0.6681162714958191, + "num_tokens": 83878432.0, + "step": 3345 + }, + { + "epoch": 0.3674500329453108, + "grad_norm": 2.037842273712158, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6977941989898682, + "num_tokens": 83906374.0, + "step": 3346 + }, + { + "epoch": 0.36755985064792446, + "grad_norm": 2.237609386444092, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.703029215335846, + "num_tokens": 83933184.0, + "step": 3347 + }, + { + "epoch": 0.3676696683505381, + "grad_norm": 2.5182087421417236, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7067801356315613, + "num_tokens": 83954509.0, + "step": 3348 + }, + { + "epoch": 0.36777948605315175, + "grad_norm": 2.5801563262939453, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.686248779296875, + "num_tokens": 83974759.0, + "step": 3349 + }, + { + "epoch": 0.36788930375576545, + "grad_norm": 2.420872926712036, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7042566537857056, + "num_tokens": 83996521.0, + "step": 3350 + }, + { + "epoch": 0.3679991214583791, + "grad_norm": 2.2858057022094727, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6983848214149475, + "num_tokens": 84020589.0, + "step": 3351 + }, + { + "epoch": 0.36810893916099274, + "grad_norm": 2.2942841053009033, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7069979310035706, + "num_tokens": 84045228.0, + "step": 3352 + }, + { + "epoch": 0.3682187568636064, + "grad_norm": 2.5087599754333496, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7066446542739868, + "num_tokens": 84065266.0, + "step": 3353 + }, + { + "epoch": 0.3683285745662201, + "grad_norm": 2.0718255043029785, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.6911779642105103, + "num_tokens": 84094669.0, + "step": 3354 + }, + { + "epoch": 0.36843839226883374, + "grad_norm": 2.2937498092651367, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7002223134040833, + "num_tokens": 84117378.0, + "step": 3355 + }, + { + "epoch": 0.3685482099714474, + "grad_norm": 2.188991069793701, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7042282819747925, + "num_tokens": 84145447.0, + "step": 3356 + }, + { + "epoch": 0.3686580276740611, + "grad_norm": 2.2253599166870117, + "learning_rate": 1e-06, + "loss": 1.1185, + "mean_token_accuracy": 0.6686707139015198, + "num_tokens": 84174621.0, + "step": 3357 + }, + { + "epoch": 0.36876784537667473, + "grad_norm": 2.0991084575653076, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7110277414321899, + "num_tokens": 84201604.0, + "step": 3358 + }, + { + "epoch": 0.3688776630792884, + "grad_norm": 2.442690134048462, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6862982511520386, + "num_tokens": 84222548.0, + "step": 3359 + }, + { + "epoch": 0.368987480781902, + "grad_norm": 2.320289373397827, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6861385107040405, + "num_tokens": 84247095.0, + "step": 3360 + }, + { + "epoch": 0.3690972984845157, + "grad_norm": 2.5593056678771973, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6966966986656189, + "num_tokens": 84266778.0, + "step": 3361 + }, + { + "epoch": 0.36920711618712937, + "grad_norm": 2.0115127563476562, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.6992954015731812, + "num_tokens": 84296041.0, + "step": 3362 + }, + { + "epoch": 0.369316933889743, + "grad_norm": 2.2193667888641357, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6855725049972534, + "num_tokens": 84323561.0, + "step": 3363 + }, + { + "epoch": 0.36942675159235666, + "grad_norm": 2.550910234451294, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6898927688598633, + "num_tokens": 84344016.0, + "step": 3364 + }, + { + "epoch": 0.36953656929497036, + "grad_norm": 2.468301773071289, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6979062557220459, + "num_tokens": 84367361.0, + "step": 3365 + }, + { + "epoch": 0.369646386997584, + "grad_norm": 2.5560107231140137, + "learning_rate": 1e-06, + "loss": 1.0833, + "mean_token_accuracy": 0.6839536428451538, + "num_tokens": 84388865.0, + "step": 3366 + }, + { + "epoch": 0.36975620470019765, + "grad_norm": 2.1670405864715576, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7199953198432922, + "num_tokens": 84415915.0, + "step": 3367 + }, + { + "epoch": 0.36986602240281136, + "grad_norm": 2.0916154384613037, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6947606801986694, + "num_tokens": 84446939.0, + "step": 3368 + }, + { + "epoch": 0.369975840105425, + "grad_norm": 2.3331902027130127, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.6994851231575012, + "num_tokens": 84470344.0, + "step": 3369 + }, + { + "epoch": 0.37008565780803865, + "grad_norm": 2.166465997695923, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7010617256164551, + "num_tokens": 84496520.0, + "step": 3370 + }, + { + "epoch": 0.3701954755106523, + "grad_norm": 2.315943479537964, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.715048611164093, + "num_tokens": 84523901.0, + "step": 3371 + }, + { + "epoch": 0.370305293213266, + "grad_norm": 2.2395546436309814, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.703445315361023, + "num_tokens": 84549812.0, + "step": 3372 + }, + { + "epoch": 0.37041511091587964, + "grad_norm": 2.1528844833374023, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6892815828323364, + "num_tokens": 84578582.0, + "step": 3373 + }, + { + "epoch": 0.3705249286184933, + "grad_norm": 2.174604892730713, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6988564133644104, + "num_tokens": 84606226.0, + "step": 3374 + }, + { + "epoch": 0.370634746321107, + "grad_norm": 2.218904495239258, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7013605833053589, + "num_tokens": 84631638.0, + "step": 3375 + }, + { + "epoch": 0.37074456402372064, + "grad_norm": 2.5328876972198486, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6997570395469666, + "num_tokens": 84651201.0, + "step": 3376 + }, + { + "epoch": 0.3708543817263343, + "grad_norm": 2.049677610397339, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.71903395652771, + "num_tokens": 84679065.0, + "step": 3377 + }, + { + "epoch": 0.3709641994289479, + "grad_norm": 2.3711588382720947, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7251918315887451, + "num_tokens": 84699688.0, + "step": 3378 + }, + { + "epoch": 0.37107401713156163, + "grad_norm": 2.520390033721924, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7225218415260315, + "num_tokens": 84719832.0, + "step": 3379 + }, + { + "epoch": 0.3711838348341753, + "grad_norm": 1.9791085720062256, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6935368180274963, + "num_tokens": 84750303.0, + "step": 3380 + }, + { + "epoch": 0.3712936525367889, + "grad_norm": 2.237356185913086, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7015256881713867, + "num_tokens": 84775686.0, + "step": 3381 + }, + { + "epoch": 0.37140347023940257, + "grad_norm": 2.363967180252075, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7062825560569763, + "num_tokens": 84799103.0, + "step": 3382 + }, + { + "epoch": 0.37151328794201627, + "grad_norm": 2.048018217086792, + "learning_rate": 1e-06, + "loss": 0.7588, + "mean_token_accuracy": 0.7608679533004761, + "num_tokens": 84822997.0, + "step": 3383 + }, + { + "epoch": 0.3716231056446299, + "grad_norm": 2.179107427597046, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6982736587524414, + "num_tokens": 84848663.0, + "step": 3384 + }, + { + "epoch": 0.37173292334724356, + "grad_norm": 2.5158145427703857, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6914873123168945, + "num_tokens": 84871830.0, + "step": 3385 + }, + { + "epoch": 0.37184274104985726, + "grad_norm": 1.9388164281845093, + "learning_rate": 1e-06, + "loss": 1.0973, + "mean_token_accuracy": 0.6780332922935486, + "num_tokens": 84907969.0, + "step": 3386 + }, + { + "epoch": 0.3719525587524709, + "grad_norm": 2.4964659214019775, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7062766551971436, + "num_tokens": 84931121.0, + "step": 3387 + }, + { + "epoch": 0.37206237645508455, + "grad_norm": 2.6689555644989014, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6928948163986206, + "num_tokens": 84949333.0, + "step": 3388 + }, + { + "epoch": 0.3721721941576982, + "grad_norm": 2.4824721813201904, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7032837867736816, + "num_tokens": 84969892.0, + "step": 3389 + }, + { + "epoch": 0.3722820118603119, + "grad_norm": 2.2200353145599365, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7045212984085083, + "num_tokens": 84994362.0, + "step": 3390 + }, + { + "epoch": 0.37239182956292555, + "grad_norm": 2.373180389404297, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7101457118988037, + "num_tokens": 85016789.0, + "step": 3391 + }, + { + "epoch": 0.3725016472655392, + "grad_norm": 2.3141560554504395, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6951431632041931, + "num_tokens": 85043300.0, + "step": 3392 + }, + { + "epoch": 0.37261146496815284, + "grad_norm": 2.3010027408599854, + "learning_rate": 1e-06, + "loss": 1.0952, + "mean_token_accuracy": 0.678705096244812, + "num_tokens": 85066802.0, + "step": 3393 + }, + { + "epoch": 0.37272128267076654, + "grad_norm": 2.602473497390747, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7007101774215698, + "num_tokens": 85084322.0, + "step": 3394 + }, + { + "epoch": 0.3728311003733802, + "grad_norm": 2.0734498500823975, + "learning_rate": 1e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.6767846345901489, + "num_tokens": 85114580.0, + "step": 3395 + }, + { + "epoch": 0.37294091807599383, + "grad_norm": 2.2063615322113037, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6982520818710327, + "num_tokens": 85140148.0, + "step": 3396 + }, + { + "epoch": 0.37305073577860753, + "grad_norm": 1.9971755743026733, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6914193630218506, + "num_tokens": 85169652.0, + "step": 3397 + }, + { + "epoch": 0.3731605534812212, + "grad_norm": 2.1117780208587646, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.709979236125946, + "num_tokens": 85195568.0, + "step": 3398 + }, + { + "epoch": 0.3732703711838348, + "grad_norm": 2.2670741081237793, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7076672911643982, + "num_tokens": 85220095.0, + "step": 3399 + }, + { + "epoch": 0.37338018888644847, + "grad_norm": 2.1296327114105225, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6875513792037964, + "num_tokens": 85249698.0, + "step": 3400 + }, + { + "epoch": 0.3734900065890622, + "grad_norm": 2.0077998638153076, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.7012283802032471, + "num_tokens": 85281363.0, + "step": 3401 + }, + { + "epoch": 0.3735998242916758, + "grad_norm": 2.353630781173706, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7280901074409485, + "num_tokens": 85303825.0, + "step": 3402 + }, + { + "epoch": 0.37370964199428947, + "grad_norm": 2.1263909339904785, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7210557460784912, + "num_tokens": 85328190.0, + "step": 3403 + }, + { + "epoch": 0.37381945969690317, + "grad_norm": 2.0699222087860107, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6934255957603455, + "num_tokens": 85357104.0, + "step": 3404 + }, + { + "epoch": 0.3739292773995168, + "grad_norm": 2.188593626022339, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7201097011566162, + "num_tokens": 85385080.0, + "step": 3405 + }, + { + "epoch": 0.37403909510213046, + "grad_norm": 2.456937313079834, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7029955983161926, + "num_tokens": 85406517.0, + "step": 3406 + }, + { + "epoch": 0.3741489128047441, + "grad_norm": 2.1660900115966797, + "learning_rate": 1e-06, + "loss": 1.094, + "mean_token_accuracy": 0.6753857135772705, + "num_tokens": 85437120.0, + "step": 3407 + }, + { + "epoch": 0.3742587305073578, + "grad_norm": 2.0219929218292236, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7000555992126465, + "num_tokens": 85467487.0, + "step": 3408 + }, + { + "epoch": 0.37436854820997145, + "grad_norm": 2.0832197666168213, + "learning_rate": 1e-06, + "loss": 1.0959, + "mean_token_accuracy": 0.6724570989608765, + "num_tokens": 85499171.0, + "step": 3409 + }, + { + "epoch": 0.3744783659125851, + "grad_norm": 1.9421098232269287, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6918712258338928, + "num_tokens": 85533261.0, + "step": 3410 + }, + { + "epoch": 0.37458818361519874, + "grad_norm": 2.590146780014038, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6995702981948853, + "num_tokens": 85553696.0, + "step": 3411 + }, + { + "epoch": 0.37469800131781245, + "grad_norm": 2.3785715103149414, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7031391859054565, + "num_tokens": 85576995.0, + "step": 3412 + }, + { + "epoch": 0.3748078190204261, + "grad_norm": 2.0231776237487793, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7195020318031311, + "num_tokens": 85606198.0, + "step": 3413 + }, + { + "epoch": 0.37491763672303974, + "grad_norm": 2.0548856258392334, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7005582451820374, + "num_tokens": 85637206.0, + "step": 3414 + }, + { + "epoch": 0.37502745442565344, + "grad_norm": 2.325157403945923, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7115057110786438, + "num_tokens": 85660666.0, + "step": 3415 + }, + { + "epoch": 0.3751372721282671, + "grad_norm": 2.716919183731079, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7007814645767212, + "num_tokens": 85681832.0, + "step": 3416 + }, + { + "epoch": 0.37524708983088073, + "grad_norm": 2.2276611328125, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7128877639770508, + "num_tokens": 85705759.0, + "step": 3417 + }, + { + "epoch": 0.3753569075334944, + "grad_norm": 2.491840124130249, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6927335262298584, + "num_tokens": 85729903.0, + "step": 3418 + }, + { + "epoch": 0.3754667252361081, + "grad_norm": 2.448551893234253, + "learning_rate": 1e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6872841119766235, + "num_tokens": 85754050.0, + "step": 3419 + }, + { + "epoch": 0.3755765429387217, + "grad_norm": 2.33272385597229, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7007176876068115, + "num_tokens": 85777115.0, + "step": 3420 + }, + { + "epoch": 0.37568636064133537, + "grad_norm": 2.1862528324127197, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6880296468734741, + "num_tokens": 85802808.0, + "step": 3421 + }, + { + "epoch": 0.37579617834394907, + "grad_norm": 2.34710693359375, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7153661251068115, + "num_tokens": 85827013.0, + "step": 3422 + }, + { + "epoch": 0.3759059960465627, + "grad_norm": 2.063467502593994, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7085846662521362, + "num_tokens": 85855275.0, + "step": 3423 + }, + { + "epoch": 0.37601581374917636, + "grad_norm": 1.9369922876358032, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.7009786367416382, + "num_tokens": 85884639.0, + "step": 3424 + }, + { + "epoch": 0.37612563145179, + "grad_norm": 2.0012776851654053, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6935896873474121, + "num_tokens": 85915101.0, + "step": 3425 + }, + { + "epoch": 0.3762354491544037, + "grad_norm": 2.206770420074463, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6875654458999634, + "num_tokens": 85941796.0, + "step": 3426 + }, + { + "epoch": 0.37634526685701736, + "grad_norm": 2.3223447799682617, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6905766725540161, + "num_tokens": 85966895.0, + "step": 3427 + }, + { + "epoch": 0.376455084559631, + "grad_norm": 2.151063919067383, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.701992928981781, + "num_tokens": 85994135.0, + "step": 3428 + }, + { + "epoch": 0.37656490226224465, + "grad_norm": 2.4007556438446045, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7210590839385986, + "num_tokens": 86015698.0, + "step": 3429 + }, + { + "epoch": 0.37667471996485835, + "grad_norm": 2.085218906402588, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6835359334945679, + "num_tokens": 86045740.0, + "step": 3430 + }, + { + "epoch": 0.376784537667472, + "grad_norm": 2.0260226726531982, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7128551602363586, + "num_tokens": 86075597.0, + "step": 3431 + }, + { + "epoch": 0.37689435537008564, + "grad_norm": 2.6459009647369385, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7108279466629028, + "num_tokens": 86094667.0, + "step": 3432 + }, + { + "epoch": 0.37700417307269934, + "grad_norm": 2.180619716644287, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6865177750587463, + "num_tokens": 86121691.0, + "step": 3433 + }, + { + "epoch": 0.377113990775313, + "grad_norm": 2.323831796646118, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.695483922958374, + "num_tokens": 86144335.0, + "step": 3434 + }, + { + "epoch": 0.37722380847792664, + "grad_norm": 2.4278934001922607, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7092132568359375, + "num_tokens": 86165805.0, + "step": 3435 + }, + { + "epoch": 0.3773336261805403, + "grad_norm": 2.3462417125701904, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6934577226638794, + "num_tokens": 86188511.0, + "step": 3436 + }, + { + "epoch": 0.377443443883154, + "grad_norm": 2.309113025665283, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7097316980361938, + "num_tokens": 86212243.0, + "step": 3437 + }, + { + "epoch": 0.37755326158576763, + "grad_norm": 2.017665147781372, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7085990905761719, + "num_tokens": 86242705.0, + "step": 3438 + }, + { + "epoch": 0.3776630792883813, + "grad_norm": 2.4405925273895264, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7191064953804016, + "num_tokens": 86265294.0, + "step": 3439 + }, + { + "epoch": 0.3777728969909949, + "grad_norm": 1.8831462860107422, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6885981559753418, + "num_tokens": 86297757.0, + "step": 3440 + }, + { + "epoch": 0.3778827146936086, + "grad_norm": 2.0952398777008057, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6932965517044067, + "num_tokens": 86326022.0, + "step": 3441 + }, + { + "epoch": 0.37799253239622227, + "grad_norm": 2.4344685077667236, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7015413045883179, + "num_tokens": 86349062.0, + "step": 3442 + }, + { + "epoch": 0.3781023500988359, + "grad_norm": 2.2805707454681396, + "learning_rate": 1e-06, + "loss": 1.0671, + "mean_token_accuracy": 0.6897546648979187, + "num_tokens": 86374945.0, + "step": 3443 + }, + { + "epoch": 0.3782121678014496, + "grad_norm": 2.4093682765960693, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.692003607749939, + "num_tokens": 86397468.0, + "step": 3444 + }, + { + "epoch": 0.37832198550406326, + "grad_norm": 2.172278881072998, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6943173408508301, + "num_tokens": 86422181.0, + "step": 3445 + }, + { + "epoch": 0.3784318032066769, + "grad_norm": 2.2037134170532227, + "learning_rate": 1e-06, + "loss": 1.0488, + "mean_token_accuracy": 0.6780732274055481, + "num_tokens": 86450080.0, + "step": 3446 + }, + { + "epoch": 0.37854162090929055, + "grad_norm": 2.2881321907043457, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6958829760551453, + "num_tokens": 86473213.0, + "step": 3447 + }, + { + "epoch": 0.37865143861190426, + "grad_norm": 2.250230312347412, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6962740421295166, + "num_tokens": 86500159.0, + "step": 3448 + }, + { + "epoch": 0.3787612563145179, + "grad_norm": 2.68462872505188, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.736396312713623, + "num_tokens": 86520615.0, + "step": 3449 + }, + { + "epoch": 0.37887107401713155, + "grad_norm": 2.4407308101654053, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7250896692276001, + "num_tokens": 86540499.0, + "step": 3450 + }, + { + "epoch": 0.37898089171974525, + "grad_norm": 2.3004231452941895, + "learning_rate": 1e-06, + "loss": 1.0918, + "mean_token_accuracy": 0.6749537587165833, + "num_tokens": 86564167.0, + "step": 3451 + }, + { + "epoch": 0.3790907094223589, + "grad_norm": 2.062426805496216, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7040451765060425, + "num_tokens": 86594334.0, + "step": 3452 + }, + { + "epoch": 0.37920052712497254, + "grad_norm": 2.010270357131958, + "learning_rate": 1e-06, + "loss": 1.0922, + "mean_token_accuracy": 0.6759562492370605, + "num_tokens": 86624946.0, + "step": 3453 + }, + { + "epoch": 0.3793103448275862, + "grad_norm": 2.642489194869995, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7045745849609375, + "num_tokens": 86645647.0, + "step": 3454 + }, + { + "epoch": 0.3794201625301999, + "grad_norm": 2.3977952003479004, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7133768796920776, + "num_tokens": 86667512.0, + "step": 3455 + }, + { + "epoch": 0.37952998023281354, + "grad_norm": 2.4688425064086914, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7279232144355774, + "num_tokens": 86687249.0, + "step": 3456 + }, + { + "epoch": 0.3796397979354272, + "grad_norm": 2.522660255432129, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7130309343338013, + "num_tokens": 86708020.0, + "step": 3457 + }, + { + "epoch": 0.3797496156380408, + "grad_norm": 2.3609108924865723, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7012068033218384, + "num_tokens": 86731786.0, + "step": 3458 + }, + { + "epoch": 0.37985943334065453, + "grad_norm": 2.5963127613067627, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7061784267425537, + "num_tokens": 86753884.0, + "step": 3459 + }, + { + "epoch": 0.3799692510432682, + "grad_norm": 2.4791512489318848, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7125849723815918, + "num_tokens": 86776363.0, + "step": 3460 + }, + { + "epoch": 0.3800790687458818, + "grad_norm": 2.312523365020752, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6892722845077515, + "num_tokens": 86803367.0, + "step": 3461 + }, + { + "epoch": 0.3801888864484955, + "grad_norm": 1.9921023845672607, + "learning_rate": 1e-06, + "loss": 1.0657, + "mean_token_accuracy": 0.6799489855766296, + "num_tokens": 86835540.0, + "step": 3462 + }, + { + "epoch": 0.38029870415110917, + "grad_norm": 2.3596251010894775, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7236212491989136, + "num_tokens": 86859720.0, + "step": 3463 + }, + { + "epoch": 0.3804085218537228, + "grad_norm": 2.7566468715667725, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.712380051612854, + "num_tokens": 86878717.0, + "step": 3464 + }, + { + "epoch": 0.38051833955633646, + "grad_norm": 2.308892011642456, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7246298789978027, + "num_tokens": 86903565.0, + "step": 3465 + }, + { + "epoch": 0.38062815725895016, + "grad_norm": 2.273815870285034, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7230604887008667, + "num_tokens": 86926494.0, + "step": 3466 + }, + { + "epoch": 0.3807379749615638, + "grad_norm": 2.053187847137451, + "learning_rate": 1e-06, + "loss": 1.0249, + "mean_token_accuracy": 0.6841086149215698, + "num_tokens": 86957422.0, + "step": 3467 + }, + { + "epoch": 0.38084779266417745, + "grad_norm": 2.4327189922332764, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.7044395208358765, + "num_tokens": 86980827.0, + "step": 3468 + }, + { + "epoch": 0.3809576103667911, + "grad_norm": 2.0894243717193604, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7202616930007935, + "num_tokens": 87007131.0, + "step": 3469 + }, + { + "epoch": 0.3810674280694048, + "grad_norm": 1.9410008192062378, + "learning_rate": 1e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7389895915985107, + "num_tokens": 87033707.0, + "step": 3470 + }, + { + "epoch": 0.38117724577201845, + "grad_norm": 2.2437119483947754, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7137502431869507, + "num_tokens": 87057304.0, + "step": 3471 + }, + { + "epoch": 0.3812870634746321, + "grad_norm": 2.3318934440612793, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6935802698135376, + "num_tokens": 87081773.0, + "step": 3472 + }, + { + "epoch": 0.3813968811772458, + "grad_norm": 2.213069200515747, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7185115218162537, + "num_tokens": 87104762.0, + "step": 3473 + }, + { + "epoch": 0.38150669887985944, + "grad_norm": 2.4238054752349854, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7004504203796387, + "num_tokens": 87128506.0, + "step": 3474 + }, + { + "epoch": 0.3816165165824731, + "grad_norm": 2.096249580383301, + "learning_rate": 1e-06, + "loss": 1.0667, + "mean_token_accuracy": 0.6859289407730103, + "num_tokens": 87157577.0, + "step": 3475 + }, + { + "epoch": 0.38172633428508673, + "grad_norm": 2.045660972595215, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6898351907730103, + "num_tokens": 87186757.0, + "step": 3476 + }, + { + "epoch": 0.38183615198770043, + "grad_norm": 2.441950798034668, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6992126703262329, + "num_tokens": 87207859.0, + "step": 3477 + }, + { + "epoch": 0.3819459696903141, + "grad_norm": 2.137638807296753, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7207680344581604, + "num_tokens": 87234517.0, + "step": 3478 + }, + { + "epoch": 0.3820557873929277, + "grad_norm": 2.3194808959960938, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7268614172935486, + "num_tokens": 87256364.0, + "step": 3479 + }, + { + "epoch": 0.3821656050955414, + "grad_norm": 2.2499430179595947, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6881327629089355, + "num_tokens": 87281550.0, + "step": 3480 + }, + { + "epoch": 0.3822754227981551, + "grad_norm": 2.1194310188293457, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7038331031799316, + "num_tokens": 87309545.0, + "step": 3481 + }, + { + "epoch": 0.3823852405007687, + "grad_norm": 2.3691701889038086, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7054129838943481, + "num_tokens": 87331396.0, + "step": 3482 + }, + { + "epoch": 0.38249505820338237, + "grad_norm": 2.3897154331207275, + "learning_rate": 1e-06, + "loss": 1.0708, + "mean_token_accuracy": 0.6799205541610718, + "num_tokens": 87356117.0, + "step": 3483 + }, + { + "epoch": 0.38260487590599607, + "grad_norm": 2.15067195892334, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7058878540992737, + "num_tokens": 87381251.0, + "step": 3484 + }, + { + "epoch": 0.3827146936086097, + "grad_norm": 2.2688379287719727, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.722761869430542, + "num_tokens": 87404130.0, + "step": 3485 + }, + { + "epoch": 0.38282451131122336, + "grad_norm": 2.05606746673584, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6978920102119446, + "num_tokens": 87432211.0, + "step": 3486 + }, + { + "epoch": 0.382934329013837, + "grad_norm": 2.0504095554351807, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7150646448135376, + "num_tokens": 87460351.0, + "step": 3487 + }, + { + "epoch": 0.3830441467164507, + "grad_norm": 2.079123020172119, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6905224323272705, + "num_tokens": 87490286.0, + "step": 3488 + }, + { + "epoch": 0.38315396441906435, + "grad_norm": 2.2337028980255127, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6989296078681946, + "num_tokens": 87516840.0, + "step": 3489 + }, + { + "epoch": 0.383263782121678, + "grad_norm": 2.164252281188965, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.70341956615448, + "num_tokens": 87542168.0, + "step": 3490 + }, + { + "epoch": 0.3833735998242917, + "grad_norm": 2.4496266841888428, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.723281741142273, + "num_tokens": 87561737.0, + "step": 3491 + }, + { + "epoch": 0.38348341752690535, + "grad_norm": 2.116645574569702, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7339833378791809, + "num_tokens": 87588979.0, + "step": 3492 + }, + { + "epoch": 0.383593235229519, + "grad_norm": 2.2517921924591064, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7188844680786133, + "num_tokens": 87613451.0, + "step": 3493 + }, + { + "epoch": 0.38370305293213264, + "grad_norm": 2.0743801593780518, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.7079043388366699, + "num_tokens": 87642544.0, + "step": 3494 + }, + { + "epoch": 0.38381287063474634, + "grad_norm": 2.145358085632324, + "learning_rate": 1e-06, + "loss": 1.0748, + "mean_token_accuracy": 0.6767023801803589, + "num_tokens": 87671311.0, + "step": 3495 + }, + { + "epoch": 0.38392268833736, + "grad_norm": 2.1575725078582764, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6925605535507202, + "num_tokens": 87696836.0, + "step": 3496 + }, + { + "epoch": 0.38403250603997363, + "grad_norm": 2.3529419898986816, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7023909091949463, + "num_tokens": 87719633.0, + "step": 3497 + }, + { + "epoch": 0.38414232374258733, + "grad_norm": 2.405353546142578, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7293844223022461, + "num_tokens": 87741537.0, + "step": 3498 + }, + { + "epoch": 0.384252141445201, + "grad_norm": 2.6703684329986572, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7105894088745117, + "num_tokens": 87759846.0, + "step": 3499 + }, + { + "epoch": 0.3843619591478146, + "grad_norm": 2.245612859725952, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7015475034713745, + "num_tokens": 87785761.0, + "step": 3500 + }, + { + "epoch": 0.38447177685042827, + "grad_norm": 2.067244529724121, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7121743559837341, + "num_tokens": 87814094.0, + "step": 3501 + }, + { + "epoch": 0.38458159455304197, + "grad_norm": 2.4410879611968994, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6794044971466064, + "num_tokens": 87837218.0, + "step": 3502 + }, + { + "epoch": 0.3846914122556556, + "grad_norm": 2.152623414993286, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7183202505111694, + "num_tokens": 87863104.0, + "step": 3503 + }, + { + "epoch": 0.38480122995826926, + "grad_norm": 2.0401525497436523, + "learning_rate": 1e-06, + "loss": 1.0978, + "mean_token_accuracy": 0.6710153222084045, + "num_tokens": 87895631.0, + "step": 3504 + }, + { + "epoch": 0.3849110476608829, + "grad_norm": 2.3284752368927, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7198085188865662, + "num_tokens": 87918813.0, + "step": 3505 + }, + { + "epoch": 0.3850208653634966, + "grad_norm": 2.273158311843872, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7075213193893433, + "num_tokens": 87943980.0, + "step": 3506 + }, + { + "epoch": 0.38513068306611026, + "grad_norm": 2.626418352127075, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7150631546974182, + "num_tokens": 87964210.0, + "step": 3507 + }, + { + "epoch": 0.3852405007687239, + "grad_norm": 2.0517873764038086, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7137324810028076, + "num_tokens": 87992820.0, + "step": 3508 + }, + { + "epoch": 0.3853503184713376, + "grad_norm": 2.599278688430786, + "learning_rate": 1e-06, + "loss": 0.8235, + "mean_token_accuracy": 0.7433582544326782, + "num_tokens": 88013334.0, + "step": 3509 + }, + { + "epoch": 0.38546013617395125, + "grad_norm": 2.500627040863037, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7105328440666199, + "num_tokens": 88032756.0, + "step": 3510 + }, + { + "epoch": 0.3855699538765649, + "grad_norm": 2.535846471786499, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6954458951950073, + "num_tokens": 88052253.0, + "step": 3511 + }, + { + "epoch": 0.38567977157917854, + "grad_norm": 2.4887428283691406, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7013311386108398, + "num_tokens": 88071855.0, + "step": 3512 + }, + { + "epoch": 0.38578958928179224, + "grad_norm": 2.2987008094787598, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7105614542961121, + "num_tokens": 88093954.0, + "step": 3513 + }, + { + "epoch": 0.3858994069844059, + "grad_norm": 2.189840793609619, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.6951464414596558, + "num_tokens": 88119524.0, + "step": 3514 + }, + { + "epoch": 0.38600922468701954, + "grad_norm": 2.2353732585906982, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.707727313041687, + "num_tokens": 88147635.0, + "step": 3515 + }, + { + "epoch": 0.3861190423896332, + "grad_norm": 2.339076042175293, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7071664333343506, + "num_tokens": 88170148.0, + "step": 3516 + }, + { + "epoch": 0.3862288600922469, + "grad_norm": 2.6938161849975586, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.711033821105957, + "num_tokens": 88188570.0, + "step": 3517 + }, + { + "epoch": 0.38633867779486053, + "grad_norm": 2.077009439468384, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6949231028556824, + "num_tokens": 88218283.0, + "step": 3518 + }, + { + "epoch": 0.3864484954974742, + "grad_norm": 2.5861141681671143, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7100979089736938, + "num_tokens": 88236312.0, + "step": 3519 + }, + { + "epoch": 0.3865583132000879, + "grad_norm": 2.417790174484253, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7331316471099854, + "num_tokens": 88257114.0, + "step": 3520 + }, + { + "epoch": 0.3866681309027015, + "grad_norm": 1.9010357856750488, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7209267616271973, + "num_tokens": 88291207.0, + "step": 3521 + }, + { + "epoch": 0.38677794860531517, + "grad_norm": 2.4285011291503906, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.6969157457351685, + "num_tokens": 88314161.0, + "step": 3522 + }, + { + "epoch": 0.3868877663079288, + "grad_norm": 2.426933765411377, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6860003471374512, + "num_tokens": 88337053.0, + "step": 3523 + }, + { + "epoch": 0.3869975840105425, + "grad_norm": 2.0581862926483154, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7080820798873901, + "num_tokens": 88367727.0, + "step": 3524 + }, + { + "epoch": 0.38710740171315616, + "grad_norm": 2.311847448348999, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.6984857320785522, + "num_tokens": 88388999.0, + "step": 3525 + }, + { + "epoch": 0.3872172194157698, + "grad_norm": 1.937193751335144, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.701117753982544, + "num_tokens": 88420249.0, + "step": 3526 + }, + { + "epoch": 0.3873270371183835, + "grad_norm": 2.037956714630127, + "learning_rate": 1e-06, + "loss": 1.1452, + "mean_token_accuracy": 0.6581757664680481, + "num_tokens": 88450552.0, + "step": 3527 + }, + { + "epoch": 0.38743685482099716, + "grad_norm": 2.525538206100464, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6897457838058472, + "num_tokens": 88471683.0, + "step": 3528 + }, + { + "epoch": 0.3875466725236108, + "grad_norm": 2.625826120376587, + "learning_rate": 1e-06, + "loss": 1.0576, + "mean_token_accuracy": 0.6846853494644165, + "num_tokens": 88492935.0, + "step": 3529 + }, + { + "epoch": 0.38765649022622445, + "grad_norm": 2.3769729137420654, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.709478497505188, + "num_tokens": 88517000.0, + "step": 3530 + }, + { + "epoch": 0.38776630792883815, + "grad_norm": 2.2230119705200195, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7225709557533264, + "num_tokens": 88543168.0, + "step": 3531 + }, + { + "epoch": 0.3878761256314518, + "grad_norm": 2.504079580307007, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6886508464813232, + "num_tokens": 88566458.0, + "step": 3532 + }, + { + "epoch": 0.38798594333406544, + "grad_norm": 2.058605670928955, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6855940222740173, + "num_tokens": 88597524.0, + "step": 3533 + }, + { + "epoch": 0.3880957610366791, + "grad_norm": 2.4621474742889404, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7254371643066406, + "num_tokens": 88619652.0, + "step": 3534 + }, + { + "epoch": 0.3882055787392928, + "grad_norm": 2.4888813495635986, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7243791818618774, + "num_tokens": 88639699.0, + "step": 3535 + }, + { + "epoch": 0.38831539644190644, + "grad_norm": 2.234278440475464, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7017818689346313, + "num_tokens": 88666509.0, + "step": 3536 + }, + { + "epoch": 0.3884252141445201, + "grad_norm": 2.13942551612854, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7010619044303894, + "num_tokens": 88692686.0, + "step": 3537 + }, + { + "epoch": 0.3885350318471338, + "grad_norm": 2.445607900619507, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6990847587585449, + "num_tokens": 88714656.0, + "step": 3538 + }, + { + "epoch": 0.38864484954974743, + "grad_norm": 2.353773832321167, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6914503574371338, + "num_tokens": 88738259.0, + "step": 3539 + }, + { + "epoch": 0.3887546672523611, + "grad_norm": 1.7988874912261963, + "learning_rate": 1e-06, + "loss": 1.1041, + "mean_token_accuracy": 0.6727316975593567, + "num_tokens": 88778002.0, + "step": 3540 + }, + { + "epoch": 0.3888644849549747, + "grad_norm": 2.145902156829834, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7089236974716187, + "num_tokens": 88804857.0, + "step": 3541 + }, + { + "epoch": 0.3889743026575884, + "grad_norm": 1.833745002746582, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6906948685646057, + "num_tokens": 88837694.0, + "step": 3542 + }, + { + "epoch": 0.38908412036020207, + "grad_norm": 2.461686849594116, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7220526337623596, + "num_tokens": 88857720.0, + "step": 3543 + }, + { + "epoch": 0.3891939380628157, + "grad_norm": 2.2347781658172607, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6879394054412842, + "num_tokens": 88882044.0, + "step": 3544 + }, + { + "epoch": 0.38930375576542936, + "grad_norm": 2.0846869945526123, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6870019435882568, + "num_tokens": 88910007.0, + "step": 3545 + }, + { + "epoch": 0.38941357346804306, + "grad_norm": 2.2769336700439453, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7137711048126221, + "num_tokens": 88934569.0, + "step": 3546 + }, + { + "epoch": 0.3895233911706567, + "grad_norm": 2.1110658645629883, + "learning_rate": 1e-06, + "loss": 1.0747, + "mean_token_accuracy": 0.678764820098877, + "num_tokens": 88963453.0, + "step": 3547 + }, + { + "epoch": 0.38963320887327035, + "grad_norm": 2.2631924152374268, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.722642183303833, + "num_tokens": 88986805.0, + "step": 3548 + }, + { + "epoch": 0.38974302657588406, + "grad_norm": 2.0817182064056396, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.6876974105834961, + "num_tokens": 89016060.0, + "step": 3549 + }, + { + "epoch": 0.3898528442784977, + "grad_norm": 2.1985323429107666, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7034504413604736, + "num_tokens": 89042691.0, + "step": 3550 + }, + { + "epoch": 0.38996266198111135, + "grad_norm": 1.981746792793274, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6946871280670166, + "num_tokens": 89072334.0, + "step": 3551 + }, + { + "epoch": 0.390072479683725, + "grad_norm": 2.123430013656616, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.699796199798584, + "num_tokens": 89099787.0, + "step": 3552 + }, + { + "epoch": 0.3901822973863387, + "grad_norm": 2.5752782821655273, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7227375507354736, + "num_tokens": 89118539.0, + "step": 3553 + }, + { + "epoch": 0.39029211508895234, + "grad_norm": 2.6027753353118896, + "learning_rate": 1e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7382336854934692, + "num_tokens": 89137173.0, + "step": 3554 + }, + { + "epoch": 0.390401932791566, + "grad_norm": 2.2035951614379883, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7161831259727478, + "num_tokens": 89161932.0, + "step": 3555 + }, + { + "epoch": 0.3905117504941797, + "grad_norm": 2.2808854579925537, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7189050912857056, + "num_tokens": 89184906.0, + "step": 3556 + }, + { + "epoch": 0.39062156819679333, + "grad_norm": 2.302757978439331, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6917107105255127, + "num_tokens": 89208417.0, + "step": 3557 + }, + { + "epoch": 0.390731385899407, + "grad_norm": 2.368809938430786, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6968240737915039, + "num_tokens": 89230907.0, + "step": 3558 + }, + { + "epoch": 0.3908412036020206, + "grad_norm": 2.0361058712005615, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6864476203918457, + "num_tokens": 89261020.0, + "step": 3559 + }, + { + "epoch": 0.3909510213046343, + "grad_norm": 2.109487771987915, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.6775451898574829, + "num_tokens": 89289789.0, + "step": 3560 + }, + { + "epoch": 0.391060839007248, + "grad_norm": 3.059262275695801, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7300631999969482, + "num_tokens": 89304907.0, + "step": 3561 + }, + { + "epoch": 0.3911706567098616, + "grad_norm": 2.2977960109710693, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7053532004356384, + "num_tokens": 89327773.0, + "step": 3562 + }, + { + "epoch": 0.39128047441247527, + "grad_norm": 2.309319019317627, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7117709517478943, + "num_tokens": 89351720.0, + "step": 3563 + }, + { + "epoch": 0.39139029211508897, + "grad_norm": 2.289518356323242, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.721580982208252, + "num_tokens": 89375904.0, + "step": 3564 + }, + { + "epoch": 0.3915001098177026, + "grad_norm": 2.46170973777771, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7051573395729065, + "num_tokens": 89397456.0, + "step": 3565 + }, + { + "epoch": 0.39160992752031626, + "grad_norm": 2.1158127784729004, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6937119364738464, + "num_tokens": 89425998.0, + "step": 3566 + }, + { + "epoch": 0.39171974522292996, + "grad_norm": 2.121591091156006, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7091171741485596, + "num_tokens": 89452883.0, + "step": 3567 + }, + { + "epoch": 0.3918295629255436, + "grad_norm": 2.0246927738189697, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.686195969581604, + "num_tokens": 89484820.0, + "step": 3568 + }, + { + "epoch": 0.39193938062815725, + "grad_norm": 2.2407193183898926, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7184087634086609, + "num_tokens": 89508581.0, + "step": 3569 + }, + { + "epoch": 0.3920491983307709, + "grad_norm": 2.364638328552246, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6900657415390015, + "num_tokens": 89530824.0, + "step": 3570 + }, + { + "epoch": 0.3921590160333846, + "grad_norm": 2.6533265113830566, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7109609842300415, + "num_tokens": 89549731.0, + "step": 3571 + }, + { + "epoch": 0.39226883373599825, + "grad_norm": 2.285207509994507, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.7046262621879578, + "num_tokens": 89573603.0, + "step": 3572 + }, + { + "epoch": 0.3923786514386119, + "grad_norm": 2.6005799770355225, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.70531165599823, + "num_tokens": 89595757.0, + "step": 3573 + }, + { + "epoch": 0.3924884691412256, + "grad_norm": 2.5718979835510254, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7084868550300598, + "num_tokens": 89614897.0, + "step": 3574 + }, + { + "epoch": 0.39259828684383924, + "grad_norm": 2.371588706970215, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7019771933555603, + "num_tokens": 89638312.0, + "step": 3575 + }, + { + "epoch": 0.3927081045464529, + "grad_norm": 2.6116957664489746, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7273423671722412, + "num_tokens": 89657230.0, + "step": 3576 + }, + { + "epoch": 0.39281792224906653, + "grad_norm": 2.486624240875244, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6985371112823486, + "num_tokens": 89679380.0, + "step": 3577 + }, + { + "epoch": 0.39292773995168023, + "grad_norm": 2.3806185722351074, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6945697069168091, + "num_tokens": 89705833.0, + "step": 3578 + }, + { + "epoch": 0.3930375576542939, + "grad_norm": 1.9802470207214355, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.6956166625022888, + "num_tokens": 89736243.0, + "step": 3579 + }, + { + "epoch": 0.3931473753569075, + "grad_norm": 2.0940656661987305, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6981183290481567, + "num_tokens": 89765332.0, + "step": 3580 + }, + { + "epoch": 0.39325719305952117, + "grad_norm": 2.234567880630493, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7083344459533691, + "num_tokens": 89791088.0, + "step": 3581 + }, + { + "epoch": 0.39336701076213487, + "grad_norm": 2.3075625896453857, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6982417106628418, + "num_tokens": 89814897.0, + "step": 3582 + }, + { + "epoch": 0.3934768284647485, + "grad_norm": 2.1617014408111572, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.7029025554656982, + "num_tokens": 89839148.0, + "step": 3583 + }, + { + "epoch": 0.39358664616736216, + "grad_norm": 2.1201045513153076, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6940872073173523, + "num_tokens": 89865148.0, + "step": 3584 + }, + { + "epoch": 0.39369646386997587, + "grad_norm": 2.1819705963134766, + "learning_rate": 1e-06, + "loss": 1.0547, + "mean_token_accuracy": 0.6818090677261353, + "num_tokens": 89892224.0, + "step": 3585 + }, + { + "epoch": 0.3938062815725895, + "grad_norm": 2.3286185264587402, + "learning_rate": 1e-06, + "loss": 1.102, + "mean_token_accuracy": 0.6702548265457153, + "num_tokens": 89916799.0, + "step": 3586 + }, + { + "epoch": 0.39391609927520316, + "grad_norm": 2.050642967224121, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7032256722450256, + "num_tokens": 89945166.0, + "step": 3587 + }, + { + "epoch": 0.3940259169778168, + "grad_norm": 2.0209367275238037, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.6926882266998291, + "num_tokens": 89976815.0, + "step": 3588 + }, + { + "epoch": 0.3941357346804305, + "grad_norm": 2.070382595062256, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7083423137664795, + "num_tokens": 90004638.0, + "step": 3589 + }, + { + "epoch": 0.39424555238304415, + "grad_norm": 2.0658044815063477, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6919925808906555, + "num_tokens": 90031476.0, + "step": 3590 + }, + { + "epoch": 0.3943553700856578, + "grad_norm": 2.2499141693115234, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7042973041534424, + "num_tokens": 90056858.0, + "step": 3591 + }, + { + "epoch": 0.39446518778827144, + "grad_norm": 2.030691385269165, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7013599276542664, + "num_tokens": 90086792.0, + "step": 3592 + }, + { + "epoch": 0.39457500549088514, + "grad_norm": 2.2657182216644287, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6903380155563354, + "num_tokens": 90109241.0, + "step": 3593 + }, + { + "epoch": 0.3946848231934988, + "grad_norm": 2.2664246559143066, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6979215145111084, + "num_tokens": 90136036.0, + "step": 3594 + }, + { + "epoch": 0.39479464089611244, + "grad_norm": 2.3268392086029053, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6900081634521484, + "num_tokens": 90160219.0, + "step": 3595 + }, + { + "epoch": 0.39490445859872614, + "grad_norm": 2.0569145679473877, + "learning_rate": 1e-06, + "loss": 1.1092, + "mean_token_accuracy": 0.6660207509994507, + "num_tokens": 90193169.0, + "step": 3596 + }, + { + "epoch": 0.3950142763013398, + "grad_norm": 2.0029428005218506, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6958195567131042, + "num_tokens": 90223555.0, + "step": 3597 + }, + { + "epoch": 0.39512409400395343, + "grad_norm": 1.9327268600463867, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6873181462287903, + "num_tokens": 90254061.0, + "step": 3598 + }, + { + "epoch": 0.3952339117065671, + "grad_norm": 2.233687400817871, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6989545822143555, + "num_tokens": 90278420.0, + "step": 3599 + }, + { + "epoch": 0.3953437294091808, + "grad_norm": 2.1831576824188232, + "learning_rate": 1e-06, + "loss": 1.0696, + "mean_token_accuracy": 0.6813221573829651, + "num_tokens": 90306509.0, + "step": 3600 + }, + { + "epoch": 0.3954535471117944, + "grad_norm": 2.2993292808532715, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7107110023498535, + "num_tokens": 90332438.0, + "step": 3601 + }, + { + "epoch": 0.39556336481440807, + "grad_norm": 2.258728265762329, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6884599328041077, + "num_tokens": 90359920.0, + "step": 3602 + }, + { + "epoch": 0.39567318251702177, + "grad_norm": 2.1787266731262207, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7022829651832581, + "num_tokens": 90386013.0, + "step": 3603 + }, + { + "epoch": 0.3957830002196354, + "grad_norm": 2.4078099727630615, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6887962818145752, + "num_tokens": 90409115.0, + "step": 3604 + }, + { + "epoch": 0.39589281792224906, + "grad_norm": 2.1840109825134277, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6885082125663757, + "num_tokens": 90437409.0, + "step": 3605 + }, + { + "epoch": 0.3960026356248627, + "grad_norm": 2.285922050476074, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6938639283180237, + "num_tokens": 90461602.0, + "step": 3606 + }, + { + "epoch": 0.3961124533274764, + "grad_norm": 2.5178287029266357, + "learning_rate": 1e-06, + "loss": 1.0761, + "mean_token_accuracy": 0.6770743727684021, + "num_tokens": 90481775.0, + "step": 3607 + }, + { + "epoch": 0.39622227103009006, + "grad_norm": 2.455204486846924, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.6905123591423035, + "num_tokens": 90502336.0, + "step": 3608 + }, + { + "epoch": 0.3963320887327037, + "grad_norm": 2.3230371475219727, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.713819146156311, + "num_tokens": 90526128.0, + "step": 3609 + }, + { + "epoch": 0.39644190643531735, + "grad_norm": 2.187453031539917, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6839483976364136, + "num_tokens": 90551691.0, + "step": 3610 + }, + { + "epoch": 0.39655172413793105, + "grad_norm": 2.5420656204223633, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7033672332763672, + "num_tokens": 90571176.0, + "step": 3611 + }, + { + "epoch": 0.3966615418405447, + "grad_norm": 2.093378782272339, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7082824110984802, + "num_tokens": 90596926.0, + "step": 3612 + }, + { + "epoch": 0.39677135954315834, + "grad_norm": 2.0869357585906982, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6934663653373718, + "num_tokens": 90625209.0, + "step": 3613 + }, + { + "epoch": 0.39688117724577204, + "grad_norm": 2.2336678504943848, + "learning_rate": 1e-06, + "loss": 1.0683, + "mean_token_accuracy": 0.6849452257156372, + "num_tokens": 90653920.0, + "step": 3614 + }, + { + "epoch": 0.3969909949483857, + "grad_norm": 1.9670792818069458, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.6819231510162354, + "num_tokens": 90686729.0, + "step": 3615 + }, + { + "epoch": 0.39710081265099934, + "grad_norm": 2.253117322921753, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6850794553756714, + "num_tokens": 90712729.0, + "step": 3616 + }, + { + "epoch": 0.397210630353613, + "grad_norm": 2.030365228652954, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7041107416152954, + "num_tokens": 90743743.0, + "step": 3617 + }, + { + "epoch": 0.3973204480562267, + "grad_norm": 2.230808734893799, + "learning_rate": 1e-06, + "loss": 1.081, + "mean_token_accuracy": 0.6828252077102661, + "num_tokens": 90770055.0, + "step": 3618 + }, + { + "epoch": 0.39743026575884033, + "grad_norm": 2.152714729309082, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7177085876464844, + "num_tokens": 90794800.0, + "step": 3619 + }, + { + "epoch": 0.397540083461454, + "grad_norm": 2.0672426223754883, + "learning_rate": 1e-06, + "loss": 1.0974, + "mean_token_accuracy": 0.678206205368042, + "num_tokens": 90825157.0, + "step": 3620 + }, + { + "epoch": 0.3976499011640676, + "grad_norm": 2.304694414138794, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7211346626281738, + "num_tokens": 90845121.0, + "step": 3621 + }, + { + "epoch": 0.3977597188666813, + "grad_norm": 2.3777902126312256, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7099587917327881, + "num_tokens": 90866845.0, + "step": 3622 + }, + { + "epoch": 0.39786953656929497, + "grad_norm": 2.307384729385376, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7084047794342041, + "num_tokens": 90890274.0, + "step": 3623 + }, + { + "epoch": 0.3979793542719086, + "grad_norm": 2.4032175540924072, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7020015120506287, + "num_tokens": 90912666.0, + "step": 3624 + }, + { + "epoch": 0.3980891719745223, + "grad_norm": 2.007044553756714, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6989437341690063, + "num_tokens": 90944310.0, + "step": 3625 + }, + { + "epoch": 0.39819898967713596, + "grad_norm": 1.9753036499023438, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.6898137331008911, + "num_tokens": 90976451.0, + "step": 3626 + }, + { + "epoch": 0.3983088073797496, + "grad_norm": 2.0907626152038574, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7163252830505371, + "num_tokens": 91003364.0, + "step": 3627 + }, + { + "epoch": 0.39841862508236325, + "grad_norm": 2.2119619846343994, + "learning_rate": 1e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7379123568534851, + "num_tokens": 91027236.0, + "step": 3628 + }, + { + "epoch": 0.39852844278497696, + "grad_norm": 2.117945909500122, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6993600130081177, + "num_tokens": 91053422.0, + "step": 3629 + }, + { + "epoch": 0.3986382604875906, + "grad_norm": 2.123836040496826, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7114428281784058, + "num_tokens": 91080766.0, + "step": 3630 + }, + { + "epoch": 0.39874807819020425, + "grad_norm": 2.205031156539917, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6998149156570435, + "num_tokens": 91107263.0, + "step": 3631 + }, + { + "epoch": 0.39885789589281795, + "grad_norm": 2.14983868598938, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6973506212234497, + "num_tokens": 91133821.0, + "step": 3632 + }, + { + "epoch": 0.3989677135954316, + "grad_norm": 2.6178464889526367, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6961655616760254, + "num_tokens": 91153078.0, + "step": 3633 + }, + { + "epoch": 0.39907753129804524, + "grad_norm": 2.1303908824920654, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7082235217094421, + "num_tokens": 91177427.0, + "step": 3634 + }, + { + "epoch": 0.3991873490006589, + "grad_norm": 1.967778205871582, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6999919414520264, + "num_tokens": 91207026.0, + "step": 3635 + }, + { + "epoch": 0.3992971667032726, + "grad_norm": 2.5644428730010986, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7217310667037964, + "num_tokens": 91226131.0, + "step": 3636 + }, + { + "epoch": 0.39940698440588623, + "grad_norm": 2.0202579498291016, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6878098249435425, + "num_tokens": 91259100.0, + "step": 3637 + }, + { + "epoch": 0.3995168021084999, + "grad_norm": 2.126006841659546, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6876477599143982, + "num_tokens": 91288511.0, + "step": 3638 + }, + { + "epoch": 0.3996266198111135, + "grad_norm": 2.3142826557159424, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7090708017349243, + "num_tokens": 91310958.0, + "step": 3639 + }, + { + "epoch": 0.3997364375137272, + "grad_norm": 2.0776174068450928, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.7007758617401123, + "num_tokens": 91339953.0, + "step": 3640 + }, + { + "epoch": 0.3998462552163409, + "grad_norm": 2.0660548210144043, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7118856310844421, + "num_tokens": 91367820.0, + "step": 3641 + }, + { + "epoch": 0.3999560729189545, + "grad_norm": 2.245157480239868, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7166000604629517, + "num_tokens": 91391685.0, + "step": 3642 + }, + { + "epoch": 0.4000658906215682, + "grad_norm": 2.279628038406372, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7183128595352173, + "num_tokens": 91414647.0, + "step": 3643 + }, + { + "epoch": 0.40017570832418187, + "grad_norm": 2.08530330657959, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7000229358673096, + "num_tokens": 91443268.0, + "step": 3644 + }, + { + "epoch": 0.4002855260267955, + "grad_norm": 1.891681432723999, + "learning_rate": 1e-06, + "loss": 1.1284, + "mean_token_accuracy": 0.6712546348571777, + "num_tokens": 91478276.0, + "step": 3645 + }, + { + "epoch": 0.40039534372940916, + "grad_norm": 2.3738608360290527, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7098598480224609, + "num_tokens": 91500253.0, + "step": 3646 + }, + { + "epoch": 0.40050516143202286, + "grad_norm": 2.247875690460205, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7108496427536011, + "num_tokens": 91525364.0, + "step": 3647 + }, + { + "epoch": 0.4006149791346365, + "grad_norm": 2.107452392578125, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6860304474830627, + "num_tokens": 91553319.0, + "step": 3648 + }, + { + "epoch": 0.40072479683725015, + "grad_norm": 2.2238271236419678, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6834433078765869, + "num_tokens": 91579027.0, + "step": 3649 + }, + { + "epoch": 0.4008346145398638, + "grad_norm": 2.523973226547241, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6893926858901978, + "num_tokens": 91598689.0, + "step": 3650 + }, + { + "epoch": 0.4009444322424775, + "grad_norm": 2.2917964458465576, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.681932270526886, + "num_tokens": 91621858.0, + "step": 3651 + }, + { + "epoch": 0.40105424994509115, + "grad_norm": 2.077873468399048, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.702156662940979, + "num_tokens": 91649223.0, + "step": 3652 + }, + { + "epoch": 0.4011640676477048, + "grad_norm": 2.2188007831573486, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7156723141670227, + "num_tokens": 91674171.0, + "step": 3653 + }, + { + "epoch": 0.4012738853503185, + "grad_norm": 2.210024356842041, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6987835764884949, + "num_tokens": 91702742.0, + "step": 3654 + }, + { + "epoch": 0.40138370305293214, + "grad_norm": 2.26997971534729, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6948887705802917, + "num_tokens": 91726852.0, + "step": 3655 + }, + { + "epoch": 0.4014935207555458, + "grad_norm": 2.2338504791259766, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7081294059753418, + "num_tokens": 91750721.0, + "step": 3656 + }, + { + "epoch": 0.40160333845815943, + "grad_norm": 2.2310597896575928, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7107471823692322, + "num_tokens": 91775183.0, + "step": 3657 + }, + { + "epoch": 0.40171315616077313, + "grad_norm": 2.106248140335083, + "learning_rate": 1e-06, + "loss": 1.058, + "mean_token_accuracy": 0.6837012767791748, + "num_tokens": 91803547.0, + "step": 3658 + }, + { + "epoch": 0.4018229738633868, + "grad_norm": 2.3168351650238037, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7155599594116211, + "num_tokens": 91828140.0, + "step": 3659 + }, + { + "epoch": 0.4019327915660004, + "grad_norm": 2.371892213821411, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7079416513442993, + "num_tokens": 91853693.0, + "step": 3660 + }, + { + "epoch": 0.4020426092686141, + "grad_norm": 1.9435656070709229, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.698712944984436, + "num_tokens": 91885003.0, + "step": 3661 + }, + { + "epoch": 0.40215242697122777, + "grad_norm": 2.8359599113464355, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7197969555854797, + "num_tokens": 91901053.0, + "step": 3662 + }, + { + "epoch": 0.4022622446738414, + "grad_norm": 2.5682685375213623, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7242698669433594, + "num_tokens": 91920109.0, + "step": 3663 + }, + { + "epoch": 0.40237206237645506, + "grad_norm": 2.535572052001953, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7012943625450134, + "num_tokens": 91939627.0, + "step": 3664 + }, + { + "epoch": 0.40248188007906877, + "grad_norm": 2.556117296218872, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7125565409660339, + "num_tokens": 91958702.0, + "step": 3665 + }, + { + "epoch": 0.4025916977816824, + "grad_norm": 1.9979031085968018, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6926539540290833, + "num_tokens": 91994323.0, + "step": 3666 + }, + { + "epoch": 0.40270151548429606, + "grad_norm": 2.7470388412475586, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6930412650108337, + "num_tokens": 92013136.0, + "step": 3667 + }, + { + "epoch": 0.4028113331869097, + "grad_norm": 2.0728304386138916, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.6974794268608093, + "num_tokens": 92042818.0, + "step": 3668 + }, + { + "epoch": 0.4029211508895234, + "grad_norm": 2.1287097930908203, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6885241866111755, + "num_tokens": 92068817.0, + "step": 3669 + }, + { + "epoch": 0.40303096859213705, + "grad_norm": 1.8888362646102905, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6726992130279541, + "num_tokens": 92101841.0, + "step": 3670 + }, + { + "epoch": 0.4031407862947507, + "grad_norm": 2.2440619468688965, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7219710350036621, + "num_tokens": 92125234.0, + "step": 3671 + }, + { + "epoch": 0.4032506039973644, + "grad_norm": 2.1884148120880127, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6971724033355713, + "num_tokens": 92152318.0, + "step": 3672 + }, + { + "epoch": 0.40336042169997804, + "grad_norm": 2.2118914127349854, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6860247254371643, + "num_tokens": 92180853.0, + "step": 3673 + }, + { + "epoch": 0.4034702394025917, + "grad_norm": 2.08804988861084, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7093244194984436, + "num_tokens": 92207160.0, + "step": 3674 + }, + { + "epoch": 0.40358005710520534, + "grad_norm": 2.5269532203674316, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6959441900253296, + "num_tokens": 92228547.0, + "step": 3675 + }, + { + "epoch": 0.40368987480781904, + "grad_norm": 2.4677300453186035, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7001912593841553, + "num_tokens": 92252120.0, + "step": 3676 + }, + { + "epoch": 0.4037996925104327, + "grad_norm": 2.3052456378936768, + "learning_rate": 1e-06, + "loss": 1.1135, + "mean_token_accuracy": 0.6717901229858398, + "num_tokens": 92275565.0, + "step": 3677 + }, + { + "epoch": 0.40390951021304633, + "grad_norm": 2.4917116165161133, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7194572687149048, + "num_tokens": 92296738.0, + "step": 3678 + }, + { + "epoch": 0.40401932791566003, + "grad_norm": 1.9484350681304932, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6855924129486084, + "num_tokens": 92328957.0, + "step": 3679 + }, + { + "epoch": 0.4041291456182737, + "grad_norm": 2.1254687309265137, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6843284368515015, + "num_tokens": 92358006.0, + "step": 3680 + }, + { + "epoch": 0.4042389633208873, + "grad_norm": 2.260648012161255, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7030286192893982, + "num_tokens": 92380984.0, + "step": 3681 + }, + { + "epoch": 0.40434878102350097, + "grad_norm": 2.271193742752075, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7131418585777283, + "num_tokens": 92403625.0, + "step": 3682 + }, + { + "epoch": 0.40445859872611467, + "grad_norm": 2.1464452743530273, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6947767734527588, + "num_tokens": 92429407.0, + "step": 3683 + }, + { + "epoch": 0.4045684164287283, + "grad_norm": 2.3329379558563232, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7067699432373047, + "num_tokens": 92451871.0, + "step": 3684 + }, + { + "epoch": 0.40467823413134196, + "grad_norm": 1.9620249271392822, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6956729888916016, + "num_tokens": 92484479.0, + "step": 3685 + }, + { + "epoch": 0.4047880518339556, + "grad_norm": 2.4923458099365234, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6913115978240967, + "num_tokens": 92507023.0, + "step": 3686 + }, + { + "epoch": 0.4048978695365693, + "grad_norm": 2.41965913772583, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6993430852890015, + "num_tokens": 92528589.0, + "step": 3687 + }, + { + "epoch": 0.40500768723918296, + "grad_norm": 2.1452088356018066, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.6937077641487122, + "num_tokens": 92554947.0, + "step": 3688 + }, + { + "epoch": 0.4051175049417966, + "grad_norm": 2.1278622150421143, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7350435256958008, + "num_tokens": 92581071.0, + "step": 3689 + }, + { + "epoch": 0.4052273226444103, + "grad_norm": 2.3324854373931885, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7024107575416565, + "num_tokens": 92604052.0, + "step": 3690 + }, + { + "epoch": 0.40533714034702395, + "grad_norm": 2.2854690551757812, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7213307619094849, + "num_tokens": 92628210.0, + "step": 3691 + }, + { + "epoch": 0.4054469580496376, + "grad_norm": 2.901573896408081, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7189245223999023, + "num_tokens": 92643452.0, + "step": 3692 + }, + { + "epoch": 0.40555677575225124, + "grad_norm": 2.0245165824890137, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6897505521774292, + "num_tokens": 92673099.0, + "step": 3693 + }, + { + "epoch": 0.40566659345486494, + "grad_norm": 2.4525136947631836, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7183418273925781, + "num_tokens": 92694016.0, + "step": 3694 + }, + { + "epoch": 0.4057764111574786, + "grad_norm": 2.178076982498169, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6906781196594238, + "num_tokens": 92719659.0, + "step": 3695 + }, + { + "epoch": 0.40588622886009224, + "grad_norm": 2.2080254554748535, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7064899802207947, + "num_tokens": 92743385.0, + "step": 3696 + }, + { + "epoch": 0.4059960465627059, + "grad_norm": 2.2583179473876953, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.69798743724823, + "num_tokens": 92767468.0, + "step": 3697 + }, + { + "epoch": 0.4061058642653196, + "grad_norm": 2.2923967838287354, + "learning_rate": 1e-06, + "loss": 1.0242, + "mean_token_accuracy": 0.6956874132156372, + "num_tokens": 92792300.0, + "step": 3698 + }, + { + "epoch": 0.40621568196793323, + "grad_norm": 2.407503128051758, + "learning_rate": 1e-06, + "loss": 1.1056, + "mean_token_accuracy": 0.6779881119728088, + "num_tokens": 92817184.0, + "step": 3699 + }, + { + "epoch": 0.4063254996705469, + "grad_norm": 2.247072219848633, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7137086987495422, + "num_tokens": 92841730.0, + "step": 3700 + }, + { + "epoch": 0.4064353173731606, + "grad_norm": 2.278334617614746, + "learning_rate": 1e-06, + "loss": 1.0583, + "mean_token_accuracy": 0.6786842942237854, + "num_tokens": 92867104.0, + "step": 3701 + }, + { + "epoch": 0.4065451350757742, + "grad_norm": 2.3245623111724854, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7264049053192139, + "num_tokens": 92889738.0, + "step": 3702 + }, + { + "epoch": 0.40665495277838787, + "grad_norm": 2.398235559463501, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6926847100257874, + "num_tokens": 92912706.0, + "step": 3703 + }, + { + "epoch": 0.4067647704810015, + "grad_norm": 2.0490877628326416, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7087938189506531, + "num_tokens": 92941490.0, + "step": 3704 + }, + { + "epoch": 0.4068745881836152, + "grad_norm": 2.540130853652954, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7005094289779663, + "num_tokens": 92962806.0, + "step": 3705 + }, + { + "epoch": 0.40698440588622886, + "grad_norm": 2.5837903022766113, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7252258062362671, + "num_tokens": 92982210.0, + "step": 3706 + }, + { + "epoch": 0.4070942235888425, + "grad_norm": 2.267970085144043, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.711577832698822, + "num_tokens": 93005025.0, + "step": 3707 + }, + { + "epoch": 0.4072040412914562, + "grad_norm": 1.9509382247924805, + "learning_rate": 1e-06, + "loss": 1.0792, + "mean_token_accuracy": 0.6893274188041687, + "num_tokens": 93036305.0, + "step": 3708 + }, + { + "epoch": 0.40731385899406986, + "grad_norm": 2.3175601959228516, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6980245113372803, + "num_tokens": 93060715.0, + "step": 3709 + }, + { + "epoch": 0.4074236766966835, + "grad_norm": 2.3661160469055176, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7055644989013672, + "num_tokens": 93083220.0, + "step": 3710 + }, + { + "epoch": 0.40753349439929715, + "grad_norm": 2.3350369930267334, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7037243843078613, + "num_tokens": 93107662.0, + "step": 3711 + }, + { + "epoch": 0.40764331210191085, + "grad_norm": 2.381451368331909, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7138508558273315, + "num_tokens": 93129691.0, + "step": 3712 + }, + { + "epoch": 0.4077531298045245, + "grad_norm": 2.4795050621032715, + "learning_rate": 1e-06, + "loss": 1.0704, + "mean_token_accuracy": 0.6805917024612427, + "num_tokens": 93151735.0, + "step": 3713 + }, + { + "epoch": 0.40786294750713814, + "grad_norm": 2.5454094409942627, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7087256908416748, + "num_tokens": 93171339.0, + "step": 3714 + }, + { + "epoch": 0.4079727652097518, + "grad_norm": 2.2013823986053467, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7111307382583618, + "num_tokens": 93197424.0, + "step": 3715 + }, + { + "epoch": 0.4080825829123655, + "grad_norm": 2.3187761306762695, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.7008683681488037, + "num_tokens": 93219277.0, + "step": 3716 + }, + { + "epoch": 0.40819240061497913, + "grad_norm": 2.5247766971588135, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7239210605621338, + "num_tokens": 93239159.0, + "step": 3717 + }, + { + "epoch": 0.4083022183175928, + "grad_norm": 2.0943667888641357, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.693482518196106, + "num_tokens": 93266253.0, + "step": 3718 + }, + { + "epoch": 0.4084120360202065, + "grad_norm": 2.232264995574951, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6919128894805908, + "num_tokens": 93291422.0, + "step": 3719 + }, + { + "epoch": 0.4085218537228201, + "grad_norm": 2.1883084774017334, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6787593364715576, + "num_tokens": 93316444.0, + "step": 3720 + }, + { + "epoch": 0.4086316714254338, + "grad_norm": 2.2885212898254395, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7078866958618164, + "num_tokens": 93339283.0, + "step": 3721 + }, + { + "epoch": 0.4087414891280474, + "grad_norm": 2.059722661972046, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.7001577019691467, + "num_tokens": 93369706.0, + "step": 3722 + }, + { + "epoch": 0.4088513068306611, + "grad_norm": 2.2809717655181885, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.6986058950424194, + "num_tokens": 93393087.0, + "step": 3723 + }, + { + "epoch": 0.40896112453327477, + "grad_norm": 2.0932345390319824, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6855509281158447, + "num_tokens": 93422072.0, + "step": 3724 + }, + { + "epoch": 0.4090709422358884, + "grad_norm": 2.0889923572540283, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7104872465133667, + "num_tokens": 93449644.0, + "step": 3725 + }, + { + "epoch": 0.40918075993850206, + "grad_norm": 2.241492509841919, + "learning_rate": 1e-06, + "loss": 1.0361, + "mean_token_accuracy": 0.6853530406951904, + "num_tokens": 93476007.0, + "step": 3726 + }, + { + "epoch": 0.40929057764111576, + "grad_norm": 2.469586133956909, + "learning_rate": 1e-06, + "loss": 1.0762, + "mean_token_accuracy": 0.6778615117073059, + "num_tokens": 93499467.0, + "step": 3727 + }, + { + "epoch": 0.4094003953437294, + "grad_norm": 2.3354663848876953, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7019597887992859, + "num_tokens": 93522440.0, + "step": 3728 + }, + { + "epoch": 0.40951021304634305, + "grad_norm": 2.1439406871795654, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.7002209424972534, + "num_tokens": 93547468.0, + "step": 3729 + }, + { + "epoch": 0.40962003074895675, + "grad_norm": 2.2011027336120605, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.69437575340271, + "num_tokens": 93573348.0, + "step": 3730 + }, + { + "epoch": 0.4097298484515704, + "grad_norm": 2.287060499191284, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.6946153044700623, + "num_tokens": 93597733.0, + "step": 3731 + }, + { + "epoch": 0.40983966615418405, + "grad_norm": 2.3612780570983887, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7046531438827515, + "num_tokens": 93621599.0, + "step": 3732 + }, + { + "epoch": 0.4099494838567977, + "grad_norm": 2.369158983230591, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.730124294757843, + "num_tokens": 93642029.0, + "step": 3733 + }, + { + "epoch": 0.4100593015594114, + "grad_norm": 2.5022799968719482, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.732062578201294, + "num_tokens": 93661890.0, + "step": 3734 + }, + { + "epoch": 0.41016911926202504, + "grad_norm": 2.588489294052124, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7199854254722595, + "num_tokens": 93683171.0, + "step": 3735 + }, + { + "epoch": 0.4102789369646387, + "grad_norm": 2.391982078552246, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.6951223015785217, + "num_tokens": 93705263.0, + "step": 3736 + }, + { + "epoch": 0.4103887546672524, + "grad_norm": 2.4075798988342285, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7139166593551636, + "num_tokens": 93725709.0, + "step": 3737 + }, + { + "epoch": 0.41049857236986603, + "grad_norm": 2.146353244781494, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7306963205337524, + "num_tokens": 93751188.0, + "step": 3738 + }, + { + "epoch": 0.4106083900724797, + "grad_norm": 2.1280221939086914, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6836616396903992, + "num_tokens": 93779635.0, + "step": 3739 + }, + { + "epoch": 0.4107182077750933, + "grad_norm": 2.281449556350708, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6993778347969055, + "num_tokens": 93803308.0, + "step": 3740 + }, + { + "epoch": 0.410828025477707, + "grad_norm": 2.4765872955322266, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6902007460594177, + "num_tokens": 93823822.0, + "step": 3741 + }, + { + "epoch": 0.41093784318032067, + "grad_norm": 2.187723398208618, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.704853892326355, + "num_tokens": 93849545.0, + "step": 3742 + }, + { + "epoch": 0.4110476608829343, + "grad_norm": 2.1152584552764893, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6878432631492615, + "num_tokens": 93880325.0, + "step": 3743 + }, + { + "epoch": 0.41115747858554796, + "grad_norm": 2.1859240531921387, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6869997978210449, + "num_tokens": 93908520.0, + "step": 3744 + }, + { + "epoch": 0.41126729628816167, + "grad_norm": 2.352262258529663, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7009648084640503, + "num_tokens": 93930545.0, + "step": 3745 + }, + { + "epoch": 0.4113771139907753, + "grad_norm": 2.1449878215789795, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7050490379333496, + "num_tokens": 93955254.0, + "step": 3746 + }, + { + "epoch": 0.41148693169338896, + "grad_norm": 2.0707437992095947, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6979079842567444, + "num_tokens": 93983909.0, + "step": 3747 + }, + { + "epoch": 0.41159674939600266, + "grad_norm": 2.0597057342529297, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6922245025634766, + "num_tokens": 94011255.0, + "step": 3748 + }, + { + "epoch": 0.4117065670986163, + "grad_norm": 1.995642066001892, + "learning_rate": 1e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6730180978775024, + "num_tokens": 94042464.0, + "step": 3749 + }, + { + "epoch": 0.41181638480122995, + "grad_norm": 2.3295252323150635, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6999480724334717, + "num_tokens": 94066155.0, + "step": 3750 + }, + { + "epoch": 0.4119262025038436, + "grad_norm": 2.0870699882507324, + "learning_rate": 1e-06, + "loss": 1.0429, + "mean_token_accuracy": 0.6835000514984131, + "num_tokens": 94095273.0, + "step": 3751 + }, + { + "epoch": 0.4120360202064573, + "grad_norm": 1.8513178825378418, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7104943990707397, + "num_tokens": 94129082.0, + "step": 3752 + }, + { + "epoch": 0.41214583790907094, + "grad_norm": 2.2799570560455322, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7126466035842896, + "num_tokens": 94152688.0, + "step": 3753 + }, + { + "epoch": 0.4122556556116846, + "grad_norm": 2.2139575481414795, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.7030474543571472, + "num_tokens": 94177972.0, + "step": 3754 + }, + { + "epoch": 0.4123654733142983, + "grad_norm": 2.3178632259368896, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7039549946784973, + "num_tokens": 94201958.0, + "step": 3755 + }, + { + "epoch": 0.41247529101691194, + "grad_norm": 2.077127695083618, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7269747257232666, + "num_tokens": 94228029.0, + "step": 3756 + }, + { + "epoch": 0.4125851087195256, + "grad_norm": 2.448697566986084, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7099124193191528, + "num_tokens": 94250587.0, + "step": 3757 + }, + { + "epoch": 0.41269492642213923, + "grad_norm": 2.258368968963623, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6861152648925781, + "num_tokens": 94279221.0, + "step": 3758 + }, + { + "epoch": 0.41280474412475293, + "grad_norm": 1.9990562200546265, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6857038736343384, + "num_tokens": 94310778.0, + "step": 3759 + }, + { + "epoch": 0.4129145618273666, + "grad_norm": 2.242680311203003, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7045727968215942, + "num_tokens": 94336284.0, + "step": 3760 + }, + { + "epoch": 0.4130243795299802, + "grad_norm": 2.3576574325561523, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7118588089942932, + "num_tokens": 94360203.0, + "step": 3761 + }, + { + "epoch": 0.41313419723259387, + "grad_norm": 2.230654239654541, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6946594715118408, + "num_tokens": 94387292.0, + "step": 3762 + }, + { + "epoch": 0.41324401493520757, + "grad_norm": 2.160001277923584, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6757994890213013, + "num_tokens": 94416130.0, + "step": 3763 + }, + { + "epoch": 0.4133538326378212, + "grad_norm": 2.216651201248169, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.6982176303863525, + "num_tokens": 94443275.0, + "step": 3764 + }, + { + "epoch": 0.41346365034043486, + "grad_norm": 2.2439990043640137, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7102389931678772, + "num_tokens": 94467551.0, + "step": 3765 + }, + { + "epoch": 0.41357346804304856, + "grad_norm": 2.367307662963867, + "learning_rate": 1e-06, + "loss": 1.0426, + "mean_token_accuracy": 0.6813971996307373, + "num_tokens": 94490299.0, + "step": 3766 + }, + { + "epoch": 0.4136832857456622, + "grad_norm": 2.1952502727508545, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.688839316368103, + "num_tokens": 94516464.0, + "step": 3767 + }, + { + "epoch": 0.41379310344827586, + "grad_norm": 2.3748912811279297, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7076306939125061, + "num_tokens": 94542286.0, + "step": 3768 + }, + { + "epoch": 0.4139029211508895, + "grad_norm": 2.4446706771850586, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.703748345375061, + "num_tokens": 94563301.0, + "step": 3769 + }, + { + "epoch": 0.4140127388535032, + "grad_norm": 2.3291590213775635, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6898478269577026, + "num_tokens": 94588729.0, + "step": 3770 + }, + { + "epoch": 0.41412255655611685, + "grad_norm": 2.095181941986084, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6961497068405151, + "num_tokens": 94615875.0, + "step": 3771 + }, + { + "epoch": 0.4142323742587305, + "grad_norm": 2.1060945987701416, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7200225591659546, + "num_tokens": 94643028.0, + "step": 3772 + }, + { + "epoch": 0.41434219196134414, + "grad_norm": 2.418689727783203, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6949781775474548, + "num_tokens": 94664547.0, + "step": 3773 + }, + { + "epoch": 0.41445200966395784, + "grad_norm": 2.092526435852051, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7057299613952637, + "num_tokens": 94693931.0, + "step": 3774 + }, + { + "epoch": 0.4145618273665715, + "grad_norm": 2.3830268383026123, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7261807918548584, + "num_tokens": 94717403.0, + "step": 3775 + }, + { + "epoch": 0.41467164506918514, + "grad_norm": 2.23986554145813, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6928870677947998, + "num_tokens": 94745682.0, + "step": 3776 + }, + { + "epoch": 0.41478146277179884, + "grad_norm": 2.523982048034668, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7105658054351807, + "num_tokens": 94765338.0, + "step": 3777 + }, + { + "epoch": 0.4148912804744125, + "grad_norm": 2.0593812465667725, + "learning_rate": 1e-06, + "loss": 1.1036, + "mean_token_accuracy": 0.6698979735374451, + "num_tokens": 94795946.0, + "step": 3778 + }, + { + "epoch": 0.41500109817702613, + "grad_norm": 2.147284984588623, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6989967823028564, + "num_tokens": 94822936.0, + "step": 3779 + }, + { + "epoch": 0.4151109158796398, + "grad_norm": 2.6317245960235596, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6995422840118408, + "num_tokens": 94843118.0, + "step": 3780 + }, + { + "epoch": 0.4152207335822535, + "grad_norm": 2.852846145629883, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7131203413009644, + "num_tokens": 94859652.0, + "step": 3781 + }, + { + "epoch": 0.4153305512848671, + "grad_norm": 2.099341869354248, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.690975546836853, + "num_tokens": 94891655.0, + "step": 3782 + }, + { + "epoch": 0.41544036898748077, + "grad_norm": 2.611032009124756, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7158013582229614, + "num_tokens": 94909926.0, + "step": 3783 + }, + { + "epoch": 0.41555018669009447, + "grad_norm": 2.24462628364563, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6858013868331909, + "num_tokens": 94933930.0, + "step": 3784 + }, + { + "epoch": 0.4156600043927081, + "grad_norm": 2.4603652954101562, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6963988542556763, + "num_tokens": 94955848.0, + "step": 3785 + }, + { + "epoch": 0.41576982209532176, + "grad_norm": 2.1285693645477295, + "learning_rate": 1e-06, + "loss": 1.0728, + "mean_token_accuracy": 0.6861346960067749, + "num_tokens": 94981327.0, + "step": 3786 + }, + { + "epoch": 0.4158796397979354, + "grad_norm": 1.9943315982818604, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7246939539909363, + "num_tokens": 95008714.0, + "step": 3787 + }, + { + "epoch": 0.4159894575005491, + "grad_norm": 2.2354812622070312, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7012970447540283, + "num_tokens": 95034254.0, + "step": 3788 + }, + { + "epoch": 0.41609927520316276, + "grad_norm": 2.389871597290039, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6855761408805847, + "num_tokens": 95058284.0, + "step": 3789 + }, + { + "epoch": 0.4162090929057764, + "grad_norm": 2.3863325119018555, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7250920534133911, + "num_tokens": 95079041.0, + "step": 3790 + }, + { + "epoch": 0.41631891060839005, + "grad_norm": 2.267644166946411, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7147920727729797, + "num_tokens": 95099885.0, + "step": 3791 + }, + { + "epoch": 0.41642872831100375, + "grad_norm": 2.0029003620147705, + "learning_rate": 1e-06, + "loss": 1.0829, + "mean_token_accuracy": 0.6728754043579102, + "num_tokens": 95130548.0, + "step": 3792 + }, + { + "epoch": 0.4165385460136174, + "grad_norm": 2.044461488723755, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7099642753601074, + "num_tokens": 95158684.0, + "step": 3793 + }, + { + "epoch": 0.41664836371623104, + "grad_norm": 1.8897887468338013, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7116137742996216, + "num_tokens": 95191463.0, + "step": 3794 + }, + { + "epoch": 0.41675818141884474, + "grad_norm": 2.2206075191497803, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6927392482757568, + "num_tokens": 95216282.0, + "step": 3795 + }, + { + "epoch": 0.4168679991214584, + "grad_norm": 2.088826894760132, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.7033205032348633, + "num_tokens": 95243174.0, + "step": 3796 + }, + { + "epoch": 0.41697781682407203, + "grad_norm": 2.3194406032562256, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6840000748634338, + "num_tokens": 95266255.0, + "step": 3797 + }, + { + "epoch": 0.4170876345266857, + "grad_norm": 2.524473190307617, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7148643136024475, + "num_tokens": 95286032.0, + "step": 3798 + }, + { + "epoch": 0.4171974522292994, + "grad_norm": 2.7755932807922363, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7235997915267944, + "num_tokens": 95303128.0, + "step": 3799 + }, + { + "epoch": 0.417307269931913, + "grad_norm": 2.2760987281799316, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7018622159957886, + "num_tokens": 95329513.0, + "step": 3800 + }, + { + "epoch": 0.4174170876345267, + "grad_norm": 2.505913734436035, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6941275000572205, + "num_tokens": 95350251.0, + "step": 3801 + }, + { + "epoch": 0.4175269053371403, + "grad_norm": 2.111607789993286, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.705096960067749, + "num_tokens": 95377928.0, + "step": 3802 + }, + { + "epoch": 0.417636723039754, + "grad_norm": 2.353867769241333, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7177963256835938, + "num_tokens": 95401165.0, + "step": 3803 + }, + { + "epoch": 0.41774654074236767, + "grad_norm": 2.3109519481658936, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7446302175521851, + "num_tokens": 95424282.0, + "step": 3804 + }, + { + "epoch": 0.4178563584449813, + "grad_norm": 2.2124173641204834, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.7029293775558472, + "num_tokens": 95449861.0, + "step": 3805 + }, + { + "epoch": 0.417966176147595, + "grad_norm": 2.3698160648345947, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7058306336402893, + "num_tokens": 95472477.0, + "step": 3806 + }, + { + "epoch": 0.41807599385020866, + "grad_norm": 2.002403497695923, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6972962617874146, + "num_tokens": 95503011.0, + "step": 3807 + }, + { + "epoch": 0.4181858115528223, + "grad_norm": 2.2313485145568848, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7311539053916931, + "num_tokens": 95526862.0, + "step": 3808 + }, + { + "epoch": 0.41829562925543595, + "grad_norm": 2.5053350925445557, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6944621801376343, + "num_tokens": 95548046.0, + "step": 3809 + }, + { + "epoch": 0.41840544695804965, + "grad_norm": 2.2925045490264893, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6887530088424683, + "num_tokens": 95572780.0, + "step": 3810 + }, + { + "epoch": 0.4185152646606633, + "grad_norm": 1.9864442348480225, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7324274778366089, + "num_tokens": 95605178.0, + "step": 3811 + }, + { + "epoch": 0.41862508236327695, + "grad_norm": 2.282355546951294, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6984261274337769, + "num_tokens": 95629235.0, + "step": 3812 + }, + { + "epoch": 0.41873490006589065, + "grad_norm": 2.1800601482391357, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6830161809921265, + "num_tokens": 95656430.0, + "step": 3813 + }, + { + "epoch": 0.4188447177685043, + "grad_norm": 2.598893404006958, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7036652565002441, + "num_tokens": 95675359.0, + "step": 3814 + }, + { + "epoch": 0.41895453547111794, + "grad_norm": 1.878644347190857, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6883333921432495, + "num_tokens": 95708406.0, + "step": 3815 + }, + { + "epoch": 0.4190643531737316, + "grad_norm": 2.379333019256592, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7018951177597046, + "num_tokens": 95729854.0, + "step": 3816 + }, + { + "epoch": 0.4191741708763453, + "grad_norm": 2.356415271759033, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7100143432617188, + "num_tokens": 95751709.0, + "step": 3817 + }, + { + "epoch": 0.41928398857895893, + "grad_norm": 2.1334996223449707, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7219588160514832, + "num_tokens": 95778920.0, + "step": 3818 + }, + { + "epoch": 0.4193938062815726, + "grad_norm": 2.5352559089660645, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6921576857566833, + "num_tokens": 95800706.0, + "step": 3819 + }, + { + "epoch": 0.4195036239841862, + "grad_norm": 2.3320906162261963, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7046409845352173, + "num_tokens": 95826099.0, + "step": 3820 + }, + { + "epoch": 0.4196134416867999, + "grad_norm": 2.255746364593506, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7239730358123779, + "num_tokens": 95849856.0, + "step": 3821 + }, + { + "epoch": 0.41972325938941357, + "grad_norm": 2.4409425258636475, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7269653081893921, + "num_tokens": 95870216.0, + "step": 3822 + }, + { + "epoch": 0.4198330770920272, + "grad_norm": 2.3176844120025635, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7075371742248535, + "num_tokens": 95893822.0, + "step": 3823 + }, + { + "epoch": 0.4199428947946409, + "grad_norm": 2.3185930252075195, + "learning_rate": 1e-06, + "loss": 1.1009, + "mean_token_accuracy": 0.6728262305259705, + "num_tokens": 95919036.0, + "step": 3824 + }, + { + "epoch": 0.42005271249725457, + "grad_norm": 2.441094398498535, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.726859986782074, + "num_tokens": 95940378.0, + "step": 3825 + }, + { + "epoch": 0.4201625301998682, + "grad_norm": 2.3577113151550293, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7300559878349304, + "num_tokens": 95962111.0, + "step": 3826 + }, + { + "epoch": 0.42027234790248186, + "grad_norm": 2.3961760997772217, + "learning_rate": 1e-06, + "loss": 1.0477, + "mean_token_accuracy": 0.6889276504516602, + "num_tokens": 95987008.0, + "step": 3827 + }, + { + "epoch": 0.42038216560509556, + "grad_norm": 2.5525104999542236, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7053593397140503, + "num_tokens": 96007418.0, + "step": 3828 + }, + { + "epoch": 0.4204919833077092, + "grad_norm": 2.1867475509643555, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6972933411598206, + "num_tokens": 96034574.0, + "step": 3829 + }, + { + "epoch": 0.42060180101032285, + "grad_norm": 2.3138418197631836, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6817185878753662, + "num_tokens": 96058380.0, + "step": 3830 + }, + { + "epoch": 0.42071161871293655, + "grad_norm": 2.507490396499634, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.716185450553894, + "num_tokens": 96079576.0, + "step": 3831 + }, + { + "epoch": 0.4208214364155502, + "grad_norm": 2.1576433181762695, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6970542669296265, + "num_tokens": 96104169.0, + "step": 3832 + }, + { + "epoch": 0.42093125411816384, + "grad_norm": 2.2891178131103516, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7081773281097412, + "num_tokens": 96128377.0, + "step": 3833 + }, + { + "epoch": 0.4210410718207775, + "grad_norm": 2.343299627304077, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6903235912322998, + "num_tokens": 96152241.0, + "step": 3834 + }, + { + "epoch": 0.4211508895233912, + "grad_norm": 2.114912509918213, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.7022665739059448, + "num_tokens": 96180634.0, + "step": 3835 + }, + { + "epoch": 0.42126070722600484, + "grad_norm": 2.192206859588623, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7155481576919556, + "num_tokens": 96206968.0, + "step": 3836 + }, + { + "epoch": 0.4213705249286185, + "grad_norm": 2.1874914169311523, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6930674910545349, + "num_tokens": 96234364.0, + "step": 3837 + }, + { + "epoch": 0.42148034263123213, + "grad_norm": 2.1236023902893066, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7155717611312866, + "num_tokens": 96262215.0, + "step": 3838 + }, + { + "epoch": 0.42159016033384583, + "grad_norm": 1.9397627115249634, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7106307744979858, + "num_tokens": 96291559.0, + "step": 3839 + }, + { + "epoch": 0.4216999780364595, + "grad_norm": 2.376718044281006, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7032349109649658, + "num_tokens": 96315012.0, + "step": 3840 + }, + { + "epoch": 0.4218097957390731, + "grad_norm": 2.3032329082489014, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.707741379737854, + "num_tokens": 96337462.0, + "step": 3841 + }, + { + "epoch": 0.4219196134416868, + "grad_norm": 1.9292981624603271, + "learning_rate": 1e-06, + "loss": 1.0749, + "mean_token_accuracy": 0.6837948560714722, + "num_tokens": 96372524.0, + "step": 3842 + }, + { + "epoch": 0.42202943114430047, + "grad_norm": 2.0234334468841553, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.72060227394104, + "num_tokens": 96400914.0, + "step": 3843 + }, + { + "epoch": 0.4221392488469141, + "grad_norm": 2.1913647651672363, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7012739777565002, + "num_tokens": 96425648.0, + "step": 3844 + }, + { + "epoch": 0.42224906654952776, + "grad_norm": 2.002413749694824, + "learning_rate": 1e-06, + "loss": 1.066, + "mean_token_accuracy": 0.6817141175270081, + "num_tokens": 96455153.0, + "step": 3845 + }, + { + "epoch": 0.42235888425214146, + "grad_norm": 2.211477279663086, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7007215619087219, + "num_tokens": 96482375.0, + "step": 3846 + }, + { + "epoch": 0.4224687019547551, + "grad_norm": 2.2119405269622803, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7139912247657776, + "num_tokens": 96505602.0, + "step": 3847 + }, + { + "epoch": 0.42257851965736876, + "grad_norm": 2.1711275577545166, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7083422541618347, + "num_tokens": 96532713.0, + "step": 3848 + }, + { + "epoch": 0.4226883373599824, + "grad_norm": 2.2005529403686523, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.6962857246398926, + "num_tokens": 96559754.0, + "step": 3849 + }, + { + "epoch": 0.4227981550625961, + "grad_norm": 2.093989610671997, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6997169256210327, + "num_tokens": 96587470.0, + "step": 3850 + }, + { + "epoch": 0.42290797276520975, + "grad_norm": 2.4090418815612793, + "learning_rate": 1e-06, + "loss": 1.0991, + "mean_token_accuracy": 0.6896403431892395, + "num_tokens": 96610789.0, + "step": 3851 + }, + { + "epoch": 0.4230177904678234, + "grad_norm": 2.2925491333007812, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.724004864692688, + "num_tokens": 96633000.0, + "step": 3852 + }, + { + "epoch": 0.4231276081704371, + "grad_norm": 2.4856998920440674, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6941068172454834, + "num_tokens": 96653146.0, + "step": 3853 + }, + { + "epoch": 0.42323742587305074, + "grad_norm": 2.2378087043762207, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7051925659179688, + "num_tokens": 96678631.0, + "step": 3854 + }, + { + "epoch": 0.4233472435756644, + "grad_norm": 2.2874584197998047, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7072291970252991, + "num_tokens": 96701925.0, + "step": 3855 + }, + { + "epoch": 0.42345706127827804, + "grad_norm": 2.398672580718994, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7044090628623962, + "num_tokens": 96725112.0, + "step": 3856 + }, + { + "epoch": 0.42356687898089174, + "grad_norm": 2.2849953174591064, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7065414786338806, + "num_tokens": 96748201.0, + "step": 3857 + }, + { + "epoch": 0.4236766966835054, + "grad_norm": 2.310523271560669, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6970381736755371, + "num_tokens": 96772119.0, + "step": 3858 + }, + { + "epoch": 0.42378651438611903, + "grad_norm": 2.251131296157837, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7194212675094604, + "num_tokens": 96796877.0, + "step": 3859 + }, + { + "epoch": 0.42389633208873273, + "grad_norm": 2.0554866790771484, + "learning_rate": 1e-06, + "loss": 1.0296, + "mean_token_accuracy": 0.6933804750442505, + "num_tokens": 96826123.0, + "step": 3860 + }, + { + "epoch": 0.4240061497913464, + "grad_norm": 2.238053560256958, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7041330933570862, + "num_tokens": 96852093.0, + "step": 3861 + }, + { + "epoch": 0.42411596749396, + "grad_norm": 2.1343376636505127, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.7055553197860718, + "num_tokens": 96880747.0, + "step": 3862 + }, + { + "epoch": 0.42422578519657367, + "grad_norm": 2.712851047515869, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7132846713066101, + "num_tokens": 96899719.0, + "step": 3863 + }, + { + "epoch": 0.42433560289918737, + "grad_norm": 2.1180896759033203, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6890151500701904, + "num_tokens": 96927576.0, + "step": 3864 + }, + { + "epoch": 0.424445420601801, + "grad_norm": 2.3384642601013184, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7036488056182861, + "num_tokens": 96950553.0, + "step": 3865 + }, + { + "epoch": 0.42455523830441466, + "grad_norm": 2.1167426109313965, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6932023763656616, + "num_tokens": 96980471.0, + "step": 3866 + }, + { + "epoch": 0.4246650560070283, + "grad_norm": 2.0513813495635986, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6925667524337769, + "num_tokens": 97009947.0, + "step": 3867 + }, + { + "epoch": 0.424774873709642, + "grad_norm": 2.0754764080047607, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7253768444061279, + "num_tokens": 97037793.0, + "step": 3868 + }, + { + "epoch": 0.42488469141225566, + "grad_norm": 2.381662130355835, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7267069220542908, + "num_tokens": 97058648.0, + "step": 3869 + }, + { + "epoch": 0.4249945091148693, + "grad_norm": 2.087455987930298, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.6961779594421387, + "num_tokens": 97086262.0, + "step": 3870 + }, + { + "epoch": 0.425104326817483, + "grad_norm": 2.5138702392578125, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7129905223846436, + "num_tokens": 97108008.0, + "step": 3871 + }, + { + "epoch": 0.42521414452009665, + "grad_norm": 2.0831942558288574, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6987084150314331, + "num_tokens": 97135578.0, + "step": 3872 + }, + { + "epoch": 0.4253239622227103, + "grad_norm": 2.174673318862915, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7050703763961792, + "num_tokens": 97161483.0, + "step": 3873 + }, + { + "epoch": 0.42543377992532394, + "grad_norm": 2.4667327404022217, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7041193246841431, + "num_tokens": 97182711.0, + "step": 3874 + }, + { + "epoch": 0.42554359762793764, + "grad_norm": 2.1187286376953125, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6925247311592102, + "num_tokens": 97212561.0, + "step": 3875 + }, + { + "epoch": 0.4256534153305513, + "grad_norm": 1.949596881866455, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6936111450195312, + "num_tokens": 97247580.0, + "step": 3876 + }, + { + "epoch": 0.42576323303316493, + "grad_norm": 2.185128927230835, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.6991956233978271, + "num_tokens": 97272185.0, + "step": 3877 + }, + { + "epoch": 0.4258730507357786, + "grad_norm": 2.525563955307007, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7162491083145142, + "num_tokens": 97291116.0, + "step": 3878 + }, + { + "epoch": 0.4259828684383923, + "grad_norm": 2.3176565170288086, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6866471767425537, + "num_tokens": 97315562.0, + "step": 3879 + }, + { + "epoch": 0.4260926861410059, + "grad_norm": 2.6575570106506348, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6826869249343872, + "num_tokens": 97335299.0, + "step": 3880 + }, + { + "epoch": 0.4262025038436196, + "grad_norm": 2.2345998287200928, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7127529382705688, + "num_tokens": 97361253.0, + "step": 3881 + }, + { + "epoch": 0.4263123215462333, + "grad_norm": 2.0945422649383545, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.695480465888977, + "num_tokens": 97387633.0, + "step": 3882 + }, + { + "epoch": 0.4264221392488469, + "grad_norm": 2.211031675338745, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6919156312942505, + "num_tokens": 97415622.0, + "step": 3883 + }, + { + "epoch": 0.42653195695146057, + "grad_norm": 2.4764554500579834, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.727651834487915, + "num_tokens": 97436256.0, + "step": 3884 + }, + { + "epoch": 0.4266417746540742, + "grad_norm": 2.370897054672241, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6816169023513794, + "num_tokens": 97460845.0, + "step": 3885 + }, + { + "epoch": 0.4267515923566879, + "grad_norm": 2.0461583137512207, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.715681791305542, + "num_tokens": 97487321.0, + "step": 3886 + }, + { + "epoch": 0.42686141005930156, + "grad_norm": 2.152268886566162, + "learning_rate": 1e-06, + "loss": 1.1378, + "mean_token_accuracy": 0.6606724858283997, + "num_tokens": 97514419.0, + "step": 3887 + }, + { + "epoch": 0.4269712277619152, + "grad_norm": 2.4005043506622314, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7014353275299072, + "num_tokens": 97535981.0, + "step": 3888 + }, + { + "epoch": 0.4270810454645289, + "grad_norm": 2.361776113510132, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6948971748352051, + "num_tokens": 97559374.0, + "step": 3889 + }, + { + "epoch": 0.42719086316714255, + "grad_norm": 2.244389295578003, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6914259195327759, + "num_tokens": 97585245.0, + "step": 3890 + }, + { + "epoch": 0.4273006808697562, + "grad_norm": 2.4049715995788574, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7061022520065308, + "num_tokens": 97607112.0, + "step": 3891 + }, + { + "epoch": 0.42741049857236985, + "grad_norm": 2.4025943279266357, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7300052642822266, + "num_tokens": 97628043.0, + "step": 3892 + }, + { + "epoch": 0.42752031627498355, + "grad_norm": 2.4551055431365967, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7149630784988403, + "num_tokens": 97650558.0, + "step": 3893 + }, + { + "epoch": 0.4276301339775972, + "grad_norm": 1.9678771495819092, + "learning_rate": 1e-06, + "loss": 1.0971, + "mean_token_accuracy": 0.6776485443115234, + "num_tokens": 97684374.0, + "step": 3894 + }, + { + "epoch": 0.42773995168021084, + "grad_norm": 2.028705358505249, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6978503465652466, + "num_tokens": 97716582.0, + "step": 3895 + }, + { + "epoch": 0.4278497693828245, + "grad_norm": 2.5495028495788574, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7120351791381836, + "num_tokens": 97736370.0, + "step": 3896 + }, + { + "epoch": 0.4279595870854382, + "grad_norm": 2.654172658920288, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7030596733093262, + "num_tokens": 97755346.0, + "step": 3897 + }, + { + "epoch": 0.42806940478805183, + "grad_norm": 1.9093811511993408, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6844387054443359, + "num_tokens": 97790200.0, + "step": 3898 + }, + { + "epoch": 0.4281792224906655, + "grad_norm": 2.4262826442718506, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7120217084884644, + "num_tokens": 97812252.0, + "step": 3899 + }, + { + "epoch": 0.4282890401932792, + "grad_norm": 2.059434175491333, + "learning_rate": 1e-06, + "loss": 1.1152, + "mean_token_accuracy": 0.6720256209373474, + "num_tokens": 97844924.0, + "step": 3900 + }, + { + "epoch": 0.4283988578958928, + "grad_norm": 2.755173683166504, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7210726737976074, + "num_tokens": 97861561.0, + "step": 3901 + }, + { + "epoch": 0.42850867559850647, + "grad_norm": 1.8461616039276123, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7135423421859741, + "num_tokens": 97895285.0, + "step": 3902 + }, + { + "epoch": 0.4286184933011201, + "grad_norm": 2.4030020236968994, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6987717151641846, + "num_tokens": 97918303.0, + "step": 3903 + }, + { + "epoch": 0.4287283110037338, + "grad_norm": 2.114203929901123, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7210955023765564, + "num_tokens": 97944707.0, + "step": 3904 + }, + { + "epoch": 0.42883812870634747, + "grad_norm": 2.205078363418579, + "learning_rate": 1e-06, + "loss": 1.0515, + "mean_token_accuracy": 0.6823198199272156, + "num_tokens": 97971961.0, + "step": 3905 + }, + { + "epoch": 0.4289479464089611, + "grad_norm": 2.0681490898132324, + "learning_rate": 1e-06, + "loss": 1.0672, + "mean_token_accuracy": 0.6813040971755981, + "num_tokens": 98000984.0, + "step": 3906 + }, + { + "epoch": 0.4290577641115748, + "grad_norm": 2.045397996902466, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6982176303863525, + "num_tokens": 98032397.0, + "step": 3907 + }, + { + "epoch": 0.42916758181418846, + "grad_norm": 2.6451830863952637, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7229434251785278, + "num_tokens": 98051970.0, + "step": 3908 + }, + { + "epoch": 0.4292773995168021, + "grad_norm": 2.1607303619384766, + "learning_rate": 1e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6802355051040649, + "num_tokens": 98078813.0, + "step": 3909 + }, + { + "epoch": 0.42938721721941575, + "grad_norm": 2.2414698600769043, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.730433464050293, + "num_tokens": 98101828.0, + "step": 3910 + }, + { + "epoch": 0.42949703492202945, + "grad_norm": 2.452531099319458, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7225241661071777, + "num_tokens": 98123890.0, + "step": 3911 + }, + { + "epoch": 0.4296068526246431, + "grad_norm": 2.3625712394714355, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7137813568115234, + "num_tokens": 98146985.0, + "step": 3912 + }, + { + "epoch": 0.42971667032725674, + "grad_norm": 2.084012985229492, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6942788362503052, + "num_tokens": 98172715.0, + "step": 3913 + }, + { + "epoch": 0.4298264880298704, + "grad_norm": 2.545877456665039, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.705691397190094, + "num_tokens": 98192320.0, + "step": 3914 + }, + { + "epoch": 0.4299363057324841, + "grad_norm": 1.9380507469177246, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6928070783615112, + "num_tokens": 98225310.0, + "step": 3915 + }, + { + "epoch": 0.43004612343509774, + "grad_norm": 2.271461248397827, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7197915315628052, + "num_tokens": 98249288.0, + "step": 3916 + }, + { + "epoch": 0.4301559411377114, + "grad_norm": 2.0750787258148193, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7146943807601929, + "num_tokens": 98276114.0, + "step": 3917 + }, + { + "epoch": 0.4302657588403251, + "grad_norm": 2.0223093032836914, + "learning_rate": 1e-06, + "loss": 1.0281, + "mean_token_accuracy": 0.6939459443092346, + "num_tokens": 98306855.0, + "step": 3918 + }, + { + "epoch": 0.43037557654293873, + "grad_norm": 2.735882520675659, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6940176486968994, + "num_tokens": 98324324.0, + "step": 3919 + }, + { + "epoch": 0.4304853942455524, + "grad_norm": 2.256956100463867, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6994431018829346, + "num_tokens": 98349427.0, + "step": 3920 + }, + { + "epoch": 0.430595211948166, + "grad_norm": 2.3144688606262207, + "learning_rate": 1e-06, + "loss": 1.0614, + "mean_token_accuracy": 0.6766643524169922, + "num_tokens": 98374176.0, + "step": 3921 + }, + { + "epoch": 0.4307050296507797, + "grad_norm": 2.334136962890625, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7066308856010437, + "num_tokens": 98399127.0, + "step": 3922 + }, + { + "epoch": 0.43081484735339337, + "grad_norm": 2.0165324211120605, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6939821839332581, + "num_tokens": 98428872.0, + "step": 3923 + }, + { + "epoch": 0.430924665056007, + "grad_norm": 2.100881814956665, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6971864700317383, + "num_tokens": 98456183.0, + "step": 3924 + }, + { + "epoch": 0.43103448275862066, + "grad_norm": 2.291635751724243, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7176162600517273, + "num_tokens": 98478667.0, + "step": 3925 + }, + { + "epoch": 0.43114430046123436, + "grad_norm": 1.9902231693267822, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7072526812553406, + "num_tokens": 98509755.0, + "step": 3926 + }, + { + "epoch": 0.431254118163848, + "grad_norm": 2.2568721771240234, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.696953296661377, + "num_tokens": 98535352.0, + "step": 3927 + }, + { + "epoch": 0.43136393586646166, + "grad_norm": 2.570232391357422, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6901489496231079, + "num_tokens": 98556819.0, + "step": 3928 + }, + { + "epoch": 0.43147375356907536, + "grad_norm": 2.331319808959961, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6869210004806519, + "num_tokens": 98579888.0, + "step": 3929 + }, + { + "epoch": 0.431583571271689, + "grad_norm": 2.150881052017212, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6930938363075256, + "num_tokens": 98607125.0, + "step": 3930 + }, + { + "epoch": 0.43169338897430265, + "grad_norm": 2.7945737838745117, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7258110642433167, + "num_tokens": 98624136.0, + "step": 3931 + }, + { + "epoch": 0.4318032066769163, + "grad_norm": 2.0402321815490723, + "learning_rate": 1e-06, + "loss": 1.0747, + "mean_token_accuracy": 0.6782854795455933, + "num_tokens": 98654840.0, + "step": 3932 + }, + { + "epoch": 0.43191302437953, + "grad_norm": 2.527327299118042, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7022570371627808, + "num_tokens": 98674504.0, + "step": 3933 + }, + { + "epoch": 0.43202284208214364, + "grad_norm": 2.289329767227173, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.7019196152687073, + "num_tokens": 98700252.0, + "step": 3934 + }, + { + "epoch": 0.4321326597847573, + "grad_norm": 2.087965250015259, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7241537570953369, + "num_tokens": 98726919.0, + "step": 3935 + }, + { + "epoch": 0.432242477487371, + "grad_norm": 2.538783311843872, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.6988045573234558, + "num_tokens": 98744245.0, + "step": 3936 + }, + { + "epoch": 0.43235229518998464, + "grad_norm": 2.461961507797241, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6957036852836609, + "num_tokens": 98765897.0, + "step": 3937 + }, + { + "epoch": 0.4324621128925983, + "grad_norm": 2.318577527999878, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.712749719619751, + "num_tokens": 98788917.0, + "step": 3938 + }, + { + "epoch": 0.43257193059521193, + "grad_norm": 2.397921085357666, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6959085464477539, + "num_tokens": 98810485.0, + "step": 3939 + }, + { + "epoch": 0.43268174829782563, + "grad_norm": 2.118868589401245, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.718704342842102, + "num_tokens": 98835909.0, + "step": 3940 + }, + { + "epoch": 0.4327915660004393, + "grad_norm": 2.542583465576172, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6958381533622742, + "num_tokens": 98857744.0, + "step": 3941 + }, + { + "epoch": 0.4329013837030529, + "grad_norm": 2.4773006439208984, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7071640491485596, + "num_tokens": 98876831.0, + "step": 3942 + }, + { + "epoch": 0.43301120140566657, + "grad_norm": 2.343606472015381, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6965247988700867, + "num_tokens": 98900312.0, + "step": 3943 + }, + { + "epoch": 0.43312101910828027, + "grad_norm": 2.323551654815674, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7078481912612915, + "num_tokens": 98923327.0, + "step": 3944 + }, + { + "epoch": 0.4332308368108939, + "grad_norm": 2.5082807540893555, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.7006280422210693, + "num_tokens": 98943334.0, + "step": 3945 + }, + { + "epoch": 0.43334065451350756, + "grad_norm": 2.696413993835449, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7196766138076782, + "num_tokens": 98960839.0, + "step": 3946 + }, + { + "epoch": 0.43345047221612126, + "grad_norm": 2.255913019180298, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.6971520185470581, + "num_tokens": 98984911.0, + "step": 3947 + }, + { + "epoch": 0.4335602899187349, + "grad_norm": 2.2345590591430664, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7146270275115967, + "num_tokens": 99010758.0, + "step": 3948 + }, + { + "epoch": 0.43367010762134856, + "grad_norm": 2.441660165786743, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7108999490737915, + "num_tokens": 99031752.0, + "step": 3949 + }, + { + "epoch": 0.4337799253239622, + "grad_norm": 2.3005290031433105, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6891922950744629, + "num_tokens": 99055431.0, + "step": 3950 + }, + { + "epoch": 0.4338897430265759, + "grad_norm": 2.348278760910034, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7207818627357483, + "num_tokens": 99076905.0, + "step": 3951 + }, + { + "epoch": 0.43399956072918955, + "grad_norm": 2.1163887977600098, + "learning_rate": 1e-06, + "loss": 1.1418, + "mean_token_accuracy": 0.6588015556335449, + "num_tokens": 99108228.0, + "step": 3952 + }, + { + "epoch": 0.4341093784318032, + "grad_norm": 2.724263906478882, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7167330980300903, + "num_tokens": 99126174.0, + "step": 3953 + }, + { + "epoch": 0.43421919613441684, + "grad_norm": 2.1861884593963623, + "learning_rate": 1e-06, + "loss": 1.0263, + "mean_token_accuracy": 0.6930059194564819, + "num_tokens": 99152444.0, + "step": 3954 + }, + { + "epoch": 0.43432901383703054, + "grad_norm": 2.156001091003418, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7236674427986145, + "num_tokens": 99176817.0, + "step": 3955 + }, + { + "epoch": 0.4344388315396442, + "grad_norm": 2.5807905197143555, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.715121865272522, + "num_tokens": 99195881.0, + "step": 3956 + }, + { + "epoch": 0.43454864924225783, + "grad_norm": 2.1946661472320557, + "learning_rate": 1e-06, + "loss": 1.0512, + "mean_token_accuracy": 0.6841497421264648, + "num_tokens": 99224107.0, + "step": 3957 + }, + { + "epoch": 0.43465846694487154, + "grad_norm": 2.3554847240448, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6855169534683228, + "num_tokens": 99247322.0, + "step": 3958 + }, + { + "epoch": 0.4347682846474852, + "grad_norm": 2.045919895172119, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7093793153762817, + "num_tokens": 99275745.0, + "step": 3959 + }, + { + "epoch": 0.4348781023500988, + "grad_norm": 2.320343017578125, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6995903849601746, + "num_tokens": 99299835.0, + "step": 3960 + }, + { + "epoch": 0.4349879200527125, + "grad_norm": 2.258406400680542, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6851855516433716, + "num_tokens": 99324180.0, + "step": 3961 + }, + { + "epoch": 0.4350977377553262, + "grad_norm": 2.572606325149536, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7031408548355103, + "num_tokens": 99344080.0, + "step": 3962 + }, + { + "epoch": 0.4352075554579398, + "grad_norm": 2.5457305908203125, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7324205636978149, + "num_tokens": 99363217.0, + "step": 3963 + }, + { + "epoch": 0.43531737316055347, + "grad_norm": 2.1955230236053467, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6955759525299072, + "num_tokens": 99388614.0, + "step": 3964 + }, + { + "epoch": 0.43542719086316717, + "grad_norm": 2.214017391204834, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7030149698257446, + "num_tokens": 99414431.0, + "step": 3965 + }, + { + "epoch": 0.4355370085657808, + "grad_norm": 2.2188687324523926, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6855278015136719, + "num_tokens": 99441381.0, + "step": 3966 + }, + { + "epoch": 0.43564682626839446, + "grad_norm": 2.103424072265625, + "learning_rate": 1e-06, + "loss": 1.0766, + "mean_token_accuracy": 0.6779459118843079, + "num_tokens": 99470957.0, + "step": 3967 + }, + { + "epoch": 0.4357566439710081, + "grad_norm": 2.185546875, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7155966758728027, + "num_tokens": 99496786.0, + "step": 3968 + }, + { + "epoch": 0.4358664616736218, + "grad_norm": 2.1711654663085938, + "learning_rate": 1e-06, + "loss": 1.0306, + "mean_token_accuracy": 0.691306471824646, + "num_tokens": 99524614.0, + "step": 3969 + }, + { + "epoch": 0.43597627937623545, + "grad_norm": 2.3214828968048096, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6882283687591553, + "num_tokens": 99550262.0, + "step": 3970 + }, + { + "epoch": 0.4360860970788491, + "grad_norm": 2.631786823272705, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7213640809059143, + "num_tokens": 99568323.0, + "step": 3971 + }, + { + "epoch": 0.43619591478146275, + "grad_norm": 2.152545928955078, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7051806449890137, + "num_tokens": 99593167.0, + "step": 3972 + }, + { + "epoch": 0.43630573248407645, + "grad_norm": 1.944743037223816, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7005846500396729, + "num_tokens": 99624510.0, + "step": 3973 + }, + { + "epoch": 0.4364155501866901, + "grad_norm": 2.1918301582336426, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.703071117401123, + "num_tokens": 99649052.0, + "step": 3974 + }, + { + "epoch": 0.43652536788930374, + "grad_norm": 2.188070058822632, + "learning_rate": 1e-06, + "loss": 1.135, + "mean_token_accuracy": 0.6687817573547363, + "num_tokens": 99674238.0, + "step": 3975 + }, + { + "epoch": 0.43663518559191744, + "grad_norm": 2.3769683837890625, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7130358219146729, + "num_tokens": 99696276.0, + "step": 3976 + }, + { + "epoch": 0.4367450032945311, + "grad_norm": 2.478303909301758, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7206538915634155, + "num_tokens": 99718273.0, + "step": 3977 + }, + { + "epoch": 0.43685482099714473, + "grad_norm": 2.116016149520874, + "learning_rate": 1e-06, + "loss": 1.0415, + "mean_token_accuracy": 0.6851041316986084, + "num_tokens": 99747390.0, + "step": 3978 + }, + { + "epoch": 0.4369646386997584, + "grad_norm": 2.4744174480438232, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7161492705345154, + "num_tokens": 99766900.0, + "step": 3979 + }, + { + "epoch": 0.4370744564023721, + "grad_norm": 2.3056957721710205, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7288212180137634, + "num_tokens": 99788312.0, + "step": 3980 + }, + { + "epoch": 0.4371842741049857, + "grad_norm": 2.1844866275787354, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.706511378288269, + "num_tokens": 99813292.0, + "step": 3981 + }, + { + "epoch": 0.43729409180759937, + "grad_norm": 2.2340869903564453, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7107880711555481, + "num_tokens": 99839308.0, + "step": 3982 + }, + { + "epoch": 0.4374039095102131, + "grad_norm": 2.280299663543701, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.700964093208313, + "num_tokens": 99863054.0, + "step": 3983 + }, + { + "epoch": 0.4375137272128267, + "grad_norm": 2.1368939876556396, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7073748111724854, + "num_tokens": 99888646.0, + "step": 3984 + }, + { + "epoch": 0.43762354491544037, + "grad_norm": 2.456953287124634, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7047466039657593, + "num_tokens": 99910914.0, + "step": 3985 + }, + { + "epoch": 0.437733362618054, + "grad_norm": 2.2853965759277344, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6941356658935547, + "num_tokens": 99933970.0, + "step": 3986 + }, + { + "epoch": 0.4378431803206677, + "grad_norm": 2.3000874519348145, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6870485544204712, + "num_tokens": 99957761.0, + "step": 3987 + }, + { + "epoch": 0.43795299802328136, + "grad_norm": 2.5069541931152344, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6948167085647583, + "num_tokens": 99979479.0, + "step": 3988 + }, + { + "epoch": 0.438062815725895, + "grad_norm": 2.323601722717285, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7251442670822144, + "num_tokens": 100004083.0, + "step": 3989 + }, + { + "epoch": 0.43817263342850865, + "grad_norm": 2.229480266571045, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6886293888092041, + "num_tokens": 100028798.0, + "step": 3990 + }, + { + "epoch": 0.43828245113112235, + "grad_norm": 2.4078991413116455, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7166740894317627, + "num_tokens": 100051721.0, + "step": 3991 + }, + { + "epoch": 0.438392268833736, + "grad_norm": 2.164597272872925, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7084962129592896, + "num_tokens": 100078630.0, + "step": 3992 + }, + { + "epoch": 0.43850208653634964, + "grad_norm": 2.307798385620117, + "learning_rate": 1e-06, + "loss": 1.0379, + "mean_token_accuracy": 0.6881214380264282, + "num_tokens": 100103032.0, + "step": 3993 + }, + { + "epoch": 0.43861190423896335, + "grad_norm": 1.7950234413146973, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7066026926040649, + "num_tokens": 100138499.0, + "step": 3994 + }, + { + "epoch": 0.438721721941577, + "grad_norm": 2.53721022605896, + "learning_rate": 1e-06, + "loss": 1.0534, + "mean_token_accuracy": 0.7041037082672119, + "num_tokens": 100159314.0, + "step": 3995 + }, + { + "epoch": 0.43883153964419064, + "grad_norm": 2.3043293952941895, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7152844667434692, + "num_tokens": 100180645.0, + "step": 3996 + }, + { + "epoch": 0.4389413573468043, + "grad_norm": 2.4819743633270264, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7080390453338623, + "num_tokens": 100200487.0, + "step": 3997 + }, + { + "epoch": 0.439051175049418, + "grad_norm": 2.364051580429077, + "learning_rate": 1e-06, + "loss": 1.0473, + "mean_token_accuracy": 0.6817941665649414, + "num_tokens": 100224477.0, + "step": 3998 + }, + { + "epoch": 0.43916099275203163, + "grad_norm": 2.4481399059295654, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7054165601730347, + "num_tokens": 100245800.0, + "step": 3999 + }, + { + "epoch": 0.4392708104546453, + "grad_norm": 2.2026946544647217, + "learning_rate": 1e-06, + "loss": 1.1351, + "mean_token_accuracy": 0.6722240447998047, + "num_tokens": 100274582.0, + "step": 4000 + }, + { + "epoch": 0.4393806281572589, + "grad_norm": 2.226421594619751, + "learning_rate": 1e-06, + "loss": 1.0865, + "mean_token_accuracy": 0.6744072437286377, + "num_tokens": 100301410.0, + "step": 4001 + }, + { + "epoch": 0.4394904458598726, + "grad_norm": 2.39851975440979, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6874092221260071, + "num_tokens": 100323287.0, + "step": 4002 + }, + { + "epoch": 0.43960026356248627, + "grad_norm": 2.1940395832061768, + "learning_rate": 1e-06, + "loss": 1.0618, + "mean_token_accuracy": 0.686305046081543, + "num_tokens": 100351651.0, + "step": 4003 + }, + { + "epoch": 0.4397100812650999, + "grad_norm": 2.5609521865844727, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.691888689994812, + "num_tokens": 100370378.0, + "step": 4004 + }, + { + "epoch": 0.4398198989677136, + "grad_norm": 2.2317185401916504, + "learning_rate": 1e-06, + "loss": 1.0244, + "mean_token_accuracy": 0.6890238523483276, + "num_tokens": 100395590.0, + "step": 4005 + }, + { + "epoch": 0.43992971667032726, + "grad_norm": 2.263523578643799, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7029510736465454, + "num_tokens": 100420462.0, + "step": 4006 + }, + { + "epoch": 0.4400395343729409, + "grad_norm": 2.027426242828369, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7105587124824524, + "num_tokens": 100449516.0, + "step": 4007 + }, + { + "epoch": 0.44014935207555456, + "grad_norm": 2.846850633621216, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.6860036849975586, + "num_tokens": 100466048.0, + "step": 4008 + }, + { + "epoch": 0.44025916977816826, + "grad_norm": 2.191257953643799, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7013584971427917, + "num_tokens": 100492258.0, + "step": 4009 + }, + { + "epoch": 0.4403689874807819, + "grad_norm": 2.012946367263794, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6942020654678345, + "num_tokens": 100523331.0, + "step": 4010 + }, + { + "epoch": 0.44047880518339555, + "grad_norm": 2.171091318130493, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6948125958442688, + "num_tokens": 100551811.0, + "step": 4011 + }, + { + "epoch": 0.44058862288600925, + "grad_norm": 2.1761977672576904, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.6969865560531616, + "num_tokens": 100576167.0, + "step": 4012 + }, + { + "epoch": 0.4406984405886229, + "grad_norm": 2.496126413345337, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.7028475999832153, + "num_tokens": 100595724.0, + "step": 4013 + }, + { + "epoch": 0.44080825829123654, + "grad_norm": 2.175673723220825, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6955586075782776, + "num_tokens": 100620495.0, + "step": 4014 + }, + { + "epoch": 0.4409180759938502, + "grad_norm": 2.0287907123565674, + "learning_rate": 1e-06, + "loss": 1.0318, + "mean_token_accuracy": 0.6956740617752075, + "num_tokens": 100650586.0, + "step": 4015 + }, + { + "epoch": 0.4410278936964639, + "grad_norm": 2.2921202182769775, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7213932871818542, + "num_tokens": 100673959.0, + "step": 4016 + }, + { + "epoch": 0.44113771139907754, + "grad_norm": 2.379211664199829, + "learning_rate": 1e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7465560436248779, + "num_tokens": 100695429.0, + "step": 4017 + }, + { + "epoch": 0.4412475291016912, + "grad_norm": 2.0231170654296875, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.697624683380127, + "num_tokens": 100724184.0, + "step": 4018 + }, + { + "epoch": 0.44135734680430483, + "grad_norm": 2.246220111846924, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6961771249771118, + "num_tokens": 100750146.0, + "step": 4019 + }, + { + "epoch": 0.44146716450691853, + "grad_norm": 2.166564464569092, + "learning_rate": 1e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7459783554077148, + "num_tokens": 100776163.0, + "step": 4020 + }, + { + "epoch": 0.4415769822095322, + "grad_norm": 2.681859016418457, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7213149070739746, + "num_tokens": 100792906.0, + "step": 4021 + }, + { + "epoch": 0.4416867999121458, + "grad_norm": 2.148946762084961, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.7011083364486694, + "num_tokens": 100818458.0, + "step": 4022 + }, + { + "epoch": 0.4417966176147595, + "grad_norm": 2.073683261871338, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.6803292036056519, + "num_tokens": 100847574.0, + "step": 4023 + }, + { + "epoch": 0.44190643531737317, + "grad_norm": 2.6448025703430176, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7075336575508118, + "num_tokens": 100864155.0, + "step": 4024 + }, + { + "epoch": 0.4420162530199868, + "grad_norm": 2.2694287300109863, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6946774125099182, + "num_tokens": 100890636.0, + "step": 4025 + }, + { + "epoch": 0.44212607072260046, + "grad_norm": 2.337766647338867, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6799728274345398, + "num_tokens": 100915694.0, + "step": 4026 + }, + { + "epoch": 0.44223588842521416, + "grad_norm": 2.6047844886779785, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7087209820747375, + "num_tokens": 100935441.0, + "step": 4027 + }, + { + "epoch": 0.4423457061278278, + "grad_norm": 2.3108303546905518, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7136290073394775, + "num_tokens": 100957547.0, + "step": 4028 + }, + { + "epoch": 0.44245552383044146, + "grad_norm": 2.3421690464019775, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7053478360176086, + "num_tokens": 100980591.0, + "step": 4029 + }, + { + "epoch": 0.4425653415330551, + "grad_norm": 2.0358710289001465, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7124547958374023, + "num_tokens": 101010461.0, + "step": 4030 + }, + { + "epoch": 0.4426751592356688, + "grad_norm": 2.1245384216308594, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6920503377914429, + "num_tokens": 101039131.0, + "step": 4031 + }, + { + "epoch": 0.44278497693828245, + "grad_norm": 2.024858236312866, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6978809833526611, + "num_tokens": 101068466.0, + "step": 4032 + }, + { + "epoch": 0.4428947946408961, + "grad_norm": 2.95107102394104, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.6979534029960632, + "num_tokens": 101084007.0, + "step": 4033 + }, + { + "epoch": 0.4430046123435098, + "grad_norm": 2.2994537353515625, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7058999538421631, + "num_tokens": 101107824.0, + "step": 4034 + }, + { + "epoch": 0.44311443004612344, + "grad_norm": 2.277214765548706, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7040703296661377, + "num_tokens": 101131919.0, + "step": 4035 + }, + { + "epoch": 0.4432242477487371, + "grad_norm": 2.366631269454956, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7074927091598511, + "num_tokens": 101154636.0, + "step": 4036 + }, + { + "epoch": 0.44333406545135073, + "grad_norm": 2.215648651123047, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6964044570922852, + "num_tokens": 101179695.0, + "step": 4037 + }, + { + "epoch": 0.44344388315396444, + "grad_norm": 2.362060785293579, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.694740355014801, + "num_tokens": 101204297.0, + "step": 4038 + }, + { + "epoch": 0.4435537008565781, + "grad_norm": 2.405519485473633, + "learning_rate": 1e-06, + "loss": 1.0776, + "mean_token_accuracy": 0.6862165927886963, + "num_tokens": 101226201.0, + "step": 4039 + }, + { + "epoch": 0.4436635185591917, + "grad_norm": 1.760068416595459, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7040512561798096, + "num_tokens": 101261822.0, + "step": 4040 + }, + { + "epoch": 0.44377333626180543, + "grad_norm": 2.357499361038208, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7187420129776001, + "num_tokens": 101285402.0, + "step": 4041 + }, + { + "epoch": 0.4438831539644191, + "grad_norm": 2.3229174613952637, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7002801895141602, + "num_tokens": 101309034.0, + "step": 4042 + }, + { + "epoch": 0.4439929716670327, + "grad_norm": 2.2802093029022217, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6847125887870789, + "num_tokens": 101333042.0, + "step": 4043 + }, + { + "epoch": 0.44410278936964637, + "grad_norm": 2.319204807281494, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7280305027961731, + "num_tokens": 101353576.0, + "step": 4044 + }, + { + "epoch": 0.44421260707226007, + "grad_norm": 2.470979928970337, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7071995735168457, + "num_tokens": 101375511.0, + "step": 4045 + }, + { + "epoch": 0.4443224247748737, + "grad_norm": 2.4462451934814453, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6900326609611511, + "num_tokens": 101397993.0, + "step": 4046 + }, + { + "epoch": 0.44443224247748736, + "grad_norm": 2.3981268405914307, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6921582818031311, + "num_tokens": 101424002.0, + "step": 4047 + }, + { + "epoch": 0.444542060180101, + "grad_norm": 2.3356966972351074, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7044789791107178, + "num_tokens": 101445826.0, + "step": 4048 + }, + { + "epoch": 0.4446518778827147, + "grad_norm": 2.1315834522247314, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.7057870030403137, + "num_tokens": 101473322.0, + "step": 4049 + }, + { + "epoch": 0.44476169558532835, + "grad_norm": 2.274641513824463, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6972576975822449, + "num_tokens": 101497669.0, + "step": 4050 + }, + { + "epoch": 0.444871513287942, + "grad_norm": 2.396050214767456, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6861019134521484, + "num_tokens": 101520094.0, + "step": 4051 + }, + { + "epoch": 0.4449813309905557, + "grad_norm": 2.2686607837677, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7047090530395508, + "num_tokens": 101544420.0, + "step": 4052 + }, + { + "epoch": 0.44509114869316935, + "grad_norm": 2.1025071144104004, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6924270391464233, + "num_tokens": 101572682.0, + "step": 4053 + }, + { + "epoch": 0.445200966395783, + "grad_norm": 2.2268354892730713, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7190645337104797, + "num_tokens": 101599432.0, + "step": 4054 + }, + { + "epoch": 0.44531078409839664, + "grad_norm": 2.312129259109497, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7070729732513428, + "num_tokens": 101624124.0, + "step": 4055 + }, + { + "epoch": 0.44542060180101034, + "grad_norm": 2.1185407638549805, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7148979902267456, + "num_tokens": 101651677.0, + "step": 4056 + }, + { + "epoch": 0.445530419503624, + "grad_norm": 2.232525587081909, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6883216500282288, + "num_tokens": 101676085.0, + "step": 4057 + }, + { + "epoch": 0.44564023720623763, + "grad_norm": 2.3860554695129395, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7057523727416992, + "num_tokens": 101698125.0, + "step": 4058 + }, + { + "epoch": 0.44575005490885133, + "grad_norm": 2.5145058631896973, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.682954728603363, + "num_tokens": 101720667.0, + "step": 4059 + }, + { + "epoch": 0.445859872611465, + "grad_norm": 2.2643790245056152, + "learning_rate": 1e-06, + "loss": 1.0612, + "mean_token_accuracy": 0.6895323991775513, + "num_tokens": 101747218.0, + "step": 4060 + }, + { + "epoch": 0.4459696903140786, + "grad_norm": 2.2820584774017334, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7014639377593994, + "num_tokens": 101771626.0, + "step": 4061 + }, + { + "epoch": 0.44607950801669227, + "grad_norm": 2.217362642288208, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7099577188491821, + "num_tokens": 101797612.0, + "step": 4062 + }, + { + "epoch": 0.446189325719306, + "grad_norm": 2.3963310718536377, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7078579068183899, + "num_tokens": 101818711.0, + "step": 4063 + }, + { + "epoch": 0.4462991434219196, + "grad_norm": 2.4410030841827393, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.693699300289154, + "num_tokens": 101841836.0, + "step": 4064 + }, + { + "epoch": 0.44640896112453327, + "grad_norm": 2.1032485961914062, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7024884223937988, + "num_tokens": 101868890.0, + "step": 4065 + }, + { + "epoch": 0.4465187788271469, + "grad_norm": 2.075277805328369, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6827889084815979, + "num_tokens": 101897085.0, + "step": 4066 + }, + { + "epoch": 0.4466285965297606, + "grad_norm": 2.093691825866699, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.684246301651001, + "num_tokens": 101925808.0, + "step": 4067 + }, + { + "epoch": 0.44673841423237426, + "grad_norm": 2.389234781265259, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7056221961975098, + "num_tokens": 101948384.0, + "step": 4068 + }, + { + "epoch": 0.4468482319349879, + "grad_norm": 2.9318289756774902, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7381035089492798, + "num_tokens": 101965772.0, + "step": 4069 + }, + { + "epoch": 0.4469580496376016, + "grad_norm": 2.1685354709625244, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7012249231338501, + "num_tokens": 101994667.0, + "step": 4070 + }, + { + "epoch": 0.44706786734021525, + "grad_norm": 2.3634629249572754, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6797318458557129, + "num_tokens": 102019530.0, + "step": 4071 + }, + { + "epoch": 0.4471776850428289, + "grad_norm": 2.2784018516540527, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7039639949798584, + "num_tokens": 102043259.0, + "step": 4072 + }, + { + "epoch": 0.44728750274544254, + "grad_norm": 2.325303792953491, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7021011114120483, + "num_tokens": 102066750.0, + "step": 4073 + }, + { + "epoch": 0.44739732044805625, + "grad_norm": 2.491619110107422, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.703474223613739, + "num_tokens": 102089062.0, + "step": 4074 + }, + { + "epoch": 0.4475071381506699, + "grad_norm": 2.4332222938537598, + "learning_rate": 1e-06, + "loss": 1.0876, + "mean_token_accuracy": 0.6735318899154663, + "num_tokens": 102113473.0, + "step": 4075 + }, + { + "epoch": 0.44761695585328354, + "grad_norm": 2.369297742843628, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.716920793056488, + "num_tokens": 102136773.0, + "step": 4076 + }, + { + "epoch": 0.4477267735558972, + "grad_norm": 2.582608938217163, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7161732912063599, + "num_tokens": 102157282.0, + "step": 4077 + }, + { + "epoch": 0.4478365912585109, + "grad_norm": 2.328061103820801, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7046368718147278, + "num_tokens": 102182139.0, + "step": 4078 + }, + { + "epoch": 0.44794640896112453, + "grad_norm": 2.1079447269439697, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6856672167778015, + "num_tokens": 102212712.0, + "step": 4079 + }, + { + "epoch": 0.4480562266637382, + "grad_norm": 2.3565642833709717, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7018882036209106, + "num_tokens": 102237269.0, + "step": 4080 + }, + { + "epoch": 0.4481660443663519, + "grad_norm": 2.4769365787506104, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7108778953552246, + "num_tokens": 102257899.0, + "step": 4081 + }, + { + "epoch": 0.4482758620689655, + "grad_norm": 2.2393200397491455, + "learning_rate": 1e-06, + "loss": 1.0514, + "mean_token_accuracy": 0.6870109438896179, + "num_tokens": 102281894.0, + "step": 4082 + }, + { + "epoch": 0.44838567977157917, + "grad_norm": 2.448734998703003, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7328144907951355, + "num_tokens": 102302442.0, + "step": 4083 + }, + { + "epoch": 0.4484954974741928, + "grad_norm": 2.181330919265747, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.708387017250061, + "num_tokens": 102327712.0, + "step": 4084 + }, + { + "epoch": 0.4486053151768065, + "grad_norm": 2.233046293258667, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6924642324447632, + "num_tokens": 102352609.0, + "step": 4085 + }, + { + "epoch": 0.44871513287942016, + "grad_norm": 2.1937334537506104, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.69534832239151, + "num_tokens": 102378490.0, + "step": 4086 + }, + { + "epoch": 0.4488249505820338, + "grad_norm": 2.2673728466033936, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7160216569900513, + "num_tokens": 102403393.0, + "step": 4087 + }, + { + "epoch": 0.4489347682846475, + "grad_norm": 2.24511981010437, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7040956020355225, + "num_tokens": 102426269.0, + "step": 4088 + }, + { + "epoch": 0.44904458598726116, + "grad_norm": 2.729215383529663, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7301688194274902, + "num_tokens": 102443008.0, + "step": 4089 + }, + { + "epoch": 0.4491544036898748, + "grad_norm": 2.5035157203674316, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6977177262306213, + "num_tokens": 102464501.0, + "step": 4090 + }, + { + "epoch": 0.44926422139248845, + "grad_norm": 2.1480484008789062, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6842495799064636, + "num_tokens": 102493112.0, + "step": 4091 + }, + { + "epoch": 0.44937403909510215, + "grad_norm": 1.8202391862869263, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7000410556793213, + "num_tokens": 102530627.0, + "step": 4092 + }, + { + "epoch": 0.4494838567977158, + "grad_norm": 2.182126760482788, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7173130512237549, + "num_tokens": 102558326.0, + "step": 4093 + }, + { + "epoch": 0.44959367450032944, + "grad_norm": 2.240659713745117, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7073057889938354, + "num_tokens": 102583110.0, + "step": 4094 + }, + { + "epoch": 0.4497034922029431, + "grad_norm": 2.1921234130859375, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7109559178352356, + "num_tokens": 102609782.0, + "step": 4095 + }, + { + "epoch": 0.4498133099055568, + "grad_norm": 2.179115056991577, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7330853939056396, + "num_tokens": 102634274.0, + "step": 4096 + }, + { + "epoch": 0.44992312760817044, + "grad_norm": 2.2824699878692627, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7080379724502563, + "num_tokens": 102660322.0, + "step": 4097 + }, + { + "epoch": 0.4500329453107841, + "grad_norm": 2.151719093322754, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7215209603309631, + "num_tokens": 102684037.0, + "step": 4098 + }, + { + "epoch": 0.4501427630133978, + "grad_norm": 2.0074310302734375, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6940142512321472, + "num_tokens": 102713296.0, + "step": 4099 + }, + { + "epoch": 0.45025258071601143, + "grad_norm": 2.329439401626587, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7073165774345398, + "num_tokens": 102737971.0, + "step": 4100 + }, + { + "epoch": 0.4503623984186251, + "grad_norm": 2.106719732284546, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7108560800552368, + "num_tokens": 102766315.0, + "step": 4101 + }, + { + "epoch": 0.4504722161212387, + "grad_norm": 2.081725597381592, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7133479118347168, + "num_tokens": 102790657.0, + "step": 4102 + }, + { + "epoch": 0.4505820338238524, + "grad_norm": 2.1021597385406494, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6943126916885376, + "num_tokens": 102818972.0, + "step": 4103 + }, + { + "epoch": 0.45069185152646607, + "grad_norm": 2.0848217010498047, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6902007460594177, + "num_tokens": 102846560.0, + "step": 4104 + }, + { + "epoch": 0.4508016692290797, + "grad_norm": 2.4537723064422607, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.6933090686798096, + "num_tokens": 102867961.0, + "step": 4105 + }, + { + "epoch": 0.45091148693169336, + "grad_norm": 2.369908571243286, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7112112045288086, + "num_tokens": 102888831.0, + "step": 4106 + }, + { + "epoch": 0.45102130463430706, + "grad_norm": 2.034458637237549, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6993297338485718, + "num_tokens": 102917138.0, + "step": 4107 + }, + { + "epoch": 0.4511311223369207, + "grad_norm": 2.375544548034668, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7222474813461304, + "num_tokens": 102940272.0, + "step": 4108 + }, + { + "epoch": 0.45124094003953436, + "grad_norm": 2.260650634765625, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7319529056549072, + "num_tokens": 102964914.0, + "step": 4109 + }, + { + "epoch": 0.45135075774214806, + "grad_norm": 2.350426197052002, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7054818868637085, + "num_tokens": 102986930.0, + "step": 4110 + }, + { + "epoch": 0.4514605754447617, + "grad_norm": 2.269162654876709, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6908727884292603, + "num_tokens": 103013171.0, + "step": 4111 + }, + { + "epoch": 0.45157039314737535, + "grad_norm": 2.1161983013153076, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6986676454544067, + "num_tokens": 103040770.0, + "step": 4112 + }, + { + "epoch": 0.451680210849989, + "grad_norm": 2.5585551261901855, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6972482800483704, + "num_tokens": 103061240.0, + "step": 4113 + }, + { + "epoch": 0.4517900285526027, + "grad_norm": 1.9800039529800415, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6861478090286255, + "num_tokens": 103094108.0, + "step": 4114 + }, + { + "epoch": 0.45189984625521634, + "grad_norm": 2.004784107208252, + "learning_rate": 1e-06, + "loss": 1.09, + "mean_token_accuracy": 0.677165150642395, + "num_tokens": 103128361.0, + "step": 4115 + }, + { + "epoch": 0.45200966395783, + "grad_norm": 2.3144774436950684, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6987011432647705, + "num_tokens": 103151630.0, + "step": 4116 + }, + { + "epoch": 0.4521194816604437, + "grad_norm": 2.134310007095337, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6932374835014343, + "num_tokens": 103179132.0, + "step": 4117 + }, + { + "epoch": 0.45222929936305734, + "grad_norm": 2.2537426948547363, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7072855830192566, + "num_tokens": 103203015.0, + "step": 4118 + }, + { + "epoch": 0.452339117065671, + "grad_norm": 2.064483404159546, + "learning_rate": 1e-06, + "loss": 1.0519, + "mean_token_accuracy": 0.6798423528671265, + "num_tokens": 103233848.0, + "step": 4119 + }, + { + "epoch": 0.4524489347682846, + "grad_norm": 2.052992582321167, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.691324770450592, + "num_tokens": 103262365.0, + "step": 4120 + }, + { + "epoch": 0.45255875247089833, + "grad_norm": 2.1351213455200195, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7248667478561401, + "num_tokens": 103285710.0, + "step": 4121 + }, + { + "epoch": 0.452668570173512, + "grad_norm": 2.0491108894348145, + "learning_rate": 1e-06, + "loss": 1.1591, + "mean_token_accuracy": 0.6491892337799072, + "num_tokens": 103318771.0, + "step": 4122 + }, + { + "epoch": 0.4527783878761256, + "grad_norm": 2.085752010345459, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7069331407546997, + "num_tokens": 103346533.0, + "step": 4123 + }, + { + "epoch": 0.45288820557873927, + "grad_norm": 2.4708144664764404, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7160747647285461, + "num_tokens": 103366213.0, + "step": 4124 + }, + { + "epoch": 0.45299802328135297, + "grad_norm": 2.0525004863739014, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7054160237312317, + "num_tokens": 103394135.0, + "step": 4125 + }, + { + "epoch": 0.4531078409839666, + "grad_norm": 2.3346903324127197, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7032734751701355, + "num_tokens": 103418365.0, + "step": 4126 + }, + { + "epoch": 0.45321765868658026, + "grad_norm": 2.441596508026123, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7053053975105286, + "num_tokens": 103440142.0, + "step": 4127 + }, + { + "epoch": 0.45332747638919396, + "grad_norm": 2.2313289642333984, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7041481733322144, + "num_tokens": 103465003.0, + "step": 4128 + }, + { + "epoch": 0.4534372940918076, + "grad_norm": 2.239093065261841, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6871280670166016, + "num_tokens": 103490467.0, + "step": 4129 + }, + { + "epoch": 0.45354711179442125, + "grad_norm": 2.3769612312316895, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7174458503723145, + "num_tokens": 103512901.0, + "step": 4130 + }, + { + "epoch": 0.4536569294970349, + "grad_norm": 2.281618595123291, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.695611834526062, + "num_tokens": 103536164.0, + "step": 4131 + }, + { + "epoch": 0.4537667471996486, + "grad_norm": 2.2002859115600586, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7033613324165344, + "num_tokens": 103562629.0, + "step": 4132 + }, + { + "epoch": 0.45387656490226225, + "grad_norm": 2.690328598022461, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7017079591751099, + "num_tokens": 103581293.0, + "step": 4133 + }, + { + "epoch": 0.4539863826048759, + "grad_norm": 2.100374221801758, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6880589723587036, + "num_tokens": 103609329.0, + "step": 4134 + }, + { + "epoch": 0.4540962003074896, + "grad_norm": 2.2832486629486084, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7040356993675232, + "num_tokens": 103634821.0, + "step": 4135 + }, + { + "epoch": 0.45420601801010324, + "grad_norm": 2.3441011905670166, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7088630199432373, + "num_tokens": 103657198.0, + "step": 4136 + }, + { + "epoch": 0.4543158357127169, + "grad_norm": 2.4867746829986572, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6907210350036621, + "num_tokens": 103678674.0, + "step": 4137 + }, + { + "epoch": 0.45442565341533053, + "grad_norm": 2.4245071411132812, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7239683270454407, + "num_tokens": 103697839.0, + "step": 4138 + }, + { + "epoch": 0.45453547111794423, + "grad_norm": 2.4723167419433594, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.708914577960968, + "num_tokens": 103718714.0, + "step": 4139 + }, + { + "epoch": 0.4546452888205579, + "grad_norm": 2.1881284713745117, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6915751099586487, + "num_tokens": 103745422.0, + "step": 4140 + }, + { + "epoch": 0.4547551065231715, + "grad_norm": 2.0256779193878174, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.6903842687606812, + "num_tokens": 103774219.0, + "step": 4141 + }, + { + "epoch": 0.45486492422578517, + "grad_norm": 2.233856678009033, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6960729956626892, + "num_tokens": 103797646.0, + "step": 4142 + }, + { + "epoch": 0.4549747419283989, + "grad_norm": 2.2694077491760254, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7271251678466797, + "num_tokens": 103820440.0, + "step": 4143 + }, + { + "epoch": 0.4550845596310125, + "grad_norm": 2.444798231124878, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6892526745796204, + "num_tokens": 103843238.0, + "step": 4144 + }, + { + "epoch": 0.45519437733362617, + "grad_norm": 2.122814655303955, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7077363133430481, + "num_tokens": 103869051.0, + "step": 4145 + }, + { + "epoch": 0.45530419503623987, + "grad_norm": 2.4864323139190674, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7128654718399048, + "num_tokens": 103888813.0, + "step": 4146 + }, + { + "epoch": 0.4554140127388535, + "grad_norm": 2.0339770317077637, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6931651830673218, + "num_tokens": 103919770.0, + "step": 4147 + }, + { + "epoch": 0.45552383044146716, + "grad_norm": 2.236726999282837, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7119832038879395, + "num_tokens": 103945232.0, + "step": 4148 + }, + { + "epoch": 0.4556336481440808, + "grad_norm": 2.5329525470733643, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7121847867965698, + "num_tokens": 103964374.0, + "step": 4149 + }, + { + "epoch": 0.4557434658466945, + "grad_norm": 2.0925259590148926, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.700633704662323, + "num_tokens": 103992061.0, + "step": 4150 + }, + { + "epoch": 0.45585328354930815, + "grad_norm": 2.435386896133423, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7151954770088196, + "num_tokens": 104013327.0, + "step": 4151 + }, + { + "epoch": 0.4559631012519218, + "grad_norm": 2.2519209384918213, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7021953463554382, + "num_tokens": 104038184.0, + "step": 4152 + }, + { + "epoch": 0.45607291895453544, + "grad_norm": 2.3246042728424072, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7182090282440186, + "num_tokens": 104059570.0, + "step": 4153 + }, + { + "epoch": 0.45618273665714915, + "grad_norm": 2.2866249084472656, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7105334997177124, + "num_tokens": 104083483.0, + "step": 4154 + }, + { + "epoch": 0.4562925543597628, + "grad_norm": 2.046144485473633, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7107669115066528, + "num_tokens": 104111283.0, + "step": 4155 + }, + { + "epoch": 0.45640237206237644, + "grad_norm": 2.041583299636841, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7175429463386536, + "num_tokens": 104140786.0, + "step": 4156 + }, + { + "epoch": 0.45651218976499014, + "grad_norm": 2.361607551574707, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7348496913909912, + "num_tokens": 104161495.0, + "step": 4157 + }, + { + "epoch": 0.4566220074676038, + "grad_norm": 2.1611459255218506, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7175286412239075, + "num_tokens": 104186273.0, + "step": 4158 + }, + { + "epoch": 0.45673182517021743, + "grad_norm": 2.4422669410705566, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7117761969566345, + "num_tokens": 104205482.0, + "step": 4159 + }, + { + "epoch": 0.4568416428728311, + "grad_norm": 2.118767499923706, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.7075955867767334, + "num_tokens": 104231666.0, + "step": 4160 + }, + { + "epoch": 0.4569514605754448, + "grad_norm": 1.7632393836975098, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7208572626113892, + "num_tokens": 104265604.0, + "step": 4161 + }, + { + "epoch": 0.4570612782780584, + "grad_norm": 2.079429864883423, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6994544267654419, + "num_tokens": 104291789.0, + "step": 4162 + }, + { + "epoch": 0.45717109598067207, + "grad_norm": 2.1005465984344482, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7135208249092102, + "num_tokens": 104318823.0, + "step": 4163 + }, + { + "epoch": 0.4572809136832858, + "grad_norm": 2.37160325050354, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7266669273376465, + "num_tokens": 104340968.0, + "step": 4164 + }, + { + "epoch": 0.4573907313858994, + "grad_norm": 2.42999005317688, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7069868445396423, + "num_tokens": 104361752.0, + "step": 4165 + }, + { + "epoch": 0.45750054908851306, + "grad_norm": 2.0331101417541504, + "learning_rate": 1e-06, + "loss": 1.0832, + "mean_token_accuracy": 0.6796536445617676, + "num_tokens": 104392277.0, + "step": 4166 + }, + { + "epoch": 0.4576103667911267, + "grad_norm": 2.140836238861084, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7060844898223877, + "num_tokens": 104417869.0, + "step": 4167 + }, + { + "epoch": 0.4577201844937404, + "grad_norm": 2.033555269241333, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6994307041168213, + "num_tokens": 104447095.0, + "step": 4168 + }, + { + "epoch": 0.45783000219635406, + "grad_norm": 2.289121150970459, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.698614239692688, + "num_tokens": 104470877.0, + "step": 4169 + }, + { + "epoch": 0.4579398198989677, + "grad_norm": 2.482081174850464, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7360110878944397, + "num_tokens": 104490295.0, + "step": 4170 + }, + { + "epoch": 0.45804963760158135, + "grad_norm": 2.4681475162506104, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7201575636863708, + "num_tokens": 104509373.0, + "step": 4171 + }, + { + "epoch": 0.45815945530419505, + "grad_norm": 2.3953568935394287, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7189012169837952, + "num_tokens": 104530392.0, + "step": 4172 + }, + { + "epoch": 0.4582692730068087, + "grad_norm": 2.2778048515319824, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6861289739608765, + "num_tokens": 104554838.0, + "step": 4173 + }, + { + "epoch": 0.45837909070942234, + "grad_norm": 2.0287396907806396, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7043970227241516, + "num_tokens": 104583779.0, + "step": 4174 + }, + { + "epoch": 0.45848890841203604, + "grad_norm": 2.1498055458068848, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.7011869549751282, + "num_tokens": 104610820.0, + "step": 4175 + }, + { + "epoch": 0.4585987261146497, + "grad_norm": 2.274416923522949, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.6985578536987305, + "num_tokens": 104637512.0, + "step": 4176 + }, + { + "epoch": 0.45870854381726334, + "grad_norm": 2.3552730083465576, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6935350298881531, + "num_tokens": 104660221.0, + "step": 4177 + }, + { + "epoch": 0.458818361519877, + "grad_norm": 2.2244369983673096, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7108799815177917, + "num_tokens": 104683871.0, + "step": 4178 + }, + { + "epoch": 0.4589281792224907, + "grad_norm": 2.222792148590088, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6905993819236755, + "num_tokens": 104709537.0, + "step": 4179 + }, + { + "epoch": 0.45903799692510433, + "grad_norm": 2.2912135124206543, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7246540784835815, + "num_tokens": 104731113.0, + "step": 4180 + }, + { + "epoch": 0.459147814627718, + "grad_norm": 2.410201072692871, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7080368995666504, + "num_tokens": 104753221.0, + "step": 4181 + }, + { + "epoch": 0.4592576323303316, + "grad_norm": 2.0802907943725586, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6940963268280029, + "num_tokens": 104781497.0, + "step": 4182 + }, + { + "epoch": 0.4593674500329453, + "grad_norm": 2.3309037685394287, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.698964536190033, + "num_tokens": 104805183.0, + "step": 4183 + }, + { + "epoch": 0.45947726773555897, + "grad_norm": 2.2788617610931396, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7059905529022217, + "num_tokens": 104830273.0, + "step": 4184 + }, + { + "epoch": 0.4595870854381726, + "grad_norm": 2.37520170211792, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7195284366607666, + "num_tokens": 104854222.0, + "step": 4185 + }, + { + "epoch": 0.4596969031407863, + "grad_norm": 2.4214680194854736, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7014809250831604, + "num_tokens": 104876779.0, + "step": 4186 + }, + { + "epoch": 0.45980672084339996, + "grad_norm": 2.233764171600342, + "learning_rate": 1e-06, + "loss": 1.037, + "mean_token_accuracy": 0.686508297920227, + "num_tokens": 104902783.0, + "step": 4187 + }, + { + "epoch": 0.4599165385460136, + "grad_norm": 2.137237548828125, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7057387828826904, + "num_tokens": 104933285.0, + "step": 4188 + }, + { + "epoch": 0.46002635624862726, + "grad_norm": 2.5076422691345215, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7068313360214233, + "num_tokens": 104955144.0, + "step": 4189 + }, + { + "epoch": 0.46013617395124096, + "grad_norm": 2.3713581562042236, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.702299952507019, + "num_tokens": 104979619.0, + "step": 4190 + }, + { + "epoch": 0.4602459916538546, + "grad_norm": 2.1994199752807617, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7015897631645203, + "num_tokens": 105005839.0, + "step": 4191 + }, + { + "epoch": 0.46035580935646825, + "grad_norm": 2.1940834522247314, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6920019388198853, + "num_tokens": 105033607.0, + "step": 4192 + }, + { + "epoch": 0.46046562705908195, + "grad_norm": 2.356112480163574, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7117680311203003, + "num_tokens": 105056873.0, + "step": 4193 + }, + { + "epoch": 0.4605754447616956, + "grad_norm": 2.265913724899292, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7328589558601379, + "num_tokens": 105079832.0, + "step": 4194 + }, + { + "epoch": 0.46068526246430924, + "grad_norm": 1.9648019075393677, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6785012483596802, + "num_tokens": 105111130.0, + "step": 4195 + }, + { + "epoch": 0.4607950801669229, + "grad_norm": 2.2723755836486816, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6878659129142761, + "num_tokens": 105135513.0, + "step": 4196 + }, + { + "epoch": 0.4609048978695366, + "grad_norm": 2.4716074466705322, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7108075022697449, + "num_tokens": 105157343.0, + "step": 4197 + }, + { + "epoch": 0.46101471557215024, + "grad_norm": 2.225693941116333, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.706588864326477, + "num_tokens": 105182297.0, + "step": 4198 + }, + { + "epoch": 0.4611245332747639, + "grad_norm": 2.365553379058838, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6985740661621094, + "num_tokens": 105205492.0, + "step": 4199 + }, + { + "epoch": 0.4612343509773775, + "grad_norm": 2.3829267024993896, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7111344933509827, + "num_tokens": 105227821.0, + "step": 4200 + }, + { + "epoch": 0.46134416867999123, + "grad_norm": 1.9259785413742065, + "learning_rate": 1e-06, + "loss": 1.0416, + "mean_token_accuracy": 0.6837925910949707, + "num_tokens": 105261557.0, + "step": 4201 + }, + { + "epoch": 0.4614539863826049, + "grad_norm": 1.8558744192123413, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.687859058380127, + "num_tokens": 105297584.0, + "step": 4202 + }, + { + "epoch": 0.4615638040852185, + "grad_norm": 2.390423059463501, + "learning_rate": 1e-06, + "loss": 1.1328, + "mean_token_accuracy": 0.6582558155059814, + "num_tokens": 105320197.0, + "step": 4203 + }, + { + "epoch": 0.4616736217878322, + "grad_norm": 1.8594703674316406, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7142242193222046, + "num_tokens": 105352059.0, + "step": 4204 + }, + { + "epoch": 0.46178343949044587, + "grad_norm": 2.09562611579895, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6840347647666931, + "num_tokens": 105383222.0, + "step": 4205 + }, + { + "epoch": 0.4618932571930595, + "grad_norm": 2.4321048259735107, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.6992138624191284, + "num_tokens": 105406385.0, + "step": 4206 + }, + { + "epoch": 0.46200307489567316, + "grad_norm": 2.322446584701538, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6981496214866638, + "num_tokens": 105429344.0, + "step": 4207 + }, + { + "epoch": 0.46211289259828686, + "grad_norm": 2.07670521736145, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.710024356842041, + "num_tokens": 105457177.0, + "step": 4208 + }, + { + "epoch": 0.4622227103009005, + "grad_norm": 2.065624713897705, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.70168536901474, + "num_tokens": 105485843.0, + "step": 4209 + }, + { + "epoch": 0.46233252800351415, + "grad_norm": 2.1180691719055176, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7012767791748047, + "num_tokens": 105511007.0, + "step": 4210 + }, + { + "epoch": 0.46244234570612786, + "grad_norm": 2.4024133682250977, + "learning_rate": 1e-06, + "loss": 1.1177, + "mean_token_accuracy": 0.6713531613349915, + "num_tokens": 105533441.0, + "step": 4211 + }, + { + "epoch": 0.4625521634087415, + "grad_norm": 2.121342420578003, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6951696872711182, + "num_tokens": 105559800.0, + "step": 4212 + }, + { + "epoch": 0.46266198111135515, + "grad_norm": 2.279740810394287, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7088360786437988, + "num_tokens": 105584264.0, + "step": 4213 + }, + { + "epoch": 0.4627717988139688, + "grad_norm": 2.1598381996154785, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7262370586395264, + "num_tokens": 105608757.0, + "step": 4214 + }, + { + "epoch": 0.4628816165165825, + "grad_norm": 2.356342315673828, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7166942358016968, + "num_tokens": 105632549.0, + "step": 4215 + }, + { + "epoch": 0.46299143421919614, + "grad_norm": 2.0501911640167236, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7276879549026489, + "num_tokens": 105659689.0, + "step": 4216 + }, + { + "epoch": 0.4631012519218098, + "grad_norm": 2.209754705429077, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7136176824569702, + "num_tokens": 105683539.0, + "step": 4217 + }, + { + "epoch": 0.46321106962442343, + "grad_norm": 2.383089303970337, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7239012122154236, + "num_tokens": 105706945.0, + "step": 4218 + }, + { + "epoch": 0.46332088732703713, + "grad_norm": 2.0562195777893066, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7020809650421143, + "num_tokens": 105736380.0, + "step": 4219 + }, + { + "epoch": 0.4634307050296508, + "grad_norm": 1.9297940731048584, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7014142870903015, + "num_tokens": 105769367.0, + "step": 4220 + }, + { + "epoch": 0.4635405227322644, + "grad_norm": 2.346601963043213, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7058193683624268, + "num_tokens": 105792593.0, + "step": 4221 + }, + { + "epoch": 0.4636503404348781, + "grad_norm": 2.2426960468292236, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7155938148498535, + "num_tokens": 105817291.0, + "step": 4222 + }, + { + "epoch": 0.4637601581374918, + "grad_norm": 2.087210178375244, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6942368745803833, + "num_tokens": 105846149.0, + "step": 4223 + }, + { + "epoch": 0.4638699758401054, + "grad_norm": 2.0727698802948, + "learning_rate": 1e-06, + "loss": 1.125, + "mean_token_accuracy": 0.669169545173645, + "num_tokens": 105875997.0, + "step": 4224 + }, + { + "epoch": 0.46397979354271907, + "grad_norm": 2.2371866703033447, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6985508799552917, + "num_tokens": 105901382.0, + "step": 4225 + }, + { + "epoch": 0.46408961124533277, + "grad_norm": 2.178539276123047, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.6988148093223572, + "num_tokens": 105930494.0, + "step": 4226 + }, + { + "epoch": 0.4641994289479464, + "grad_norm": 2.1967453956604004, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7319380640983582, + "num_tokens": 105955723.0, + "step": 4227 + }, + { + "epoch": 0.46430924665056006, + "grad_norm": 2.629927396774292, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7016639709472656, + "num_tokens": 105974196.0, + "step": 4228 + }, + { + "epoch": 0.4644190643531737, + "grad_norm": 2.275364398956299, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6950331926345825, + "num_tokens": 105998216.0, + "step": 4229 + }, + { + "epoch": 0.4645288820557874, + "grad_norm": 2.0271060466766357, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6972727179527283, + "num_tokens": 106027549.0, + "step": 4230 + }, + { + "epoch": 0.46463869975840105, + "grad_norm": 2.4235689640045166, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7216981649398804, + "num_tokens": 106047731.0, + "step": 4231 + }, + { + "epoch": 0.4647485174610147, + "grad_norm": 2.3691012859344482, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7061043977737427, + "num_tokens": 106070849.0, + "step": 4232 + }, + { + "epoch": 0.4648583351636284, + "grad_norm": 2.0918185710906982, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7205541133880615, + "num_tokens": 106097944.0, + "step": 4233 + }, + { + "epoch": 0.46496815286624205, + "grad_norm": 2.184475898742676, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7154736518859863, + "num_tokens": 106120903.0, + "step": 4234 + }, + { + "epoch": 0.4650779705688557, + "grad_norm": 2.073289155960083, + "learning_rate": 1e-06, + "loss": 1.1266, + "mean_token_accuracy": 0.6721353530883789, + "num_tokens": 106151332.0, + "step": 4235 + }, + { + "epoch": 0.46518778827146934, + "grad_norm": 1.996350884437561, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7239845395088196, + "num_tokens": 106179941.0, + "step": 4236 + }, + { + "epoch": 0.46529760597408304, + "grad_norm": 2.3439977169036865, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6937792301177979, + "num_tokens": 106203345.0, + "step": 4237 + }, + { + "epoch": 0.4654074236766967, + "grad_norm": 2.125638723373413, + "learning_rate": 1e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.6775820255279541, + "num_tokens": 106231055.0, + "step": 4238 + }, + { + "epoch": 0.46551724137931033, + "grad_norm": 2.2107532024383545, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7067228555679321, + "num_tokens": 106258733.0, + "step": 4239 + }, + { + "epoch": 0.46562705908192403, + "grad_norm": 2.2856297492980957, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.7021301984786987, + "num_tokens": 106283988.0, + "step": 4240 + }, + { + "epoch": 0.4657368767845377, + "grad_norm": 2.191190719604492, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6961260437965393, + "num_tokens": 106308600.0, + "step": 4241 + }, + { + "epoch": 0.4658466944871513, + "grad_norm": 2.314596176147461, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.715876579284668, + "num_tokens": 106329807.0, + "step": 4242 + }, + { + "epoch": 0.46595651218976497, + "grad_norm": 2.3644423484802246, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7007180452346802, + "num_tokens": 106353190.0, + "step": 4243 + }, + { + "epoch": 0.4660663298923787, + "grad_norm": 2.1943366527557373, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6971578598022461, + "num_tokens": 106379862.0, + "step": 4244 + }, + { + "epoch": 0.4661761475949923, + "grad_norm": 2.041379928588867, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7088030576705933, + "num_tokens": 106409232.0, + "step": 4245 + }, + { + "epoch": 0.46628596529760596, + "grad_norm": 2.7525618076324463, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7052894830703735, + "num_tokens": 106427658.0, + "step": 4246 + }, + { + "epoch": 0.4663957830002196, + "grad_norm": 2.083217144012451, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7092827558517456, + "num_tokens": 106453969.0, + "step": 4247 + }, + { + "epoch": 0.4665056007028333, + "grad_norm": 2.5117547512054443, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7037816643714905, + "num_tokens": 106475703.0, + "step": 4248 + }, + { + "epoch": 0.46661541840544696, + "grad_norm": 2.1554930210113525, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6902639269828796, + "num_tokens": 106504188.0, + "step": 4249 + }, + { + "epoch": 0.4667252361080606, + "grad_norm": 2.3491289615631104, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7022402286529541, + "num_tokens": 106526296.0, + "step": 4250 + }, + { + "epoch": 0.4668350538106743, + "grad_norm": 2.4960198402404785, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6904568672180176, + "num_tokens": 106548603.0, + "step": 4251 + }, + { + "epoch": 0.46694487151328795, + "grad_norm": 2.4848735332489014, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7219094038009644, + "num_tokens": 106568464.0, + "step": 4252 + }, + { + "epoch": 0.4670546892159016, + "grad_norm": 2.1107959747314453, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7089532017707825, + "num_tokens": 106594609.0, + "step": 4253 + }, + { + "epoch": 0.46716450691851524, + "grad_norm": 2.254319190979004, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7135865688323975, + "num_tokens": 106616901.0, + "step": 4254 + }, + { + "epoch": 0.46727432462112894, + "grad_norm": 2.6311817169189453, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7115322351455688, + "num_tokens": 106634758.0, + "step": 4255 + }, + { + "epoch": 0.4673841423237426, + "grad_norm": 2.5866963863372803, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.720133364200592, + "num_tokens": 106658552.0, + "step": 4256 + }, + { + "epoch": 0.46749396002635624, + "grad_norm": 2.2306880950927734, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6991173028945923, + "num_tokens": 106683166.0, + "step": 4257 + }, + { + "epoch": 0.4676037777289699, + "grad_norm": 2.4708592891693115, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7114877700805664, + "num_tokens": 106705456.0, + "step": 4258 + }, + { + "epoch": 0.4677135954315836, + "grad_norm": 2.288210391998291, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6984976530075073, + "num_tokens": 106730274.0, + "step": 4259 + }, + { + "epoch": 0.46782341313419723, + "grad_norm": 2.3289883136749268, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7050929069519043, + "num_tokens": 106751903.0, + "step": 4260 + }, + { + "epoch": 0.4679332308368109, + "grad_norm": 2.128631353378296, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6907538175582886, + "num_tokens": 106778801.0, + "step": 4261 + }, + { + "epoch": 0.4680430485394246, + "grad_norm": 2.457634925842285, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7034567594528198, + "num_tokens": 106799391.0, + "step": 4262 + }, + { + "epoch": 0.4681528662420382, + "grad_norm": 2.5349864959716797, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7047414779663086, + "num_tokens": 106819180.0, + "step": 4263 + }, + { + "epoch": 0.46826268394465187, + "grad_norm": 2.2913708686828613, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7015126943588257, + "num_tokens": 106842996.0, + "step": 4264 + }, + { + "epoch": 0.4683725016472655, + "grad_norm": 2.269679546356201, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7033895254135132, + "num_tokens": 106868823.0, + "step": 4265 + }, + { + "epoch": 0.4684823193498792, + "grad_norm": 2.333463430404663, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7131892442703247, + "num_tokens": 106892980.0, + "step": 4266 + }, + { + "epoch": 0.46859213705249286, + "grad_norm": 2.569371461868286, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7256842851638794, + "num_tokens": 106912645.0, + "step": 4267 + }, + { + "epoch": 0.4687019547551065, + "grad_norm": 2.5284457206726074, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7127133011817932, + "num_tokens": 106932291.0, + "step": 4268 + }, + { + "epoch": 0.4688117724577202, + "grad_norm": 2.301299810409546, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7035055160522461, + "num_tokens": 106955820.0, + "step": 4269 + }, + { + "epoch": 0.46892159016033386, + "grad_norm": 2.325805187225342, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7075294256210327, + "num_tokens": 106978592.0, + "step": 4270 + }, + { + "epoch": 0.4690314078629475, + "grad_norm": 2.2927401065826416, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.7034238576889038, + "num_tokens": 107002145.0, + "step": 4271 + }, + { + "epoch": 0.46914122556556115, + "grad_norm": 2.3081517219543457, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.689329206943512, + "num_tokens": 107027251.0, + "step": 4272 + }, + { + "epoch": 0.46925104326817485, + "grad_norm": 2.038377046585083, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7140436172485352, + "num_tokens": 107054655.0, + "step": 4273 + }, + { + "epoch": 0.4693608609707885, + "grad_norm": 2.276240825653076, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7048450708389282, + "num_tokens": 107078396.0, + "step": 4274 + }, + { + "epoch": 0.46947067867340214, + "grad_norm": 2.0013582706451416, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7092025279998779, + "num_tokens": 107105228.0, + "step": 4275 + }, + { + "epoch": 0.4695804963760158, + "grad_norm": 2.214000701904297, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.703439474105835, + "num_tokens": 107130823.0, + "step": 4276 + }, + { + "epoch": 0.4696903140786295, + "grad_norm": 2.488079786300659, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7008926868438721, + "num_tokens": 107151559.0, + "step": 4277 + }, + { + "epoch": 0.46980013178124314, + "grad_norm": 2.0273280143737793, + "learning_rate": 1e-06, + "loss": 1.0756, + "mean_token_accuracy": 0.6773025989532471, + "num_tokens": 107181013.0, + "step": 4278 + }, + { + "epoch": 0.4699099494838568, + "grad_norm": 2.2613766193389893, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7082335352897644, + "num_tokens": 107205506.0, + "step": 4279 + }, + { + "epoch": 0.4700197671864705, + "grad_norm": 2.0248584747314453, + "learning_rate": 1e-06, + "loss": 1.0572, + "mean_token_accuracy": 0.6815687417984009, + "num_tokens": 107233208.0, + "step": 4280 + }, + { + "epoch": 0.47012958488908413, + "grad_norm": 2.4198648929595947, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7008368968963623, + "num_tokens": 107254860.0, + "step": 4281 + }, + { + "epoch": 0.4702394025916978, + "grad_norm": 2.003002405166626, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6913303732872009, + "num_tokens": 107284198.0, + "step": 4282 + }, + { + "epoch": 0.4703492202943114, + "grad_norm": 2.5643346309661865, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7359253168106079, + "num_tokens": 107305054.0, + "step": 4283 + }, + { + "epoch": 0.4704590379969251, + "grad_norm": 2.2023215293884277, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.709214448928833, + "num_tokens": 107330903.0, + "step": 4284 + }, + { + "epoch": 0.47056885569953877, + "grad_norm": 2.5015621185302734, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6964534521102905, + "num_tokens": 107351020.0, + "step": 4285 + }, + { + "epoch": 0.4706786734021524, + "grad_norm": 2.254477024078369, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6905239224433899, + "num_tokens": 107377552.0, + "step": 4286 + }, + { + "epoch": 0.4707884911047661, + "grad_norm": 2.3474981784820557, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.688461184501648, + "num_tokens": 107401883.0, + "step": 4287 + }, + { + "epoch": 0.47089830880737976, + "grad_norm": 2.2083423137664795, + "learning_rate": 1e-06, + "loss": 1.061, + "mean_token_accuracy": 0.6818724870681763, + "num_tokens": 107430639.0, + "step": 4288 + }, + { + "epoch": 0.4710081265099934, + "grad_norm": 1.9456143379211426, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7084507346153259, + "num_tokens": 107460918.0, + "step": 4289 + }, + { + "epoch": 0.47111794421260705, + "grad_norm": 2.3277647495269775, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6965171098709106, + "num_tokens": 107487347.0, + "step": 4290 + }, + { + "epoch": 0.47122776191522076, + "grad_norm": 2.3800644874572754, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.713188886642456, + "num_tokens": 107508304.0, + "step": 4291 + }, + { + "epoch": 0.4713375796178344, + "grad_norm": 1.9990352392196655, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.6954702138900757, + "num_tokens": 107539409.0, + "step": 4292 + }, + { + "epoch": 0.47144739732044805, + "grad_norm": 2.059908151626587, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7028516530990601, + "num_tokens": 107569758.0, + "step": 4293 + }, + { + "epoch": 0.4715572150230617, + "grad_norm": 2.155308961868286, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6979107856750488, + "num_tokens": 107595739.0, + "step": 4294 + }, + { + "epoch": 0.4716670327256754, + "grad_norm": 2.1327381134033203, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7209558486938477, + "num_tokens": 107619505.0, + "step": 4295 + }, + { + "epoch": 0.47177685042828904, + "grad_norm": 2.377108573913574, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.708236575126648, + "num_tokens": 107644079.0, + "step": 4296 + }, + { + "epoch": 0.4718866681309027, + "grad_norm": 2.2144529819488525, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7301762104034424, + "num_tokens": 107670879.0, + "step": 4297 + }, + { + "epoch": 0.4719964858335164, + "grad_norm": 2.26247501373291, + "learning_rate": 1e-06, + "loss": 1.03, + "mean_token_accuracy": 0.6936485767364502, + "num_tokens": 107695014.0, + "step": 4298 + }, + { + "epoch": 0.47210630353613003, + "grad_norm": 2.349639415740967, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6838001012802124, + "num_tokens": 107719784.0, + "step": 4299 + }, + { + "epoch": 0.4722161212387437, + "grad_norm": 2.176440954208374, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7074919939041138, + "num_tokens": 107745385.0, + "step": 4300 + }, + { + "epoch": 0.4723259389413573, + "grad_norm": 2.035712957382202, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7094587683677673, + "num_tokens": 107773323.0, + "step": 4301 + }, + { + "epoch": 0.472435756643971, + "grad_norm": 2.367081880569458, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7156345844268799, + "num_tokens": 107795964.0, + "step": 4302 + }, + { + "epoch": 0.4725455743465847, + "grad_norm": 2.3122334480285645, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7112435102462769, + "num_tokens": 107819778.0, + "step": 4303 + }, + { + "epoch": 0.4726553920491983, + "grad_norm": 2.153934955596924, + "learning_rate": 1e-06, + "loss": 1.1099, + "mean_token_accuracy": 0.6727365255355835, + "num_tokens": 107849591.0, + "step": 4304 + }, + { + "epoch": 0.47276520975181197, + "grad_norm": 2.223522901535034, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6807892322540283, + "num_tokens": 107877677.0, + "step": 4305 + }, + { + "epoch": 0.47287502745442567, + "grad_norm": 2.2373414039611816, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7101380228996277, + "num_tokens": 107904278.0, + "step": 4306 + }, + { + "epoch": 0.4729848451570393, + "grad_norm": 2.269408941268921, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.695501983165741, + "num_tokens": 107930675.0, + "step": 4307 + }, + { + "epoch": 0.47309466285965296, + "grad_norm": 2.532305955886841, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7192186117172241, + "num_tokens": 107950335.0, + "step": 4308 + }, + { + "epoch": 0.47320448056226666, + "grad_norm": 2.784200429916382, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7043037414550781, + "num_tokens": 107968129.0, + "step": 4309 + }, + { + "epoch": 0.4733142982648803, + "grad_norm": 2.125802516937256, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7109754085540771, + "num_tokens": 107994574.0, + "step": 4310 + }, + { + "epoch": 0.47342411596749395, + "grad_norm": 2.2020223140716553, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.6977695226669312, + "num_tokens": 108020671.0, + "step": 4311 + }, + { + "epoch": 0.4735339336701076, + "grad_norm": 2.115168333053589, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7067700028419495, + "num_tokens": 108046579.0, + "step": 4312 + }, + { + "epoch": 0.4736437513727213, + "grad_norm": 2.1860086917877197, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7243847846984863, + "num_tokens": 108072335.0, + "step": 4313 + }, + { + "epoch": 0.47375356907533495, + "grad_norm": 1.8937633037567139, + "learning_rate": 1e-06, + "loss": 1.0834, + "mean_token_accuracy": 0.6848163604736328, + "num_tokens": 108104399.0, + "step": 4314 + }, + { + "epoch": 0.4738633867779486, + "grad_norm": 2.349652051925659, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7091717720031738, + "num_tokens": 108127557.0, + "step": 4315 + }, + { + "epoch": 0.4739732044805623, + "grad_norm": 2.4371840953826904, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7154205441474915, + "num_tokens": 108149468.0, + "step": 4316 + }, + { + "epoch": 0.47408302218317594, + "grad_norm": 2.0830678939819336, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6863309144973755, + "num_tokens": 108178616.0, + "step": 4317 + }, + { + "epoch": 0.4741928398857896, + "grad_norm": 2.057966709136963, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6958878040313721, + "num_tokens": 108205119.0, + "step": 4318 + }, + { + "epoch": 0.47430265758840323, + "grad_norm": 2.0614001750946045, + "learning_rate": 1e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.682192325592041, + "num_tokens": 108235881.0, + "step": 4319 + }, + { + "epoch": 0.47441247529101693, + "grad_norm": 2.0012223720550537, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7144930362701416, + "num_tokens": 108263573.0, + "step": 4320 + }, + { + "epoch": 0.4745222929936306, + "grad_norm": 2.1175107955932617, + "learning_rate": 1e-06, + "loss": 1.0974, + "mean_token_accuracy": 0.6744514107704163, + "num_tokens": 108292991.0, + "step": 4321 + }, + { + "epoch": 0.4746321106962442, + "grad_norm": 2.184272527694702, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.718985915184021, + "num_tokens": 108317973.0, + "step": 4322 + }, + { + "epoch": 0.47474192839885787, + "grad_norm": 2.2296652793884277, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7001085877418518, + "num_tokens": 108342244.0, + "step": 4323 + }, + { + "epoch": 0.4748517461014716, + "grad_norm": 2.23504900932312, + "learning_rate": 1e-06, + "loss": 1.0159, + "mean_token_accuracy": 0.6928764581680298, + "num_tokens": 108367289.0, + "step": 4324 + }, + { + "epoch": 0.4749615638040852, + "grad_norm": 2.267887592315674, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.7032774686813354, + "num_tokens": 108392525.0, + "step": 4325 + }, + { + "epoch": 0.47507138150669886, + "grad_norm": 2.6145222187042236, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7135705947875977, + "num_tokens": 108411407.0, + "step": 4326 + }, + { + "epoch": 0.47518119920931257, + "grad_norm": 2.0583860874176025, + "learning_rate": 1e-06, + "loss": 1.0321, + "mean_token_accuracy": 0.683635950088501, + "num_tokens": 108440277.0, + "step": 4327 + }, + { + "epoch": 0.4752910169119262, + "grad_norm": 1.9849872589111328, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6950204968452454, + "num_tokens": 108473651.0, + "step": 4328 + }, + { + "epoch": 0.47540083461453986, + "grad_norm": 2.4872190952301025, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6980708837509155, + "num_tokens": 108496096.0, + "step": 4329 + }, + { + "epoch": 0.4755106523171535, + "grad_norm": 2.2392477989196777, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6880393028259277, + "num_tokens": 108521040.0, + "step": 4330 + }, + { + "epoch": 0.4756204700197672, + "grad_norm": 2.1998815536499023, + "learning_rate": 1e-06, + "loss": 1.0829, + "mean_token_accuracy": 0.677851676940918, + "num_tokens": 108547176.0, + "step": 4331 + }, + { + "epoch": 0.47573028772238085, + "grad_norm": 2.072948932647705, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7142608761787415, + "num_tokens": 108574125.0, + "step": 4332 + }, + { + "epoch": 0.4758401054249945, + "grad_norm": 2.443713665008545, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7043167352676392, + "num_tokens": 108597219.0, + "step": 4333 + }, + { + "epoch": 0.47594992312760814, + "grad_norm": 2.187650680541992, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.718609094619751, + "num_tokens": 108621795.0, + "step": 4334 + }, + { + "epoch": 0.47605974083022184, + "grad_norm": 2.345191717147827, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7079752087593079, + "num_tokens": 108643330.0, + "step": 4335 + }, + { + "epoch": 0.4761695585328355, + "grad_norm": 2.3028159141540527, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6867120265960693, + "num_tokens": 108668796.0, + "step": 4336 + }, + { + "epoch": 0.47627937623544914, + "grad_norm": 2.2266745567321777, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7187970876693726, + "num_tokens": 108692509.0, + "step": 4337 + }, + { + "epoch": 0.47638919393806284, + "grad_norm": 2.210057497024536, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.6977236866950989, + "num_tokens": 108717146.0, + "step": 4338 + }, + { + "epoch": 0.4764990116406765, + "grad_norm": 2.6873109340667725, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7162079215049744, + "num_tokens": 108737201.0, + "step": 4339 + }, + { + "epoch": 0.47660882934329013, + "grad_norm": 2.517125129699707, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.7050192356109619, + "num_tokens": 108758764.0, + "step": 4340 + }, + { + "epoch": 0.4767186470459038, + "grad_norm": 2.830460786819458, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.71721351146698, + "num_tokens": 108775183.0, + "step": 4341 + }, + { + "epoch": 0.4768284647485175, + "grad_norm": 2.115710973739624, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7102222442626953, + "num_tokens": 108801550.0, + "step": 4342 + }, + { + "epoch": 0.4769382824511311, + "grad_norm": 2.5180325508117676, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7253197431564331, + "num_tokens": 108822773.0, + "step": 4343 + }, + { + "epoch": 0.47704810015374477, + "grad_norm": 2.4468274116516113, + "learning_rate": 1e-06, + "loss": 1.044, + "mean_token_accuracy": 0.6897903680801392, + "num_tokens": 108844608.0, + "step": 4344 + }, + { + "epoch": 0.47715791785635847, + "grad_norm": 1.9860647916793823, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6860766410827637, + "num_tokens": 108876711.0, + "step": 4345 + }, + { + "epoch": 0.4772677355589721, + "grad_norm": 2.113067388534546, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7205584049224854, + "num_tokens": 108902209.0, + "step": 4346 + }, + { + "epoch": 0.47737755326158576, + "grad_norm": 2.293516159057617, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7065248489379883, + "num_tokens": 108926951.0, + "step": 4347 + }, + { + "epoch": 0.4774873709641994, + "grad_norm": 2.0842018127441406, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7053889036178589, + "num_tokens": 108953722.0, + "step": 4348 + }, + { + "epoch": 0.4775971886668131, + "grad_norm": 2.454575777053833, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7141163349151611, + "num_tokens": 108972861.0, + "step": 4349 + }, + { + "epoch": 0.47770700636942676, + "grad_norm": 1.8841794729232788, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6952368021011353, + "num_tokens": 109005799.0, + "step": 4350 + }, + { + "epoch": 0.4778168240720404, + "grad_norm": 2.363071918487549, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7041558027267456, + "num_tokens": 109030679.0, + "step": 4351 + }, + { + "epoch": 0.47792664177465405, + "grad_norm": 2.1507863998413086, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6898813843727112, + "num_tokens": 109056648.0, + "step": 4352 + }, + { + "epoch": 0.47803645947726775, + "grad_norm": 2.0038259029388428, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6963895559310913, + "num_tokens": 109089903.0, + "step": 4353 + }, + { + "epoch": 0.4781462771798814, + "grad_norm": 2.5325396060943604, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7085776329040527, + "num_tokens": 109109982.0, + "step": 4354 + }, + { + "epoch": 0.47825609488249504, + "grad_norm": 2.46266770362854, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7158507108688354, + "num_tokens": 109129763.0, + "step": 4355 + }, + { + "epoch": 0.47836591258510874, + "grad_norm": 2.229311943054199, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.721906304359436, + "num_tokens": 109153564.0, + "step": 4356 + }, + { + "epoch": 0.4784757302877224, + "grad_norm": 2.146047592163086, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.701317548751831, + "num_tokens": 109180545.0, + "step": 4357 + }, + { + "epoch": 0.47858554799033604, + "grad_norm": 2.045921564102173, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6971303820610046, + "num_tokens": 109211346.0, + "step": 4358 + }, + { + "epoch": 0.4786953656929497, + "grad_norm": 2.3169307708740234, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7027714252471924, + "num_tokens": 109235706.0, + "step": 4359 + }, + { + "epoch": 0.4788051833955634, + "grad_norm": 2.141831636428833, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6995970010757446, + "num_tokens": 109264089.0, + "step": 4360 + }, + { + "epoch": 0.47891500109817703, + "grad_norm": 2.720928907394409, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7156742215156555, + "num_tokens": 109281757.0, + "step": 4361 + }, + { + "epoch": 0.4790248188007907, + "grad_norm": 2.4403843879699707, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.710987389087677, + "num_tokens": 109301866.0, + "step": 4362 + }, + { + "epoch": 0.4791346365034044, + "grad_norm": 2.586089611053467, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7002279758453369, + "num_tokens": 109322196.0, + "step": 4363 + }, + { + "epoch": 0.479244454206018, + "grad_norm": 2.310720682144165, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7188563942909241, + "num_tokens": 109345816.0, + "step": 4364 + }, + { + "epoch": 0.47935427190863167, + "grad_norm": 2.1284682750701904, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7080110907554626, + "num_tokens": 109372410.0, + "step": 4365 + }, + { + "epoch": 0.4794640896112453, + "grad_norm": 2.124356508255005, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6902742385864258, + "num_tokens": 109400367.0, + "step": 4366 + }, + { + "epoch": 0.479573907313859, + "grad_norm": 2.060068368911743, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6852314472198486, + "num_tokens": 109429434.0, + "step": 4367 + }, + { + "epoch": 0.47968372501647266, + "grad_norm": 2.408407688140869, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6875369548797607, + "num_tokens": 109450547.0, + "step": 4368 + }, + { + "epoch": 0.4797935427190863, + "grad_norm": 2.177215337753296, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6805843114852905, + "num_tokens": 109477064.0, + "step": 4369 + }, + { + "epoch": 0.47990336042169995, + "grad_norm": 2.1562132835388184, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.6897022724151611, + "num_tokens": 109504597.0, + "step": 4370 + }, + { + "epoch": 0.48001317812431366, + "grad_norm": 2.272718906402588, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7328677177429199, + "num_tokens": 109527307.0, + "step": 4371 + }, + { + "epoch": 0.4801229958269273, + "grad_norm": 2.2618861198425293, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.69962477684021, + "num_tokens": 109552891.0, + "step": 4372 + }, + { + "epoch": 0.48023281352954095, + "grad_norm": 2.6614551544189453, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6930020451545715, + "num_tokens": 109572660.0, + "step": 4373 + }, + { + "epoch": 0.48034263123215465, + "grad_norm": 2.1988306045532227, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6812132596969604, + "num_tokens": 109599935.0, + "step": 4374 + }, + { + "epoch": 0.4804524489347683, + "grad_norm": 2.1079704761505127, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7049485445022583, + "num_tokens": 109626635.0, + "step": 4375 + }, + { + "epoch": 0.48056226663738194, + "grad_norm": 2.0797951221466064, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.6977888345718384, + "num_tokens": 109654111.0, + "step": 4376 + }, + { + "epoch": 0.4806720843399956, + "grad_norm": 2.3755273818969727, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7140138745307922, + "num_tokens": 109675800.0, + "step": 4377 + }, + { + "epoch": 0.4807819020426093, + "grad_norm": 2.1592495441436768, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7234275341033936, + "num_tokens": 109701299.0, + "step": 4378 + }, + { + "epoch": 0.48089171974522293, + "grad_norm": 2.4662222862243652, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7104756236076355, + "num_tokens": 109723822.0, + "step": 4379 + }, + { + "epoch": 0.4810015374478366, + "grad_norm": 2.529221296310425, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.6909219622612, + "num_tokens": 109744690.0, + "step": 4380 + }, + { + "epoch": 0.4811113551504502, + "grad_norm": 2.190150499343872, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7048548460006714, + "num_tokens": 109770232.0, + "step": 4381 + }, + { + "epoch": 0.4812211728530639, + "grad_norm": 2.090804100036621, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7016489505767822, + "num_tokens": 109796699.0, + "step": 4382 + }, + { + "epoch": 0.4813309905556776, + "grad_norm": 2.3362584114074707, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7144919633865356, + "num_tokens": 109817421.0, + "step": 4383 + }, + { + "epoch": 0.4814408082582912, + "grad_norm": 2.2932727336883545, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6964303255081177, + "num_tokens": 109843098.0, + "step": 4384 + }, + { + "epoch": 0.4815506259609049, + "grad_norm": 2.582700490951538, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7008267641067505, + "num_tokens": 109862285.0, + "step": 4385 + }, + { + "epoch": 0.48166044366351857, + "grad_norm": 2.1548280715942383, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7337208986282349, + "num_tokens": 109887494.0, + "step": 4386 + }, + { + "epoch": 0.4817702613661322, + "grad_norm": 2.6701650619506836, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7068934440612793, + "num_tokens": 109906162.0, + "step": 4387 + }, + { + "epoch": 0.48188007906874586, + "grad_norm": 2.243126630783081, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7156581878662109, + "num_tokens": 109928772.0, + "step": 4388 + }, + { + "epoch": 0.48198989677135956, + "grad_norm": 2.230468273162842, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7118097543716431, + "num_tokens": 109952255.0, + "step": 4389 + }, + { + "epoch": 0.4820997144739732, + "grad_norm": 2.4885189533233643, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7011050581932068, + "num_tokens": 109974487.0, + "step": 4390 + }, + { + "epoch": 0.48220953217658685, + "grad_norm": 2.1931514739990234, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7117741107940674, + "num_tokens": 110000510.0, + "step": 4391 + }, + { + "epoch": 0.48231934987920055, + "grad_norm": 2.064751386642456, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6945581436157227, + "num_tokens": 110028816.0, + "step": 4392 + }, + { + "epoch": 0.4824291675818142, + "grad_norm": 2.480074644088745, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7112301588058472, + "num_tokens": 110048972.0, + "step": 4393 + }, + { + "epoch": 0.48253898528442785, + "grad_norm": 2.8748090267181396, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7199100852012634, + "num_tokens": 110064588.0, + "step": 4394 + }, + { + "epoch": 0.4826488029870415, + "grad_norm": 2.0745744705200195, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7016618251800537, + "num_tokens": 110092338.0, + "step": 4395 + }, + { + "epoch": 0.4827586206896552, + "grad_norm": 2.0599050521850586, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7246535420417786, + "num_tokens": 110117511.0, + "step": 4396 + }, + { + "epoch": 0.48286843839226884, + "grad_norm": 2.512044906616211, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7110516428947449, + "num_tokens": 110138606.0, + "step": 4397 + }, + { + "epoch": 0.4829782560948825, + "grad_norm": 2.55525541305542, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.6943819522857666, + "num_tokens": 110158298.0, + "step": 4398 + }, + { + "epoch": 0.48308807379749613, + "grad_norm": 2.5181851387023926, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.704183042049408, + "num_tokens": 110178975.0, + "step": 4399 + }, + { + "epoch": 0.48319789150010983, + "grad_norm": 2.3571720123291016, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7270015478134155, + "num_tokens": 110200008.0, + "step": 4400 + }, + { + "epoch": 0.4833077092027235, + "grad_norm": 2.1331064701080322, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7095891833305359, + "num_tokens": 110225861.0, + "step": 4401 + }, + { + "epoch": 0.4834175269053371, + "grad_norm": 2.3540382385253906, + "learning_rate": 1e-06, + "loss": 1.054, + "mean_token_accuracy": 0.6835699081420898, + "num_tokens": 110248543.0, + "step": 4402 + }, + { + "epoch": 0.4835273446079508, + "grad_norm": 2.2358944416046143, + "learning_rate": 1e-06, + "loss": 1.0678, + "mean_token_accuracy": 0.6781013607978821, + "num_tokens": 110273229.0, + "step": 4403 + }, + { + "epoch": 0.4836371623105645, + "grad_norm": 2.0459392070770264, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7036097645759583, + "num_tokens": 110300962.0, + "step": 4404 + }, + { + "epoch": 0.4837469800131781, + "grad_norm": 2.052327871322632, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7173094153404236, + "num_tokens": 110328182.0, + "step": 4405 + }, + { + "epoch": 0.48385679771579176, + "grad_norm": 2.1004016399383545, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7119802236557007, + "num_tokens": 110356608.0, + "step": 4406 + }, + { + "epoch": 0.48396661541840547, + "grad_norm": 2.2337944507598877, + "learning_rate": 1e-06, + "loss": 1.0692, + "mean_token_accuracy": 0.6885361671447754, + "num_tokens": 110384453.0, + "step": 4407 + }, + { + "epoch": 0.4840764331210191, + "grad_norm": 2.221027135848999, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6999737024307251, + "num_tokens": 110409295.0, + "step": 4408 + }, + { + "epoch": 0.48418625082363276, + "grad_norm": 2.4484782218933105, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7020000219345093, + "num_tokens": 110430099.0, + "step": 4409 + }, + { + "epoch": 0.4842960685262464, + "grad_norm": 2.1409363746643066, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.7003993988037109, + "num_tokens": 110459139.0, + "step": 4410 + }, + { + "epoch": 0.4844058862288601, + "grad_norm": 2.5803487300872803, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7261211276054382, + "num_tokens": 110478933.0, + "step": 4411 + }, + { + "epoch": 0.48451570393147375, + "grad_norm": 2.316956043243408, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7163554430007935, + "num_tokens": 110502841.0, + "step": 4412 + }, + { + "epoch": 0.4846255216340874, + "grad_norm": 2.347839117050171, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7076526284217834, + "num_tokens": 110524246.0, + "step": 4413 + }, + { + "epoch": 0.4847353393367011, + "grad_norm": 2.3624002933502197, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7115559577941895, + "num_tokens": 110547627.0, + "step": 4414 + }, + { + "epoch": 0.48484515703931474, + "grad_norm": 2.263537645339966, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6974389553070068, + "num_tokens": 110573425.0, + "step": 4415 + }, + { + "epoch": 0.4849549747419284, + "grad_norm": 2.757854700088501, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.718076229095459, + "num_tokens": 110590483.0, + "step": 4416 + }, + { + "epoch": 0.48506479244454204, + "grad_norm": 2.3959407806396484, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7098411917686462, + "num_tokens": 110612044.0, + "step": 4417 + }, + { + "epoch": 0.48517461014715574, + "grad_norm": 2.263256311416626, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.7049828767776489, + "num_tokens": 110635460.0, + "step": 4418 + }, + { + "epoch": 0.4852844278497694, + "grad_norm": 2.215681791305542, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7227332592010498, + "num_tokens": 110659997.0, + "step": 4419 + }, + { + "epoch": 0.48539424555238303, + "grad_norm": 2.1223556995391846, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6936435103416443, + "num_tokens": 110686231.0, + "step": 4420 + }, + { + "epoch": 0.48550406325499673, + "grad_norm": 2.511759042739868, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7006607055664062, + "num_tokens": 110707043.0, + "step": 4421 + }, + { + "epoch": 0.4856138809576104, + "grad_norm": 2.5116162300109863, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.69576495885849, + "num_tokens": 110729276.0, + "step": 4422 + }, + { + "epoch": 0.485723698660224, + "grad_norm": 2.091083526611328, + "learning_rate": 1e-06, + "loss": 1.0713, + "mean_token_accuracy": 0.6846724152565002, + "num_tokens": 110757761.0, + "step": 4423 + }, + { + "epoch": 0.48583351636283767, + "grad_norm": 2.5266594886779785, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.735754668712616, + "num_tokens": 110778442.0, + "step": 4424 + }, + { + "epoch": 0.48594333406545137, + "grad_norm": 2.283102512359619, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7262008190155029, + "num_tokens": 110799570.0, + "step": 4425 + }, + { + "epoch": 0.486053151768065, + "grad_norm": 2.583550453186035, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7107325196266174, + "num_tokens": 110818715.0, + "step": 4426 + }, + { + "epoch": 0.48616296947067866, + "grad_norm": 1.9535554647445679, + "learning_rate": 1e-06, + "loss": 1.0418, + "mean_token_accuracy": 0.686324954032898, + "num_tokens": 110848250.0, + "step": 4427 + }, + { + "epoch": 0.4862727871732923, + "grad_norm": 2.1639199256896973, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7021807432174683, + "num_tokens": 110876765.0, + "step": 4428 + }, + { + "epoch": 0.486382604875906, + "grad_norm": 2.1744585037231445, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.701357364654541, + "num_tokens": 110902183.0, + "step": 4429 + }, + { + "epoch": 0.48649242257851966, + "grad_norm": 2.2450435161590576, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.6965768337249756, + "num_tokens": 110928322.0, + "step": 4430 + }, + { + "epoch": 0.4866022402811333, + "grad_norm": 2.1642603874206543, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7034965753555298, + "num_tokens": 110953146.0, + "step": 4431 + }, + { + "epoch": 0.486712057983747, + "grad_norm": 1.9867310523986816, + "learning_rate": 1e-06, + "loss": 1.1087, + "mean_token_accuracy": 0.674101710319519, + "num_tokens": 110986657.0, + "step": 4432 + }, + { + "epoch": 0.48682187568636065, + "grad_norm": 2.25274395942688, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6811168193817139, + "num_tokens": 111011150.0, + "step": 4433 + }, + { + "epoch": 0.4869316933889743, + "grad_norm": 2.0643372535705566, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.706382691860199, + "num_tokens": 111039483.0, + "step": 4434 + }, + { + "epoch": 0.48704151109158794, + "grad_norm": 2.459139108657837, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7006627321243286, + "num_tokens": 111062938.0, + "step": 4435 + }, + { + "epoch": 0.48715132879420164, + "grad_norm": 1.9199013710021973, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.6993356347084045, + "num_tokens": 111093937.0, + "step": 4436 + }, + { + "epoch": 0.4872611464968153, + "grad_norm": 2.488405227661133, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6902391314506531, + "num_tokens": 111113849.0, + "step": 4437 + }, + { + "epoch": 0.48737096419942894, + "grad_norm": 2.0257256031036377, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7018252015113831, + "num_tokens": 111145460.0, + "step": 4438 + }, + { + "epoch": 0.48748078190204264, + "grad_norm": 2.221304178237915, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6846708059310913, + "num_tokens": 111171540.0, + "step": 4439 + }, + { + "epoch": 0.4875905996046563, + "grad_norm": 2.3579633235931396, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7024356722831726, + "num_tokens": 111194025.0, + "step": 4440 + }, + { + "epoch": 0.48770041730726993, + "grad_norm": 2.1420295238494873, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7333618998527527, + "num_tokens": 111218278.0, + "step": 4441 + }, + { + "epoch": 0.4878102350098836, + "grad_norm": 2.271683692932129, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7148314714431763, + "num_tokens": 111241825.0, + "step": 4442 + }, + { + "epoch": 0.4879200527124973, + "grad_norm": 2.1117756366729736, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6987691521644592, + "num_tokens": 111269146.0, + "step": 4443 + }, + { + "epoch": 0.4880298704151109, + "grad_norm": 2.4748921394348145, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7306423187255859, + "num_tokens": 111287209.0, + "step": 4444 + }, + { + "epoch": 0.48813968811772457, + "grad_norm": 2.2701845169067383, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.687933623790741, + "num_tokens": 111311539.0, + "step": 4445 + }, + { + "epoch": 0.4882495058203382, + "grad_norm": 2.349968910217285, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.702404260635376, + "num_tokens": 111333821.0, + "step": 4446 + }, + { + "epoch": 0.4883593235229519, + "grad_norm": 2.379129409790039, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.708564281463623, + "num_tokens": 111356770.0, + "step": 4447 + }, + { + "epoch": 0.48846914122556556, + "grad_norm": 2.4460010528564453, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7205241322517395, + "num_tokens": 111376574.0, + "step": 4448 + }, + { + "epoch": 0.4885789589281792, + "grad_norm": 2.247875928878784, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7066165208816528, + "num_tokens": 111400429.0, + "step": 4449 + }, + { + "epoch": 0.4886887766307929, + "grad_norm": 2.4287381172180176, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7222427725791931, + "num_tokens": 111421136.0, + "step": 4450 + }, + { + "epoch": 0.48879859433340656, + "grad_norm": 2.1609792709350586, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6910535097122192, + "num_tokens": 111448996.0, + "step": 4451 + }, + { + "epoch": 0.4889084120360202, + "grad_norm": 2.083853006362915, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6809113025665283, + "num_tokens": 111477110.0, + "step": 4452 + }, + { + "epoch": 0.48901822973863385, + "grad_norm": 2.038862466812134, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7115088701248169, + "num_tokens": 111505370.0, + "step": 4453 + }, + { + "epoch": 0.48912804744124755, + "grad_norm": 2.1753358840942383, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7217919826507568, + "num_tokens": 111529142.0, + "step": 4454 + }, + { + "epoch": 0.4892378651438612, + "grad_norm": 2.172903537750244, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6836556196212769, + "num_tokens": 111557397.0, + "step": 4455 + }, + { + "epoch": 0.48934768284647484, + "grad_norm": 2.4196221828460693, + "learning_rate": 1e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.6827981472015381, + "num_tokens": 111580987.0, + "step": 4456 + }, + { + "epoch": 0.4894575005490885, + "grad_norm": 2.1686747074127197, + "learning_rate": 1e-06, + "loss": 1.0497, + "mean_token_accuracy": 0.6827697157859802, + "num_tokens": 111606701.0, + "step": 4457 + }, + { + "epoch": 0.4895673182517022, + "grad_norm": 2.2021291255950928, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7120695114135742, + "num_tokens": 111634069.0, + "step": 4458 + }, + { + "epoch": 0.48967713595431583, + "grad_norm": 2.406386613845825, + "learning_rate": 1e-06, + "loss": 1.051, + "mean_token_accuracy": 0.6850054860115051, + "num_tokens": 111655339.0, + "step": 4459 + }, + { + "epoch": 0.4897869536569295, + "grad_norm": 2.527203321456909, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7172428369522095, + "num_tokens": 111675691.0, + "step": 4460 + }, + { + "epoch": 0.4898967713595432, + "grad_norm": 2.296365261077881, + "learning_rate": 1e-06, + "loss": 1.0602, + "mean_token_accuracy": 0.6862594485282898, + "num_tokens": 111700796.0, + "step": 4461 + }, + { + "epoch": 0.4900065890621568, + "grad_norm": 2.089787721633911, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.698482096195221, + "num_tokens": 111728580.0, + "step": 4462 + }, + { + "epoch": 0.4901164067647705, + "grad_norm": 2.092994451522827, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.70847088098526, + "num_tokens": 111756147.0, + "step": 4463 + }, + { + "epoch": 0.4902262244673841, + "grad_norm": 2.4029414653778076, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7137777805328369, + "num_tokens": 111777579.0, + "step": 4464 + }, + { + "epoch": 0.4903360421699978, + "grad_norm": 2.889511823654175, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7242146134376526, + "num_tokens": 111793435.0, + "step": 4465 + }, + { + "epoch": 0.49044585987261147, + "grad_norm": 2.1362929344177246, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7191827297210693, + "num_tokens": 111817915.0, + "step": 4466 + }, + { + "epoch": 0.4905556775752251, + "grad_norm": 2.090944528579712, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.7013766765594482, + "num_tokens": 111845999.0, + "step": 4467 + }, + { + "epoch": 0.4906654952778388, + "grad_norm": 1.8687173128128052, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6964804530143738, + "num_tokens": 111878610.0, + "step": 4468 + }, + { + "epoch": 0.49077531298045246, + "grad_norm": 2.143710136413574, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.69794100522995, + "num_tokens": 111905728.0, + "step": 4469 + }, + { + "epoch": 0.4908851306830661, + "grad_norm": 1.7893277406692505, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6760400533676147, + "num_tokens": 111945649.0, + "step": 4470 + }, + { + "epoch": 0.49099494838567975, + "grad_norm": 2.499159097671509, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7019872665405273, + "num_tokens": 111966664.0, + "step": 4471 + }, + { + "epoch": 0.49110476608829345, + "grad_norm": 2.022369861602783, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6877790689468384, + "num_tokens": 111996018.0, + "step": 4472 + }, + { + "epoch": 0.4912145837909071, + "grad_norm": 2.0117173194885254, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.695565938949585, + "num_tokens": 112024170.0, + "step": 4473 + }, + { + "epoch": 0.49132440149352075, + "grad_norm": 2.825941801071167, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7209104299545288, + "num_tokens": 112041005.0, + "step": 4474 + }, + { + "epoch": 0.4914342191961344, + "grad_norm": 2.0586466789245605, + "learning_rate": 1e-06, + "loss": 1.0843, + "mean_token_accuracy": 0.6689821481704712, + "num_tokens": 112071468.0, + "step": 4475 + }, + { + "epoch": 0.4915440368987481, + "grad_norm": 2.6444599628448486, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7005681991577148, + "num_tokens": 112089641.0, + "step": 4476 + }, + { + "epoch": 0.49165385460136174, + "grad_norm": 2.1045103073120117, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7107114195823669, + "num_tokens": 112116476.0, + "step": 4477 + }, + { + "epoch": 0.4917636723039754, + "grad_norm": 2.0623912811279297, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6894711852073669, + "num_tokens": 112147318.0, + "step": 4478 + }, + { + "epoch": 0.4918734900065891, + "grad_norm": 2.0102782249450684, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6975659728050232, + "num_tokens": 112177029.0, + "step": 4479 + }, + { + "epoch": 0.49198330770920273, + "grad_norm": 2.294069528579712, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7095012068748474, + "num_tokens": 112201438.0, + "step": 4480 + }, + { + "epoch": 0.4920931254118164, + "grad_norm": 2.1170594692230225, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7059834003448486, + "num_tokens": 112227607.0, + "step": 4481 + }, + { + "epoch": 0.49220294311443, + "grad_norm": 2.1771888732910156, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7232009768486023, + "num_tokens": 112253922.0, + "step": 4482 + }, + { + "epoch": 0.4923127608170437, + "grad_norm": 2.0299644470214844, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6976801156997681, + "num_tokens": 112283681.0, + "step": 4483 + }, + { + "epoch": 0.4924225785196574, + "grad_norm": 2.198993682861328, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6895591020584106, + "num_tokens": 112308494.0, + "step": 4484 + }, + { + "epoch": 0.492532396222271, + "grad_norm": 2.602625846862793, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7114369869232178, + "num_tokens": 112328547.0, + "step": 4485 + }, + { + "epoch": 0.49264221392488466, + "grad_norm": 2.269350290298462, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6946227550506592, + "num_tokens": 112354923.0, + "step": 4486 + }, + { + "epoch": 0.49275203162749837, + "grad_norm": 2.0354115962982178, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7067995071411133, + "num_tokens": 112384369.0, + "step": 4487 + }, + { + "epoch": 0.492861849330112, + "grad_norm": 2.338534355163574, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7055160999298096, + "num_tokens": 112408514.0, + "step": 4488 + }, + { + "epoch": 0.49297166703272566, + "grad_norm": 2.1977782249450684, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7102850675582886, + "num_tokens": 112433142.0, + "step": 4489 + }, + { + "epoch": 0.49308148473533936, + "grad_norm": 2.3409972190856934, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7080044746398926, + "num_tokens": 112455593.0, + "step": 4490 + }, + { + "epoch": 0.493191302437953, + "grad_norm": 2.196873188018799, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6994079351425171, + "num_tokens": 112481015.0, + "step": 4491 + }, + { + "epoch": 0.49330112014056665, + "grad_norm": 2.0719375610351562, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7010183334350586, + "num_tokens": 112508931.0, + "step": 4492 + }, + { + "epoch": 0.4934109378431803, + "grad_norm": 2.191938638687134, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.696746826171875, + "num_tokens": 112534887.0, + "step": 4493 + }, + { + "epoch": 0.493520755545794, + "grad_norm": 2.5315287113189697, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7170646786689758, + "num_tokens": 112555037.0, + "step": 4494 + }, + { + "epoch": 0.49363057324840764, + "grad_norm": 2.355085611343384, + "learning_rate": 1e-06, + "loss": 1.0901, + "mean_token_accuracy": 0.6772825717926025, + "num_tokens": 112577520.0, + "step": 4495 + }, + { + "epoch": 0.4937403909510213, + "grad_norm": 1.9291512966156006, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6808996200561523, + "num_tokens": 112612682.0, + "step": 4496 + }, + { + "epoch": 0.493850208653635, + "grad_norm": 2.3886969089508057, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.7007371187210083, + "num_tokens": 112635822.0, + "step": 4497 + }, + { + "epoch": 0.49396002635624864, + "grad_norm": 2.4226789474487305, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.724755048751831, + "num_tokens": 112657284.0, + "step": 4498 + }, + { + "epoch": 0.4940698440588623, + "grad_norm": 2.282794952392578, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7136490345001221, + "num_tokens": 112680242.0, + "step": 4499 + }, + { + "epoch": 0.49417966176147593, + "grad_norm": 2.1996090412139893, + "learning_rate": 1e-06, + "loss": 1.0465, + "mean_token_accuracy": 0.6879724860191345, + "num_tokens": 112707170.0, + "step": 4500 + }, + { + "epoch": 0.49428947946408963, + "grad_norm": 2.2535719871520996, + "learning_rate": 1e-06, + "loss": 1.07, + "mean_token_accuracy": 0.69651859998703, + "num_tokens": 112730548.0, + "step": 4501 + }, + { + "epoch": 0.4943992971667033, + "grad_norm": 2.069154739379883, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7007478475570679, + "num_tokens": 112759753.0, + "step": 4502 + }, + { + "epoch": 0.4945091148693169, + "grad_norm": 2.196061611175537, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7081500291824341, + "num_tokens": 112785145.0, + "step": 4503 + }, + { + "epoch": 0.49461893257193057, + "grad_norm": 2.3080062866210938, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7103133201599121, + "num_tokens": 112807094.0, + "step": 4504 + }, + { + "epoch": 0.49472875027454427, + "grad_norm": 2.2838330268859863, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7350543737411499, + "num_tokens": 112829697.0, + "step": 4505 + }, + { + "epoch": 0.4948385679771579, + "grad_norm": 2.225658416748047, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7128540277481079, + "num_tokens": 112852009.0, + "step": 4506 + }, + { + "epoch": 0.49494838567977156, + "grad_norm": 2.22177791595459, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6878589391708374, + "num_tokens": 112877070.0, + "step": 4507 + }, + { + "epoch": 0.49505820338238526, + "grad_norm": 2.230825901031494, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.708701491355896, + "num_tokens": 112900554.0, + "step": 4508 + }, + { + "epoch": 0.4951680210849989, + "grad_norm": 2.1659321784973145, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7206730842590332, + "num_tokens": 112928691.0, + "step": 4509 + }, + { + "epoch": 0.49527783878761256, + "grad_norm": 2.3274168968200684, + "learning_rate": 1e-06, + "loss": 1.0935, + "mean_token_accuracy": 0.6804777383804321, + "num_tokens": 112954921.0, + "step": 4510 + }, + { + "epoch": 0.4953876564902262, + "grad_norm": 2.1331727504730225, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7088408470153809, + "num_tokens": 112983606.0, + "step": 4511 + }, + { + "epoch": 0.4954974741928399, + "grad_norm": 2.420571804046631, + "learning_rate": 1e-06, + "loss": 1.0674, + "mean_token_accuracy": 0.6950284838676453, + "num_tokens": 113007153.0, + "step": 4512 + }, + { + "epoch": 0.49560729189545355, + "grad_norm": 2.2959773540496826, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6774585843086243, + "num_tokens": 113031035.0, + "step": 4513 + }, + { + "epoch": 0.4957171095980672, + "grad_norm": 2.2184340953826904, + "learning_rate": 1e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.746450662612915, + "num_tokens": 113052761.0, + "step": 4514 + }, + { + "epoch": 0.4958269273006809, + "grad_norm": 2.4211418628692627, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7126231789588928, + "num_tokens": 113072863.0, + "step": 4515 + }, + { + "epoch": 0.49593674500329454, + "grad_norm": 2.1370623111724854, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.705359935760498, + "num_tokens": 113100309.0, + "step": 4516 + }, + { + "epoch": 0.4960465627059082, + "grad_norm": 2.4078238010406494, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.7003874778747559, + "num_tokens": 113121033.0, + "step": 4517 + }, + { + "epoch": 0.49615638040852184, + "grad_norm": 2.246786117553711, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7050957679748535, + "num_tokens": 113145436.0, + "step": 4518 + }, + { + "epoch": 0.49626619811113554, + "grad_norm": 2.3595221042633057, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7022854685783386, + "num_tokens": 113167039.0, + "step": 4519 + }, + { + "epoch": 0.4963760158137492, + "grad_norm": 2.462639570236206, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7225759029388428, + "num_tokens": 113188776.0, + "step": 4520 + }, + { + "epoch": 0.49648583351636283, + "grad_norm": 2.1083974838256836, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7054868340492249, + "num_tokens": 113216273.0, + "step": 4521 + }, + { + "epoch": 0.4965956512189765, + "grad_norm": 2.1219279766082764, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6924642324447632, + "num_tokens": 113246526.0, + "step": 4522 + }, + { + "epoch": 0.4967054689215902, + "grad_norm": 2.3216822147369385, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6990094184875488, + "num_tokens": 113270469.0, + "step": 4523 + }, + { + "epoch": 0.4968152866242038, + "grad_norm": 2.055361747741699, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7163528800010681, + "num_tokens": 113299339.0, + "step": 4524 + }, + { + "epoch": 0.49692510432681747, + "grad_norm": 2.429547071456909, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7242652177810669, + "num_tokens": 113319705.0, + "step": 4525 + }, + { + "epoch": 0.49703492202943117, + "grad_norm": 2.433070421218872, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7161466479301453, + "num_tokens": 113341693.0, + "step": 4526 + }, + { + "epoch": 0.4971447397320448, + "grad_norm": 2.2060089111328125, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7046483755111694, + "num_tokens": 113365643.0, + "step": 4527 + }, + { + "epoch": 0.49725455743465846, + "grad_norm": 2.468372344970703, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7011727094650269, + "num_tokens": 113385846.0, + "step": 4528 + }, + { + "epoch": 0.4973643751372721, + "grad_norm": 2.2428197860717773, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7107561826705933, + "num_tokens": 113409447.0, + "step": 4529 + }, + { + "epoch": 0.4974741928398858, + "grad_norm": 2.450408458709717, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7193930745124817, + "num_tokens": 113430474.0, + "step": 4530 + }, + { + "epoch": 0.49758401054249946, + "grad_norm": 2.2273130416870117, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7073644399642944, + "num_tokens": 113454481.0, + "step": 4531 + }, + { + "epoch": 0.4976938282451131, + "grad_norm": 2.296243667602539, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7020729184150696, + "num_tokens": 113478759.0, + "step": 4532 + }, + { + "epoch": 0.49780364594772675, + "grad_norm": 2.2815473079681396, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6989860534667969, + "num_tokens": 113501639.0, + "step": 4533 + }, + { + "epoch": 0.49791346365034045, + "grad_norm": 2.542113780975342, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7351677417755127, + "num_tokens": 113520687.0, + "step": 4534 + }, + { + "epoch": 0.4980232813529541, + "grad_norm": 2.282719850540161, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6971750855445862, + "num_tokens": 113542762.0, + "step": 4535 + }, + { + "epoch": 0.49813309905556774, + "grad_norm": 2.1723392009735107, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7246208190917969, + "num_tokens": 113568930.0, + "step": 4536 + }, + { + "epoch": 0.49824291675818144, + "grad_norm": 2.3130130767822266, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.708228588104248, + "num_tokens": 113592926.0, + "step": 4537 + }, + { + "epoch": 0.4983527344607951, + "grad_norm": 2.250528335571289, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6988949179649353, + "num_tokens": 113616323.0, + "step": 4538 + }, + { + "epoch": 0.49846255216340873, + "grad_norm": 2.1004176139831543, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7128512859344482, + "num_tokens": 113643657.0, + "step": 4539 + }, + { + "epoch": 0.4985723698660224, + "grad_norm": 2.2319014072418213, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.688024640083313, + "num_tokens": 113669259.0, + "step": 4540 + }, + { + "epoch": 0.4986821875686361, + "grad_norm": 2.1938507556915283, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7065107226371765, + "num_tokens": 113694096.0, + "step": 4541 + }, + { + "epoch": 0.4987920052712497, + "grad_norm": 2.76143741607666, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6976723670959473, + "num_tokens": 113711359.0, + "step": 4542 + }, + { + "epoch": 0.4989018229738634, + "grad_norm": 2.2940547466278076, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7050366401672363, + "num_tokens": 113734075.0, + "step": 4543 + }, + { + "epoch": 0.4990116406764771, + "grad_norm": 2.030003786087036, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7131693959236145, + "num_tokens": 113760661.0, + "step": 4544 + }, + { + "epoch": 0.4991214583790907, + "grad_norm": 2.5750231742858887, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7035592794418335, + "num_tokens": 113780966.0, + "step": 4545 + }, + { + "epoch": 0.49923127608170437, + "grad_norm": 2.1439387798309326, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6970078349113464, + "num_tokens": 113808184.0, + "step": 4546 + }, + { + "epoch": 0.499341093784318, + "grad_norm": 2.23573899269104, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6856955885887146, + "num_tokens": 113835185.0, + "step": 4547 + }, + { + "epoch": 0.4994509114869317, + "grad_norm": 2.130631446838379, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7015582919120789, + "num_tokens": 113861860.0, + "step": 4548 + }, + { + "epoch": 0.49956072918954536, + "grad_norm": 2.1945443153381348, + "learning_rate": 1e-06, + "loss": 1.0301, + "mean_token_accuracy": 0.6934611797332764, + "num_tokens": 113886100.0, + "step": 4549 + }, + { + "epoch": 0.499670546892159, + "grad_norm": 1.9861253499984741, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7050858736038208, + "num_tokens": 113914083.0, + "step": 4550 + }, + { + "epoch": 0.49978036459477265, + "grad_norm": 2.060023784637451, + "learning_rate": 1e-06, + "loss": 1.0601, + "mean_token_accuracy": 0.6773239970207214, + "num_tokens": 113942152.0, + "step": 4551 + }, + { + "epoch": 0.49989018229738635, + "grad_norm": 2.3128607273101807, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7073400020599365, + "num_tokens": 113964812.0, + "step": 4552 + }, + { + "epoch": 0.5, + "grad_norm": 2.0796403884887695, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7062992453575134, + "num_tokens": 113991953.0, + "step": 4553 + }, + { + "epoch": 0.5001098177026136, + "grad_norm": 2.1688947677612305, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7103046178817749, + "num_tokens": 114016565.0, + "step": 4554 + }, + { + "epoch": 0.5002196354052273, + "grad_norm": 2.165855646133423, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7100414037704468, + "num_tokens": 114041788.0, + "step": 4555 + }, + { + "epoch": 0.5003294531078409, + "grad_norm": 2.298365354537964, + "learning_rate": 1e-06, + "loss": 1.0853, + "mean_token_accuracy": 0.6741499900817871, + "num_tokens": 114066463.0, + "step": 4556 + }, + { + "epoch": 0.5004392708104547, + "grad_norm": 2.317135810852051, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6884057521820068, + "num_tokens": 114090915.0, + "step": 4557 + }, + { + "epoch": 0.5005490885130683, + "grad_norm": 2.0342864990234375, + "learning_rate": 1e-06, + "loss": 1.1135, + "mean_token_accuracy": 0.6676353216171265, + "num_tokens": 114119750.0, + "step": 4558 + }, + { + "epoch": 0.500658906215682, + "grad_norm": 2.178077459335327, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7029483914375305, + "num_tokens": 114145759.0, + "step": 4559 + }, + { + "epoch": 0.5007687239182956, + "grad_norm": 2.756471872329712, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7424398064613342, + "num_tokens": 114160508.0, + "step": 4560 + }, + { + "epoch": 0.5008785416209093, + "grad_norm": 2.4078731536865234, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7028673887252808, + "num_tokens": 114181433.0, + "step": 4561 + }, + { + "epoch": 0.5009883593235229, + "grad_norm": 2.4951512813568115, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.696260392665863, + "num_tokens": 114200248.0, + "step": 4562 + }, + { + "epoch": 0.5010981770261366, + "grad_norm": 2.1826019287109375, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7137399911880493, + "num_tokens": 114223950.0, + "step": 4563 + }, + { + "epoch": 0.5012079947287503, + "grad_norm": 2.488806962966919, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.7093478441238403, + "num_tokens": 114243562.0, + "step": 4564 + }, + { + "epoch": 0.501317812431364, + "grad_norm": 2.2173919677734375, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.6972887516021729, + "num_tokens": 114268684.0, + "step": 4565 + }, + { + "epoch": 0.5014276301339776, + "grad_norm": 2.574732542037964, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6901341080665588, + "num_tokens": 114288275.0, + "step": 4566 + }, + { + "epoch": 0.5015374478365913, + "grad_norm": 2.5476698875427246, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.6880640983581543, + "num_tokens": 114309988.0, + "step": 4567 + }, + { + "epoch": 0.5016472655392049, + "grad_norm": 2.3399105072021484, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6988914012908936, + "num_tokens": 114333723.0, + "step": 4568 + }, + { + "epoch": 0.5017570832418186, + "grad_norm": 2.2751049995422363, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7180616855621338, + "num_tokens": 114357515.0, + "step": 4569 + }, + { + "epoch": 0.5018669009444322, + "grad_norm": 2.4838955402374268, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6958333253860474, + "num_tokens": 114378862.0, + "step": 4570 + }, + { + "epoch": 0.5019767186470458, + "grad_norm": 2.310906410217285, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7208751440048218, + "num_tokens": 114402489.0, + "step": 4571 + }, + { + "epoch": 0.5020865363496596, + "grad_norm": 2.190767765045166, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6841906309127808, + "num_tokens": 114429379.0, + "step": 4572 + }, + { + "epoch": 0.5021963540522733, + "grad_norm": 2.2097458839416504, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.691891074180603, + "num_tokens": 114454787.0, + "step": 4573 + }, + { + "epoch": 0.5023061717548869, + "grad_norm": 2.401946783065796, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6916232705116272, + "num_tokens": 114477344.0, + "step": 4574 + }, + { + "epoch": 0.5024159894575005, + "grad_norm": 2.178692102432251, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6983485221862793, + "num_tokens": 114504529.0, + "step": 4575 + }, + { + "epoch": 0.5025258071601142, + "grad_norm": 2.430664539337158, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7056621313095093, + "num_tokens": 114527585.0, + "step": 4576 + }, + { + "epoch": 0.5026356248627278, + "grad_norm": 2.293811798095703, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7029763460159302, + "num_tokens": 114550845.0, + "step": 4577 + }, + { + "epoch": 0.5027454425653415, + "grad_norm": 2.5154268741607666, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7283353805541992, + "num_tokens": 114571000.0, + "step": 4578 + }, + { + "epoch": 0.5028552602679552, + "grad_norm": 2.5779080390930176, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7108380198478699, + "num_tokens": 114592013.0, + "step": 4579 + }, + { + "epoch": 0.5029650779705689, + "grad_norm": 2.302119493484497, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6976101994514465, + "num_tokens": 114615102.0, + "step": 4580 + }, + { + "epoch": 0.5030748956731825, + "grad_norm": 2.0927484035491943, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7256648540496826, + "num_tokens": 114639879.0, + "step": 4581 + }, + { + "epoch": 0.5031847133757962, + "grad_norm": 2.272176504135132, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7109050750732422, + "num_tokens": 114663627.0, + "step": 4582 + }, + { + "epoch": 0.5032945310784098, + "grad_norm": 2.394482135772705, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6990980505943298, + "num_tokens": 114686890.0, + "step": 4583 + }, + { + "epoch": 0.5034043487810235, + "grad_norm": 2.3703081607818604, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7096268534660339, + "num_tokens": 114709357.0, + "step": 4584 + }, + { + "epoch": 0.5035141664836371, + "grad_norm": 1.9010597467422485, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7134522795677185, + "num_tokens": 114741051.0, + "step": 4585 + }, + { + "epoch": 0.5036239841862509, + "grad_norm": 1.9972130060195923, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6777979135513306, + "num_tokens": 114772494.0, + "step": 4586 + }, + { + "epoch": 0.5037338018888645, + "grad_norm": 2.0236573219299316, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6899574995040894, + "num_tokens": 114800208.0, + "step": 4587 + }, + { + "epoch": 0.5038436195914782, + "grad_norm": 2.208008289337158, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.703851044178009, + "num_tokens": 114825139.0, + "step": 4588 + }, + { + "epoch": 0.5039534372940918, + "grad_norm": 2.1808438301086426, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6945537328720093, + "num_tokens": 114851516.0, + "step": 4589 + }, + { + "epoch": 0.5040632549967055, + "grad_norm": 1.83272123336792, + "learning_rate": 1e-06, + "loss": 1.1412, + "mean_token_accuracy": 0.6730366945266724, + "num_tokens": 114889335.0, + "step": 4590 + }, + { + "epoch": 0.5041730726993191, + "grad_norm": 2.0520410537719727, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7021387815475464, + "num_tokens": 114919875.0, + "step": 4591 + }, + { + "epoch": 0.5042828904019327, + "grad_norm": 2.3188400268554688, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7165989875793457, + "num_tokens": 114943406.0, + "step": 4592 + }, + { + "epoch": 0.5043927081045465, + "grad_norm": 2.201144218444824, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7064437866210938, + "num_tokens": 114968864.0, + "step": 4593 + }, + { + "epoch": 0.5045025258071602, + "grad_norm": 2.492910623550415, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7120014429092407, + "num_tokens": 114991569.0, + "step": 4594 + }, + { + "epoch": 0.5046123435097738, + "grad_norm": 2.220261335372925, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.6937364339828491, + "num_tokens": 115018568.0, + "step": 4595 + }, + { + "epoch": 0.5047221612123874, + "grad_norm": 2.1319525241851807, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7281945943832397, + "num_tokens": 115044894.0, + "step": 4596 + }, + { + "epoch": 0.5048319789150011, + "grad_norm": 2.2180027961730957, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7042672038078308, + "num_tokens": 115070493.0, + "step": 4597 + }, + { + "epoch": 0.5049417966176147, + "grad_norm": 2.260732412338257, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7018265128135681, + "num_tokens": 115093343.0, + "step": 4598 + }, + { + "epoch": 0.5050516143202284, + "grad_norm": 2.1991195678710938, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6976041197776794, + "num_tokens": 115119308.0, + "step": 4599 + }, + { + "epoch": 0.505161432022842, + "grad_norm": 2.28094220161438, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6894844770431519, + "num_tokens": 115144159.0, + "step": 4600 + }, + { + "epoch": 0.5052712497254558, + "grad_norm": 2.4299423694610596, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.695908784866333, + "num_tokens": 115166274.0, + "step": 4601 + }, + { + "epoch": 0.5053810674280694, + "grad_norm": 2.198880195617676, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6925510764122009, + "num_tokens": 115191518.0, + "step": 4602 + }, + { + "epoch": 0.5054908851306831, + "grad_norm": 2.1592814922332764, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6900423169136047, + "num_tokens": 115217924.0, + "step": 4603 + }, + { + "epoch": 0.5056007028332967, + "grad_norm": 2.3632776737213135, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7157435417175293, + "num_tokens": 115241094.0, + "step": 4604 + }, + { + "epoch": 0.5057105205359104, + "grad_norm": 2.3322055339813232, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7214577198028564, + "num_tokens": 115264661.0, + "step": 4605 + }, + { + "epoch": 0.505820338238524, + "grad_norm": 2.207256555557251, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6891402006149292, + "num_tokens": 115291962.0, + "step": 4606 + }, + { + "epoch": 0.5059301559411377, + "grad_norm": 2.1883246898651123, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7036981582641602, + "num_tokens": 115317767.0, + "step": 4607 + }, + { + "epoch": 0.5060399736437514, + "grad_norm": 2.1357228755950928, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6944246292114258, + "num_tokens": 115344242.0, + "step": 4608 + }, + { + "epoch": 0.5061497913463651, + "grad_norm": 2.275815963745117, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.6987048983573914, + "num_tokens": 115369638.0, + "step": 4609 + }, + { + "epoch": 0.5062596090489787, + "grad_norm": 1.9925105571746826, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6989285349845886, + "num_tokens": 115401198.0, + "step": 4610 + }, + { + "epoch": 0.5063694267515924, + "grad_norm": 2.196120262145996, + "learning_rate": 1e-06, + "loss": 1.112, + "mean_token_accuracy": 0.6688868403434753, + "num_tokens": 115429699.0, + "step": 4611 + }, + { + "epoch": 0.506479244454206, + "grad_norm": 1.9640495777130127, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6928067207336426, + "num_tokens": 115462211.0, + "step": 4612 + }, + { + "epoch": 0.5065890621568196, + "grad_norm": 2.04479718208313, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7243054509162903, + "num_tokens": 115489777.0, + "step": 4613 + }, + { + "epoch": 0.5066988798594333, + "grad_norm": 2.233964681625366, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7082177400588989, + "num_tokens": 115514572.0, + "step": 4614 + }, + { + "epoch": 0.506808697562047, + "grad_norm": 2.241523027420044, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6797922253608704, + "num_tokens": 115540958.0, + "step": 4615 + }, + { + "epoch": 0.5069185152646607, + "grad_norm": 2.2874975204467773, + "learning_rate": 1e-06, + "loss": 1.0573, + "mean_token_accuracy": 0.6865644454956055, + "num_tokens": 115565320.0, + "step": 4616 + }, + { + "epoch": 0.5070283329672743, + "grad_norm": 2.066601276397705, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6921757459640503, + "num_tokens": 115593749.0, + "step": 4617 + }, + { + "epoch": 0.507138150669888, + "grad_norm": 2.1077475547790527, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.6880697011947632, + "num_tokens": 115621251.0, + "step": 4618 + }, + { + "epoch": 0.5072479683725016, + "grad_norm": 2.23134708404541, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7132584452629089, + "num_tokens": 115646918.0, + "step": 4619 + }, + { + "epoch": 0.5073577860751153, + "grad_norm": 2.3293559551239014, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7255821228027344, + "num_tokens": 115667864.0, + "step": 4620 + }, + { + "epoch": 0.5074676037777289, + "grad_norm": 2.403837203979492, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7017689943313599, + "num_tokens": 115689654.0, + "step": 4621 + }, + { + "epoch": 0.5075774214803427, + "grad_norm": 2.4424071311950684, + "learning_rate": 1e-06, + "loss": 1.0381, + "mean_token_accuracy": 0.6893616914749146, + "num_tokens": 115712453.0, + "step": 4622 + }, + { + "epoch": 0.5076872391829563, + "grad_norm": 2.0888524055480957, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7072335481643677, + "num_tokens": 115739100.0, + "step": 4623 + }, + { + "epoch": 0.50779705688557, + "grad_norm": 2.1910159587860107, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7066197395324707, + "num_tokens": 115763997.0, + "step": 4624 + }, + { + "epoch": 0.5079068745881836, + "grad_norm": 2.0517349243164062, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7042269110679626, + "num_tokens": 115793132.0, + "step": 4625 + }, + { + "epoch": 0.5080166922907973, + "grad_norm": 2.421168327331543, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6928688883781433, + "num_tokens": 115815857.0, + "step": 4626 + }, + { + "epoch": 0.5081265099934109, + "grad_norm": 2.278473138809204, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6843698024749756, + "num_tokens": 115841003.0, + "step": 4627 + }, + { + "epoch": 0.5082363276960246, + "grad_norm": 2.0360562801361084, + "learning_rate": 1e-06, + "loss": 1.0818, + "mean_token_accuracy": 0.6770325303077698, + "num_tokens": 115869131.0, + "step": 4628 + }, + { + "epoch": 0.5083461453986382, + "grad_norm": 1.9894073009490967, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7087301015853882, + "num_tokens": 115899433.0, + "step": 4629 + }, + { + "epoch": 0.508455963101252, + "grad_norm": 2.1367990970611572, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7044587135314941, + "num_tokens": 115926111.0, + "step": 4630 + }, + { + "epoch": 0.5085657808038656, + "grad_norm": 2.1682381629943848, + "learning_rate": 1e-06, + "loss": 1.0478, + "mean_token_accuracy": 0.6877251267433167, + "num_tokens": 115954348.0, + "step": 4631 + }, + { + "epoch": 0.5086755985064793, + "grad_norm": 2.362847089767456, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6954969167709351, + "num_tokens": 115978551.0, + "step": 4632 + }, + { + "epoch": 0.5087854162090929, + "grad_norm": 2.3341224193573, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.7125018239021301, + "num_tokens": 116001267.0, + "step": 4633 + }, + { + "epoch": 0.5088952339117065, + "grad_norm": 2.383460283279419, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7030583024024963, + "num_tokens": 116024219.0, + "step": 4634 + }, + { + "epoch": 0.5090050516143202, + "grad_norm": 2.4077625274658203, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7256053686141968, + "num_tokens": 116044599.0, + "step": 4635 + }, + { + "epoch": 0.5091148693169338, + "grad_norm": 2.4318490028381348, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7027112245559692, + "num_tokens": 116066997.0, + "step": 4636 + }, + { + "epoch": 0.5092246870195476, + "grad_norm": 2.228278160095215, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.6858443021774292, + "num_tokens": 116093124.0, + "step": 4637 + }, + { + "epoch": 0.5093345047221612, + "grad_norm": 2.118548631668091, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7136583924293518, + "num_tokens": 116118762.0, + "step": 4638 + }, + { + "epoch": 0.5094443224247749, + "grad_norm": 1.9348098039627075, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6976549029350281, + "num_tokens": 116151752.0, + "step": 4639 + }, + { + "epoch": 0.5095541401273885, + "grad_norm": 2.294064521789551, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6895619034767151, + "num_tokens": 116176850.0, + "step": 4640 + }, + { + "epoch": 0.5096639578300022, + "grad_norm": 2.3324100971221924, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7015361785888672, + "num_tokens": 116200222.0, + "step": 4641 + }, + { + "epoch": 0.5097737755326158, + "grad_norm": 2.1637415885925293, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.700756311416626, + "num_tokens": 116226967.0, + "step": 4642 + }, + { + "epoch": 0.5098835932352295, + "grad_norm": 1.9520143270492554, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7159130573272705, + "num_tokens": 116258076.0, + "step": 4643 + }, + { + "epoch": 0.5099934109378432, + "grad_norm": 2.4428892135620117, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7122332453727722, + "num_tokens": 116279172.0, + "step": 4644 + }, + { + "epoch": 0.5101032286404569, + "grad_norm": 1.9668560028076172, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.703039288520813, + "num_tokens": 116310603.0, + "step": 4645 + }, + { + "epoch": 0.5102130463430705, + "grad_norm": 2.5386481285095215, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6833741664886475, + "num_tokens": 116331999.0, + "step": 4646 + }, + { + "epoch": 0.5103228640456842, + "grad_norm": 2.5929713249206543, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6980840563774109, + "num_tokens": 116352597.0, + "step": 4647 + }, + { + "epoch": 0.5104326817482978, + "grad_norm": 2.010820150375366, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6998122334480286, + "num_tokens": 116381105.0, + "step": 4648 + }, + { + "epoch": 0.5105424994509115, + "grad_norm": 2.2419002056121826, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6885347366333008, + "num_tokens": 116405891.0, + "step": 4649 + }, + { + "epoch": 0.5106523171535251, + "grad_norm": 2.092132091522217, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7031567096710205, + "num_tokens": 116432838.0, + "step": 4650 + }, + { + "epoch": 0.5107621348561389, + "grad_norm": 2.305746078491211, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7297275066375732, + "num_tokens": 116454584.0, + "step": 4651 + }, + { + "epoch": 0.5108719525587525, + "grad_norm": 2.473470449447632, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6928187608718872, + "num_tokens": 116475605.0, + "step": 4652 + }, + { + "epoch": 0.5109817702613662, + "grad_norm": 2.218221664428711, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7036603093147278, + "num_tokens": 116500231.0, + "step": 4653 + }, + { + "epoch": 0.5110915879639798, + "grad_norm": 2.1083009243011475, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7148185968399048, + "num_tokens": 116526082.0, + "step": 4654 + }, + { + "epoch": 0.5112014056665934, + "grad_norm": 2.077455520629883, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7219197750091553, + "num_tokens": 116551563.0, + "step": 4655 + }, + { + "epoch": 0.5113112233692071, + "grad_norm": 1.8714847564697266, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7279554605484009, + "num_tokens": 116581224.0, + "step": 4656 + }, + { + "epoch": 0.5114210410718207, + "grad_norm": 1.893817663192749, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6987254619598389, + "num_tokens": 116613826.0, + "step": 4657 + }, + { + "epoch": 0.5115308587744345, + "grad_norm": 2.1962831020355225, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.683137059211731, + "num_tokens": 116640294.0, + "step": 4658 + }, + { + "epoch": 0.5116406764770481, + "grad_norm": 2.595055103302002, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7049845457077026, + "num_tokens": 116659576.0, + "step": 4659 + }, + { + "epoch": 0.5117504941796618, + "grad_norm": 2.127345085144043, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6916764378547668, + "num_tokens": 116690015.0, + "step": 4660 + }, + { + "epoch": 0.5118603118822754, + "grad_norm": 1.9764095544815063, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7101507186889648, + "num_tokens": 116719154.0, + "step": 4661 + }, + { + "epoch": 0.5119701295848891, + "grad_norm": 2.1125998497009277, + "learning_rate": 1e-06, + "loss": 1.0358, + "mean_token_accuracy": 0.6933090686798096, + "num_tokens": 116747362.0, + "step": 4662 + }, + { + "epoch": 0.5120799472875027, + "grad_norm": 2.0585830211639404, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.701716959476471, + "num_tokens": 116776865.0, + "step": 4663 + }, + { + "epoch": 0.5121897649901164, + "grad_norm": 2.312246799468994, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7156381607055664, + "num_tokens": 116799096.0, + "step": 4664 + }, + { + "epoch": 0.51229958269273, + "grad_norm": 2.4041104316711426, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7075163722038269, + "num_tokens": 116821224.0, + "step": 4665 + }, + { + "epoch": 0.5124094003953438, + "grad_norm": 2.0023787021636963, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.6995139718055725, + "num_tokens": 116850932.0, + "step": 4666 + }, + { + "epoch": 0.5125192180979574, + "grad_norm": 2.3842082023620605, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.7081632018089294, + "num_tokens": 116874940.0, + "step": 4667 + }, + { + "epoch": 0.5126290358005711, + "grad_norm": 2.071061372756958, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7154207229614258, + "num_tokens": 116901295.0, + "step": 4668 + }, + { + "epoch": 0.5127388535031847, + "grad_norm": 2.526183843612671, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7354394793510437, + "num_tokens": 116919140.0, + "step": 4669 + }, + { + "epoch": 0.5128486712057984, + "grad_norm": 2.196815013885498, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7096397280693054, + "num_tokens": 116945592.0, + "step": 4670 + }, + { + "epoch": 0.512958488908412, + "grad_norm": 2.3055291175842285, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7035836577415466, + "num_tokens": 116968614.0, + "step": 4671 + }, + { + "epoch": 0.5130683066110256, + "grad_norm": 2.6755309104919434, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6914359331130981, + "num_tokens": 116989008.0, + "step": 4672 + }, + { + "epoch": 0.5131781243136394, + "grad_norm": 2.361799478530884, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7061919569969177, + "num_tokens": 117011308.0, + "step": 4673 + }, + { + "epoch": 0.513287942016253, + "grad_norm": 2.127537727355957, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6946712732315063, + "num_tokens": 117038973.0, + "step": 4674 + }, + { + "epoch": 0.5133977597188667, + "grad_norm": 2.5046539306640625, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7163411378860474, + "num_tokens": 117059665.0, + "step": 4675 + }, + { + "epoch": 0.5135075774214803, + "grad_norm": 2.4634265899658203, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.708027720451355, + "num_tokens": 117080861.0, + "step": 4676 + }, + { + "epoch": 0.513617395124094, + "grad_norm": 1.9590566158294678, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7165453433990479, + "num_tokens": 117110297.0, + "step": 4677 + }, + { + "epoch": 0.5137272128267076, + "grad_norm": 2.297675371170044, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7291033267974854, + "num_tokens": 117133729.0, + "step": 4678 + }, + { + "epoch": 0.5138370305293213, + "grad_norm": 2.8581466674804688, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.711777925491333, + "num_tokens": 117149866.0, + "step": 4679 + }, + { + "epoch": 0.513946848231935, + "grad_norm": 2.6740965843200684, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7130720019340515, + "num_tokens": 117169189.0, + "step": 4680 + }, + { + "epoch": 0.5140566659345487, + "grad_norm": 2.5357420444488525, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7241819500923157, + "num_tokens": 117187369.0, + "step": 4681 + }, + { + "epoch": 0.5141664836371623, + "grad_norm": 2.4175548553466797, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7109448909759521, + "num_tokens": 117208334.0, + "step": 4682 + }, + { + "epoch": 0.514276301339776, + "grad_norm": 2.1007094383239746, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7075743675231934, + "num_tokens": 117235648.0, + "step": 4683 + }, + { + "epoch": 0.5143861190423896, + "grad_norm": 2.2101404666900635, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7145180702209473, + "num_tokens": 117260694.0, + "step": 4684 + }, + { + "epoch": 0.5144959367450033, + "grad_norm": 2.1491217613220215, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7278783321380615, + "num_tokens": 117286229.0, + "step": 4685 + }, + { + "epoch": 0.5146057544476169, + "grad_norm": 2.044541835784912, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.708425760269165, + "num_tokens": 117314853.0, + "step": 4686 + }, + { + "epoch": 0.5147155721502307, + "grad_norm": 2.403475046157837, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6963557004928589, + "num_tokens": 117336426.0, + "step": 4687 + }, + { + "epoch": 0.5148253898528443, + "grad_norm": 2.646143674850464, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.69196617603302, + "num_tokens": 117356466.0, + "step": 4688 + }, + { + "epoch": 0.514935207555458, + "grad_norm": 2.189760684967041, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6925877928733826, + "num_tokens": 117381749.0, + "step": 4689 + }, + { + "epoch": 0.5150450252580716, + "grad_norm": 2.2200114727020264, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7096095681190491, + "num_tokens": 117405767.0, + "step": 4690 + }, + { + "epoch": 0.5151548429606853, + "grad_norm": 2.071016311645508, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7122044563293457, + "num_tokens": 117433147.0, + "step": 4691 + }, + { + "epoch": 0.5152646606632989, + "grad_norm": 1.9854031801223755, + "learning_rate": 1e-06, + "loss": 1.0617, + "mean_token_accuracy": 0.674919605255127, + "num_tokens": 117463239.0, + "step": 4692 + }, + { + "epoch": 0.5153744783659125, + "grad_norm": 2.4661989212036133, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6796897649765015, + "num_tokens": 117486119.0, + "step": 4693 + }, + { + "epoch": 0.5154842960685262, + "grad_norm": 2.519202470779419, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7219575047492981, + "num_tokens": 117505473.0, + "step": 4694 + }, + { + "epoch": 0.51559411377114, + "grad_norm": 2.446943759918213, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6953016519546509, + "num_tokens": 117528403.0, + "step": 4695 + }, + { + "epoch": 0.5157039314737536, + "grad_norm": 2.2955739498138428, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7284391522407532, + "num_tokens": 117550360.0, + "step": 4696 + }, + { + "epoch": 0.5158137491763672, + "grad_norm": 2.1986746788024902, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7201170921325684, + "num_tokens": 117575096.0, + "step": 4697 + }, + { + "epoch": 0.5159235668789809, + "grad_norm": 2.214146137237549, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7353412508964539, + "num_tokens": 117599088.0, + "step": 4698 + }, + { + "epoch": 0.5160333845815945, + "grad_norm": 1.9876861572265625, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6896493434906006, + "num_tokens": 117631829.0, + "step": 4699 + }, + { + "epoch": 0.5161432022842082, + "grad_norm": 2.127260446548462, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7221897840499878, + "num_tokens": 117658141.0, + "step": 4700 + }, + { + "epoch": 0.5162530199868218, + "grad_norm": 2.240013837814331, + "learning_rate": 1e-06, + "loss": 0.971, + "mean_token_accuracy": 0.6992436647415161, + "num_tokens": 117683074.0, + "step": 4701 + }, + { + "epoch": 0.5163628376894356, + "grad_norm": 2.275390148162842, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7079893350601196, + "num_tokens": 117707857.0, + "step": 4702 + }, + { + "epoch": 0.5164726553920492, + "grad_norm": 2.2139647006988525, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.719459593296051, + "num_tokens": 117733198.0, + "step": 4703 + }, + { + "epoch": 0.5165824730946629, + "grad_norm": 1.9021540880203247, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6961828470230103, + "num_tokens": 117765084.0, + "step": 4704 + }, + { + "epoch": 0.5166922907972765, + "grad_norm": 2.226203441619873, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7203812003135681, + "num_tokens": 117788430.0, + "step": 4705 + }, + { + "epoch": 0.5168021084998902, + "grad_norm": 2.104696035385132, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7094550132751465, + "num_tokens": 117814603.0, + "step": 4706 + }, + { + "epoch": 0.5169119262025038, + "grad_norm": 2.336667060852051, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7093422412872314, + "num_tokens": 117836388.0, + "step": 4707 + }, + { + "epoch": 0.5170217439051175, + "grad_norm": 2.6604983806610107, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7167307734489441, + "num_tokens": 117857096.0, + "step": 4708 + }, + { + "epoch": 0.5171315616077312, + "grad_norm": 2.3802578449249268, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7212238311767578, + "num_tokens": 117878180.0, + "step": 4709 + }, + { + "epoch": 0.5172413793103449, + "grad_norm": 2.5503852367401123, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7294797301292419, + "num_tokens": 117897002.0, + "step": 4710 + }, + { + "epoch": 0.5173511970129585, + "grad_norm": 2.1298115253448486, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.723776638507843, + "num_tokens": 117924035.0, + "step": 4711 + }, + { + "epoch": 0.5174610147155722, + "grad_norm": 2.535512685775757, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7020412087440491, + "num_tokens": 117944991.0, + "step": 4712 + }, + { + "epoch": 0.5175708324181858, + "grad_norm": 2.460376501083374, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7165964245796204, + "num_tokens": 117964801.0, + "step": 4713 + }, + { + "epoch": 0.5176806501207994, + "grad_norm": 2.1580753326416016, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6926525831222534, + "num_tokens": 117989909.0, + "step": 4714 + }, + { + "epoch": 0.5177904678234131, + "grad_norm": 2.3040544986724854, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.7043287754058838, + "num_tokens": 118013294.0, + "step": 4715 + }, + { + "epoch": 0.5179002855260268, + "grad_norm": 2.510777235031128, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7051711082458496, + "num_tokens": 118034285.0, + "step": 4716 + }, + { + "epoch": 0.5180101032286405, + "grad_norm": 2.1135122776031494, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6901902556419373, + "num_tokens": 118062902.0, + "step": 4717 + }, + { + "epoch": 0.5181199209312541, + "grad_norm": 2.5223751068115234, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7097904086112976, + "num_tokens": 118084207.0, + "step": 4718 + }, + { + "epoch": 0.5182297386338678, + "grad_norm": 2.0297493934631348, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7030221223831177, + "num_tokens": 118114867.0, + "step": 4719 + }, + { + "epoch": 0.5183395563364814, + "grad_norm": 2.0862643718719482, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.704313337802887, + "num_tokens": 118141608.0, + "step": 4720 + }, + { + "epoch": 0.5184493740390951, + "grad_norm": 1.8223108053207397, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.7038632035255432, + "num_tokens": 118173092.0, + "step": 4721 + }, + { + "epoch": 0.5185591917417087, + "grad_norm": 2.170659065246582, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7218115329742432, + "num_tokens": 118201579.0, + "step": 4722 + }, + { + "epoch": 0.5186690094443224, + "grad_norm": 2.118659257888794, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7017890214920044, + "num_tokens": 118228318.0, + "step": 4723 + }, + { + "epoch": 0.5187788271469361, + "grad_norm": 2.2642555236816406, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6914488077163696, + "num_tokens": 118251995.0, + "step": 4724 + }, + { + "epoch": 0.5188886448495498, + "grad_norm": 2.0462071895599365, + "learning_rate": 1e-06, + "loss": 1.019, + "mean_token_accuracy": 0.6893592476844788, + "num_tokens": 118280821.0, + "step": 4725 + }, + { + "epoch": 0.5189984625521634, + "grad_norm": 2.3897459506988525, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7025685906410217, + "num_tokens": 118302659.0, + "step": 4726 + }, + { + "epoch": 0.5191082802547771, + "grad_norm": 2.4533259868621826, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7198770046234131, + "num_tokens": 118323254.0, + "step": 4727 + }, + { + "epoch": 0.5192180979573907, + "grad_norm": 2.1910133361816406, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6927754878997803, + "num_tokens": 118350773.0, + "step": 4728 + }, + { + "epoch": 0.5193279156600044, + "grad_norm": 2.2495617866516113, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.693990170955658, + "num_tokens": 118376024.0, + "step": 4729 + }, + { + "epoch": 0.519437733362618, + "grad_norm": 2.5885162353515625, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7178393006324768, + "num_tokens": 118397090.0, + "step": 4730 + }, + { + "epoch": 0.5195475510652318, + "grad_norm": 2.2376561164855957, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7290949821472168, + "num_tokens": 118420262.0, + "step": 4731 + }, + { + "epoch": 0.5196573687678454, + "grad_norm": 2.1528913974761963, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7143964767456055, + "num_tokens": 118447141.0, + "step": 4732 + }, + { + "epoch": 0.519767186470459, + "grad_norm": 2.065175771713257, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6942490935325623, + "num_tokens": 118473376.0, + "step": 4733 + }, + { + "epoch": 0.5198770041730727, + "grad_norm": 2.2624781131744385, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7369397878646851, + "num_tokens": 118496758.0, + "step": 4734 + }, + { + "epoch": 0.5199868218756863, + "grad_norm": 2.3068044185638428, + "learning_rate": 1e-06, + "loss": 1.1049, + "mean_token_accuracy": 0.6688406467437744, + "num_tokens": 118519747.0, + "step": 4735 + }, + { + "epoch": 0.5200966395783, + "grad_norm": 2.097423791885376, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6892343759536743, + "num_tokens": 118548487.0, + "step": 4736 + }, + { + "epoch": 0.5202064572809136, + "grad_norm": 2.2559399604797363, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.715563178062439, + "num_tokens": 118571547.0, + "step": 4737 + }, + { + "epoch": 0.5203162749835274, + "grad_norm": 2.1306679248809814, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7137671709060669, + "num_tokens": 118597616.0, + "step": 4738 + }, + { + "epoch": 0.520426092686141, + "grad_norm": 2.3197546005249023, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.697422981262207, + "num_tokens": 118621349.0, + "step": 4739 + }, + { + "epoch": 0.5205359103887547, + "grad_norm": 2.4969942569732666, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7030166387557983, + "num_tokens": 118643578.0, + "step": 4740 + }, + { + "epoch": 0.5206457280913683, + "grad_norm": 2.213564395904541, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7148270606994629, + "num_tokens": 118667572.0, + "step": 4741 + }, + { + "epoch": 0.520755545793982, + "grad_norm": 2.296647787094116, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7125698924064636, + "num_tokens": 118690162.0, + "step": 4742 + }, + { + "epoch": 0.5208653634965956, + "grad_norm": 2.470338821411133, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7162607312202454, + "num_tokens": 118719330.0, + "step": 4743 + }, + { + "epoch": 0.5209751811992093, + "grad_norm": 2.3367645740509033, + "learning_rate": 1e-06, + "loss": 1.0894, + "mean_token_accuracy": 0.6756221055984497, + "num_tokens": 118743976.0, + "step": 4744 + }, + { + "epoch": 0.521084998901823, + "grad_norm": 2.1443183422088623, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7124234437942505, + "num_tokens": 118769410.0, + "step": 4745 + }, + { + "epoch": 0.5211948166044367, + "grad_norm": 2.2527830600738525, + "learning_rate": 1e-06, + "loss": 1.0689, + "mean_token_accuracy": 0.6864906549453735, + "num_tokens": 118793921.0, + "step": 4746 + }, + { + "epoch": 0.5213046343070503, + "grad_norm": 2.3038382530212402, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.716766357421875, + "num_tokens": 118816480.0, + "step": 4747 + }, + { + "epoch": 0.521414452009664, + "grad_norm": 2.3361964225769043, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7004505395889282, + "num_tokens": 118839168.0, + "step": 4748 + }, + { + "epoch": 0.5215242697122776, + "grad_norm": 1.8647717237472534, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6899526119232178, + "num_tokens": 118873861.0, + "step": 4749 + }, + { + "epoch": 0.5216340874148913, + "grad_norm": 2.4633209705352783, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7057797312736511, + "num_tokens": 118893798.0, + "step": 4750 + }, + { + "epoch": 0.5217439051175049, + "grad_norm": 2.303466558456421, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7101876139640808, + "num_tokens": 118916766.0, + "step": 4751 + }, + { + "epoch": 0.5218537228201185, + "grad_norm": 2.30367112159729, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7186335325241089, + "num_tokens": 118940741.0, + "step": 4752 + }, + { + "epoch": 0.5219635405227323, + "grad_norm": 2.15748929977417, + "learning_rate": 1e-06, + "loss": 1.0348, + "mean_token_accuracy": 0.687187671661377, + "num_tokens": 118968260.0, + "step": 4753 + }, + { + "epoch": 0.522073358225346, + "grad_norm": 2.040541410446167, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7033005356788635, + "num_tokens": 118998478.0, + "step": 4754 + }, + { + "epoch": 0.5221831759279596, + "grad_norm": 1.9592105150222778, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6894617080688477, + "num_tokens": 119031092.0, + "step": 4755 + }, + { + "epoch": 0.5222929936305732, + "grad_norm": 2.256347179412842, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.701501727104187, + "num_tokens": 119054910.0, + "step": 4756 + }, + { + "epoch": 0.5224028113331869, + "grad_norm": 2.093973159790039, + "learning_rate": 1e-06, + "loss": 1.0844, + "mean_token_accuracy": 0.6729403734207153, + "num_tokens": 119084881.0, + "step": 4757 + }, + { + "epoch": 0.5225126290358005, + "grad_norm": 2.252063512802124, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6910134553909302, + "num_tokens": 119111768.0, + "step": 4758 + }, + { + "epoch": 0.5226224467384142, + "grad_norm": 1.9955905675888062, + "learning_rate": 1e-06, + "loss": 1.073, + "mean_token_accuracy": 0.680512547492981, + "num_tokens": 119146203.0, + "step": 4759 + }, + { + "epoch": 0.5227322644410279, + "grad_norm": 2.104452610015869, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7270776033401489, + "num_tokens": 119171517.0, + "step": 4760 + }, + { + "epoch": 0.5228420821436416, + "grad_norm": 2.0529799461364746, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7107319831848145, + "num_tokens": 119199192.0, + "step": 4761 + }, + { + "epoch": 0.5229518998462552, + "grad_norm": 2.2304446697235107, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7194983959197998, + "num_tokens": 119223516.0, + "step": 4762 + }, + { + "epoch": 0.5230617175488689, + "grad_norm": 2.1994783878326416, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7312108278274536, + "num_tokens": 119246612.0, + "step": 4763 + }, + { + "epoch": 0.5231715352514825, + "grad_norm": 2.3616607189178467, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7154322266578674, + "num_tokens": 119268554.0, + "step": 4764 + }, + { + "epoch": 0.5232813529540962, + "grad_norm": 2.460073232650757, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7201733589172363, + "num_tokens": 119287884.0, + "step": 4765 + }, + { + "epoch": 0.5233911706567098, + "grad_norm": 2.231926679611206, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.6981274485588074, + "num_tokens": 119314066.0, + "step": 4766 + }, + { + "epoch": 0.5235009883593236, + "grad_norm": 2.153723955154419, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7003023028373718, + "num_tokens": 119340818.0, + "step": 4767 + }, + { + "epoch": 0.5236108060619372, + "grad_norm": 2.65195631980896, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7034881114959717, + "num_tokens": 119360248.0, + "step": 4768 + }, + { + "epoch": 0.5237206237645509, + "grad_norm": 2.3931243419647217, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7108672857284546, + "num_tokens": 119382306.0, + "step": 4769 + }, + { + "epoch": 0.5238304414671645, + "grad_norm": 1.9864267110824585, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7108386754989624, + "num_tokens": 119411981.0, + "step": 4770 + }, + { + "epoch": 0.5239402591697782, + "grad_norm": 2.2470149993896484, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.73492032289505, + "num_tokens": 119435054.0, + "step": 4771 + }, + { + "epoch": 0.5240500768723918, + "grad_norm": 1.913802981376648, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7267937660217285, + "num_tokens": 119465936.0, + "step": 4772 + }, + { + "epoch": 0.5241598945750054, + "grad_norm": 2.2609574794769287, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7212262153625488, + "num_tokens": 119487075.0, + "step": 4773 + }, + { + "epoch": 0.5242697122776192, + "grad_norm": 2.17144775390625, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7073639035224915, + "num_tokens": 119512218.0, + "step": 4774 + }, + { + "epoch": 0.5243795299802329, + "grad_norm": 2.165947914123535, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.6961115002632141, + "num_tokens": 119538691.0, + "step": 4775 + }, + { + "epoch": 0.5244893476828465, + "grad_norm": 2.0490880012512207, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6943403482437134, + "num_tokens": 119566001.0, + "step": 4776 + }, + { + "epoch": 0.5245991653854601, + "grad_norm": 2.3736960887908936, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7338379621505737, + "num_tokens": 119587063.0, + "step": 4777 + }, + { + "epoch": 0.5247089830880738, + "grad_norm": 2.1942479610443115, + "learning_rate": 1e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7393184304237366, + "num_tokens": 119611106.0, + "step": 4778 + }, + { + "epoch": 0.5248188007906874, + "grad_norm": 2.745418071746826, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6911475658416748, + "num_tokens": 119630176.0, + "step": 4779 + }, + { + "epoch": 0.5249286184933011, + "grad_norm": 2.140263795852661, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7069135904312134, + "num_tokens": 119657757.0, + "step": 4780 + }, + { + "epoch": 0.5250384361959147, + "grad_norm": 2.1884756088256836, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6998007297515869, + "num_tokens": 119684337.0, + "step": 4781 + }, + { + "epoch": 0.5251482538985285, + "grad_norm": 2.2308294773101807, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.7049151659011841, + "num_tokens": 119708941.0, + "step": 4782 + }, + { + "epoch": 0.5252580716011421, + "grad_norm": 2.176218271255493, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6951507329940796, + "num_tokens": 119736664.0, + "step": 4783 + }, + { + "epoch": 0.5253678893037558, + "grad_norm": 2.167053461074829, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6891961097717285, + "num_tokens": 119765730.0, + "step": 4784 + }, + { + "epoch": 0.5254777070063694, + "grad_norm": 2.5924181938171387, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7314404249191284, + "num_tokens": 119783392.0, + "step": 4785 + }, + { + "epoch": 0.5255875247089831, + "grad_norm": 2.3940796852111816, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6943274736404419, + "num_tokens": 119805808.0, + "step": 4786 + }, + { + "epoch": 0.5256973424115967, + "grad_norm": 2.285648822784424, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6951341032981873, + "num_tokens": 119828303.0, + "step": 4787 + }, + { + "epoch": 0.5258071601142104, + "grad_norm": 2.398040294647217, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.708734393119812, + "num_tokens": 119849902.0, + "step": 4788 + }, + { + "epoch": 0.5259169778168241, + "grad_norm": 2.3086891174316406, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7254326343536377, + "num_tokens": 119872160.0, + "step": 4789 + }, + { + "epoch": 0.5260267955194378, + "grad_norm": 2.382843494415283, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7029953002929688, + "num_tokens": 119893919.0, + "step": 4790 + }, + { + "epoch": 0.5261366132220514, + "grad_norm": 2.347370147705078, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7037035226821899, + "num_tokens": 119918893.0, + "step": 4791 + }, + { + "epoch": 0.526246430924665, + "grad_norm": 2.3337416648864746, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6965309977531433, + "num_tokens": 119943746.0, + "step": 4792 + }, + { + "epoch": 0.5263562486272787, + "grad_norm": 2.3498623371124268, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7081422805786133, + "num_tokens": 119966139.0, + "step": 4793 + }, + { + "epoch": 0.5264660663298923, + "grad_norm": 2.1528780460357666, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6994706988334656, + "num_tokens": 119991622.0, + "step": 4794 + }, + { + "epoch": 0.526575884032506, + "grad_norm": 2.0136616230010986, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.7003370523452759, + "num_tokens": 120019287.0, + "step": 4795 + }, + { + "epoch": 0.5266857017351197, + "grad_norm": 2.181985855102539, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6779126524925232, + "num_tokens": 120045000.0, + "step": 4796 + }, + { + "epoch": 0.5267955194377334, + "grad_norm": 2.1905875205993652, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7167420387268066, + "num_tokens": 120068610.0, + "step": 4797 + }, + { + "epoch": 0.526905337140347, + "grad_norm": 2.2764458656311035, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7178940773010254, + "num_tokens": 120090957.0, + "step": 4798 + }, + { + "epoch": 0.5270151548429607, + "grad_norm": 2.2111284732818604, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.699187159538269, + "num_tokens": 120117962.0, + "step": 4799 + }, + { + "epoch": 0.5271249725455743, + "grad_norm": 2.601485252380371, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7061967849731445, + "num_tokens": 120136332.0, + "step": 4800 + }, + { + "epoch": 0.527234790248188, + "grad_norm": 2.0919651985168457, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6938210725784302, + "num_tokens": 120161445.0, + "step": 4801 + }, + { + "epoch": 0.5273446079508016, + "grad_norm": 2.436978816986084, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.70029217004776, + "num_tokens": 120184463.0, + "step": 4802 + }, + { + "epoch": 0.5274544256534154, + "grad_norm": 2.373162269592285, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6945359706878662, + "num_tokens": 120207721.0, + "step": 4803 + }, + { + "epoch": 0.527564243356029, + "grad_norm": 2.1471707820892334, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6781659126281738, + "num_tokens": 120233598.0, + "step": 4804 + }, + { + "epoch": 0.5276740610586427, + "grad_norm": 2.141829490661621, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7154212594032288, + "num_tokens": 120259405.0, + "step": 4805 + }, + { + "epoch": 0.5277838787612563, + "grad_norm": 2.33250093460083, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7135348320007324, + "num_tokens": 120282352.0, + "step": 4806 + }, + { + "epoch": 0.52789369646387, + "grad_norm": 2.2124195098876953, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7095277309417725, + "num_tokens": 120310858.0, + "step": 4807 + }, + { + "epoch": 0.5280035141664836, + "grad_norm": 2.36824369430542, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7013230919837952, + "num_tokens": 120332826.0, + "step": 4808 + }, + { + "epoch": 0.5281133318690973, + "grad_norm": 2.1928794384002686, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.697516679763794, + "num_tokens": 120357814.0, + "step": 4809 + }, + { + "epoch": 0.528223149571711, + "grad_norm": 2.106821060180664, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7072129249572754, + "num_tokens": 120386175.0, + "step": 4810 + }, + { + "epoch": 0.5283329672743247, + "grad_norm": 2.307590961456299, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6889055371284485, + "num_tokens": 120409906.0, + "step": 4811 + }, + { + "epoch": 0.5284427849769383, + "grad_norm": 2.273155450820923, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7142837643623352, + "num_tokens": 120434953.0, + "step": 4812 + }, + { + "epoch": 0.528552602679552, + "grad_norm": 2.2082607746124268, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7016110420227051, + "num_tokens": 120462147.0, + "step": 4813 + }, + { + "epoch": 0.5286624203821656, + "grad_norm": 2.486043691635132, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7083593010902405, + "num_tokens": 120483256.0, + "step": 4814 + }, + { + "epoch": 0.5287722380847792, + "grad_norm": 1.9984524250030518, + "learning_rate": 1e-06, + "loss": 1.0838, + "mean_token_accuracy": 0.6818230152130127, + "num_tokens": 120512344.0, + "step": 4815 + }, + { + "epoch": 0.5288820557873929, + "grad_norm": 2.2052745819091797, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6946169137954712, + "num_tokens": 120538804.0, + "step": 4816 + }, + { + "epoch": 0.5289918734900065, + "grad_norm": 2.4606752395629883, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7217792272567749, + "num_tokens": 120558916.0, + "step": 4817 + }, + { + "epoch": 0.5291016911926203, + "grad_norm": 1.9277912378311157, + "learning_rate": 1e-06, + "loss": 1.0734, + "mean_token_accuracy": 0.6766065359115601, + "num_tokens": 120589782.0, + "step": 4818 + }, + { + "epoch": 0.5292115088952339, + "grad_norm": 2.244159460067749, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6948673725128174, + "num_tokens": 120617336.0, + "step": 4819 + }, + { + "epoch": 0.5293213265978476, + "grad_norm": 2.274968147277832, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7013518214225769, + "num_tokens": 120643028.0, + "step": 4820 + }, + { + "epoch": 0.5294311443004612, + "grad_norm": 2.169628620147705, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6932775974273682, + "num_tokens": 120670617.0, + "step": 4821 + }, + { + "epoch": 0.5295409620030749, + "grad_norm": 2.007962226867676, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6978750228881836, + "num_tokens": 120700735.0, + "step": 4822 + }, + { + "epoch": 0.5296507797056885, + "grad_norm": 2.1085128784179688, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.7007365226745605, + "num_tokens": 120729189.0, + "step": 4823 + }, + { + "epoch": 0.5297605974083022, + "grad_norm": 2.2241103649139404, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7141356468200684, + "num_tokens": 120754656.0, + "step": 4824 + }, + { + "epoch": 0.5298704151109159, + "grad_norm": 2.198608160018921, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7051335573196411, + "num_tokens": 120781047.0, + "step": 4825 + }, + { + "epoch": 0.5299802328135296, + "grad_norm": 2.2807157039642334, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.714392900466919, + "num_tokens": 120804487.0, + "step": 4826 + }, + { + "epoch": 0.5300900505161432, + "grad_norm": 2.3811166286468506, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7161116600036621, + "num_tokens": 120827962.0, + "step": 4827 + }, + { + "epoch": 0.5301998682187569, + "grad_norm": 2.158242702484131, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7009866237640381, + "num_tokens": 120852999.0, + "step": 4828 + }, + { + "epoch": 0.5303096859213705, + "grad_norm": 2.3933327198028564, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7128212451934814, + "num_tokens": 120873195.0, + "step": 4829 + }, + { + "epoch": 0.5304195036239842, + "grad_norm": 2.0600650310516357, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.700300931930542, + "num_tokens": 120903815.0, + "step": 4830 + }, + { + "epoch": 0.5305293213265978, + "grad_norm": 2.086337089538574, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7251249551773071, + "num_tokens": 120930766.0, + "step": 4831 + }, + { + "epoch": 0.5306391390292116, + "grad_norm": 2.183229923248291, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.6963748335838318, + "num_tokens": 120955649.0, + "step": 4832 + }, + { + "epoch": 0.5307489567318252, + "grad_norm": 2.574371576309204, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.6976003646850586, + "num_tokens": 120976273.0, + "step": 4833 + }, + { + "epoch": 0.5308587744344389, + "grad_norm": 2.3712046146392822, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7465797662734985, + "num_tokens": 120997211.0, + "step": 4834 + }, + { + "epoch": 0.5309685921370525, + "grad_norm": 2.333718776702881, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6972888112068176, + "num_tokens": 121022974.0, + "step": 4835 + }, + { + "epoch": 0.5310784098396661, + "grad_norm": 2.307166337966919, + "learning_rate": 1e-06, + "loss": 1.0577, + "mean_token_accuracy": 0.6940905451774597, + "num_tokens": 121046527.0, + "step": 4836 + }, + { + "epoch": 0.5311882275422798, + "grad_norm": 2.3452422618865967, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7224195003509521, + "num_tokens": 121068657.0, + "step": 4837 + }, + { + "epoch": 0.5312980452448934, + "grad_norm": 2.1546106338500977, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7132494449615479, + "num_tokens": 121094283.0, + "step": 4838 + }, + { + "epoch": 0.5314078629475072, + "grad_norm": 2.1826441287994385, + "learning_rate": 1e-06, + "loss": 1.0144, + "mean_token_accuracy": 0.6916441917419434, + "num_tokens": 121120430.0, + "step": 4839 + }, + { + "epoch": 0.5315176806501208, + "grad_norm": 2.4003982543945312, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7157273292541504, + "num_tokens": 121140321.0, + "step": 4840 + }, + { + "epoch": 0.5316274983527345, + "grad_norm": 2.06510329246521, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6812441349029541, + "num_tokens": 121168202.0, + "step": 4841 + }, + { + "epoch": 0.5317373160553481, + "grad_norm": 2.3160834312438965, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7019897699356079, + "num_tokens": 121191210.0, + "step": 4842 + }, + { + "epoch": 0.5318471337579618, + "grad_norm": 2.1574149131774902, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7227843999862671, + "num_tokens": 121214510.0, + "step": 4843 + }, + { + "epoch": 0.5319569514605754, + "grad_norm": 2.472275972366333, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.7077071070671082, + "num_tokens": 121234516.0, + "step": 4844 + }, + { + "epoch": 0.5320667691631891, + "grad_norm": 2.3325023651123047, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6836584806442261, + "num_tokens": 121258107.0, + "step": 4845 + }, + { + "epoch": 0.5321765868658027, + "grad_norm": 2.2862484455108643, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6934945583343506, + "num_tokens": 121281628.0, + "step": 4846 + }, + { + "epoch": 0.5322864045684165, + "grad_norm": 2.1955182552337646, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7067083120346069, + "num_tokens": 121304766.0, + "step": 4847 + }, + { + "epoch": 0.5323962222710301, + "grad_norm": 1.9159132242202759, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6993940472602844, + "num_tokens": 121334797.0, + "step": 4848 + }, + { + "epoch": 0.5325060399736438, + "grad_norm": 2.282670497894287, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7247861623764038, + "num_tokens": 121358273.0, + "step": 4849 + }, + { + "epoch": 0.5326158576762574, + "grad_norm": 2.2852933406829834, + "learning_rate": 1e-06, + "loss": 1.0499, + "mean_token_accuracy": 0.6835412979125977, + "num_tokens": 121382715.0, + "step": 4850 + }, + { + "epoch": 0.532725675378871, + "grad_norm": 2.2372379302978516, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7045615911483765, + "num_tokens": 121408842.0, + "step": 4851 + }, + { + "epoch": 0.5328354930814847, + "grad_norm": 2.3151819705963135, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7033963203430176, + "num_tokens": 121430683.0, + "step": 4852 + }, + { + "epoch": 0.5329453107840983, + "grad_norm": 2.1352596282958984, + "learning_rate": 1e-06, + "loss": 1.0363, + "mean_token_accuracy": 0.6823273301124573, + "num_tokens": 121459661.0, + "step": 4853 + }, + { + "epoch": 0.5330551284867121, + "grad_norm": 2.6821532249450684, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7027890682220459, + "num_tokens": 121478969.0, + "step": 4854 + }, + { + "epoch": 0.5331649461893258, + "grad_norm": 2.1027042865753174, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7008851766586304, + "num_tokens": 121507624.0, + "step": 4855 + }, + { + "epoch": 0.5332747638919394, + "grad_norm": 2.209362030029297, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.701768159866333, + "num_tokens": 121532156.0, + "step": 4856 + }, + { + "epoch": 0.533384581594553, + "grad_norm": 2.133925199508667, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7044340372085571, + "num_tokens": 121557811.0, + "step": 4857 + }, + { + "epoch": 0.5334943992971667, + "grad_norm": 2.258235454559326, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.720511794090271, + "num_tokens": 121582629.0, + "step": 4858 + }, + { + "epoch": 0.5336042169997803, + "grad_norm": 1.9984558820724487, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7017527222633362, + "num_tokens": 121610382.0, + "step": 4859 + }, + { + "epoch": 0.533714034702394, + "grad_norm": 2.010615587234497, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6984901428222656, + "num_tokens": 121641111.0, + "step": 4860 + }, + { + "epoch": 0.5338238524050077, + "grad_norm": 2.040555238723755, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6971588134765625, + "num_tokens": 121670123.0, + "step": 4861 + }, + { + "epoch": 0.5339336701076214, + "grad_norm": 2.214082717895508, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.704049825668335, + "num_tokens": 121695702.0, + "step": 4862 + }, + { + "epoch": 0.534043487810235, + "grad_norm": 1.8501254320144653, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.696652889251709, + "num_tokens": 121729501.0, + "step": 4863 + }, + { + "epoch": 0.5341533055128487, + "grad_norm": 2.0229601860046387, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6997360587120056, + "num_tokens": 121756349.0, + "step": 4864 + }, + { + "epoch": 0.5342631232154623, + "grad_norm": 2.129178285598755, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6962465047836304, + "num_tokens": 121783812.0, + "step": 4865 + }, + { + "epoch": 0.534372940918076, + "grad_norm": 2.5670721530914307, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7066208124160767, + "num_tokens": 121804432.0, + "step": 4866 + }, + { + "epoch": 0.5344827586206896, + "grad_norm": 2.4872984886169434, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7050726413726807, + "num_tokens": 121828023.0, + "step": 4867 + }, + { + "epoch": 0.5345925763233034, + "grad_norm": 2.5489425659179688, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7068506479263306, + "num_tokens": 121848535.0, + "step": 4868 + }, + { + "epoch": 0.534702394025917, + "grad_norm": 2.082958936691284, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7209365367889404, + "num_tokens": 121873834.0, + "step": 4869 + }, + { + "epoch": 0.5348122117285307, + "grad_norm": 2.3471808433532715, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.695710301399231, + "num_tokens": 121897264.0, + "step": 4870 + }, + { + "epoch": 0.5349220294311443, + "grad_norm": 2.383910894393921, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6905857920646667, + "num_tokens": 121923855.0, + "step": 4871 + }, + { + "epoch": 0.535031847133758, + "grad_norm": 2.1904642581939697, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.721422553062439, + "num_tokens": 121946088.0, + "step": 4872 + }, + { + "epoch": 0.5351416648363716, + "grad_norm": 2.0956199169158936, + "learning_rate": 1e-06, + "loss": 1.1052, + "mean_token_accuracy": 0.6754823923110962, + "num_tokens": 121976862.0, + "step": 4873 + }, + { + "epoch": 0.5352514825389852, + "grad_norm": 2.140829086303711, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7078999280929565, + "num_tokens": 122002122.0, + "step": 4874 + }, + { + "epoch": 0.5353613002415989, + "grad_norm": 2.674445152282715, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7032299041748047, + "num_tokens": 122021565.0, + "step": 4875 + }, + { + "epoch": 0.5354711179442126, + "grad_norm": 2.724726438522339, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7157166004180908, + "num_tokens": 122040346.0, + "step": 4876 + }, + { + "epoch": 0.5355809356468263, + "grad_norm": 2.3387396335601807, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6834625005722046, + "num_tokens": 122063772.0, + "step": 4877 + }, + { + "epoch": 0.5356907533494399, + "grad_norm": 2.0793943405151367, + "learning_rate": 1e-06, + "loss": 1.0898, + "mean_token_accuracy": 0.6746422648429871, + "num_tokens": 122095945.0, + "step": 4878 + }, + { + "epoch": 0.5358005710520536, + "grad_norm": 2.0443360805511475, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6959187388420105, + "num_tokens": 122123793.0, + "step": 4879 + }, + { + "epoch": 0.5359103887546672, + "grad_norm": 2.208794593811035, + "learning_rate": 1e-06, + "loss": 1.0901, + "mean_token_accuracy": 0.6791372895240784, + "num_tokens": 122148966.0, + "step": 4880 + }, + { + "epoch": 0.5360202064572809, + "grad_norm": 2.0539398193359375, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.707831084728241, + "num_tokens": 122175149.0, + "step": 4881 + }, + { + "epoch": 0.5361300241598945, + "grad_norm": 2.457118511199951, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7065702676773071, + "num_tokens": 122195161.0, + "step": 4882 + }, + { + "epoch": 0.5362398418625083, + "grad_norm": 2.3306403160095215, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7255996465682983, + "num_tokens": 122217763.0, + "step": 4883 + }, + { + "epoch": 0.5363496595651219, + "grad_norm": 2.401726722717285, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7224991321563721, + "num_tokens": 122238345.0, + "step": 4884 + }, + { + "epoch": 0.5364594772677356, + "grad_norm": 2.5272669792175293, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7170008420944214, + "num_tokens": 122258161.0, + "step": 4885 + }, + { + "epoch": 0.5365692949703492, + "grad_norm": 2.1482174396514893, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.704652726650238, + "num_tokens": 122282870.0, + "step": 4886 + }, + { + "epoch": 0.5366791126729629, + "grad_norm": 2.4077165126800537, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7097983360290527, + "num_tokens": 122304801.0, + "step": 4887 + }, + { + "epoch": 0.5367889303755765, + "grad_norm": 2.1339495182037354, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7124436497688293, + "num_tokens": 122334384.0, + "step": 4888 + }, + { + "epoch": 0.5368987480781902, + "grad_norm": 2.4004263877868652, + "learning_rate": 1e-06, + "loss": 1.0587, + "mean_token_accuracy": 0.6832704544067383, + "num_tokens": 122359859.0, + "step": 4889 + }, + { + "epoch": 0.5370085657808039, + "grad_norm": 1.9445661306381226, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.688225269317627, + "num_tokens": 122392306.0, + "step": 4890 + }, + { + "epoch": 0.5371183834834176, + "grad_norm": 2.5426647663116455, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7000741362571716, + "num_tokens": 122414955.0, + "step": 4891 + }, + { + "epoch": 0.5372282011860312, + "grad_norm": 2.3976762294769287, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7171187996864319, + "num_tokens": 122438315.0, + "step": 4892 + }, + { + "epoch": 0.5373380188886449, + "grad_norm": 1.9041246175765991, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6848347187042236, + "num_tokens": 122471130.0, + "step": 4893 + }, + { + "epoch": 0.5374478365912585, + "grad_norm": 2.264035224914551, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7138786315917969, + "num_tokens": 122494268.0, + "step": 4894 + }, + { + "epoch": 0.5375576542938721, + "grad_norm": 2.145705223083496, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6901879906654358, + "num_tokens": 122521271.0, + "step": 4895 + }, + { + "epoch": 0.5376674719964858, + "grad_norm": 2.334771156311035, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7137727737426758, + "num_tokens": 122542681.0, + "step": 4896 + }, + { + "epoch": 0.5377772896990995, + "grad_norm": 2.2070531845092773, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6984230279922485, + "num_tokens": 122571837.0, + "step": 4897 + }, + { + "epoch": 0.5378871074017132, + "grad_norm": 2.3596558570861816, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6854044795036316, + "num_tokens": 122596190.0, + "step": 4898 + }, + { + "epoch": 0.5379969251043268, + "grad_norm": 2.4925999641418457, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7012588381767273, + "num_tokens": 122617969.0, + "step": 4899 + }, + { + "epoch": 0.5381067428069405, + "grad_norm": 2.364157199859619, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7036581635475159, + "num_tokens": 122641407.0, + "step": 4900 + }, + { + "epoch": 0.5382165605095541, + "grad_norm": 2.2619550228118896, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6938316822052002, + "num_tokens": 122667961.0, + "step": 4901 + }, + { + "epoch": 0.5383263782121678, + "grad_norm": 2.1505038738250732, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6911910772323608, + "num_tokens": 122693239.0, + "step": 4902 + }, + { + "epoch": 0.5384361959147814, + "grad_norm": 2.050480842590332, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6891028881072998, + "num_tokens": 122723038.0, + "step": 4903 + }, + { + "epoch": 0.5385460136173951, + "grad_norm": 2.12186336517334, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7030994892120361, + "num_tokens": 122752143.0, + "step": 4904 + }, + { + "epoch": 0.5386558313200088, + "grad_norm": 2.299118757247925, + "learning_rate": 1e-06, + "loss": 1.0898, + "mean_token_accuracy": 0.6769891977310181, + "num_tokens": 122776775.0, + "step": 4905 + }, + { + "epoch": 0.5387656490226225, + "grad_norm": 2.4294772148132324, + "learning_rate": 1e-06, + "loss": 1.0921, + "mean_token_accuracy": 0.678036630153656, + "num_tokens": 122799534.0, + "step": 4906 + }, + { + "epoch": 0.5388754667252361, + "grad_norm": 2.5392234325408936, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7198467254638672, + "num_tokens": 122818698.0, + "step": 4907 + }, + { + "epoch": 0.5389852844278498, + "grad_norm": 2.473249673843384, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7294217944145203, + "num_tokens": 122839410.0, + "step": 4908 + }, + { + "epoch": 0.5390951021304634, + "grad_norm": 1.9518723487854004, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7111078500747681, + "num_tokens": 122869169.0, + "step": 4909 + }, + { + "epoch": 0.5392049198330771, + "grad_norm": 2.1997311115264893, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7191931009292603, + "num_tokens": 122896151.0, + "step": 4910 + }, + { + "epoch": 0.5393147375356907, + "grad_norm": 2.2735989093780518, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7052299976348877, + "num_tokens": 122919171.0, + "step": 4911 + }, + { + "epoch": 0.5394245552383045, + "grad_norm": 2.4653592109680176, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6921359300613403, + "num_tokens": 122940613.0, + "step": 4912 + }, + { + "epoch": 0.5395343729409181, + "grad_norm": 2.2086105346679688, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6981198787689209, + "num_tokens": 122966318.0, + "step": 4913 + }, + { + "epoch": 0.5396441906435318, + "grad_norm": 2.157012462615967, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.700532078742981, + "num_tokens": 122992865.0, + "step": 4914 + }, + { + "epoch": 0.5397540083461454, + "grad_norm": 2.120954990386963, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7022964954376221, + "num_tokens": 123018147.0, + "step": 4915 + }, + { + "epoch": 0.539863826048759, + "grad_norm": 2.1091606616973877, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7142212390899658, + "num_tokens": 123044485.0, + "step": 4916 + }, + { + "epoch": 0.5399736437513727, + "grad_norm": 2.1404061317443848, + "learning_rate": 1e-06, + "loss": 1.0199, + "mean_token_accuracy": 0.691306471824646, + "num_tokens": 123070617.0, + "step": 4917 + }, + { + "epoch": 0.5400834614539863, + "grad_norm": 2.285478353500366, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.71168053150177, + "num_tokens": 123093168.0, + "step": 4918 + }, + { + "epoch": 0.5401932791566001, + "grad_norm": 2.5698654651641846, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6763840913772583, + "num_tokens": 123113772.0, + "step": 4919 + }, + { + "epoch": 0.5403030968592137, + "grad_norm": 2.898068904876709, + "learning_rate": 1e-06, + "loss": 1.023, + "mean_token_accuracy": 0.6871685981750488, + "num_tokens": 123130363.0, + "step": 4920 + }, + { + "epoch": 0.5404129145618274, + "grad_norm": 2.0022919178009033, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.702654242515564, + "num_tokens": 123160696.0, + "step": 4921 + }, + { + "epoch": 0.540522732264441, + "grad_norm": 1.997599482536316, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6880185008049011, + "num_tokens": 123190496.0, + "step": 4922 + }, + { + "epoch": 0.5406325499670547, + "grad_norm": 2.302462339401245, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6946582794189453, + "num_tokens": 123215511.0, + "step": 4923 + }, + { + "epoch": 0.5407423676696683, + "grad_norm": 2.588183641433716, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.695818305015564, + "num_tokens": 123235239.0, + "step": 4924 + }, + { + "epoch": 0.540852185372282, + "grad_norm": 2.0754408836364746, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6847214698791504, + "num_tokens": 123265439.0, + "step": 4925 + }, + { + "epoch": 0.5409620030748957, + "grad_norm": 2.259153127670288, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7223727107048035, + "num_tokens": 123288341.0, + "step": 4926 + }, + { + "epoch": 0.5410718207775094, + "grad_norm": 2.1491496562957764, + "learning_rate": 1e-06, + "loss": 1.0768, + "mean_token_accuracy": 0.6818789839744568, + "num_tokens": 123314484.0, + "step": 4927 + }, + { + "epoch": 0.541181638480123, + "grad_norm": 2.1695480346679688, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7232207655906677, + "num_tokens": 123340194.0, + "step": 4928 + }, + { + "epoch": 0.5412914561827367, + "grad_norm": 2.2697720527648926, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6951824426651001, + "num_tokens": 123364526.0, + "step": 4929 + }, + { + "epoch": 0.5414012738853503, + "grad_norm": 2.5997939109802246, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7094098925590515, + "num_tokens": 123383755.0, + "step": 4930 + }, + { + "epoch": 0.541511091587964, + "grad_norm": 2.3369083404541016, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.714550256729126, + "num_tokens": 123404866.0, + "step": 4931 + }, + { + "epoch": 0.5416209092905776, + "grad_norm": 2.2713654041290283, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6978796720504761, + "num_tokens": 123428791.0, + "step": 4932 + }, + { + "epoch": 0.5417307269931912, + "grad_norm": 2.0612642765045166, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7055202126502991, + "num_tokens": 123455596.0, + "step": 4933 + }, + { + "epoch": 0.541840544695805, + "grad_norm": 2.16530704498291, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.6880929470062256, + "num_tokens": 123483679.0, + "step": 4934 + }, + { + "epoch": 0.5419503623984187, + "grad_norm": 2.505281448364258, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7133464813232422, + "num_tokens": 123503505.0, + "step": 4935 + }, + { + "epoch": 0.5420601801010323, + "grad_norm": 2.0510613918304443, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7270909547805786, + "num_tokens": 123530639.0, + "step": 4936 + }, + { + "epoch": 0.5421699978036459, + "grad_norm": 2.256894111633301, + "learning_rate": 1e-06, + "loss": 1.0949, + "mean_token_accuracy": 0.6750551462173462, + "num_tokens": 123554755.0, + "step": 4937 + }, + { + "epoch": 0.5422798155062596, + "grad_norm": 2.0972607135772705, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6966363191604614, + "num_tokens": 123583875.0, + "step": 4938 + }, + { + "epoch": 0.5423896332088732, + "grad_norm": 2.1759064197540283, + "learning_rate": 1e-06, + "loss": 1.028, + "mean_token_accuracy": 0.700995683670044, + "num_tokens": 123610499.0, + "step": 4939 + }, + { + "epoch": 0.5424994509114869, + "grad_norm": 2.2011451721191406, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7031543850898743, + "num_tokens": 123635849.0, + "step": 4940 + }, + { + "epoch": 0.5426092686141006, + "grad_norm": 2.394848585128784, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6827645301818848, + "num_tokens": 123660327.0, + "step": 4941 + }, + { + "epoch": 0.5427190863167143, + "grad_norm": 2.370431423187256, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.73036128282547, + "num_tokens": 123682061.0, + "step": 4942 + }, + { + "epoch": 0.5428289040193279, + "grad_norm": 2.1944727897644043, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7135754823684692, + "num_tokens": 123705616.0, + "step": 4943 + }, + { + "epoch": 0.5429387217219416, + "grad_norm": 2.3060922622680664, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7150508165359497, + "num_tokens": 123728755.0, + "step": 4944 + }, + { + "epoch": 0.5430485394245552, + "grad_norm": 2.2243356704711914, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7170479893684387, + "num_tokens": 123753232.0, + "step": 4945 + }, + { + "epoch": 0.5431583571271689, + "grad_norm": 1.9693032503128052, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6926214098930359, + "num_tokens": 123784245.0, + "step": 4946 + }, + { + "epoch": 0.5432681748297825, + "grad_norm": 2.3530774116516113, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.701270580291748, + "num_tokens": 123806647.0, + "step": 4947 + }, + { + "epoch": 0.5433779925323963, + "grad_norm": 2.3095791339874268, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7082593441009521, + "num_tokens": 123829635.0, + "step": 4948 + }, + { + "epoch": 0.5434878102350099, + "grad_norm": 2.224860906600952, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.689827561378479, + "num_tokens": 123854935.0, + "step": 4949 + }, + { + "epoch": 0.5435976279376236, + "grad_norm": 2.083159923553467, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7002982497215271, + "num_tokens": 123882772.0, + "step": 4950 + }, + { + "epoch": 0.5437074456402372, + "grad_norm": 2.273837089538574, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7004204988479614, + "num_tokens": 123906639.0, + "step": 4951 + }, + { + "epoch": 0.5438172633428509, + "grad_norm": 2.608854293823242, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.698512613773346, + "num_tokens": 123926351.0, + "step": 4952 + }, + { + "epoch": 0.5439270810454645, + "grad_norm": 2.4361979961395264, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7174496650695801, + "num_tokens": 123947788.0, + "step": 4953 + }, + { + "epoch": 0.5440368987480781, + "grad_norm": 2.026657819747925, + "learning_rate": 1e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6753221750259399, + "num_tokens": 123979313.0, + "step": 4954 + }, + { + "epoch": 0.5441467164506919, + "grad_norm": 2.046088457107544, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7004643082618713, + "num_tokens": 124007059.0, + "step": 4955 + }, + { + "epoch": 0.5442565341533055, + "grad_norm": 2.224733352661133, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7044252753257751, + "num_tokens": 124033095.0, + "step": 4956 + }, + { + "epoch": 0.5443663518559192, + "grad_norm": 2.4299099445343018, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7210427522659302, + "num_tokens": 124054008.0, + "step": 4957 + }, + { + "epoch": 0.5444761695585328, + "grad_norm": 2.5017616748809814, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6936727166175842, + "num_tokens": 124074689.0, + "step": 4958 + }, + { + "epoch": 0.5445859872611465, + "grad_norm": 2.5246450901031494, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7213399410247803, + "num_tokens": 124094387.0, + "step": 4959 + }, + { + "epoch": 0.5446958049637601, + "grad_norm": 2.2487528324127197, + "learning_rate": 1e-06, + "loss": 1.0951, + "mean_token_accuracy": 0.6725839376449585, + "num_tokens": 124121136.0, + "step": 4960 + }, + { + "epoch": 0.5448056226663738, + "grad_norm": 2.2523937225341797, + "learning_rate": 1e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.6792216300964355, + "num_tokens": 124145407.0, + "step": 4961 + }, + { + "epoch": 0.5449154403689875, + "grad_norm": 2.594806671142578, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7130778431892395, + "num_tokens": 124165153.0, + "step": 4962 + }, + { + "epoch": 0.5450252580716012, + "grad_norm": 2.413240671157837, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7131001353263855, + "num_tokens": 124187070.0, + "step": 4963 + }, + { + "epoch": 0.5451350757742148, + "grad_norm": 2.7544736862182617, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.705001711845398, + "num_tokens": 124209600.0, + "step": 4964 + }, + { + "epoch": 0.5452448934768285, + "grad_norm": 2.7243266105651855, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7172644734382629, + "num_tokens": 124227668.0, + "step": 4965 + }, + { + "epoch": 0.5453547111794421, + "grad_norm": 2.3950510025024414, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7173230648040771, + "num_tokens": 124249428.0, + "step": 4966 + }, + { + "epoch": 0.5454645288820558, + "grad_norm": 2.005316734313965, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7061360478401184, + "num_tokens": 124280528.0, + "step": 4967 + }, + { + "epoch": 0.5455743465846694, + "grad_norm": 2.4616923332214355, + "learning_rate": 1e-06, + "loss": 1.0742, + "mean_token_accuracy": 0.6779187917709351, + "num_tokens": 124301837.0, + "step": 4968 + }, + { + "epoch": 0.5456841642872831, + "grad_norm": 2.098599910736084, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7093590497970581, + "num_tokens": 124330731.0, + "step": 4969 + }, + { + "epoch": 0.5457939819898968, + "grad_norm": 2.2608797550201416, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7117703557014465, + "num_tokens": 124354962.0, + "step": 4970 + }, + { + "epoch": 0.5459037996925105, + "grad_norm": 2.6046788692474365, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7211798429489136, + "num_tokens": 124374713.0, + "step": 4971 + }, + { + "epoch": 0.5460136173951241, + "grad_norm": 2.068726062774658, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7048380374908447, + "num_tokens": 124405860.0, + "step": 4972 + }, + { + "epoch": 0.5461234350977378, + "grad_norm": 2.4168941974639893, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6994431018829346, + "num_tokens": 124428161.0, + "step": 4973 + }, + { + "epoch": 0.5462332528003514, + "grad_norm": 2.1227617263793945, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7011575698852539, + "num_tokens": 124454825.0, + "step": 4974 + }, + { + "epoch": 0.546343070502965, + "grad_norm": 2.353029489517212, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7070932388305664, + "num_tokens": 124476314.0, + "step": 4975 + }, + { + "epoch": 0.5464528882055787, + "grad_norm": 2.109135150909424, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7008046507835388, + "num_tokens": 124503972.0, + "step": 4976 + }, + { + "epoch": 0.5465627059081924, + "grad_norm": 2.2028207778930664, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6899304389953613, + "num_tokens": 124529952.0, + "step": 4977 + }, + { + "epoch": 0.5466725236108061, + "grad_norm": 2.1976139545440674, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6995970606803894, + "num_tokens": 124556023.0, + "step": 4978 + }, + { + "epoch": 0.5467823413134197, + "grad_norm": 2.117490291595459, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6872194409370422, + "num_tokens": 124583312.0, + "step": 4979 + }, + { + "epoch": 0.5468921590160334, + "grad_norm": 2.0749807357788086, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7165813446044922, + "num_tokens": 124607737.0, + "step": 4980 + }, + { + "epoch": 0.547001976718647, + "grad_norm": 2.507270097732544, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6918860673904419, + "num_tokens": 124628320.0, + "step": 4981 + }, + { + "epoch": 0.5471117944212607, + "grad_norm": 2.3325307369232178, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7184538841247559, + "num_tokens": 124649722.0, + "step": 4982 + }, + { + "epoch": 0.5472216121238743, + "grad_norm": 2.621968984603882, + "learning_rate": 1e-06, + "loss": 1.0233, + "mean_token_accuracy": 0.6957963705062866, + "num_tokens": 124667605.0, + "step": 4983 + }, + { + "epoch": 0.5473314298264881, + "grad_norm": 2.1152212619781494, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7194949984550476, + "num_tokens": 124695393.0, + "step": 4984 + }, + { + "epoch": 0.5474412475291017, + "grad_norm": 2.6412999629974365, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7101783156394958, + "num_tokens": 124715145.0, + "step": 4985 + }, + { + "epoch": 0.5475510652317154, + "grad_norm": 2.1228723526000977, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7251893281936646, + "num_tokens": 124741393.0, + "step": 4986 + }, + { + "epoch": 0.547660882934329, + "grad_norm": 2.3945751190185547, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7104148268699646, + "num_tokens": 124764422.0, + "step": 4987 + }, + { + "epoch": 0.5477707006369427, + "grad_norm": 2.1360220909118652, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.6922661662101746, + "num_tokens": 124791829.0, + "step": 4988 + }, + { + "epoch": 0.5478805183395563, + "grad_norm": 2.118361473083496, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7143331170082092, + "num_tokens": 124818775.0, + "step": 4989 + }, + { + "epoch": 0.54799033604217, + "grad_norm": 2.238548517227173, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7176289558410645, + "num_tokens": 124844686.0, + "step": 4990 + }, + { + "epoch": 0.5481001537447837, + "grad_norm": 2.529334783554077, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7001514434814453, + "num_tokens": 124864788.0, + "step": 4991 + }, + { + "epoch": 0.5482099714473974, + "grad_norm": 2.32296085357666, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7098082900047302, + "num_tokens": 124888236.0, + "step": 4992 + }, + { + "epoch": 0.548319789150011, + "grad_norm": 2.0945096015930176, + "learning_rate": 1e-06, + "loss": 1.1009, + "mean_token_accuracy": 0.671958327293396, + "num_tokens": 124917270.0, + "step": 4993 + }, + { + "epoch": 0.5484296068526247, + "grad_norm": 2.2111003398895264, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7237659096717834, + "num_tokens": 124941027.0, + "step": 4994 + }, + { + "epoch": 0.5485394245552383, + "grad_norm": 1.8782576322555542, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7065421938896179, + "num_tokens": 124972735.0, + "step": 4995 + }, + { + "epoch": 0.5486492422578519, + "grad_norm": 2.101212739944458, + "learning_rate": 1e-06, + "loss": 1.101, + "mean_token_accuracy": 0.6680829524993896, + "num_tokens": 125002392.0, + "step": 4996 + }, + { + "epoch": 0.5487590599604656, + "grad_norm": 2.2407479286193848, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7086185216903687, + "num_tokens": 125026837.0, + "step": 4997 + }, + { + "epoch": 0.5488688776630792, + "grad_norm": 2.444483518600464, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7223760485649109, + "num_tokens": 125048450.0, + "step": 4998 + }, + { + "epoch": 0.548978695365693, + "grad_norm": 2.0686376094818115, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6923125982284546, + "num_tokens": 125075702.0, + "step": 4999 + }, + { + "epoch": 0.5490885130683066, + "grad_norm": 2.632500171661377, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6937326788902283, + "num_tokens": 125094659.0, + "step": 5000 + }, + { + "epoch": 0.5491983307709203, + "grad_norm": 2.071726083755493, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7131307721138, + "num_tokens": 125122829.0, + "step": 5001 + }, + { + "epoch": 0.5493081484735339, + "grad_norm": 2.140709638595581, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7036588191986084, + "num_tokens": 125149165.0, + "step": 5002 + }, + { + "epoch": 0.5494179661761476, + "grad_norm": 2.2918922901153564, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7235631942749023, + "num_tokens": 125170625.0, + "step": 5003 + }, + { + "epoch": 0.5495277838787612, + "grad_norm": 2.0518107414245605, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7227264642715454, + "num_tokens": 125199698.0, + "step": 5004 + }, + { + "epoch": 0.5496376015813749, + "grad_norm": 2.1803741455078125, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7300436496734619, + "num_tokens": 125224036.0, + "step": 5005 + }, + { + "epoch": 0.5497474192839886, + "grad_norm": 2.166280746459961, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7193076610565186, + "num_tokens": 125247641.0, + "step": 5006 + }, + { + "epoch": 0.5498572369866023, + "grad_norm": 2.200894594192505, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7128100395202637, + "num_tokens": 125272200.0, + "step": 5007 + }, + { + "epoch": 0.5499670546892159, + "grad_norm": 2.412851572036743, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6944997906684875, + "num_tokens": 125295165.0, + "step": 5008 + }, + { + "epoch": 0.5500768723918296, + "grad_norm": 2.2925620079040527, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7080401182174683, + "num_tokens": 125317981.0, + "step": 5009 + }, + { + "epoch": 0.5501866900944432, + "grad_norm": 2.511138677597046, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7092751264572144, + "num_tokens": 125337613.0, + "step": 5010 + }, + { + "epoch": 0.5502965077970569, + "grad_norm": 2.322984457015991, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7247400879859924, + "num_tokens": 125358167.0, + "step": 5011 + }, + { + "epoch": 0.5504063254996705, + "grad_norm": 2.118107557296753, + "learning_rate": 1e-06, + "loss": 1.0637, + "mean_token_accuracy": 0.6896231174468994, + "num_tokens": 125386918.0, + "step": 5012 + }, + { + "epoch": 0.5505161432022843, + "grad_norm": 2.1533021926879883, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6908998489379883, + "num_tokens": 125411558.0, + "step": 5013 + }, + { + "epoch": 0.5506259609048979, + "grad_norm": 2.1222498416900635, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7336925268173218, + "num_tokens": 125436563.0, + "step": 5014 + }, + { + "epoch": 0.5507357786075116, + "grad_norm": 2.1965129375457764, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7205609083175659, + "num_tokens": 125461754.0, + "step": 5015 + }, + { + "epoch": 0.5508455963101252, + "grad_norm": 2.1358132362365723, + "learning_rate": 1e-06, + "loss": 1.0568, + "mean_token_accuracy": 0.6851025819778442, + "num_tokens": 125488236.0, + "step": 5016 + }, + { + "epoch": 0.5509554140127388, + "grad_norm": 2.4080069065093994, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7138345241546631, + "num_tokens": 125510012.0, + "step": 5017 + }, + { + "epoch": 0.5510652317153525, + "grad_norm": 2.3674211502075195, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7117944955825806, + "num_tokens": 125533019.0, + "step": 5018 + }, + { + "epoch": 0.5511750494179661, + "grad_norm": 2.0901663303375244, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7037825584411621, + "num_tokens": 125560023.0, + "step": 5019 + }, + { + "epoch": 0.5512848671205799, + "grad_norm": 2.277951240539551, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7163205146789551, + "num_tokens": 125584904.0, + "step": 5020 + }, + { + "epoch": 0.5513946848231935, + "grad_norm": 2.087930917739868, + "learning_rate": 1e-06, + "loss": 1.1044, + "mean_token_accuracy": 0.6768227815628052, + "num_tokens": 125614269.0, + "step": 5021 + }, + { + "epoch": 0.5515045025258072, + "grad_norm": 2.3780977725982666, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7117968797683716, + "num_tokens": 125635104.0, + "step": 5022 + }, + { + "epoch": 0.5516143202284208, + "grad_norm": 2.520873546600342, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7439113259315491, + "num_tokens": 125654084.0, + "step": 5023 + }, + { + "epoch": 0.5517241379310345, + "grad_norm": 2.0315403938293457, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6941319704055786, + "num_tokens": 125683950.0, + "step": 5024 + }, + { + "epoch": 0.5518339556336481, + "grad_norm": 2.1799845695495605, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.6938576698303223, + "num_tokens": 125710357.0, + "step": 5025 + }, + { + "epoch": 0.5519437733362618, + "grad_norm": 2.3885433673858643, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7135452032089233, + "num_tokens": 125732690.0, + "step": 5026 + }, + { + "epoch": 0.5520535910388754, + "grad_norm": 2.2524445056915283, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7249577045440674, + "num_tokens": 125756863.0, + "step": 5027 + }, + { + "epoch": 0.5521634087414892, + "grad_norm": 2.34714674949646, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7123228311538696, + "num_tokens": 125781574.0, + "step": 5028 + }, + { + "epoch": 0.5522732264441028, + "grad_norm": 2.600210428237915, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6770927906036377, + "num_tokens": 125801756.0, + "step": 5029 + }, + { + "epoch": 0.5523830441467165, + "grad_norm": 2.39984393119812, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7004225254058838, + "num_tokens": 125823421.0, + "step": 5030 + }, + { + "epoch": 0.5524928618493301, + "grad_norm": 2.4116575717926025, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6955221891403198, + "num_tokens": 125846754.0, + "step": 5031 + }, + { + "epoch": 0.5526026795519438, + "grad_norm": 1.9697537422180176, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.710981547832489, + "num_tokens": 125875970.0, + "step": 5032 + }, + { + "epoch": 0.5527124972545574, + "grad_norm": 2.189460277557373, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6935102939605713, + "num_tokens": 125902330.0, + "step": 5033 + }, + { + "epoch": 0.552822314957171, + "grad_norm": 2.044119358062744, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7055026292800903, + "num_tokens": 125929115.0, + "step": 5034 + }, + { + "epoch": 0.5529321326597848, + "grad_norm": 2.4896459579467773, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7105464935302734, + "num_tokens": 125948015.0, + "step": 5035 + }, + { + "epoch": 0.5530419503623984, + "grad_norm": 2.0854969024658203, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7096964716911316, + "num_tokens": 125976634.0, + "step": 5036 + }, + { + "epoch": 0.5531517680650121, + "grad_norm": 2.5543465614318848, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7290701866149902, + "num_tokens": 125994701.0, + "step": 5037 + }, + { + "epoch": 0.5532615857676257, + "grad_norm": 2.1020848751068115, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6867565512657166, + "num_tokens": 126023515.0, + "step": 5038 + }, + { + "epoch": 0.5533714034702394, + "grad_norm": 2.0631027221679688, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6982633471488953, + "num_tokens": 126052994.0, + "step": 5039 + }, + { + "epoch": 0.553481221172853, + "grad_norm": 2.1165661811828613, + "learning_rate": 1e-06, + "loss": 1.1362, + "mean_token_accuracy": 0.6616588830947876, + "num_tokens": 126081505.0, + "step": 5040 + }, + { + "epoch": 0.5535910388754667, + "grad_norm": 2.0544791221618652, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6910514831542969, + "num_tokens": 126111825.0, + "step": 5041 + }, + { + "epoch": 0.5537008565780804, + "grad_norm": 2.4251322746276855, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7168855667114258, + "num_tokens": 126133606.0, + "step": 5042 + }, + { + "epoch": 0.5538106742806941, + "grad_norm": 2.2486541271209717, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6770070195198059, + "num_tokens": 126157968.0, + "step": 5043 + }, + { + "epoch": 0.5539204919833077, + "grad_norm": 2.16947865486145, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7050541639328003, + "num_tokens": 126183783.0, + "step": 5044 + }, + { + "epoch": 0.5540303096859214, + "grad_norm": 1.9732776880264282, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7000399827957153, + "num_tokens": 126213541.0, + "step": 5045 + }, + { + "epoch": 0.554140127388535, + "grad_norm": 2.333662748336792, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.709312915802002, + "num_tokens": 126237078.0, + "step": 5046 + }, + { + "epoch": 0.5542499450911487, + "grad_norm": 2.1752986907958984, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7096068859100342, + "num_tokens": 126264837.0, + "step": 5047 + }, + { + "epoch": 0.5543597627937623, + "grad_norm": 2.141031503677368, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7146486639976501, + "num_tokens": 126291166.0, + "step": 5048 + }, + { + "epoch": 0.5544695804963761, + "grad_norm": 2.190422773361206, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7164725065231323, + "num_tokens": 126314713.0, + "step": 5049 + }, + { + "epoch": 0.5545793981989897, + "grad_norm": 2.530085802078247, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6946648359298706, + "num_tokens": 126335939.0, + "step": 5050 + }, + { + "epoch": 0.5546892159016034, + "grad_norm": 2.679133653640747, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7061318755149841, + "num_tokens": 126353415.0, + "step": 5051 + }, + { + "epoch": 0.554799033604217, + "grad_norm": 1.9558472633361816, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7192080616950989, + "num_tokens": 126383233.0, + "step": 5052 + }, + { + "epoch": 0.5549088513068307, + "grad_norm": 2.1450023651123047, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.69950270652771, + "num_tokens": 126409922.0, + "step": 5053 + }, + { + "epoch": 0.5550186690094443, + "grad_norm": 2.1463282108306885, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7100440263748169, + "num_tokens": 126435929.0, + "step": 5054 + }, + { + "epoch": 0.555128486712058, + "grad_norm": 2.3600497245788574, + "learning_rate": 1e-06, + "loss": 1.0147, + "mean_token_accuracy": 0.6948624849319458, + "num_tokens": 126461039.0, + "step": 5055 + }, + { + "epoch": 0.5552383044146716, + "grad_norm": 2.1459131240844727, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6976211071014404, + "num_tokens": 126487558.0, + "step": 5056 + }, + { + "epoch": 0.5553481221172853, + "grad_norm": 1.987642526626587, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6971855163574219, + "num_tokens": 126519286.0, + "step": 5057 + }, + { + "epoch": 0.555457939819899, + "grad_norm": 2.47377610206604, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.707246720790863, + "num_tokens": 126539713.0, + "step": 5058 + }, + { + "epoch": 0.5555677575225126, + "grad_norm": 2.4116461277008057, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6905388236045837, + "num_tokens": 126562180.0, + "step": 5059 + }, + { + "epoch": 0.5556775752251263, + "grad_norm": 2.0919899940490723, + "learning_rate": 1e-06, + "loss": 1.0471, + "mean_token_accuracy": 0.6875264048576355, + "num_tokens": 126590113.0, + "step": 5060 + }, + { + "epoch": 0.5557873929277399, + "grad_norm": 2.3755180835723877, + "learning_rate": 1e-06, + "loss": 1.0537, + "mean_token_accuracy": 0.6931703686714172, + "num_tokens": 126612620.0, + "step": 5061 + }, + { + "epoch": 0.5558972106303536, + "grad_norm": 2.2163500785827637, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6904069781303406, + "num_tokens": 126639763.0, + "step": 5062 + }, + { + "epoch": 0.5560070283329672, + "grad_norm": 2.463029146194458, + "learning_rate": 1e-06, + "loss": 1.0626, + "mean_token_accuracy": 0.6728551387786865, + "num_tokens": 126662483.0, + "step": 5063 + }, + { + "epoch": 0.556116846035581, + "grad_norm": 2.3023016452789307, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7038357257843018, + "num_tokens": 126686924.0, + "step": 5064 + }, + { + "epoch": 0.5562266637381946, + "grad_norm": 1.9550070762634277, + "learning_rate": 1e-06, + "loss": 1.056, + "mean_token_accuracy": 0.6888525485992432, + "num_tokens": 126719318.0, + "step": 5065 + }, + { + "epoch": 0.5563364814408083, + "grad_norm": 2.500640392303467, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6967830657958984, + "num_tokens": 126741082.0, + "step": 5066 + }, + { + "epoch": 0.5564462991434219, + "grad_norm": 2.3108510971069336, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7185890078544617, + "num_tokens": 126765391.0, + "step": 5067 + }, + { + "epoch": 0.5565561168460356, + "grad_norm": 2.138789415359497, + "learning_rate": 1e-06, + "loss": 1.0881, + "mean_token_accuracy": 0.6791842579841614, + "num_tokens": 126796653.0, + "step": 5068 + }, + { + "epoch": 0.5566659345486492, + "grad_norm": 2.2180583477020264, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6947689056396484, + "num_tokens": 126819777.0, + "step": 5069 + }, + { + "epoch": 0.5567757522512629, + "grad_norm": 2.1498336791992188, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.6976796388626099, + "num_tokens": 126845848.0, + "step": 5070 + }, + { + "epoch": 0.5568855699538766, + "grad_norm": 2.5331640243530273, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7347346544265747, + "num_tokens": 126864386.0, + "step": 5071 + }, + { + "epoch": 0.5569953876564903, + "grad_norm": 2.1655611991882324, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6834357976913452, + "num_tokens": 126888909.0, + "step": 5072 + }, + { + "epoch": 0.5571052053591039, + "grad_norm": 2.125253200531006, + "learning_rate": 1e-06, + "loss": 1.0859, + "mean_token_accuracy": 0.6759498119354248, + "num_tokens": 126915688.0, + "step": 5073 + }, + { + "epoch": 0.5572150230617176, + "grad_norm": 2.078341484069824, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6948875188827515, + "num_tokens": 126943201.0, + "step": 5074 + }, + { + "epoch": 0.5573248407643312, + "grad_norm": 2.2752668857574463, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6942381858825684, + "num_tokens": 126966672.0, + "step": 5075 + }, + { + "epoch": 0.5574346584669448, + "grad_norm": 2.413379430770874, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.714072585105896, + "num_tokens": 126988314.0, + "step": 5076 + }, + { + "epoch": 0.5575444761695585, + "grad_norm": 2.202313184738159, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6846274733543396, + "num_tokens": 127013264.0, + "step": 5077 + }, + { + "epoch": 0.5576542938721722, + "grad_norm": 2.139641523361206, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7177442312240601, + "num_tokens": 127039700.0, + "step": 5078 + }, + { + "epoch": 0.5577641115747859, + "grad_norm": 2.220874547958374, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6998100876808167, + "num_tokens": 127064645.0, + "step": 5079 + }, + { + "epoch": 0.5578739292773995, + "grad_norm": 2.281033754348755, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6997843384742737, + "num_tokens": 127088300.0, + "step": 5080 + }, + { + "epoch": 0.5579837469800132, + "grad_norm": 2.2684195041656494, + "learning_rate": 1e-06, + "loss": 0.9865, + "mean_token_accuracy": 0.7142621278762817, + "num_tokens": 127112782.0, + "step": 5081 + }, + { + "epoch": 0.5580935646826268, + "grad_norm": 2.192791223526001, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7188359498977661, + "num_tokens": 127136431.0, + "step": 5082 + }, + { + "epoch": 0.5582033823852405, + "grad_norm": 2.0937016010284424, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7016056776046753, + "num_tokens": 127162337.0, + "step": 5083 + }, + { + "epoch": 0.5583132000878541, + "grad_norm": 2.1690561771392822, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6996537446975708, + "num_tokens": 127187661.0, + "step": 5084 + }, + { + "epoch": 0.5584230177904678, + "grad_norm": 2.1243441104888916, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7039275169372559, + "num_tokens": 127214172.0, + "step": 5085 + }, + { + "epoch": 0.5585328354930815, + "grad_norm": 2.6287429332733154, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6930659413337708, + "num_tokens": 127232791.0, + "step": 5086 + }, + { + "epoch": 0.5586426531956952, + "grad_norm": 2.026435613632202, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.70527184009552, + "num_tokens": 127261576.0, + "step": 5087 + }, + { + "epoch": 0.5587524708983088, + "grad_norm": 2.4019808769226074, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6894510388374329, + "num_tokens": 127284348.0, + "step": 5088 + }, + { + "epoch": 0.5588622886009225, + "grad_norm": 2.500493049621582, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6940061450004578, + "num_tokens": 127304978.0, + "step": 5089 + }, + { + "epoch": 0.5589721063035361, + "grad_norm": 2.1961207389831543, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6865056753158569, + "num_tokens": 127329850.0, + "step": 5090 + }, + { + "epoch": 0.5590819240061498, + "grad_norm": 2.2014248371124268, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7122625112533569, + "num_tokens": 127353677.0, + "step": 5091 + }, + { + "epoch": 0.5591917417087634, + "grad_norm": 2.2767250537872314, + "learning_rate": 1e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6855220794677734, + "num_tokens": 127381217.0, + "step": 5092 + }, + { + "epoch": 0.5593015594113772, + "grad_norm": 1.980075716972351, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7104253768920898, + "num_tokens": 127408919.0, + "step": 5093 + }, + { + "epoch": 0.5594113771139908, + "grad_norm": 2.0726211071014404, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6931982040405273, + "num_tokens": 127438647.0, + "step": 5094 + }, + { + "epoch": 0.5595211948166045, + "grad_norm": 2.1907036304473877, + "learning_rate": 1e-06, + "loss": 1.0618, + "mean_token_accuracy": 0.685761570930481, + "num_tokens": 127465237.0, + "step": 5095 + }, + { + "epoch": 0.5596310125192181, + "grad_norm": 2.0538361072540283, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6995092630386353, + "num_tokens": 127495001.0, + "step": 5096 + }, + { + "epoch": 0.5597408302218317, + "grad_norm": 2.005160093307495, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6797431707382202, + "num_tokens": 127525175.0, + "step": 5097 + }, + { + "epoch": 0.5598506479244454, + "grad_norm": 2.284473419189453, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6913048028945923, + "num_tokens": 127549887.0, + "step": 5098 + }, + { + "epoch": 0.559960465627059, + "grad_norm": 2.1534712314605713, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7035115957260132, + "num_tokens": 127579938.0, + "step": 5099 + }, + { + "epoch": 0.5600702833296728, + "grad_norm": 2.25539493560791, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6985348463058472, + "num_tokens": 127607434.0, + "step": 5100 + }, + { + "epoch": 0.5601801010322864, + "grad_norm": 2.3960254192352295, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7039556503295898, + "num_tokens": 127629168.0, + "step": 5101 + }, + { + "epoch": 0.5602899187349001, + "grad_norm": 2.2695720195770264, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6926048994064331, + "num_tokens": 127652917.0, + "step": 5102 + }, + { + "epoch": 0.5603997364375137, + "grad_norm": 2.1751301288604736, + "learning_rate": 1e-06, + "loss": 1.0255, + "mean_token_accuracy": 0.6976202726364136, + "num_tokens": 127680151.0, + "step": 5103 + }, + { + "epoch": 0.5605095541401274, + "grad_norm": 2.3287787437438965, + "learning_rate": 1e-06, + "loss": 1.0239, + "mean_token_accuracy": 0.6913858652114868, + "num_tokens": 127702511.0, + "step": 5104 + }, + { + "epoch": 0.560619371842741, + "grad_norm": 2.2337486743927, + "learning_rate": 1e-06, + "loss": 1.1143, + "mean_token_accuracy": 0.6728230118751526, + "num_tokens": 127729692.0, + "step": 5105 + }, + { + "epoch": 0.5607291895453547, + "grad_norm": 2.1690433025360107, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6957651376724243, + "num_tokens": 127754824.0, + "step": 5106 + }, + { + "epoch": 0.5608390072479684, + "grad_norm": 2.1868813037872314, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7326135635375977, + "num_tokens": 127781709.0, + "step": 5107 + }, + { + "epoch": 0.5609488249505821, + "grad_norm": 2.392496347427368, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7032603025436401, + "num_tokens": 127803727.0, + "step": 5108 + }, + { + "epoch": 0.5610586426531957, + "grad_norm": 2.2370493412017822, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6755973696708679, + "num_tokens": 127829337.0, + "step": 5109 + }, + { + "epoch": 0.5611684603558094, + "grad_norm": 2.477630853652954, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7036670446395874, + "num_tokens": 127849777.0, + "step": 5110 + }, + { + "epoch": 0.561278278058423, + "grad_norm": 1.968867301940918, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6947943568229675, + "num_tokens": 127878343.0, + "step": 5111 + }, + { + "epoch": 0.5613880957610367, + "grad_norm": 2.1533076763153076, + "learning_rate": 1e-06, + "loss": 1.0727, + "mean_token_accuracy": 0.6943613290786743, + "num_tokens": 127904297.0, + "step": 5112 + }, + { + "epoch": 0.5614979134636503, + "grad_norm": 2.3920464515686035, + "learning_rate": 1e-06, + "loss": 1.0898, + "mean_token_accuracy": 0.6773388385772705, + "num_tokens": 127927771.0, + "step": 5113 + }, + { + "epoch": 0.5616077311662641, + "grad_norm": 2.3602893352508545, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7065674066543579, + "num_tokens": 127949369.0, + "step": 5114 + }, + { + "epoch": 0.5617175488688777, + "grad_norm": 2.1426610946655273, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6956535577774048, + "num_tokens": 127976464.0, + "step": 5115 + }, + { + "epoch": 0.5618273665714913, + "grad_norm": 2.074932813644409, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.730964720249176, + "num_tokens": 128003097.0, + "step": 5116 + }, + { + "epoch": 0.561937184274105, + "grad_norm": 2.252772092819214, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7302542924880981, + "num_tokens": 128025967.0, + "step": 5117 + }, + { + "epoch": 0.5620470019767186, + "grad_norm": 1.9701790809631348, + "learning_rate": 1e-06, + "loss": 1.0928, + "mean_token_accuracy": 0.6718336343765259, + "num_tokens": 128059634.0, + "step": 5118 + }, + { + "epoch": 0.5621568196793323, + "grad_norm": 2.532074213027954, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7120439410209656, + "num_tokens": 128080893.0, + "step": 5119 + }, + { + "epoch": 0.5622666373819459, + "grad_norm": 2.120899200439453, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7022109031677246, + "num_tokens": 128110084.0, + "step": 5120 + }, + { + "epoch": 0.5623764550845596, + "grad_norm": 2.02518892288208, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6923333406448364, + "num_tokens": 128140253.0, + "step": 5121 + }, + { + "epoch": 0.5624862727871733, + "grad_norm": 2.136559009552002, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7197826504707336, + "num_tokens": 128164960.0, + "step": 5122 + }, + { + "epoch": 0.562596090489787, + "grad_norm": 1.9758541584014893, + "learning_rate": 1e-06, + "loss": 1.0884, + "mean_token_accuracy": 0.6726627349853516, + "num_tokens": 128198437.0, + "step": 5123 + }, + { + "epoch": 0.5627059081924006, + "grad_norm": 2.206435203552246, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.719250500202179, + "num_tokens": 128223531.0, + "step": 5124 + }, + { + "epoch": 0.5628157258950143, + "grad_norm": 2.392932176589966, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.692743718624115, + "num_tokens": 128248216.0, + "step": 5125 + }, + { + "epoch": 0.5629255435976279, + "grad_norm": 2.0962793827056885, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7010234594345093, + "num_tokens": 128275994.0, + "step": 5126 + }, + { + "epoch": 0.5630353613002416, + "grad_norm": 2.343040943145752, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7104321718215942, + "num_tokens": 128298227.0, + "step": 5127 + }, + { + "epoch": 0.5631451790028552, + "grad_norm": 2.4878666400909424, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7170984745025635, + "num_tokens": 128318515.0, + "step": 5128 + }, + { + "epoch": 0.563254996705469, + "grad_norm": 2.177901029586792, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7022208571434021, + "num_tokens": 128345246.0, + "step": 5129 + }, + { + "epoch": 0.5633648144080826, + "grad_norm": 2.064474582672119, + "learning_rate": 1e-06, + "loss": 1.1113, + "mean_token_accuracy": 0.6737034320831299, + "num_tokens": 128376492.0, + "step": 5130 + }, + { + "epoch": 0.5634746321106963, + "grad_norm": 1.8587323427200317, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.722109317779541, + "num_tokens": 128409796.0, + "step": 5131 + }, + { + "epoch": 0.5635844498133099, + "grad_norm": 2.3148512840270996, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6929771900177002, + "num_tokens": 128434883.0, + "step": 5132 + }, + { + "epoch": 0.5636942675159236, + "grad_norm": 2.355060577392578, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7037097811698914, + "num_tokens": 128456854.0, + "step": 5133 + }, + { + "epoch": 0.5638040852185372, + "grad_norm": 2.139660120010376, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7331657409667969, + "num_tokens": 128483660.0, + "step": 5134 + }, + { + "epoch": 0.5639139029211508, + "grad_norm": 2.058152675628662, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7042622566223145, + "num_tokens": 128511788.0, + "step": 5135 + }, + { + "epoch": 0.5640237206237646, + "grad_norm": 2.2664451599121094, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7012403011322021, + "num_tokens": 128535496.0, + "step": 5136 + }, + { + "epoch": 0.5641335383263782, + "grad_norm": 2.052114486694336, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7066963315010071, + "num_tokens": 128563187.0, + "step": 5137 + }, + { + "epoch": 0.5642433560289919, + "grad_norm": 2.431040048599243, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.7030504941940308, + "num_tokens": 128587049.0, + "step": 5138 + }, + { + "epoch": 0.5643531737316055, + "grad_norm": 2.123311758041382, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7201124429702759, + "num_tokens": 128612045.0, + "step": 5139 + }, + { + "epoch": 0.5644629914342192, + "grad_norm": 2.210320472717285, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6870692372322083, + "num_tokens": 128639535.0, + "step": 5140 + }, + { + "epoch": 0.5645728091368328, + "grad_norm": 2.2267589569091797, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7263636589050293, + "num_tokens": 128666096.0, + "step": 5141 + }, + { + "epoch": 0.5646826268394465, + "grad_norm": 2.2871692180633545, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.6841374039649963, + "num_tokens": 128691392.0, + "step": 5142 + }, + { + "epoch": 0.5647924445420602, + "grad_norm": 2.2366139888763428, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7082812786102295, + "num_tokens": 128717948.0, + "step": 5143 + }, + { + "epoch": 0.5649022622446739, + "grad_norm": 2.4423813819885254, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7172333002090454, + "num_tokens": 128737456.0, + "step": 5144 + }, + { + "epoch": 0.5650120799472875, + "grad_norm": 2.2803874015808105, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.706340491771698, + "num_tokens": 128760046.0, + "step": 5145 + }, + { + "epoch": 0.5651218976499012, + "grad_norm": 2.1426265239715576, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7028557658195496, + "num_tokens": 128787813.0, + "step": 5146 + }, + { + "epoch": 0.5652317153525148, + "grad_norm": 2.280707359313965, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6827521920204163, + "num_tokens": 128812659.0, + "step": 5147 + }, + { + "epoch": 0.5653415330551285, + "grad_norm": 2.127066135406494, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7071450352668762, + "num_tokens": 128839008.0, + "step": 5148 + }, + { + "epoch": 0.5654513507577421, + "grad_norm": 2.0866715908050537, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7153679728507996, + "num_tokens": 128866320.0, + "step": 5149 + }, + { + "epoch": 0.5655611684603558, + "grad_norm": 2.2772696018218994, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7073016166687012, + "num_tokens": 128889587.0, + "step": 5150 + }, + { + "epoch": 0.5656709861629695, + "grad_norm": 2.5114126205444336, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7084436416625977, + "num_tokens": 128910567.0, + "step": 5151 + }, + { + "epoch": 0.5657808038655832, + "grad_norm": 2.2601616382598877, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.713653564453125, + "num_tokens": 128933570.0, + "step": 5152 + }, + { + "epoch": 0.5658906215681968, + "grad_norm": 2.183950424194336, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6980805993080139, + "num_tokens": 128960701.0, + "step": 5153 + }, + { + "epoch": 0.5660004392708105, + "grad_norm": 2.0080862045288086, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7133285999298096, + "num_tokens": 128989707.0, + "step": 5154 + }, + { + "epoch": 0.5661102569734241, + "grad_norm": 2.147106647491455, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7087909579277039, + "num_tokens": 129017890.0, + "step": 5155 + }, + { + "epoch": 0.5662200746760377, + "grad_norm": 2.211632490158081, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6863605976104736, + "num_tokens": 129044770.0, + "step": 5156 + }, + { + "epoch": 0.5663298923786514, + "grad_norm": 2.2902584075927734, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7153310775756836, + "num_tokens": 129070080.0, + "step": 5157 + }, + { + "epoch": 0.5664397100812651, + "grad_norm": 2.1650543212890625, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7088108062744141, + "num_tokens": 129096930.0, + "step": 5158 + }, + { + "epoch": 0.5665495277838788, + "grad_norm": 2.6954598426818848, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7004591226577759, + "num_tokens": 129114923.0, + "step": 5159 + }, + { + "epoch": 0.5666593454864924, + "grad_norm": 2.246582508087158, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7281904220581055, + "num_tokens": 129139204.0, + "step": 5160 + }, + { + "epoch": 0.5667691631891061, + "grad_norm": 2.8362598419189453, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7324875593185425, + "num_tokens": 129154136.0, + "step": 5161 + }, + { + "epoch": 0.5668789808917197, + "grad_norm": 2.265202283859253, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7189006209373474, + "num_tokens": 129176776.0, + "step": 5162 + }, + { + "epoch": 0.5669887985943334, + "grad_norm": 1.9424465894699097, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6999001502990723, + "num_tokens": 129207860.0, + "step": 5163 + }, + { + "epoch": 0.567098616296947, + "grad_norm": 1.9930906295776367, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7149729132652283, + "num_tokens": 129238204.0, + "step": 5164 + }, + { + "epoch": 0.5672084339995608, + "grad_norm": 2.4325907230377197, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6942362785339355, + "num_tokens": 129259505.0, + "step": 5165 + }, + { + "epoch": 0.5673182517021744, + "grad_norm": 2.0685982704162598, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.6998124122619629, + "num_tokens": 129290574.0, + "step": 5166 + }, + { + "epoch": 0.5674280694047881, + "grad_norm": 2.0908701419830322, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6970587968826294, + "num_tokens": 129319526.0, + "step": 5167 + }, + { + "epoch": 0.5675378871074017, + "grad_norm": 2.5807011127471924, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7297335863113403, + "num_tokens": 129337400.0, + "step": 5168 + }, + { + "epoch": 0.5676477048100154, + "grad_norm": 1.950039267539978, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.6968086957931519, + "num_tokens": 129368905.0, + "step": 5169 + }, + { + "epoch": 0.567757522512629, + "grad_norm": 2.614802837371826, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7075896859169006, + "num_tokens": 129388273.0, + "step": 5170 + }, + { + "epoch": 0.5678673402152427, + "grad_norm": 2.202932357788086, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7153077125549316, + "num_tokens": 129413447.0, + "step": 5171 + }, + { + "epoch": 0.5679771579178564, + "grad_norm": 2.5361111164093018, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.6975947618484497, + "num_tokens": 129434368.0, + "step": 5172 + }, + { + "epoch": 0.5680869756204701, + "grad_norm": 2.094606399536133, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.703652024269104, + "num_tokens": 129462137.0, + "step": 5173 + }, + { + "epoch": 0.5681967933230837, + "grad_norm": 2.0982015132904053, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.700162410736084, + "num_tokens": 129491507.0, + "step": 5174 + }, + { + "epoch": 0.5683066110256974, + "grad_norm": 2.5157108306884766, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7239862680435181, + "num_tokens": 129512510.0, + "step": 5175 + }, + { + "epoch": 0.568416428728311, + "grad_norm": 2.1193430423736572, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6949822902679443, + "num_tokens": 129541259.0, + "step": 5176 + }, + { + "epoch": 0.5685262464309246, + "grad_norm": 2.6247148513793945, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.706821858882904, + "num_tokens": 129562669.0, + "step": 5177 + }, + { + "epoch": 0.5686360641335383, + "grad_norm": 1.8700803518295288, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6875219345092773, + "num_tokens": 129597397.0, + "step": 5178 + }, + { + "epoch": 0.5687458818361519, + "grad_norm": 2.2608025074005127, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6955681443214417, + "num_tokens": 129622596.0, + "step": 5179 + }, + { + "epoch": 0.5688556995387657, + "grad_norm": 2.058069944381714, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6920673847198486, + "num_tokens": 129651056.0, + "step": 5180 + }, + { + "epoch": 0.5689655172413793, + "grad_norm": 2.143751621246338, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6917750835418701, + "num_tokens": 129678599.0, + "step": 5181 + }, + { + "epoch": 0.569075334943993, + "grad_norm": 2.1138370037078857, + "learning_rate": 1e-06, + "loss": 1.0506, + "mean_token_accuracy": 0.6862921714782715, + "num_tokens": 129707138.0, + "step": 5182 + }, + { + "epoch": 0.5691851526466066, + "grad_norm": 1.9256283044815063, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7011956572532654, + "num_tokens": 129738814.0, + "step": 5183 + }, + { + "epoch": 0.5692949703492203, + "grad_norm": 2.3846945762634277, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.7000367641448975, + "num_tokens": 129763386.0, + "step": 5184 + }, + { + "epoch": 0.5694047880518339, + "grad_norm": 2.0093512535095215, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6995928287506104, + "num_tokens": 129793485.0, + "step": 5185 + }, + { + "epoch": 0.5695146057544476, + "grad_norm": 2.234813928604126, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.729537844657898, + "num_tokens": 129816552.0, + "step": 5186 + }, + { + "epoch": 0.5696244234570613, + "grad_norm": 2.3309078216552734, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6943277716636658, + "num_tokens": 129840962.0, + "step": 5187 + }, + { + "epoch": 0.569734241159675, + "grad_norm": 2.1742775440216064, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7311615943908691, + "num_tokens": 129866692.0, + "step": 5188 + }, + { + "epoch": 0.5698440588622886, + "grad_norm": 2.5008468627929688, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7216683626174927, + "num_tokens": 129886932.0, + "step": 5189 + }, + { + "epoch": 0.5699538765649023, + "grad_norm": 2.283583402633667, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7070708274841309, + "num_tokens": 129909748.0, + "step": 5190 + }, + { + "epoch": 0.5700636942675159, + "grad_norm": 2.065051555633545, + "learning_rate": 1e-06, + "loss": 1.0282, + "mean_token_accuracy": 0.6900463700294495, + "num_tokens": 129938135.0, + "step": 5191 + }, + { + "epoch": 0.5701735119701296, + "grad_norm": 2.009296178817749, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6955065727233887, + "num_tokens": 129966102.0, + "step": 5192 + }, + { + "epoch": 0.5702833296727432, + "grad_norm": 2.286426067352295, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6994273662567139, + "num_tokens": 129989604.0, + "step": 5193 + }, + { + "epoch": 0.570393147375357, + "grad_norm": 2.4608049392700195, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7264391779899597, + "num_tokens": 130009021.0, + "step": 5194 + }, + { + "epoch": 0.5705029650779706, + "grad_norm": 1.9581891298294067, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6793912649154663, + "num_tokens": 130041939.0, + "step": 5195 + }, + { + "epoch": 0.5706127827805842, + "grad_norm": 2.310258150100708, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7111951112747192, + "num_tokens": 130064045.0, + "step": 5196 + }, + { + "epoch": 0.5707226004831979, + "grad_norm": 2.1670796871185303, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6952253580093384, + "num_tokens": 130089497.0, + "step": 5197 + }, + { + "epoch": 0.5708324181858115, + "grad_norm": 2.338780403137207, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7003946900367737, + "num_tokens": 130111239.0, + "step": 5198 + }, + { + "epoch": 0.5709422358884252, + "grad_norm": 2.374844789505005, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.6945568323135376, + "num_tokens": 130134276.0, + "step": 5199 + }, + { + "epoch": 0.5710520535910388, + "grad_norm": 2.197826623916626, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7177743315696716, + "num_tokens": 130158725.0, + "step": 5200 + }, + { + "epoch": 0.5711618712936526, + "grad_norm": 2.2414371967315674, + "learning_rate": 1e-06, + "loss": 1.0091, + "mean_token_accuracy": 0.6986644268035889, + "num_tokens": 130185469.0, + "step": 5201 + }, + { + "epoch": 0.5712716889962662, + "grad_norm": 1.9761548042297363, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.6993987560272217, + "num_tokens": 130217828.0, + "step": 5202 + }, + { + "epoch": 0.5713815066988799, + "grad_norm": 2.106668710708618, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7088762521743774, + "num_tokens": 130242878.0, + "step": 5203 + }, + { + "epoch": 0.5714913244014935, + "grad_norm": 2.136885643005371, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7024338841438293, + "num_tokens": 130269406.0, + "step": 5204 + }, + { + "epoch": 0.5716011421041072, + "grad_norm": 2.5779294967651367, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7114591598510742, + "num_tokens": 130288940.0, + "step": 5205 + }, + { + "epoch": 0.5717109598067208, + "grad_norm": 2.227147102355957, + "learning_rate": 1e-06, + "loss": 1.0437, + "mean_token_accuracy": 0.6908360719680786, + "num_tokens": 130315468.0, + "step": 5206 + }, + { + "epoch": 0.5718207775093345, + "grad_norm": 2.0603792667388916, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7306980490684509, + "num_tokens": 130342204.0, + "step": 5207 + }, + { + "epoch": 0.5719305952119481, + "grad_norm": 2.367114782333374, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6965839862823486, + "num_tokens": 130364777.0, + "step": 5208 + }, + { + "epoch": 0.5720404129145619, + "grad_norm": 2.2365236282348633, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6968548893928528, + "num_tokens": 130389823.0, + "step": 5209 + }, + { + "epoch": 0.5721502306171755, + "grad_norm": 2.156590461730957, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6965081691741943, + "num_tokens": 130416094.0, + "step": 5210 + }, + { + "epoch": 0.5722600483197892, + "grad_norm": 2.6151928901672363, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.7028401494026184, + "num_tokens": 130435638.0, + "step": 5211 + }, + { + "epoch": 0.5723698660224028, + "grad_norm": 2.4138240814208984, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7113805413246155, + "num_tokens": 130456937.0, + "step": 5212 + }, + { + "epoch": 0.5724796837250165, + "grad_norm": 2.119635820388794, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6981404423713684, + "num_tokens": 130485732.0, + "step": 5213 + }, + { + "epoch": 0.5725895014276301, + "grad_norm": 2.1282522678375244, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.713469386100769, + "num_tokens": 130511118.0, + "step": 5214 + }, + { + "epoch": 0.5726993191302437, + "grad_norm": 2.2850613594055176, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6930602192878723, + "num_tokens": 130535725.0, + "step": 5215 + }, + { + "epoch": 0.5728091368328575, + "grad_norm": 2.0837955474853516, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6972769498825073, + "num_tokens": 130563021.0, + "step": 5216 + }, + { + "epoch": 0.5729189545354711, + "grad_norm": 2.248051881790161, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6870793104171753, + "num_tokens": 130589403.0, + "step": 5217 + }, + { + "epoch": 0.5730287722380848, + "grad_norm": 2.0765154361724854, + "learning_rate": 1e-06, + "loss": 1.0508, + "mean_token_accuracy": 0.6900699734687805, + "num_tokens": 130619882.0, + "step": 5218 + }, + { + "epoch": 0.5731385899406984, + "grad_norm": 2.363140106201172, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.7052556276321411, + "num_tokens": 130643411.0, + "step": 5219 + }, + { + "epoch": 0.5732484076433121, + "grad_norm": 2.4321253299713135, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7222933769226074, + "num_tokens": 130663880.0, + "step": 5220 + }, + { + "epoch": 0.5733582253459257, + "grad_norm": 2.1873135566711426, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6999391317367554, + "num_tokens": 130689560.0, + "step": 5221 + }, + { + "epoch": 0.5734680430485394, + "grad_norm": 2.330018997192383, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6892451047897339, + "num_tokens": 130712236.0, + "step": 5222 + }, + { + "epoch": 0.5735778607511531, + "grad_norm": 2.708162307739258, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.703869104385376, + "num_tokens": 130730562.0, + "step": 5223 + }, + { + "epoch": 0.5736876784537668, + "grad_norm": 2.2463841438293457, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6904692649841309, + "num_tokens": 130757684.0, + "step": 5224 + }, + { + "epoch": 0.5737974961563804, + "grad_norm": 2.015868663787842, + "learning_rate": 1e-06, + "loss": 1.0787, + "mean_token_accuracy": 0.676753580570221, + "num_tokens": 130788319.0, + "step": 5225 + }, + { + "epoch": 0.5739073138589941, + "grad_norm": 2.1851534843444824, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7044147253036499, + "num_tokens": 130816281.0, + "step": 5226 + }, + { + "epoch": 0.5740171315616077, + "grad_norm": 2.3774397373199463, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7153124809265137, + "num_tokens": 130838300.0, + "step": 5227 + }, + { + "epoch": 0.5741269492642214, + "grad_norm": 2.113403797149658, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6915753483772278, + "num_tokens": 130867169.0, + "step": 5228 + }, + { + "epoch": 0.574236766966835, + "grad_norm": 2.484243631362915, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7055794596672058, + "num_tokens": 130887826.0, + "step": 5229 + }, + { + "epoch": 0.5743465846694488, + "grad_norm": 2.066253423690796, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6958463191986084, + "num_tokens": 130917738.0, + "step": 5230 + }, + { + "epoch": 0.5744564023720624, + "grad_norm": 2.249678611755371, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.7065669298171997, + "num_tokens": 130943074.0, + "step": 5231 + }, + { + "epoch": 0.5745662200746761, + "grad_norm": 2.3989641666412354, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7150281667709351, + "num_tokens": 130963704.0, + "step": 5232 + }, + { + "epoch": 0.5746760377772897, + "grad_norm": 2.301785945892334, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6827806234359741, + "num_tokens": 130987719.0, + "step": 5233 + }, + { + "epoch": 0.5747858554799034, + "grad_norm": 2.0435538291931152, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7067779898643494, + "num_tokens": 131017854.0, + "step": 5234 + }, + { + "epoch": 0.574895673182517, + "grad_norm": 2.295236825942993, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7112689018249512, + "num_tokens": 131041819.0, + "step": 5235 + }, + { + "epoch": 0.5750054908851306, + "grad_norm": 2.04006028175354, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7137329578399658, + "num_tokens": 131069993.0, + "step": 5236 + }, + { + "epoch": 0.5751153085877443, + "grad_norm": 2.4978466033935547, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7135136127471924, + "num_tokens": 131089427.0, + "step": 5237 + }, + { + "epoch": 0.575225126290358, + "grad_norm": 1.9898625612258911, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7067795395851135, + "num_tokens": 131119268.0, + "step": 5238 + }, + { + "epoch": 0.5753349439929717, + "grad_norm": 2.2854197025299072, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.6980131268501282, + "num_tokens": 131143635.0, + "step": 5239 + }, + { + "epoch": 0.5754447616955853, + "grad_norm": 1.991560935974121, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.6975900530815125, + "num_tokens": 131174961.0, + "step": 5240 + }, + { + "epoch": 0.575554579398199, + "grad_norm": 2.427064895629883, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7076351642608643, + "num_tokens": 131196382.0, + "step": 5241 + }, + { + "epoch": 0.5756643971008126, + "grad_norm": 1.8947641849517822, + "learning_rate": 1e-06, + "loss": 1.0438, + "mean_token_accuracy": 0.6862680912017822, + "num_tokens": 131230560.0, + "step": 5242 + }, + { + "epoch": 0.5757742148034263, + "grad_norm": 2.3758816719055176, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.7002255916595459, + "num_tokens": 131255251.0, + "step": 5243 + }, + { + "epoch": 0.5758840325060399, + "grad_norm": 2.4870848655700684, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.723110020160675, + "num_tokens": 131274840.0, + "step": 5244 + }, + { + "epoch": 0.5759938502086537, + "grad_norm": 2.355173349380493, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.709958553314209, + "num_tokens": 131297473.0, + "step": 5245 + }, + { + "epoch": 0.5761036679112673, + "grad_norm": 2.2415759563446045, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7256849408149719, + "num_tokens": 131320917.0, + "step": 5246 + }, + { + "epoch": 0.576213485613881, + "grad_norm": 2.370269298553467, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7084411382675171, + "num_tokens": 131343078.0, + "step": 5247 + }, + { + "epoch": 0.5763233033164946, + "grad_norm": 1.9727727174758911, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6995517611503601, + "num_tokens": 131372452.0, + "step": 5248 + }, + { + "epoch": 0.5764331210191083, + "grad_norm": 1.9992552995681763, + "learning_rate": 1e-06, + "loss": 1.0277, + "mean_token_accuracy": 0.6886751055717468, + "num_tokens": 131404813.0, + "step": 5249 + }, + { + "epoch": 0.5765429387217219, + "grad_norm": 2.4029581546783447, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7037570476531982, + "num_tokens": 131428685.0, + "step": 5250 + }, + { + "epoch": 0.5766527564243356, + "grad_norm": 2.268962860107422, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7022705674171448, + "num_tokens": 131452068.0, + "step": 5251 + }, + { + "epoch": 0.5767625741269493, + "grad_norm": 2.2971982955932617, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.706442654132843, + "num_tokens": 131474318.0, + "step": 5252 + }, + { + "epoch": 0.576872391829563, + "grad_norm": 2.1285758018493652, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6960445046424866, + "num_tokens": 131502166.0, + "step": 5253 + }, + { + "epoch": 0.5769822095321766, + "grad_norm": 2.4000368118286133, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.7042495012283325, + "num_tokens": 131526518.0, + "step": 5254 + }, + { + "epoch": 0.5770920272347903, + "grad_norm": 2.3395283222198486, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6886904239654541, + "num_tokens": 131549949.0, + "step": 5255 + }, + { + "epoch": 0.5772018449374039, + "grad_norm": 2.2208807468414307, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7019444704055786, + "num_tokens": 131575005.0, + "step": 5256 + }, + { + "epoch": 0.5773116626400175, + "grad_norm": 2.525573492050171, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7193612456321716, + "num_tokens": 131596437.0, + "step": 5257 + }, + { + "epoch": 0.5774214803426312, + "grad_norm": 2.3317484855651855, + "learning_rate": 1e-06, + "loss": 1.0084, + "mean_token_accuracy": 0.6938599348068237, + "num_tokens": 131621110.0, + "step": 5258 + }, + { + "epoch": 0.577531298045245, + "grad_norm": 2.2854413986206055, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.71772301197052, + "num_tokens": 131645219.0, + "step": 5259 + }, + { + "epoch": 0.5776411157478586, + "grad_norm": 2.008897066116333, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7164537906646729, + "num_tokens": 131673838.0, + "step": 5260 + }, + { + "epoch": 0.5777509334504722, + "grad_norm": 2.031764268875122, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6949266791343689, + "num_tokens": 131703244.0, + "step": 5261 + }, + { + "epoch": 0.5778607511530859, + "grad_norm": 2.8466296195983887, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7217397689819336, + "num_tokens": 131719596.0, + "step": 5262 + }, + { + "epoch": 0.5779705688556995, + "grad_norm": 2.172274112701416, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7172868251800537, + "num_tokens": 131745101.0, + "step": 5263 + }, + { + "epoch": 0.5780803865583132, + "grad_norm": 2.3632683753967285, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7041289806365967, + "num_tokens": 131767680.0, + "step": 5264 + }, + { + "epoch": 0.5781902042609268, + "grad_norm": 2.6424853801727295, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7056309580802917, + "num_tokens": 131786892.0, + "step": 5265 + }, + { + "epoch": 0.5783000219635406, + "grad_norm": 2.1572790145874023, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7036858797073364, + "num_tokens": 131815591.0, + "step": 5266 + }, + { + "epoch": 0.5784098396661542, + "grad_norm": 2.3505587577819824, + "learning_rate": 1e-06, + "loss": 1.1217, + "mean_token_accuracy": 0.6751904487609863, + "num_tokens": 131841136.0, + "step": 5267 + }, + { + "epoch": 0.5785196573687679, + "grad_norm": 1.9520460367202759, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.7043180465698242, + "num_tokens": 131873212.0, + "step": 5268 + }, + { + "epoch": 0.5786294750713815, + "grad_norm": 2.6353862285614014, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6867074370384216, + "num_tokens": 131894250.0, + "step": 5269 + }, + { + "epoch": 0.5787392927739952, + "grad_norm": 2.3138043880462646, + "learning_rate": 1e-06, + "loss": 1.0697, + "mean_token_accuracy": 0.6825587749481201, + "num_tokens": 131917952.0, + "step": 5270 + }, + { + "epoch": 0.5788491104766088, + "grad_norm": 2.1622753143310547, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7000532150268555, + "num_tokens": 131944867.0, + "step": 5271 + }, + { + "epoch": 0.5789589281792225, + "grad_norm": 2.3370795249938965, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7152643203735352, + "num_tokens": 131968000.0, + "step": 5272 + }, + { + "epoch": 0.5790687458818361, + "grad_norm": 2.0983853340148926, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6870346069335938, + "num_tokens": 131994756.0, + "step": 5273 + }, + { + "epoch": 0.5791785635844499, + "grad_norm": 2.489691734313965, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7194762229919434, + "num_tokens": 132016147.0, + "step": 5274 + }, + { + "epoch": 0.5792883812870635, + "grad_norm": 2.3848676681518555, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7304493188858032, + "num_tokens": 132037449.0, + "step": 5275 + }, + { + "epoch": 0.5793981989896771, + "grad_norm": 2.4501078128814697, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7149380445480347, + "num_tokens": 132057444.0, + "step": 5276 + }, + { + "epoch": 0.5795080166922908, + "grad_norm": 2.3359010219573975, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6944223642349243, + "num_tokens": 132080675.0, + "step": 5277 + }, + { + "epoch": 0.5796178343949044, + "grad_norm": 2.385197162628174, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7171179056167603, + "num_tokens": 132102617.0, + "step": 5278 + }, + { + "epoch": 0.5797276520975181, + "grad_norm": 2.0471436977386475, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6929333209991455, + "num_tokens": 132130903.0, + "step": 5279 + }, + { + "epoch": 0.5798374698001317, + "grad_norm": 2.4026710987091064, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6966311931610107, + "num_tokens": 132153315.0, + "step": 5280 + }, + { + "epoch": 0.5799472875027455, + "grad_norm": 2.1429848670959473, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.708251953125, + "num_tokens": 132179382.0, + "step": 5281 + }, + { + "epoch": 0.5800571052053591, + "grad_norm": 2.2619097232818604, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7276967763900757, + "num_tokens": 132201712.0, + "step": 5282 + }, + { + "epoch": 0.5801669229079728, + "grad_norm": 2.566854238510132, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7099166512489319, + "num_tokens": 132222105.0, + "step": 5283 + }, + { + "epoch": 0.5802767406105864, + "grad_norm": 2.276937246322632, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7058717012405396, + "num_tokens": 132246344.0, + "step": 5284 + }, + { + "epoch": 0.5803865583132001, + "grad_norm": 2.2775447368621826, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6932705044746399, + "num_tokens": 132271290.0, + "step": 5285 + }, + { + "epoch": 0.5804963760158137, + "grad_norm": 2.1402394771575928, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7111921310424805, + "num_tokens": 132296926.0, + "step": 5286 + }, + { + "epoch": 0.5806061937184274, + "grad_norm": 2.2700812816619873, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6934129595756531, + "num_tokens": 132322316.0, + "step": 5287 + }, + { + "epoch": 0.5807160114210411, + "grad_norm": 2.1164674758911133, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7013244032859802, + "num_tokens": 132349416.0, + "step": 5288 + }, + { + "epoch": 0.5808258291236548, + "grad_norm": 2.2851645946502686, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7284646034240723, + "num_tokens": 132373723.0, + "step": 5289 + }, + { + "epoch": 0.5809356468262684, + "grad_norm": 2.039400815963745, + "learning_rate": 1e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6724580526351929, + "num_tokens": 132402349.0, + "step": 5290 + }, + { + "epoch": 0.5810454645288821, + "grad_norm": 2.095393419265747, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6972362399101257, + "num_tokens": 132429525.0, + "step": 5291 + }, + { + "epoch": 0.5811552822314957, + "grad_norm": 2.039628505706787, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7322367429733276, + "num_tokens": 132455970.0, + "step": 5292 + }, + { + "epoch": 0.5812650999341094, + "grad_norm": 2.4031660556793213, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.708229124546051, + "num_tokens": 132478180.0, + "step": 5293 + }, + { + "epoch": 0.581374917636723, + "grad_norm": 2.0877695083618164, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7233843803405762, + "num_tokens": 132504863.0, + "step": 5294 + }, + { + "epoch": 0.5814847353393368, + "grad_norm": 2.1383190155029297, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6985498070716858, + "num_tokens": 132533540.0, + "step": 5295 + }, + { + "epoch": 0.5815945530419504, + "grad_norm": 2.129617929458618, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7087481021881104, + "num_tokens": 132558203.0, + "step": 5296 + }, + { + "epoch": 0.581704370744564, + "grad_norm": 2.076977252960205, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7120897769927979, + "num_tokens": 132586370.0, + "step": 5297 + }, + { + "epoch": 0.5818141884471777, + "grad_norm": 2.3820555210113525, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7151166200637817, + "num_tokens": 132608259.0, + "step": 5298 + }, + { + "epoch": 0.5819240061497913, + "grad_norm": 2.273617744445801, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6947931051254272, + "num_tokens": 132633876.0, + "step": 5299 + }, + { + "epoch": 0.582033823852405, + "grad_norm": 2.290200710296631, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6960253715515137, + "num_tokens": 132658572.0, + "step": 5300 + }, + { + "epoch": 0.5821436415550186, + "grad_norm": 1.9467520713806152, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.731178879737854, + "num_tokens": 132687677.0, + "step": 5301 + }, + { + "epoch": 0.5822534592576323, + "grad_norm": 2.007321834564209, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.6927932500839233, + "num_tokens": 132716128.0, + "step": 5302 + }, + { + "epoch": 0.582363276960246, + "grad_norm": 2.1612000465393066, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.697786271572113, + "num_tokens": 132741969.0, + "step": 5303 + }, + { + "epoch": 0.5824730946628597, + "grad_norm": 2.1118736267089844, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7283986210823059, + "num_tokens": 132767679.0, + "step": 5304 + }, + { + "epoch": 0.5825829123654733, + "grad_norm": 2.136410713195801, + "learning_rate": 1e-06, + "loss": 1.0681, + "mean_token_accuracy": 0.6856882572174072, + "num_tokens": 132796015.0, + "step": 5305 + }, + { + "epoch": 0.582692730068087, + "grad_norm": 2.1922364234924316, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7013261318206787, + "num_tokens": 132820092.0, + "step": 5306 + }, + { + "epoch": 0.5828025477707006, + "grad_norm": 2.114337682723999, + "learning_rate": 1e-06, + "loss": 1.0553, + "mean_token_accuracy": 0.6781563758850098, + "num_tokens": 132846560.0, + "step": 5307 + }, + { + "epoch": 0.5829123654733143, + "grad_norm": 2.0367820262908936, + "learning_rate": 1e-06, + "loss": 1.0883, + "mean_token_accuracy": 0.6743167042732239, + "num_tokens": 132876404.0, + "step": 5308 + }, + { + "epoch": 0.5830221831759279, + "grad_norm": 2.1113333702087402, + "learning_rate": 1e-06, + "loss": 1.0839, + "mean_token_accuracy": 0.6777815818786621, + "num_tokens": 132904894.0, + "step": 5309 + }, + { + "epoch": 0.5831320008785417, + "grad_norm": 2.457817316055298, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7178247570991516, + "num_tokens": 132924389.0, + "step": 5310 + }, + { + "epoch": 0.5832418185811553, + "grad_norm": 2.123229742050171, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7012114524841309, + "num_tokens": 132950738.0, + "step": 5311 + }, + { + "epoch": 0.583351636283769, + "grad_norm": 2.025718927383423, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7049338817596436, + "num_tokens": 132979747.0, + "step": 5312 + }, + { + "epoch": 0.5834614539863826, + "grad_norm": 2.348827362060547, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7084311842918396, + "num_tokens": 133002485.0, + "step": 5313 + }, + { + "epoch": 0.5835712716889963, + "grad_norm": 2.184140682220459, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6994414925575256, + "num_tokens": 133028186.0, + "step": 5314 + }, + { + "epoch": 0.5836810893916099, + "grad_norm": 2.0249524116516113, + "learning_rate": 1e-06, + "loss": 1.0468, + "mean_token_accuracy": 0.6883624792098999, + "num_tokens": 133056135.0, + "step": 5315 + }, + { + "epoch": 0.5837909070942235, + "grad_norm": 2.533191204071045, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6833848357200623, + "num_tokens": 133076141.0, + "step": 5316 + }, + { + "epoch": 0.5839007247968373, + "grad_norm": 2.3747060298919678, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7038531303405762, + "num_tokens": 133097174.0, + "step": 5317 + }, + { + "epoch": 0.584010542499451, + "grad_norm": 2.2088441848754883, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7054303288459778, + "num_tokens": 133122235.0, + "step": 5318 + }, + { + "epoch": 0.5841203602020646, + "grad_norm": 2.089473247528076, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7232016324996948, + "num_tokens": 133148169.0, + "step": 5319 + }, + { + "epoch": 0.5842301779046782, + "grad_norm": 2.389237642288208, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7020713090896606, + "num_tokens": 133168897.0, + "step": 5320 + }, + { + "epoch": 0.5843399956072919, + "grad_norm": 2.0534465312957764, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6971635222434998, + "num_tokens": 133197006.0, + "step": 5321 + }, + { + "epoch": 0.5844498133099055, + "grad_norm": 2.3049309253692627, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6927184462547302, + "num_tokens": 133222850.0, + "step": 5322 + }, + { + "epoch": 0.5845596310125192, + "grad_norm": 1.8413666486740112, + "learning_rate": 1e-06, + "loss": 1.0821, + "mean_token_accuracy": 0.6785047650337219, + "num_tokens": 133257401.0, + "step": 5323 + }, + { + "epoch": 0.5846694487151329, + "grad_norm": 1.9799760580062866, + "learning_rate": 1e-06, + "loss": 1.0351, + "mean_token_accuracy": 0.6861563920974731, + "num_tokens": 133289574.0, + "step": 5324 + }, + { + "epoch": 0.5847792664177466, + "grad_norm": 1.973879337310791, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6922321915626526, + "num_tokens": 133321457.0, + "step": 5325 + }, + { + "epoch": 0.5848890841203602, + "grad_norm": 2.3174240589141846, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7150327563285828, + "num_tokens": 133343816.0, + "step": 5326 + }, + { + "epoch": 0.5849989018229739, + "grad_norm": 2.549459934234619, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7097804546356201, + "num_tokens": 133364059.0, + "step": 5327 + }, + { + "epoch": 0.5851087195255875, + "grad_norm": 2.459585428237915, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7058717012405396, + "num_tokens": 133385861.0, + "step": 5328 + }, + { + "epoch": 0.5852185372282012, + "grad_norm": 2.421499490737915, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7248704433441162, + "num_tokens": 133405927.0, + "step": 5329 + }, + { + "epoch": 0.5853283549308148, + "grad_norm": 2.4322593212127686, + "learning_rate": 1e-06, + "loss": 1.0307, + "mean_token_accuracy": 0.686281681060791, + "num_tokens": 133428780.0, + "step": 5330 + }, + { + "epoch": 0.5854381726334285, + "grad_norm": 2.4102084636688232, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7238770723342896, + "num_tokens": 133448848.0, + "step": 5331 + }, + { + "epoch": 0.5855479903360422, + "grad_norm": 2.273820161819458, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7364439964294434, + "num_tokens": 133472314.0, + "step": 5332 + }, + { + "epoch": 0.5856578080386559, + "grad_norm": 2.0394465923309326, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7007604837417603, + "num_tokens": 133500424.0, + "step": 5333 + }, + { + "epoch": 0.5857676257412695, + "grad_norm": 2.4221251010894775, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7287039756774902, + "num_tokens": 133519573.0, + "step": 5334 + }, + { + "epoch": 0.5858774434438832, + "grad_norm": 2.3369140625, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6926631927490234, + "num_tokens": 133544435.0, + "step": 5335 + }, + { + "epoch": 0.5859872611464968, + "grad_norm": 2.347694158554077, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6926826238632202, + "num_tokens": 133567770.0, + "step": 5336 + }, + { + "epoch": 0.5860970788491104, + "grad_norm": 2.044006824493408, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6741205453872681, + "num_tokens": 133597582.0, + "step": 5337 + }, + { + "epoch": 0.5862068965517241, + "grad_norm": 2.3145134449005127, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7324134707450867, + "num_tokens": 133620236.0, + "step": 5338 + }, + { + "epoch": 0.5863167142543378, + "grad_norm": 2.169199228286743, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6967012882232666, + "num_tokens": 133647305.0, + "step": 5339 + }, + { + "epoch": 0.5864265319569515, + "grad_norm": 1.9347913265228271, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7068777084350586, + "num_tokens": 133676538.0, + "step": 5340 + }, + { + "epoch": 0.5865363496595651, + "grad_norm": 2.1972527503967285, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7030894756317139, + "num_tokens": 133702300.0, + "step": 5341 + }, + { + "epoch": 0.5866461673621788, + "grad_norm": 2.3050765991210938, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.68938809633255, + "num_tokens": 133724823.0, + "step": 5342 + }, + { + "epoch": 0.5867559850647924, + "grad_norm": 2.108461380004883, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.6999515295028687, + "num_tokens": 133753691.0, + "step": 5343 + }, + { + "epoch": 0.5868658027674061, + "grad_norm": 2.066986322402954, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6882615685462952, + "num_tokens": 133783392.0, + "step": 5344 + }, + { + "epoch": 0.5869756204700197, + "grad_norm": 2.1899149417877197, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7312435507774353, + "num_tokens": 133807676.0, + "step": 5345 + }, + { + "epoch": 0.5870854381726335, + "grad_norm": 2.5230836868286133, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7258989810943604, + "num_tokens": 133826199.0, + "step": 5346 + }, + { + "epoch": 0.5871952558752471, + "grad_norm": 2.4896562099456787, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7033575177192688, + "num_tokens": 133847298.0, + "step": 5347 + }, + { + "epoch": 0.5873050735778608, + "grad_norm": 2.0702567100524902, + "learning_rate": 1e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6788996458053589, + "num_tokens": 133878005.0, + "step": 5348 + }, + { + "epoch": 0.5874148912804744, + "grad_norm": 2.0411291122436523, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6977806687355042, + "num_tokens": 133907102.0, + "step": 5349 + }, + { + "epoch": 0.5875247089830881, + "grad_norm": 2.0414109230041504, + "learning_rate": 1e-06, + "loss": 1.1103, + "mean_token_accuracy": 0.6670500040054321, + "num_tokens": 133937376.0, + "step": 5350 + }, + { + "epoch": 0.5876345266857017, + "grad_norm": 2.0576703548431396, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7127150297164917, + "num_tokens": 133965698.0, + "step": 5351 + }, + { + "epoch": 0.5877443443883154, + "grad_norm": 2.359947681427002, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7169516086578369, + "num_tokens": 133986616.0, + "step": 5352 + }, + { + "epoch": 0.5878541620909291, + "grad_norm": 2.1468255519866943, + "learning_rate": 1e-06, + "loss": 1.0903, + "mean_token_accuracy": 0.6924005746841431, + "num_tokens": 134012508.0, + "step": 5353 + }, + { + "epoch": 0.5879639797935428, + "grad_norm": 2.0883288383483887, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.676224946975708, + "num_tokens": 134040576.0, + "step": 5354 + }, + { + "epoch": 0.5880737974961564, + "grad_norm": 2.1397030353546143, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7044212818145752, + "num_tokens": 134065869.0, + "step": 5355 + }, + { + "epoch": 0.58818361519877, + "grad_norm": 2.1902379989624023, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6971367001533508, + "num_tokens": 134090522.0, + "step": 5356 + }, + { + "epoch": 0.5882934329013837, + "grad_norm": 2.7522571086883545, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7267891764640808, + "num_tokens": 134107768.0, + "step": 5357 + }, + { + "epoch": 0.5884032506039973, + "grad_norm": 2.3698232173919678, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.7036814093589783, + "num_tokens": 134131566.0, + "step": 5358 + }, + { + "epoch": 0.588513068306611, + "grad_norm": 2.602043390274048, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7052768468856812, + "num_tokens": 134151915.0, + "step": 5359 + }, + { + "epoch": 0.5886228860092246, + "grad_norm": 2.138608694076538, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6943622827529907, + "num_tokens": 134179920.0, + "step": 5360 + }, + { + "epoch": 0.5887327037118384, + "grad_norm": 2.0601892471313477, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7447412014007568, + "num_tokens": 134206270.0, + "step": 5361 + }, + { + "epoch": 0.588842521414452, + "grad_norm": 2.39218807220459, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.6975692510604858, + "num_tokens": 134227871.0, + "step": 5362 + }, + { + "epoch": 0.5889523391170657, + "grad_norm": 2.0973012447357178, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7150201797485352, + "num_tokens": 134254347.0, + "step": 5363 + }, + { + "epoch": 0.5890621568196793, + "grad_norm": 2.3808724880218506, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7120202779769897, + "num_tokens": 134273703.0, + "step": 5364 + }, + { + "epoch": 0.589171974522293, + "grad_norm": 2.378591537475586, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7264653444290161, + "num_tokens": 134294881.0, + "step": 5365 + }, + { + "epoch": 0.5892817922249066, + "grad_norm": 2.4214398860931396, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.737610936164856, + "num_tokens": 134315450.0, + "step": 5366 + }, + { + "epoch": 0.5893916099275203, + "grad_norm": 2.593083381652832, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.709328293800354, + "num_tokens": 134334394.0, + "step": 5367 + }, + { + "epoch": 0.589501427630134, + "grad_norm": 2.3007824420928955, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7185981869697571, + "num_tokens": 134356417.0, + "step": 5368 + }, + { + "epoch": 0.5896112453327477, + "grad_norm": 2.192873001098633, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6910492777824402, + "num_tokens": 134381474.0, + "step": 5369 + }, + { + "epoch": 0.5897210630353613, + "grad_norm": 2.5560450553894043, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.713739275932312, + "num_tokens": 134401374.0, + "step": 5370 + }, + { + "epoch": 0.589830880737975, + "grad_norm": 2.0324912071228027, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.7015471458435059, + "num_tokens": 134429983.0, + "step": 5371 + }, + { + "epoch": 0.5899406984405886, + "grad_norm": 2.9321115016937256, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7228587865829468, + "num_tokens": 134444896.0, + "step": 5372 + }, + { + "epoch": 0.5900505161432023, + "grad_norm": 2.0884673595428467, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6978560090065002, + "num_tokens": 134474752.0, + "step": 5373 + }, + { + "epoch": 0.5901603338458159, + "grad_norm": 2.100025177001953, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.725227952003479, + "num_tokens": 134500034.0, + "step": 5374 + }, + { + "epoch": 0.5902701515484297, + "grad_norm": 2.1501970291137695, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6963826417922974, + "num_tokens": 134528077.0, + "step": 5375 + }, + { + "epoch": 0.5903799692510433, + "grad_norm": 2.3252155780792236, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6970405578613281, + "num_tokens": 134553870.0, + "step": 5376 + }, + { + "epoch": 0.590489786953657, + "grad_norm": 2.02958083152771, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7079030275344849, + "num_tokens": 134581272.0, + "step": 5377 + }, + { + "epoch": 0.5905996046562706, + "grad_norm": 2.0740370750427246, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7052047252655029, + "num_tokens": 134608592.0, + "step": 5378 + }, + { + "epoch": 0.5907094223588842, + "grad_norm": 2.1074750423431396, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.7010601758956909, + "num_tokens": 134636328.0, + "step": 5379 + }, + { + "epoch": 0.5908192400614979, + "grad_norm": 2.089850425720215, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7155463695526123, + "num_tokens": 134664097.0, + "step": 5380 + }, + { + "epoch": 0.5909290577641115, + "grad_norm": 1.8685706853866577, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.7027764320373535, + "num_tokens": 134695936.0, + "step": 5381 + }, + { + "epoch": 0.5910388754667253, + "grad_norm": 2.118180274963379, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.7003283500671387, + "num_tokens": 134723457.0, + "step": 5382 + }, + { + "epoch": 0.5911486931693389, + "grad_norm": 2.1963562965393066, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6926385164260864, + "num_tokens": 134750384.0, + "step": 5383 + }, + { + "epoch": 0.5912585108719526, + "grad_norm": 2.42510724067688, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7075891494750977, + "num_tokens": 134771215.0, + "step": 5384 + }, + { + "epoch": 0.5913683285745662, + "grad_norm": 2.121551513671875, + "learning_rate": 1e-06, + "loss": 1.0435, + "mean_token_accuracy": 0.6858190894126892, + "num_tokens": 134800187.0, + "step": 5385 + }, + { + "epoch": 0.5914781462771799, + "grad_norm": 2.1882987022399902, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.6948647499084473, + "num_tokens": 134826981.0, + "step": 5386 + }, + { + "epoch": 0.5915879639797935, + "grad_norm": 2.2592906951904297, + "learning_rate": 1e-06, + "loss": 1.0663, + "mean_token_accuracy": 0.6787415146827698, + "num_tokens": 134850016.0, + "step": 5387 + }, + { + "epoch": 0.5916977816824072, + "grad_norm": 2.1189801692962646, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6966025829315186, + "num_tokens": 134877211.0, + "step": 5388 + }, + { + "epoch": 0.5918075993850208, + "grad_norm": 2.3210129737854004, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6971057653427124, + "num_tokens": 134901032.0, + "step": 5389 + }, + { + "epoch": 0.5919174170876346, + "grad_norm": 2.426837682723999, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.6967268586158752, + "num_tokens": 134922584.0, + "step": 5390 + }, + { + "epoch": 0.5920272347902482, + "grad_norm": 2.489670991897583, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7158522605895996, + "num_tokens": 134944925.0, + "step": 5391 + }, + { + "epoch": 0.5921370524928619, + "grad_norm": 2.1353225708007812, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7108194231987, + "num_tokens": 134971739.0, + "step": 5392 + }, + { + "epoch": 0.5922468701954755, + "grad_norm": 1.966090202331543, + "learning_rate": 1e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.6857874393463135, + "num_tokens": 135004180.0, + "step": 5393 + }, + { + "epoch": 0.5923566878980892, + "grad_norm": 2.116281747817993, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.701683521270752, + "num_tokens": 135029685.0, + "step": 5394 + }, + { + "epoch": 0.5924665056007028, + "grad_norm": 2.0835750102996826, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7087258696556091, + "num_tokens": 135057908.0, + "step": 5395 + }, + { + "epoch": 0.5925763233033164, + "grad_norm": 2.5421595573425293, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6920974254608154, + "num_tokens": 135079359.0, + "step": 5396 + }, + { + "epoch": 0.5926861410059302, + "grad_norm": 2.330732822418213, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.714833676815033, + "num_tokens": 135102581.0, + "step": 5397 + }, + { + "epoch": 0.5927959587085438, + "grad_norm": 2.185218572616577, + "learning_rate": 1e-06, + "loss": 1.0182, + "mean_token_accuracy": 0.6874841451644897, + "num_tokens": 135128051.0, + "step": 5398 + }, + { + "epoch": 0.5929057764111575, + "grad_norm": 2.1382412910461426, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.7128782272338867, + "num_tokens": 135155000.0, + "step": 5399 + }, + { + "epoch": 0.5930155941137711, + "grad_norm": 2.431117534637451, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6917396187782288, + "num_tokens": 135175955.0, + "step": 5400 + }, + { + "epoch": 0.5931254118163848, + "grad_norm": 2.5727615356445312, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.731905460357666, + "num_tokens": 135195369.0, + "step": 5401 + }, + { + "epoch": 0.5932352295189984, + "grad_norm": 2.0866646766662598, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7069960832595825, + "num_tokens": 135222349.0, + "step": 5402 + }, + { + "epoch": 0.5933450472216121, + "grad_norm": 2.4121298789978027, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7104389071464539, + "num_tokens": 135243827.0, + "step": 5403 + }, + { + "epoch": 0.5934548649242258, + "grad_norm": 2.0060958862304688, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.721034824848175, + "num_tokens": 135273541.0, + "step": 5404 + }, + { + "epoch": 0.5935646826268395, + "grad_norm": 2.1850457191467285, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6980760097503662, + "num_tokens": 135300479.0, + "step": 5405 + }, + { + "epoch": 0.5936745003294531, + "grad_norm": 2.4147863388061523, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.7022895812988281, + "num_tokens": 135322700.0, + "step": 5406 + }, + { + "epoch": 0.5937843180320668, + "grad_norm": 2.349256753921509, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7157513499259949, + "num_tokens": 135344910.0, + "step": 5407 + }, + { + "epoch": 0.5938941357346804, + "grad_norm": 2.2818827629089355, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7058802247047424, + "num_tokens": 135367945.0, + "step": 5408 + }, + { + "epoch": 0.5940039534372941, + "grad_norm": 2.071965217590332, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7009562253952026, + "num_tokens": 135395073.0, + "step": 5409 + }, + { + "epoch": 0.5941137711399077, + "grad_norm": 2.2455687522888184, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.680793046951294, + "num_tokens": 135419211.0, + "step": 5410 + }, + { + "epoch": 0.5942235888425215, + "grad_norm": 2.4766910076141357, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7201697826385498, + "num_tokens": 135436792.0, + "step": 5411 + }, + { + "epoch": 0.5943334065451351, + "grad_norm": 2.004347085952759, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7080780863761902, + "num_tokens": 135463944.0, + "step": 5412 + }, + { + "epoch": 0.5944432242477488, + "grad_norm": 2.103285551071167, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6867778301239014, + "num_tokens": 135491464.0, + "step": 5413 + }, + { + "epoch": 0.5945530419503624, + "grad_norm": 2.1457595825195312, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6992207765579224, + "num_tokens": 135518107.0, + "step": 5414 + }, + { + "epoch": 0.594662859652976, + "grad_norm": 2.3150343894958496, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7148213982582092, + "num_tokens": 135541221.0, + "step": 5415 + }, + { + "epoch": 0.5947726773555897, + "grad_norm": 2.372438907623291, + "learning_rate": 1e-06, + "loss": 1.0561, + "mean_token_accuracy": 0.6921961903572083, + "num_tokens": 135565671.0, + "step": 5416 + }, + { + "epoch": 0.5948824950582033, + "grad_norm": 2.0797417163848877, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.7187145948410034, + "num_tokens": 135593845.0, + "step": 5417 + }, + { + "epoch": 0.5949923127608171, + "grad_norm": 2.2755463123321533, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7079684734344482, + "num_tokens": 135617138.0, + "step": 5418 + }, + { + "epoch": 0.5951021304634307, + "grad_norm": 1.9688457250595093, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7400189638137817, + "num_tokens": 135646377.0, + "step": 5419 + }, + { + "epoch": 0.5952119481660444, + "grad_norm": 2.104397773742676, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7092158794403076, + "num_tokens": 135672296.0, + "step": 5420 + }, + { + "epoch": 0.595321765868658, + "grad_norm": 1.988297462463379, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7005071640014648, + "num_tokens": 135701848.0, + "step": 5421 + }, + { + "epoch": 0.5954315835712717, + "grad_norm": 2.24627685546875, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6806973814964294, + "num_tokens": 135726741.0, + "step": 5422 + }, + { + "epoch": 0.5955414012738853, + "grad_norm": 2.545555830001831, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6973404288291931, + "num_tokens": 135746348.0, + "step": 5423 + }, + { + "epoch": 0.595651218976499, + "grad_norm": 2.2159268856048584, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7208163738250732, + "num_tokens": 135770281.0, + "step": 5424 + }, + { + "epoch": 0.5957610366791126, + "grad_norm": 2.479612112045288, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6930899620056152, + "num_tokens": 135791643.0, + "step": 5425 + }, + { + "epoch": 0.5958708543817264, + "grad_norm": 2.073570728302002, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6932150721549988, + "num_tokens": 135819848.0, + "step": 5426 + }, + { + "epoch": 0.59598067208434, + "grad_norm": 2.216045618057251, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7117702960968018, + "num_tokens": 135844051.0, + "step": 5427 + }, + { + "epoch": 0.5960904897869537, + "grad_norm": 2.1270902156829834, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7046334743499756, + "num_tokens": 135873157.0, + "step": 5428 + }, + { + "epoch": 0.5962003074895673, + "grad_norm": 2.1743826866149902, + "learning_rate": 1e-06, + "loss": 1.0327, + "mean_token_accuracy": 0.6844785213470459, + "num_tokens": 135899479.0, + "step": 5429 + }, + { + "epoch": 0.596310125192181, + "grad_norm": 2.4293649196624756, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7126184701919556, + "num_tokens": 135922227.0, + "step": 5430 + }, + { + "epoch": 0.5964199428947946, + "grad_norm": 2.453658103942871, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.70734041929245, + "num_tokens": 135942872.0, + "step": 5431 + }, + { + "epoch": 0.5965297605974083, + "grad_norm": 2.1649839878082275, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7195091247558594, + "num_tokens": 135971048.0, + "step": 5432 + }, + { + "epoch": 0.596639578300022, + "grad_norm": 2.2595629692077637, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7006685733795166, + "num_tokens": 135995961.0, + "step": 5433 + }, + { + "epoch": 0.5967493960026357, + "grad_norm": 2.1368002891540527, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7153677344322205, + "num_tokens": 136022402.0, + "step": 5434 + }, + { + "epoch": 0.5968592137052493, + "grad_norm": 2.0056724548339844, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7237844467163086, + "num_tokens": 136048919.0, + "step": 5435 + }, + { + "epoch": 0.596969031407863, + "grad_norm": 2.3107826709747314, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6858522891998291, + "num_tokens": 136074341.0, + "step": 5436 + }, + { + "epoch": 0.5970788491104766, + "grad_norm": 2.263808488845825, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.709263801574707, + "num_tokens": 136098145.0, + "step": 5437 + }, + { + "epoch": 0.5971886668130902, + "grad_norm": 2.342158079147339, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7052384614944458, + "num_tokens": 136120069.0, + "step": 5438 + }, + { + "epoch": 0.5972984845157039, + "grad_norm": 2.200390577316284, + "learning_rate": 1e-06, + "loss": 1.1154, + "mean_token_accuracy": 0.6651914119720459, + "num_tokens": 136147636.0, + "step": 5439 + }, + { + "epoch": 0.5974083022183176, + "grad_norm": 2.3388853073120117, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.700040876865387, + "num_tokens": 136171398.0, + "step": 5440 + }, + { + "epoch": 0.5975181199209313, + "grad_norm": 2.160576343536377, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7079851627349854, + "num_tokens": 136197176.0, + "step": 5441 + }, + { + "epoch": 0.5976279376235449, + "grad_norm": 2.2771382331848145, + "learning_rate": 1e-06, + "loss": 1.0427, + "mean_token_accuracy": 0.687721848487854, + "num_tokens": 136221121.0, + "step": 5442 + }, + { + "epoch": 0.5977377553261586, + "grad_norm": 2.063241720199585, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6961370706558228, + "num_tokens": 136247456.0, + "step": 5443 + }, + { + "epoch": 0.5978475730287722, + "grad_norm": 2.3786699771881104, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7016488313674927, + "num_tokens": 136269670.0, + "step": 5444 + }, + { + "epoch": 0.5979573907313859, + "grad_norm": 2.2469210624694824, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6843347549438477, + "num_tokens": 136295278.0, + "step": 5445 + }, + { + "epoch": 0.5980672084339995, + "grad_norm": 1.903687596321106, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6893108487129211, + "num_tokens": 136329155.0, + "step": 5446 + }, + { + "epoch": 0.5981770261366133, + "grad_norm": 2.23527455329895, + "learning_rate": 1e-06, + "loss": 1.0453, + "mean_token_accuracy": 0.6780250072479248, + "num_tokens": 136354429.0, + "step": 5447 + }, + { + "epoch": 0.5982868438392269, + "grad_norm": 2.3886396884918213, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7012240886688232, + "num_tokens": 136375397.0, + "step": 5448 + }, + { + "epoch": 0.5983966615418406, + "grad_norm": 2.2243921756744385, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6983743906021118, + "num_tokens": 136400603.0, + "step": 5449 + }, + { + "epoch": 0.5985064792444542, + "grad_norm": 2.4979286193847656, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7156333923339844, + "num_tokens": 136420804.0, + "step": 5450 + }, + { + "epoch": 0.5986162969470679, + "grad_norm": 2.418537139892578, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.6960440874099731, + "num_tokens": 136441545.0, + "step": 5451 + }, + { + "epoch": 0.5987261146496815, + "grad_norm": 2.3992063999176025, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7191754579544067, + "num_tokens": 136462028.0, + "step": 5452 + }, + { + "epoch": 0.5988359323522952, + "grad_norm": 2.032512903213501, + "learning_rate": 1e-06, + "loss": 1.0205, + "mean_token_accuracy": 0.6983439326286316, + "num_tokens": 136493593.0, + "step": 5453 + }, + { + "epoch": 0.5989457500549088, + "grad_norm": 1.9779529571533203, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7038509845733643, + "num_tokens": 136522332.0, + "step": 5454 + }, + { + "epoch": 0.5990555677575226, + "grad_norm": 2.173006534576416, + "learning_rate": 1e-06, + "loss": 1.04, + "mean_token_accuracy": 0.6886644959449768, + "num_tokens": 136549612.0, + "step": 5455 + }, + { + "epoch": 0.5991653854601362, + "grad_norm": 2.0807032585144043, + "learning_rate": 1e-06, + "loss": 1.0745, + "mean_token_accuracy": 0.6791932582855225, + "num_tokens": 136576919.0, + "step": 5456 + }, + { + "epoch": 0.5992752031627498, + "grad_norm": 2.038872718811035, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7078261375427246, + "num_tokens": 136604706.0, + "step": 5457 + }, + { + "epoch": 0.5993850208653635, + "grad_norm": 2.493452310562134, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7007617950439453, + "num_tokens": 136625720.0, + "step": 5458 + }, + { + "epoch": 0.5994948385679771, + "grad_norm": 2.1086323261260986, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7014290690422058, + "num_tokens": 136652318.0, + "step": 5459 + }, + { + "epoch": 0.5996046562705908, + "grad_norm": 2.152509927749634, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7258697152137756, + "num_tokens": 136676570.0, + "step": 5460 + }, + { + "epoch": 0.5997144739732044, + "grad_norm": 1.9355758428573608, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6951947212219238, + "num_tokens": 136706592.0, + "step": 5461 + }, + { + "epoch": 0.5998242916758182, + "grad_norm": 2.1935856342315674, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6886316537857056, + "num_tokens": 136734229.0, + "step": 5462 + }, + { + "epoch": 0.5999341093784318, + "grad_norm": 2.4452006816864014, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7221819162368774, + "num_tokens": 136755111.0, + "step": 5463 + }, + { + "epoch": 0.6000439270810455, + "grad_norm": 2.427187204360962, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7009251117706299, + "num_tokens": 136776231.0, + "step": 5464 + }, + { + "epoch": 0.6001537447836591, + "grad_norm": 2.352992057800293, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6934561133384705, + "num_tokens": 136801170.0, + "step": 5465 + }, + { + "epoch": 0.6002635624862728, + "grad_norm": 2.1249232292175293, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7228571176528931, + "num_tokens": 136827396.0, + "step": 5466 + }, + { + "epoch": 0.6003733801888864, + "grad_norm": 2.1214067935943604, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7143970727920532, + "num_tokens": 136854548.0, + "step": 5467 + }, + { + "epoch": 0.6004831978915001, + "grad_norm": 2.5605108737945557, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.720209002494812, + "num_tokens": 136873646.0, + "step": 5468 + }, + { + "epoch": 0.6005930155941138, + "grad_norm": 2.355433225631714, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6870923042297363, + "num_tokens": 136898737.0, + "step": 5469 + }, + { + "epoch": 0.6007028332967275, + "grad_norm": 2.112771987915039, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.6724931001663208, + "num_tokens": 136925911.0, + "step": 5470 + }, + { + "epoch": 0.6008126509993411, + "grad_norm": 2.34897780418396, + "learning_rate": 1e-06, + "loss": 1.0457, + "mean_token_accuracy": 0.6983730792999268, + "num_tokens": 136949640.0, + "step": 5471 + }, + { + "epoch": 0.6009224687019548, + "grad_norm": 2.127110719680786, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7108844518661499, + "num_tokens": 136975563.0, + "step": 5472 + }, + { + "epoch": 0.6010322864045684, + "grad_norm": 2.339672088623047, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6829972267150879, + "num_tokens": 137002429.0, + "step": 5473 + }, + { + "epoch": 0.601142104107182, + "grad_norm": 2.1050851345062256, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7104268670082092, + "num_tokens": 137027604.0, + "step": 5474 + }, + { + "epoch": 0.6012519218097957, + "grad_norm": 2.4293317794799805, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.69554603099823, + "num_tokens": 137049575.0, + "step": 5475 + }, + { + "epoch": 0.6013617395124095, + "grad_norm": 2.466836452484131, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7144798040390015, + "num_tokens": 137070755.0, + "step": 5476 + }, + { + "epoch": 0.6014715572150231, + "grad_norm": 2.3570961952209473, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7155495882034302, + "num_tokens": 137093146.0, + "step": 5477 + }, + { + "epoch": 0.6015813749176367, + "grad_norm": 2.3049697875976562, + "learning_rate": 1e-06, + "loss": 1.0562, + "mean_token_accuracy": 0.6834362149238586, + "num_tokens": 137116026.0, + "step": 5478 + }, + { + "epoch": 0.6016911926202504, + "grad_norm": 2.068671703338623, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6933929920196533, + "num_tokens": 137144970.0, + "step": 5479 + }, + { + "epoch": 0.601801010322864, + "grad_norm": 2.3916244506835938, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6921476125717163, + "num_tokens": 137166007.0, + "step": 5480 + }, + { + "epoch": 0.6019108280254777, + "grad_norm": 2.1847734451293945, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7104064226150513, + "num_tokens": 137190658.0, + "step": 5481 + }, + { + "epoch": 0.6020206457280913, + "grad_norm": 2.305976390838623, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7198908925056458, + "num_tokens": 137212642.0, + "step": 5482 + }, + { + "epoch": 0.602130463430705, + "grad_norm": 2.5087432861328125, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7121588587760925, + "num_tokens": 137233601.0, + "step": 5483 + }, + { + "epoch": 0.6022402811333187, + "grad_norm": 2.9039385318756104, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7250694036483765, + "num_tokens": 137249077.0, + "step": 5484 + }, + { + "epoch": 0.6023500988359324, + "grad_norm": 2.298313856124878, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7051295042037964, + "num_tokens": 137273687.0, + "step": 5485 + }, + { + "epoch": 0.602459916538546, + "grad_norm": 2.419628620147705, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7158935070037842, + "num_tokens": 137294483.0, + "step": 5486 + }, + { + "epoch": 0.6025697342411597, + "grad_norm": 2.344399929046631, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.714333713054657, + "num_tokens": 137316548.0, + "step": 5487 + }, + { + "epoch": 0.6026795519437733, + "grad_norm": 2.692186117172241, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7393965721130371, + "num_tokens": 137333811.0, + "step": 5488 + }, + { + "epoch": 0.602789369646387, + "grad_norm": 2.5185346603393555, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.704416036605835, + "num_tokens": 137355365.0, + "step": 5489 + }, + { + "epoch": 0.6028991873490006, + "grad_norm": 2.319291830062866, + "learning_rate": 1e-06, + "loss": 1.0751, + "mean_token_accuracy": 0.683280348777771, + "num_tokens": 137382946.0, + "step": 5490 + }, + { + "epoch": 0.6030090050516144, + "grad_norm": 2.4165637493133545, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7314118146896362, + "num_tokens": 137403779.0, + "step": 5491 + }, + { + "epoch": 0.603118822754228, + "grad_norm": 2.1800732612609863, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7055383920669556, + "num_tokens": 137429862.0, + "step": 5492 + }, + { + "epoch": 0.6032286404568417, + "grad_norm": 2.2465546131134033, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7069477438926697, + "num_tokens": 137453605.0, + "step": 5493 + }, + { + "epoch": 0.6033384581594553, + "grad_norm": 2.067941904067993, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7182044982910156, + "num_tokens": 137481367.0, + "step": 5494 + }, + { + "epoch": 0.603448275862069, + "grad_norm": 2.0295779705047607, + "learning_rate": 1e-06, + "loss": 1.0789, + "mean_token_accuracy": 0.6848954558372498, + "num_tokens": 137514067.0, + "step": 5495 + }, + { + "epoch": 0.6035580935646826, + "grad_norm": 2.178992509841919, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6859477758407593, + "num_tokens": 137539433.0, + "step": 5496 + }, + { + "epoch": 0.6036679112672962, + "grad_norm": 2.3176016807556152, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7155774235725403, + "num_tokens": 137561311.0, + "step": 5497 + }, + { + "epoch": 0.60377772896991, + "grad_norm": 2.429410696029663, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7227072715759277, + "num_tokens": 137582186.0, + "step": 5498 + }, + { + "epoch": 0.6038875466725236, + "grad_norm": 2.1710290908813477, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7000476121902466, + "num_tokens": 137608339.0, + "step": 5499 + }, + { + "epoch": 0.6039973643751373, + "grad_norm": 2.124037504196167, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6856710910797119, + "num_tokens": 137634077.0, + "step": 5500 + }, + { + "epoch": 0.6041071820777509, + "grad_norm": 2.029139757156372, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7121278047561646, + "num_tokens": 137660929.0, + "step": 5501 + }, + { + "epoch": 0.6042169997803646, + "grad_norm": 1.9566818475723267, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7123434543609619, + "num_tokens": 137689497.0, + "step": 5502 + }, + { + "epoch": 0.6043268174829782, + "grad_norm": 2.196645736694336, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7179932594299316, + "num_tokens": 137713354.0, + "step": 5503 + }, + { + "epoch": 0.6044366351855919, + "grad_norm": 2.2844436168670654, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7047275900840759, + "num_tokens": 137736151.0, + "step": 5504 + }, + { + "epoch": 0.6045464528882056, + "grad_norm": 1.952821969985962, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6910457015037537, + "num_tokens": 137770120.0, + "step": 5505 + }, + { + "epoch": 0.6046562705908193, + "grad_norm": 2.240834951400757, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.694171667098999, + "num_tokens": 137796174.0, + "step": 5506 + }, + { + "epoch": 0.6047660882934329, + "grad_norm": 2.1128430366516113, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7019941806793213, + "num_tokens": 137822928.0, + "step": 5507 + }, + { + "epoch": 0.6048759059960466, + "grad_norm": 2.196730613708496, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7190432548522949, + "num_tokens": 137847932.0, + "step": 5508 + }, + { + "epoch": 0.6049857236986602, + "grad_norm": 2.2293028831481934, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7161530256271362, + "num_tokens": 137872108.0, + "step": 5509 + }, + { + "epoch": 0.6050955414012739, + "grad_norm": 2.0754640102386475, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.6971839666366577, + "num_tokens": 137899861.0, + "step": 5510 + }, + { + "epoch": 0.6052053591038875, + "grad_norm": 2.023221969604492, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7241906523704529, + "num_tokens": 137929779.0, + "step": 5511 + }, + { + "epoch": 0.6053151768065012, + "grad_norm": 2.323718547821045, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7072036266326904, + "num_tokens": 137951094.0, + "step": 5512 + }, + { + "epoch": 0.6054249945091149, + "grad_norm": 2.1318187713623047, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6977889537811279, + "num_tokens": 137979269.0, + "step": 5513 + }, + { + "epoch": 0.6055348122117286, + "grad_norm": 2.5722148418426514, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7121587991714478, + "num_tokens": 137999532.0, + "step": 5514 + }, + { + "epoch": 0.6056446299143422, + "grad_norm": 2.1057114601135254, + "learning_rate": 1e-06, + "loss": 1.0285, + "mean_token_accuracy": 0.6854727268218994, + "num_tokens": 138028715.0, + "step": 5515 + }, + { + "epoch": 0.6057544476169558, + "grad_norm": 2.4538543224334717, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.710576057434082, + "num_tokens": 138049693.0, + "step": 5516 + }, + { + "epoch": 0.6058642653195695, + "grad_norm": 2.156323194503784, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7001079320907593, + "num_tokens": 138076466.0, + "step": 5517 + }, + { + "epoch": 0.6059740830221831, + "grad_norm": 2.1689517498016357, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7063418626785278, + "num_tokens": 138104576.0, + "step": 5518 + }, + { + "epoch": 0.6060839007247968, + "grad_norm": 2.0900444984436035, + "learning_rate": 1e-06, + "loss": 1.0096, + "mean_token_accuracy": 0.6906315088272095, + "num_tokens": 138132764.0, + "step": 5519 + }, + { + "epoch": 0.6061937184274105, + "grad_norm": 2.248948097229004, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6982535719871521, + "num_tokens": 138161306.0, + "step": 5520 + }, + { + "epoch": 0.6063035361300242, + "grad_norm": 2.3865456581115723, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6977822184562683, + "num_tokens": 138181633.0, + "step": 5521 + }, + { + "epoch": 0.6064133538326378, + "grad_norm": 2.2268154621124268, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6940226554870605, + "num_tokens": 138206104.0, + "step": 5522 + }, + { + "epoch": 0.6065231715352515, + "grad_norm": 2.6389412879943848, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7022857666015625, + "num_tokens": 138224545.0, + "step": 5523 + }, + { + "epoch": 0.6066329892378651, + "grad_norm": 1.9900168180465698, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7064768671989441, + "num_tokens": 138253883.0, + "step": 5524 + }, + { + "epoch": 0.6067428069404788, + "grad_norm": 1.9513118267059326, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7054297924041748, + "num_tokens": 138285589.0, + "step": 5525 + }, + { + "epoch": 0.6068526246430924, + "grad_norm": 2.216646432876587, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6881219744682312, + "num_tokens": 138311302.0, + "step": 5526 + }, + { + "epoch": 0.6069624423457062, + "grad_norm": 2.228098154067993, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7060479521751404, + "num_tokens": 138336540.0, + "step": 5527 + }, + { + "epoch": 0.6070722600483198, + "grad_norm": 2.4837093353271484, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.6996548771858215, + "num_tokens": 138357809.0, + "step": 5528 + }, + { + "epoch": 0.6071820777509335, + "grad_norm": 2.419954299926758, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7143455147743225, + "num_tokens": 138378752.0, + "step": 5529 + }, + { + "epoch": 0.6072918954535471, + "grad_norm": 1.9467110633850098, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6979621648788452, + "num_tokens": 138409483.0, + "step": 5530 + }, + { + "epoch": 0.6074017131561608, + "grad_norm": 2.4510862827301025, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7079266905784607, + "num_tokens": 138428811.0, + "step": 5531 + }, + { + "epoch": 0.6075115308587744, + "grad_norm": 2.2012994289398193, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.7004602551460266, + "num_tokens": 138453295.0, + "step": 5532 + }, + { + "epoch": 0.607621348561388, + "grad_norm": 2.193873405456543, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.6979261636734009, + "num_tokens": 138480497.0, + "step": 5533 + }, + { + "epoch": 0.6077311662640018, + "grad_norm": 2.2964112758636475, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7075082063674927, + "num_tokens": 138502758.0, + "step": 5534 + }, + { + "epoch": 0.6078409839666155, + "grad_norm": 2.1650829315185547, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.7013841271400452, + "num_tokens": 138529031.0, + "step": 5535 + }, + { + "epoch": 0.6079508016692291, + "grad_norm": 2.1793034076690674, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7073076963424683, + "num_tokens": 138556765.0, + "step": 5536 + }, + { + "epoch": 0.6080606193718427, + "grad_norm": 2.2070508003234863, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6976487636566162, + "num_tokens": 138580725.0, + "step": 5537 + }, + { + "epoch": 0.6081704370744564, + "grad_norm": 2.3052849769592285, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7246294021606445, + "num_tokens": 138602986.0, + "step": 5538 + }, + { + "epoch": 0.60828025477707, + "grad_norm": 2.1329283714294434, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.719807505607605, + "num_tokens": 138632697.0, + "step": 5539 + }, + { + "epoch": 0.6083900724796837, + "grad_norm": 2.381366491317749, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.6786562204360962, + "num_tokens": 138656309.0, + "step": 5540 + }, + { + "epoch": 0.6084998901822973, + "grad_norm": 1.9103091955184937, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.702842116355896, + "num_tokens": 138687225.0, + "step": 5541 + }, + { + "epoch": 0.6086097078849111, + "grad_norm": 2.064615488052368, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6928747892379761, + "num_tokens": 138717493.0, + "step": 5542 + }, + { + "epoch": 0.6087195255875247, + "grad_norm": 2.083320140838623, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7065007090568542, + "num_tokens": 138746476.0, + "step": 5543 + }, + { + "epoch": 0.6088293432901384, + "grad_norm": 2.362912893295288, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.715692400932312, + "num_tokens": 138769216.0, + "step": 5544 + }, + { + "epoch": 0.608939160992752, + "grad_norm": 1.9727697372436523, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7041628956794739, + "num_tokens": 138799193.0, + "step": 5545 + }, + { + "epoch": 0.6090489786953657, + "grad_norm": 2.1671814918518066, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6885848045349121, + "num_tokens": 138825853.0, + "step": 5546 + }, + { + "epoch": 0.6091587963979793, + "grad_norm": 2.1177148818969727, + "learning_rate": 1e-06, + "loss": 1.0163, + "mean_token_accuracy": 0.7007235884666443, + "num_tokens": 138853500.0, + "step": 5547 + }, + { + "epoch": 0.609268614100593, + "grad_norm": 2.414472818374634, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.730605959892273, + "num_tokens": 138873737.0, + "step": 5548 + }, + { + "epoch": 0.6093784318032067, + "grad_norm": 2.793443441390991, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.733208417892456, + "num_tokens": 138889669.0, + "step": 5549 + }, + { + "epoch": 0.6094882495058204, + "grad_norm": 2.436183452606201, + "learning_rate": 1e-06, + "loss": 1.0606, + "mean_token_accuracy": 0.6852100491523743, + "num_tokens": 138912958.0, + "step": 5550 + }, + { + "epoch": 0.609598067208434, + "grad_norm": 1.982358455657959, + "learning_rate": 1e-06, + "loss": 1.101, + "mean_token_accuracy": 0.6703277826309204, + "num_tokens": 138945157.0, + "step": 5551 + }, + { + "epoch": 0.6097078849110477, + "grad_norm": 1.9287608861923218, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6964935660362244, + "num_tokens": 138978536.0, + "step": 5552 + }, + { + "epoch": 0.6098177026136613, + "grad_norm": 2.3506133556365967, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6996991038322449, + "num_tokens": 139002230.0, + "step": 5553 + }, + { + "epoch": 0.609927520316275, + "grad_norm": 2.016371726989746, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6948317289352417, + "num_tokens": 139032070.0, + "step": 5554 + }, + { + "epoch": 0.6100373380188886, + "grad_norm": 2.2364888191223145, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6876968145370483, + "num_tokens": 139058422.0, + "step": 5555 + }, + { + "epoch": 0.6101471557215024, + "grad_norm": 2.368420362472534, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7077438235282898, + "num_tokens": 139081191.0, + "step": 5556 + }, + { + "epoch": 0.610256973424116, + "grad_norm": 2.5531089305877686, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7348109483718872, + "num_tokens": 139098783.0, + "step": 5557 + }, + { + "epoch": 0.6103667911267296, + "grad_norm": 2.239060163497925, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6984464526176453, + "num_tokens": 139125755.0, + "step": 5558 + }, + { + "epoch": 0.6104766088293433, + "grad_norm": 2.0814242362976074, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6920723915100098, + "num_tokens": 139155380.0, + "step": 5559 + }, + { + "epoch": 0.6105864265319569, + "grad_norm": 2.4981894493103027, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.7106308341026306, + "num_tokens": 139178283.0, + "step": 5560 + }, + { + "epoch": 0.6106962442345706, + "grad_norm": 2.0753698348999023, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7085787653923035, + "num_tokens": 139205878.0, + "step": 5561 + }, + { + "epoch": 0.6108060619371842, + "grad_norm": 2.3776931762695312, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6996901631355286, + "num_tokens": 139228665.0, + "step": 5562 + }, + { + "epoch": 0.610915879639798, + "grad_norm": 2.208660125732422, + "learning_rate": 1e-06, + "loss": 1.0462, + "mean_token_accuracy": 0.6809802055358887, + "num_tokens": 139254955.0, + "step": 5563 + }, + { + "epoch": 0.6110256973424116, + "grad_norm": 2.4615626335144043, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6938889026641846, + "num_tokens": 139277005.0, + "step": 5564 + }, + { + "epoch": 0.6111355150450253, + "grad_norm": 2.1714999675750732, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6905895471572876, + "num_tokens": 139302504.0, + "step": 5565 + }, + { + "epoch": 0.6112453327476389, + "grad_norm": 2.214320659637451, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.680827796459198, + "num_tokens": 139329040.0, + "step": 5566 + }, + { + "epoch": 0.6113551504502526, + "grad_norm": 2.062270402908325, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7205239534378052, + "num_tokens": 139355997.0, + "step": 5567 + }, + { + "epoch": 0.6114649681528662, + "grad_norm": 2.294973850250244, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7170202732086182, + "num_tokens": 139379443.0, + "step": 5568 + }, + { + "epoch": 0.6115747858554799, + "grad_norm": 2.4329588413238525, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7171037197113037, + "num_tokens": 139400524.0, + "step": 5569 + }, + { + "epoch": 0.6116846035580935, + "grad_norm": 1.9344254732131958, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7047916650772095, + "num_tokens": 139433136.0, + "step": 5570 + }, + { + "epoch": 0.6117944212607073, + "grad_norm": 1.9048130512237549, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7001886367797852, + "num_tokens": 139465212.0, + "step": 5571 + }, + { + "epoch": 0.6119042389633209, + "grad_norm": 2.197969675064087, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7275550365447998, + "num_tokens": 139489179.0, + "step": 5572 + }, + { + "epoch": 0.6120140566659346, + "grad_norm": 2.2200708389282227, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7129402756690979, + "num_tokens": 139513706.0, + "step": 5573 + }, + { + "epoch": 0.6121238743685482, + "grad_norm": 2.177152633666992, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7029399871826172, + "num_tokens": 139540007.0, + "step": 5574 + }, + { + "epoch": 0.6122336920711619, + "grad_norm": 2.3662469387054443, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6921555399894714, + "num_tokens": 139563958.0, + "step": 5575 + }, + { + "epoch": 0.6123435097737755, + "grad_norm": 2.600503921508789, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7116421461105347, + "num_tokens": 139583240.0, + "step": 5576 + }, + { + "epoch": 0.6124533274763891, + "grad_norm": 2.614288091659546, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7030459046363831, + "num_tokens": 139602588.0, + "step": 5577 + }, + { + "epoch": 0.6125631451790029, + "grad_norm": 2.0572268962860107, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7254782915115356, + "num_tokens": 139628913.0, + "step": 5578 + }, + { + "epoch": 0.6126729628816165, + "grad_norm": 2.0537970066070557, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7149186730384827, + "num_tokens": 139658316.0, + "step": 5579 + }, + { + "epoch": 0.6127827805842302, + "grad_norm": 2.299666404724121, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6945995092391968, + "num_tokens": 139683431.0, + "step": 5580 + }, + { + "epoch": 0.6128925982868438, + "grad_norm": 1.9028103351593018, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6857532858848572, + "num_tokens": 139715936.0, + "step": 5581 + }, + { + "epoch": 0.6130024159894575, + "grad_norm": 2.139796495437622, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6989112496376038, + "num_tokens": 139741684.0, + "step": 5582 + }, + { + "epoch": 0.6131122336920711, + "grad_norm": 2.3481175899505615, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7242313027381897, + "num_tokens": 139764030.0, + "step": 5583 + }, + { + "epoch": 0.6132220513946848, + "grad_norm": 2.325981378555298, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.689839243888855, + "num_tokens": 139786861.0, + "step": 5584 + }, + { + "epoch": 0.6133318690972985, + "grad_norm": 2.1666183471679688, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6921052932739258, + "num_tokens": 139814405.0, + "step": 5585 + }, + { + "epoch": 0.6134416867999122, + "grad_norm": 2.1722421646118164, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6857755184173584, + "num_tokens": 139840666.0, + "step": 5586 + }, + { + "epoch": 0.6135515045025258, + "grad_norm": 2.188265085220337, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7092493772506714, + "num_tokens": 139865646.0, + "step": 5587 + }, + { + "epoch": 0.6136613222051395, + "grad_norm": 2.0077311992645264, + "learning_rate": 1e-06, + "loss": 0.9887, + "mean_token_accuracy": 0.7058857679367065, + "num_tokens": 139894958.0, + "step": 5588 + }, + { + "epoch": 0.6137711399077531, + "grad_norm": 2.261733055114746, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7045320272445679, + "num_tokens": 139917122.0, + "step": 5589 + }, + { + "epoch": 0.6138809576103668, + "grad_norm": 2.237671136856079, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7288432717323303, + "num_tokens": 139941246.0, + "step": 5590 + }, + { + "epoch": 0.6139907753129804, + "grad_norm": 2.3699870109558105, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7108331322669983, + "num_tokens": 139962147.0, + "step": 5591 + }, + { + "epoch": 0.6141005930155942, + "grad_norm": 2.2928307056427, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7089565992355347, + "num_tokens": 139984872.0, + "step": 5592 + }, + { + "epoch": 0.6142104107182078, + "grad_norm": 2.2655789852142334, + "learning_rate": 1e-06, + "loss": 1.0312, + "mean_token_accuracy": 0.6911383867263794, + "num_tokens": 140007449.0, + "step": 5593 + }, + { + "epoch": 0.6143202284208215, + "grad_norm": 2.439136505126953, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7176946401596069, + "num_tokens": 140028456.0, + "step": 5594 + }, + { + "epoch": 0.6144300461234351, + "grad_norm": 2.342291831970215, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6967233419418335, + "num_tokens": 140053048.0, + "step": 5595 + }, + { + "epoch": 0.6145398638260487, + "grad_norm": 2.257087469100952, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6874499320983887, + "num_tokens": 140078650.0, + "step": 5596 + }, + { + "epoch": 0.6146496815286624, + "grad_norm": 2.273811101913452, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7196024656295776, + "num_tokens": 140100921.0, + "step": 5597 + }, + { + "epoch": 0.614759499231276, + "grad_norm": 2.245864152908325, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7328059673309326, + "num_tokens": 140126037.0, + "step": 5598 + }, + { + "epoch": 0.6148693169338898, + "grad_norm": 2.602139711380005, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7109158635139465, + "num_tokens": 140144080.0, + "step": 5599 + }, + { + "epoch": 0.6149791346365034, + "grad_norm": 2.0708937644958496, + "learning_rate": 1e-06, + "loss": 1.064, + "mean_token_accuracy": 0.6901743412017822, + "num_tokens": 140173633.0, + "step": 5600 + }, + { + "epoch": 0.6150889523391171, + "grad_norm": 2.140263319015503, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7308562994003296, + "num_tokens": 140197875.0, + "step": 5601 + }, + { + "epoch": 0.6151987700417307, + "grad_norm": 2.390307664871216, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.711871862411499, + "num_tokens": 140220307.0, + "step": 5602 + }, + { + "epoch": 0.6153085877443444, + "grad_norm": 2.6107747554779053, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7171792387962341, + "num_tokens": 140238710.0, + "step": 5603 + }, + { + "epoch": 0.615418405446958, + "grad_norm": 2.502686023712158, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.6927934885025024, + "num_tokens": 140260862.0, + "step": 5604 + }, + { + "epoch": 0.6155282231495717, + "grad_norm": 2.0611789226531982, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.6829269528388977, + "num_tokens": 140288745.0, + "step": 5605 + }, + { + "epoch": 0.6156380408521853, + "grad_norm": 2.0796027183532715, + "learning_rate": 1e-06, + "loss": 1.0848, + "mean_token_accuracy": 0.6758705377578735, + "num_tokens": 140316968.0, + "step": 5606 + }, + { + "epoch": 0.6157478585547991, + "grad_norm": 2.4084360599517822, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6914393901824951, + "num_tokens": 140338264.0, + "step": 5607 + }, + { + "epoch": 0.6158576762574127, + "grad_norm": 2.3720266819000244, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7102355360984802, + "num_tokens": 140360048.0, + "step": 5608 + }, + { + "epoch": 0.6159674939600264, + "grad_norm": 2.604203224182129, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7052140235900879, + "num_tokens": 140377333.0, + "step": 5609 + }, + { + "epoch": 0.61607731166264, + "grad_norm": 2.1421260833740234, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7196006774902344, + "num_tokens": 140403132.0, + "step": 5610 + }, + { + "epoch": 0.6161871293652537, + "grad_norm": 2.3392386436462402, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.701716423034668, + "num_tokens": 140427819.0, + "step": 5611 + }, + { + "epoch": 0.6162969470678673, + "grad_norm": 2.3625941276550293, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6932110786437988, + "num_tokens": 140450076.0, + "step": 5612 + }, + { + "epoch": 0.616406764770481, + "grad_norm": 2.300819158554077, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7303744554519653, + "num_tokens": 140472415.0, + "step": 5613 + }, + { + "epoch": 0.6165165824730947, + "grad_norm": 2.1274609565734863, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7223873734474182, + "num_tokens": 140496908.0, + "step": 5614 + }, + { + "epoch": 0.6166264001757084, + "grad_norm": 2.286128520965576, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7029221653938293, + "num_tokens": 140522138.0, + "step": 5615 + }, + { + "epoch": 0.616736217878322, + "grad_norm": 2.134492874145508, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6912892460823059, + "num_tokens": 140548649.0, + "step": 5616 + }, + { + "epoch": 0.6168460355809356, + "grad_norm": 2.228133201599121, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6824707984924316, + "num_tokens": 140573127.0, + "step": 5617 + }, + { + "epoch": 0.6169558532835493, + "grad_norm": 2.16147518157959, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7114025354385376, + "num_tokens": 140597157.0, + "step": 5618 + }, + { + "epoch": 0.6170656709861629, + "grad_norm": 1.9549857378005981, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.6937909126281738, + "num_tokens": 140630228.0, + "step": 5619 + }, + { + "epoch": 0.6171754886887766, + "grad_norm": 2.497715473175049, + "learning_rate": 1e-06, + "loss": 1.0555, + "mean_token_accuracy": 0.6807845830917358, + "num_tokens": 140653956.0, + "step": 5620 + }, + { + "epoch": 0.6172853063913903, + "grad_norm": 1.9545167684555054, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7077205181121826, + "num_tokens": 140683589.0, + "step": 5621 + }, + { + "epoch": 0.617395124094004, + "grad_norm": 2.066490650177002, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.6992703080177307, + "num_tokens": 140711396.0, + "step": 5622 + }, + { + "epoch": 0.6175049417966176, + "grad_norm": 2.1271579265594482, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7140955924987793, + "num_tokens": 140737248.0, + "step": 5623 + }, + { + "epoch": 0.6176147594992313, + "grad_norm": 2.1961312294006348, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7109420299530029, + "num_tokens": 140762262.0, + "step": 5624 + }, + { + "epoch": 0.6177245772018449, + "grad_norm": 2.0258255004882812, + "learning_rate": 1e-06, + "loss": 1.0744, + "mean_token_accuracy": 0.6782334446907043, + "num_tokens": 140792064.0, + "step": 5625 + }, + { + "epoch": 0.6178343949044586, + "grad_norm": 2.394530773162842, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7056068778038025, + "num_tokens": 140811964.0, + "step": 5626 + }, + { + "epoch": 0.6179442126070722, + "grad_norm": 2.1777119636535645, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6974685192108154, + "num_tokens": 140838577.0, + "step": 5627 + }, + { + "epoch": 0.618054030309686, + "grad_norm": 2.3041322231292725, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7255953550338745, + "num_tokens": 140863016.0, + "step": 5628 + }, + { + "epoch": 0.6181638480122996, + "grad_norm": 2.32126522064209, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7167695760726929, + "num_tokens": 140887927.0, + "step": 5629 + }, + { + "epoch": 0.6182736657149133, + "grad_norm": 2.269601583480835, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7011165618896484, + "num_tokens": 140911912.0, + "step": 5630 + }, + { + "epoch": 0.6183834834175269, + "grad_norm": 2.2490670680999756, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7077445983886719, + "num_tokens": 140936030.0, + "step": 5631 + }, + { + "epoch": 0.6184933011201406, + "grad_norm": 1.979505181312561, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6886796951293945, + "num_tokens": 140967157.0, + "step": 5632 + }, + { + "epoch": 0.6186031188227542, + "grad_norm": 1.9970020055770874, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.700067937374115, + "num_tokens": 140998412.0, + "step": 5633 + }, + { + "epoch": 0.6187129365253679, + "grad_norm": 2.3523647785186768, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7023844122886658, + "num_tokens": 141019832.0, + "step": 5634 + }, + { + "epoch": 0.6188227542279815, + "grad_norm": 2.1566646099090576, + "learning_rate": 1e-06, + "loss": 1.0677, + "mean_token_accuracy": 0.687848687171936, + "num_tokens": 141046075.0, + "step": 5635 + }, + { + "epoch": 0.6189325719305953, + "grad_norm": 2.3634235858917236, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.711672306060791, + "num_tokens": 141068402.0, + "step": 5636 + }, + { + "epoch": 0.6190423896332089, + "grad_norm": 2.112292528152466, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.722761869430542, + "num_tokens": 141094400.0, + "step": 5637 + }, + { + "epoch": 0.6191522073358225, + "grad_norm": 2.2035489082336426, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.6990488767623901, + "num_tokens": 141120157.0, + "step": 5638 + }, + { + "epoch": 0.6192620250384362, + "grad_norm": 2.0224859714508057, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6846742630004883, + "num_tokens": 141149510.0, + "step": 5639 + }, + { + "epoch": 0.6193718427410498, + "grad_norm": 2.4007890224456787, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7161715030670166, + "num_tokens": 141172989.0, + "step": 5640 + }, + { + "epoch": 0.6194816604436635, + "grad_norm": 2.0846776962280273, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7051684856414795, + "num_tokens": 141199373.0, + "step": 5641 + }, + { + "epoch": 0.6195914781462771, + "grad_norm": 2.504145622253418, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6988111138343811, + "num_tokens": 141222993.0, + "step": 5642 + }, + { + "epoch": 0.6197012958488909, + "grad_norm": 2.8196306228637695, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7104542851448059, + "num_tokens": 141240397.0, + "step": 5643 + }, + { + "epoch": 0.6198111135515045, + "grad_norm": 2.1612656116485596, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6971571445465088, + "num_tokens": 141267545.0, + "step": 5644 + }, + { + "epoch": 0.6199209312541182, + "grad_norm": 2.2748966217041016, + "learning_rate": 1e-06, + "loss": 1.0193, + "mean_token_accuracy": 0.6867581605911255, + "num_tokens": 141291863.0, + "step": 5645 + }, + { + "epoch": 0.6200307489567318, + "grad_norm": 2.351463556289673, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7023456692695618, + "num_tokens": 141315675.0, + "step": 5646 + }, + { + "epoch": 0.6201405666593455, + "grad_norm": 2.6321494579315186, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7307910919189453, + "num_tokens": 141334063.0, + "step": 5647 + }, + { + "epoch": 0.6202503843619591, + "grad_norm": 2.14811110496521, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.69828200340271, + "num_tokens": 141360197.0, + "step": 5648 + }, + { + "epoch": 0.6203602020645728, + "grad_norm": 2.3933966159820557, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.712194561958313, + "num_tokens": 141382077.0, + "step": 5649 + }, + { + "epoch": 0.6204700197671865, + "grad_norm": 2.6717417240142822, + "learning_rate": 1e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7404175996780396, + "num_tokens": 141399125.0, + "step": 5650 + }, + { + "epoch": 0.6205798374698002, + "grad_norm": 2.2436537742614746, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7237477898597717, + "num_tokens": 141421481.0, + "step": 5651 + }, + { + "epoch": 0.6206896551724138, + "grad_norm": 2.2576992511749268, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7210210561752319, + "num_tokens": 141445080.0, + "step": 5652 + }, + { + "epoch": 0.6207994728750275, + "grad_norm": 2.0621554851531982, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7013674974441528, + "num_tokens": 141474133.0, + "step": 5653 + }, + { + "epoch": 0.6209092905776411, + "grad_norm": 2.331024408340454, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7078925967216492, + "num_tokens": 141497616.0, + "step": 5654 + }, + { + "epoch": 0.6210191082802548, + "grad_norm": 2.0417325496673584, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7064146995544434, + "num_tokens": 141525630.0, + "step": 5655 + }, + { + "epoch": 0.6211289259828684, + "grad_norm": 2.2522292137145996, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7052121162414551, + "num_tokens": 141549104.0, + "step": 5656 + }, + { + "epoch": 0.6212387436854822, + "grad_norm": 2.3879494667053223, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6949251294136047, + "num_tokens": 141572598.0, + "step": 5657 + }, + { + "epoch": 0.6213485613880958, + "grad_norm": 2.462454080581665, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7120196223258972, + "num_tokens": 141593224.0, + "step": 5658 + }, + { + "epoch": 0.6214583790907094, + "grad_norm": 2.1160824298858643, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7103705406188965, + "num_tokens": 141619521.0, + "step": 5659 + }, + { + "epoch": 0.6215681967933231, + "grad_norm": 2.5378479957580566, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7436070442199707, + "num_tokens": 141638121.0, + "step": 5660 + }, + { + "epoch": 0.6216780144959367, + "grad_norm": 2.3518226146698, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7316321134567261, + "num_tokens": 141659743.0, + "step": 5661 + }, + { + "epoch": 0.6217878321985504, + "grad_norm": 2.620994806289673, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.722846269607544, + "num_tokens": 141677789.0, + "step": 5662 + }, + { + "epoch": 0.621897649901164, + "grad_norm": 2.4588546752929688, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7069481611251831, + "num_tokens": 141698301.0, + "step": 5663 + }, + { + "epoch": 0.6220074676037777, + "grad_norm": 1.8700242042541504, + "learning_rate": 1e-06, + "loss": 1.0342, + "mean_token_accuracy": 0.6890894770622253, + "num_tokens": 141731640.0, + "step": 5664 + }, + { + "epoch": 0.6221172853063914, + "grad_norm": 2.4721572399139404, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6935040950775146, + "num_tokens": 141753979.0, + "step": 5665 + }, + { + "epoch": 0.6222271030090051, + "grad_norm": 2.251434326171875, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.7070128321647644, + "num_tokens": 141778210.0, + "step": 5666 + }, + { + "epoch": 0.6223369207116187, + "grad_norm": 2.0725197792053223, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7219585180282593, + "num_tokens": 141804700.0, + "step": 5667 + }, + { + "epoch": 0.6224467384142324, + "grad_norm": 2.393556594848633, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7115394473075867, + "num_tokens": 141825989.0, + "step": 5668 + }, + { + "epoch": 0.622556556116846, + "grad_norm": 2.0908098220825195, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6913324594497681, + "num_tokens": 141854812.0, + "step": 5669 + }, + { + "epoch": 0.6226663738194597, + "grad_norm": 2.127495288848877, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7225137948989868, + "num_tokens": 141880230.0, + "step": 5670 + }, + { + "epoch": 0.6227761915220733, + "grad_norm": 2.1920299530029297, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.7265849113464355, + "num_tokens": 141903934.0, + "step": 5671 + }, + { + "epoch": 0.6228860092246871, + "grad_norm": 2.211301326751709, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7093201875686646, + "num_tokens": 141926753.0, + "step": 5672 + }, + { + "epoch": 0.6229958269273007, + "grad_norm": 2.362919807434082, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6917135119438171, + "num_tokens": 141950449.0, + "step": 5673 + }, + { + "epoch": 0.6231056446299144, + "grad_norm": 2.6339917182922363, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7296003103256226, + "num_tokens": 141969143.0, + "step": 5674 + }, + { + "epoch": 0.623215462332528, + "grad_norm": 2.163487195968628, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6999565362930298, + "num_tokens": 141993807.0, + "step": 5675 + }, + { + "epoch": 0.6233252800351416, + "grad_norm": 2.310554265975952, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.6813902258872986, + "num_tokens": 142019383.0, + "step": 5676 + }, + { + "epoch": 0.6234350977377553, + "grad_norm": 2.088500499725342, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7072668075561523, + "num_tokens": 142048244.0, + "step": 5677 + }, + { + "epoch": 0.6235449154403689, + "grad_norm": 2.180086135864258, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7194544672966003, + "num_tokens": 142073239.0, + "step": 5678 + }, + { + "epoch": 0.6236547331429827, + "grad_norm": 2.335867404937744, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6970582604408264, + "num_tokens": 142098054.0, + "step": 5679 + }, + { + "epoch": 0.6237645508455963, + "grad_norm": 2.3548953533172607, + "learning_rate": 1e-06, + "loss": 0.9775, + "mean_token_accuracy": 0.7016571760177612, + "num_tokens": 142121030.0, + "step": 5680 + }, + { + "epoch": 0.62387436854821, + "grad_norm": 2.1157331466674805, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7120242714881897, + "num_tokens": 142148861.0, + "step": 5681 + }, + { + "epoch": 0.6239841862508236, + "grad_norm": 2.749577522277832, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7255081534385681, + "num_tokens": 142168301.0, + "step": 5682 + }, + { + "epoch": 0.6240940039534373, + "grad_norm": 2.012712001800537, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7074155211448669, + "num_tokens": 142197158.0, + "step": 5683 + }, + { + "epoch": 0.6242038216560509, + "grad_norm": 1.9207392930984497, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7047957181930542, + "num_tokens": 142228667.0, + "step": 5684 + }, + { + "epoch": 0.6243136393586646, + "grad_norm": 2.074599027633667, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7143476009368896, + "num_tokens": 142254463.0, + "step": 5685 + }, + { + "epoch": 0.6244234570612783, + "grad_norm": 2.2667577266693115, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.719014048576355, + "num_tokens": 142277010.0, + "step": 5686 + }, + { + "epoch": 0.624533274763892, + "grad_norm": 2.439695358276367, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7012128829956055, + "num_tokens": 142298870.0, + "step": 5687 + }, + { + "epoch": 0.6246430924665056, + "grad_norm": 2.389129877090454, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6835379004478455, + "num_tokens": 142321903.0, + "step": 5688 + }, + { + "epoch": 0.6247529101691193, + "grad_norm": 2.341156244277954, + "learning_rate": 1e-06, + "loss": 1.0578, + "mean_token_accuracy": 0.6805339455604553, + "num_tokens": 142347452.0, + "step": 5689 + }, + { + "epoch": 0.6248627278717329, + "grad_norm": 2.1688637733459473, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7025526165962219, + "num_tokens": 142371332.0, + "step": 5690 + }, + { + "epoch": 0.6249725455743466, + "grad_norm": 2.3115460872650146, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6903036832809448, + "num_tokens": 142395970.0, + "step": 5691 + }, + { + "epoch": 0.6250823632769602, + "grad_norm": 2.245039939880371, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7080308198928833, + "num_tokens": 142420795.0, + "step": 5692 + }, + { + "epoch": 0.6251921809795739, + "grad_norm": 2.0533132553100586, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6976981163024902, + "num_tokens": 142451296.0, + "step": 5693 + }, + { + "epoch": 0.6253019986821876, + "grad_norm": 2.0751616954803467, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6915349960327148, + "num_tokens": 142481064.0, + "step": 5694 + }, + { + "epoch": 0.6254118163848013, + "grad_norm": 2.295297622680664, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.72727370262146, + "num_tokens": 142505659.0, + "step": 5695 + }, + { + "epoch": 0.6255216340874149, + "grad_norm": 2.269979953765869, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7055422067642212, + "num_tokens": 142528121.0, + "step": 5696 + }, + { + "epoch": 0.6256314517900285, + "grad_norm": 2.183448076248169, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7065857648849487, + "num_tokens": 142553848.0, + "step": 5697 + }, + { + "epoch": 0.6257412694926422, + "grad_norm": 2.2895522117614746, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.6980807781219482, + "num_tokens": 142577525.0, + "step": 5698 + }, + { + "epoch": 0.6258510871952558, + "grad_norm": 2.649672269821167, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7136742472648621, + "num_tokens": 142596430.0, + "step": 5699 + }, + { + "epoch": 0.6259609048978695, + "grad_norm": 2.095853805541992, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7041651010513306, + "num_tokens": 142624013.0, + "step": 5700 + }, + { + "epoch": 0.6260707226004832, + "grad_norm": 2.1665761470794678, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.6944522857666016, + "num_tokens": 142650945.0, + "step": 5701 + }, + { + "epoch": 0.6261805403030969, + "grad_norm": 2.2782142162323, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6868732571601868, + "num_tokens": 142677396.0, + "step": 5702 + }, + { + "epoch": 0.6262903580057105, + "grad_norm": 1.8980822563171387, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.68425452709198, + "num_tokens": 142710650.0, + "step": 5703 + }, + { + "epoch": 0.6264001757083242, + "grad_norm": 2.3393430709838867, + "learning_rate": 1e-06, + "loss": 1.0504, + "mean_token_accuracy": 0.6904458403587341, + "num_tokens": 142736309.0, + "step": 5704 + }, + { + "epoch": 0.6265099934109378, + "grad_norm": 2.33015775680542, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.706281304359436, + "num_tokens": 142761635.0, + "step": 5705 + }, + { + "epoch": 0.6266198111135515, + "grad_norm": 2.317260265350342, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7140288352966309, + "num_tokens": 142784383.0, + "step": 5706 + }, + { + "epoch": 0.6267296288161651, + "grad_norm": 2.140948534011841, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7350496053695679, + "num_tokens": 142809161.0, + "step": 5707 + }, + { + "epoch": 0.6268394465187789, + "grad_norm": 2.2755351066589355, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6961700320243835, + "num_tokens": 142833609.0, + "step": 5708 + }, + { + "epoch": 0.6269492642213925, + "grad_norm": 2.2519688606262207, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7011426687240601, + "num_tokens": 142856849.0, + "step": 5709 + }, + { + "epoch": 0.6270590819240062, + "grad_norm": 2.20017409324646, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7071573734283447, + "num_tokens": 142882277.0, + "step": 5710 + }, + { + "epoch": 0.6271688996266198, + "grad_norm": 2.262310028076172, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.6968013644218445, + "num_tokens": 142904561.0, + "step": 5711 + }, + { + "epoch": 0.6272787173292335, + "grad_norm": 2.159250020980835, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.7031217813491821, + "num_tokens": 142929833.0, + "step": 5712 + }, + { + "epoch": 0.6273885350318471, + "grad_norm": 2.2509357929229736, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7074085474014282, + "num_tokens": 142953006.0, + "step": 5713 + }, + { + "epoch": 0.6274983527344608, + "grad_norm": 2.0898382663726807, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7285405993461609, + "num_tokens": 142980315.0, + "step": 5714 + }, + { + "epoch": 0.6276081704370745, + "grad_norm": 2.4125425815582275, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.6975036263465881, + "num_tokens": 143002213.0, + "step": 5715 + }, + { + "epoch": 0.6277179881396882, + "grad_norm": 2.128182888031006, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7224062085151672, + "num_tokens": 143026890.0, + "step": 5716 + }, + { + "epoch": 0.6278278058423018, + "grad_norm": 2.1163620948791504, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6976956129074097, + "num_tokens": 143054013.0, + "step": 5717 + }, + { + "epoch": 0.6279376235449154, + "grad_norm": 2.2119433879852295, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7088350057601929, + "num_tokens": 143080084.0, + "step": 5718 + }, + { + "epoch": 0.6280474412475291, + "grad_norm": 2.545443058013916, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6988675594329834, + "num_tokens": 143100689.0, + "step": 5719 + }, + { + "epoch": 0.6281572589501427, + "grad_norm": 2.314945936203003, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.694585382938385, + "num_tokens": 143124571.0, + "step": 5720 + }, + { + "epoch": 0.6282670766527564, + "grad_norm": 2.3229851722717285, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6931138038635254, + "num_tokens": 143147545.0, + "step": 5721 + }, + { + "epoch": 0.62837689435537, + "grad_norm": 2.244015693664551, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7295867800712585, + "num_tokens": 143167895.0, + "step": 5722 + }, + { + "epoch": 0.6284867120579838, + "grad_norm": 2.5361955165863037, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6985201835632324, + "num_tokens": 143187934.0, + "step": 5723 + }, + { + "epoch": 0.6285965297605974, + "grad_norm": 2.036525249481201, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6896178722381592, + "num_tokens": 143217648.0, + "step": 5724 + }, + { + "epoch": 0.6287063474632111, + "grad_norm": 2.4887185096740723, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.714127779006958, + "num_tokens": 143240657.0, + "step": 5725 + }, + { + "epoch": 0.6288161651658247, + "grad_norm": 2.3912513256073, + "learning_rate": 1e-06, + "loss": 1.0172, + "mean_token_accuracy": 0.695247232913971, + "num_tokens": 143265627.0, + "step": 5726 + }, + { + "epoch": 0.6289259828684384, + "grad_norm": 2.3423144817352295, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7118262052536011, + "num_tokens": 143286973.0, + "step": 5727 + }, + { + "epoch": 0.629035800571052, + "grad_norm": 1.9841201305389404, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6728769540786743, + "num_tokens": 143318761.0, + "step": 5728 + }, + { + "epoch": 0.6291456182736657, + "grad_norm": 2.220956802368164, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6911464333534241, + "num_tokens": 143343560.0, + "step": 5729 + }, + { + "epoch": 0.6292554359762794, + "grad_norm": 2.420976161956787, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7205750942230225, + "num_tokens": 143363484.0, + "step": 5730 + }, + { + "epoch": 0.6293652536788931, + "grad_norm": 2.372746467590332, + "learning_rate": 1e-06, + "loss": 1.0673, + "mean_token_accuracy": 0.6784453392028809, + "num_tokens": 143388251.0, + "step": 5731 + }, + { + "epoch": 0.6294750713815067, + "grad_norm": 2.2640295028686523, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7191129922866821, + "num_tokens": 143412099.0, + "step": 5732 + }, + { + "epoch": 0.6295848890841204, + "grad_norm": 2.6163389682769775, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7160711884498596, + "num_tokens": 143431571.0, + "step": 5733 + }, + { + "epoch": 0.629694706786734, + "grad_norm": 2.2568418979644775, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.709057092666626, + "num_tokens": 143457128.0, + "step": 5734 + }, + { + "epoch": 0.6298045244893477, + "grad_norm": 2.216550350189209, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7068784236907959, + "num_tokens": 143481518.0, + "step": 5735 + }, + { + "epoch": 0.6299143421919613, + "grad_norm": 1.9605392217636108, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6944999694824219, + "num_tokens": 143513010.0, + "step": 5736 + }, + { + "epoch": 0.630024159894575, + "grad_norm": 2.358947992324829, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7061744928359985, + "num_tokens": 143534342.0, + "step": 5737 + }, + { + "epoch": 0.6301339775971887, + "grad_norm": 2.382141351699829, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7153723239898682, + "num_tokens": 143555938.0, + "step": 5738 + }, + { + "epoch": 0.6302437952998023, + "grad_norm": 2.0473010540008545, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7135518789291382, + "num_tokens": 143582885.0, + "step": 5739 + }, + { + "epoch": 0.630353613002416, + "grad_norm": 2.3773934841156006, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7364954948425293, + "num_tokens": 143603220.0, + "step": 5740 + }, + { + "epoch": 0.6304634307050296, + "grad_norm": 2.1684067249298096, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7109255790710449, + "num_tokens": 143628498.0, + "step": 5741 + }, + { + "epoch": 0.6305732484076433, + "grad_norm": 2.264988660812378, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6963530778884888, + "num_tokens": 143651904.0, + "step": 5742 + }, + { + "epoch": 0.6306830661102569, + "grad_norm": 2.7127606868743896, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7132382392883301, + "num_tokens": 143668726.0, + "step": 5743 + }, + { + "epoch": 0.6307928838128707, + "grad_norm": 1.8869128227233887, + "learning_rate": 1e-06, + "loss": 1.0476, + "mean_token_accuracy": 0.6846905946731567, + "num_tokens": 143700162.0, + "step": 5744 + }, + { + "epoch": 0.6309027015154843, + "grad_norm": 2.1823227405548096, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7035484910011292, + "num_tokens": 143726506.0, + "step": 5745 + }, + { + "epoch": 0.631012519218098, + "grad_norm": 2.1834895610809326, + "learning_rate": 1e-06, + "loss": 1.0217, + "mean_token_accuracy": 0.6990987062454224, + "num_tokens": 143750792.0, + "step": 5746 + }, + { + "epoch": 0.6311223369207116, + "grad_norm": 2.054701566696167, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7039063572883606, + "num_tokens": 143777580.0, + "step": 5747 + }, + { + "epoch": 0.6312321546233253, + "grad_norm": 2.4342641830444336, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.6999668478965759, + "num_tokens": 143799224.0, + "step": 5748 + }, + { + "epoch": 0.6313419723259389, + "grad_norm": 2.1139261722564697, + "learning_rate": 1e-06, + "loss": 1.065, + "mean_token_accuracy": 0.6820292472839355, + "num_tokens": 143828408.0, + "step": 5749 + }, + { + "epoch": 0.6314517900285526, + "grad_norm": 2.0974531173706055, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7139852046966553, + "num_tokens": 143854736.0, + "step": 5750 + }, + { + "epoch": 0.6315616077311663, + "grad_norm": 2.2051730155944824, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7060372829437256, + "num_tokens": 143877964.0, + "step": 5751 + }, + { + "epoch": 0.63167142543378, + "grad_norm": 2.003490924835205, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7168242931365967, + "num_tokens": 143907923.0, + "step": 5752 + }, + { + "epoch": 0.6317812431363936, + "grad_norm": 2.5743038654327393, + "learning_rate": 1e-06, + "loss": 0.7377, + "mean_token_accuracy": 0.7648487091064453, + "num_tokens": 143924992.0, + "step": 5753 + }, + { + "epoch": 0.6318910608390073, + "grad_norm": 2.4148128032684326, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7124279737472534, + "num_tokens": 143947839.0, + "step": 5754 + }, + { + "epoch": 0.6320008785416209, + "grad_norm": 2.3165369033813477, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6896854043006897, + "num_tokens": 143970489.0, + "step": 5755 + }, + { + "epoch": 0.6321106962442345, + "grad_norm": 2.5624632835388184, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.699000895023346, + "num_tokens": 143989860.0, + "step": 5756 + }, + { + "epoch": 0.6322205139468482, + "grad_norm": 2.2807154655456543, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6972249746322632, + "num_tokens": 144012303.0, + "step": 5757 + }, + { + "epoch": 0.6323303316494618, + "grad_norm": 2.3626718521118164, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7084628343582153, + "num_tokens": 144036312.0, + "step": 5758 + }, + { + "epoch": 0.6324401493520756, + "grad_norm": 2.1590795516967773, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7114531993865967, + "num_tokens": 144059168.0, + "step": 5759 + }, + { + "epoch": 0.6325499670546892, + "grad_norm": 2.1296472549438477, + "learning_rate": 1e-06, + "loss": 1.0204, + "mean_token_accuracy": 0.7009432315826416, + "num_tokens": 144085415.0, + "step": 5760 + }, + { + "epoch": 0.6326597847573029, + "grad_norm": 2.0867183208465576, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.703861653804779, + "num_tokens": 144111649.0, + "step": 5761 + }, + { + "epoch": 0.6327696024599165, + "grad_norm": 2.5552096366882324, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7297042608261108, + "num_tokens": 144129790.0, + "step": 5762 + }, + { + "epoch": 0.6328794201625302, + "grad_norm": 2.2552285194396973, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7036421895027161, + "num_tokens": 144152577.0, + "step": 5763 + }, + { + "epoch": 0.6329892378651438, + "grad_norm": 2.4387946128845215, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.732513427734375, + "num_tokens": 144172808.0, + "step": 5764 + }, + { + "epoch": 0.6330990555677575, + "grad_norm": 2.184913158416748, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7138111591339111, + "num_tokens": 144197421.0, + "step": 5765 + }, + { + "epoch": 0.6332088732703712, + "grad_norm": 2.034641981124878, + "learning_rate": 1e-06, + "loss": 1.0814, + "mean_token_accuracy": 0.6754226684570312, + "num_tokens": 144225363.0, + "step": 5766 + }, + { + "epoch": 0.6333186909729849, + "grad_norm": 2.251734733581543, + "learning_rate": 1e-06, + "loss": 1.0703, + "mean_token_accuracy": 0.675500750541687, + "num_tokens": 144250714.0, + "step": 5767 + }, + { + "epoch": 0.6334285086755985, + "grad_norm": 1.9960256814956665, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7068818807601929, + "num_tokens": 144278745.0, + "step": 5768 + }, + { + "epoch": 0.6335383263782122, + "grad_norm": 1.9759492874145508, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7051386833190918, + "num_tokens": 144308414.0, + "step": 5769 + }, + { + "epoch": 0.6336481440808258, + "grad_norm": 2.2155094146728516, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6916026473045349, + "num_tokens": 144332801.0, + "step": 5770 + }, + { + "epoch": 0.6337579617834395, + "grad_norm": 2.3374805450439453, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7062863707542419, + "num_tokens": 144356142.0, + "step": 5771 + }, + { + "epoch": 0.6338677794860531, + "grad_norm": 2.2258477210998535, + "learning_rate": 1e-06, + "loss": 1.0556, + "mean_token_accuracy": 0.6844398975372314, + "num_tokens": 144380984.0, + "step": 5772 + }, + { + "epoch": 0.6339775971886669, + "grad_norm": 2.5054447650909424, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.7028727531433105, + "num_tokens": 144401057.0, + "step": 5773 + }, + { + "epoch": 0.6340874148912805, + "grad_norm": 2.207245111465454, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.718157947063446, + "num_tokens": 144424275.0, + "step": 5774 + }, + { + "epoch": 0.6341972325938942, + "grad_norm": 2.26389741897583, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7000706195831299, + "num_tokens": 144447731.0, + "step": 5775 + }, + { + "epoch": 0.6343070502965078, + "grad_norm": 2.1040236949920654, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.7046194076538086, + "num_tokens": 144475519.0, + "step": 5776 + }, + { + "epoch": 0.6344168679991214, + "grad_norm": 2.278198480606079, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7069193124771118, + "num_tokens": 144498041.0, + "step": 5777 + }, + { + "epoch": 0.6345266857017351, + "grad_norm": 2.50978946685791, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7280559539794922, + "num_tokens": 144517520.0, + "step": 5778 + }, + { + "epoch": 0.6346365034043487, + "grad_norm": 2.373049736022949, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7112261056900024, + "num_tokens": 144539603.0, + "step": 5779 + }, + { + "epoch": 0.6347463211069625, + "grad_norm": 2.255272626876831, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7006179094314575, + "num_tokens": 144563994.0, + "step": 5780 + }, + { + "epoch": 0.6348561388095761, + "grad_norm": 2.4277842044830322, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7173597812652588, + "num_tokens": 144585216.0, + "step": 5781 + }, + { + "epoch": 0.6349659565121898, + "grad_norm": 2.2901601791381836, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6917694807052612, + "num_tokens": 144610211.0, + "step": 5782 + }, + { + "epoch": 0.6350757742148034, + "grad_norm": 2.4378044605255127, + "learning_rate": 1e-06, + "loss": 1.0712, + "mean_token_accuracy": 0.6808345317840576, + "num_tokens": 144632311.0, + "step": 5783 + }, + { + "epoch": 0.6351855919174171, + "grad_norm": 2.053138494491577, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.6944599151611328, + "num_tokens": 144660964.0, + "step": 5784 + }, + { + "epoch": 0.6352954096200307, + "grad_norm": 2.2309768199920654, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7191135883331299, + "num_tokens": 144683414.0, + "step": 5785 + }, + { + "epoch": 0.6354052273226444, + "grad_norm": 2.122671604156494, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6928442716598511, + "num_tokens": 144712679.0, + "step": 5786 + }, + { + "epoch": 0.635515045025258, + "grad_norm": 1.9072009325027466, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.6962102055549622, + "num_tokens": 144749216.0, + "step": 5787 + }, + { + "epoch": 0.6356248627278718, + "grad_norm": 2.0990564823150635, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7025720477104187, + "num_tokens": 144776821.0, + "step": 5788 + }, + { + "epoch": 0.6357346804304854, + "grad_norm": 2.276249885559082, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7064098119735718, + "num_tokens": 144801394.0, + "step": 5789 + }, + { + "epoch": 0.6358444981330991, + "grad_norm": 2.0795350074768066, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7080281972885132, + "num_tokens": 144829983.0, + "step": 5790 + }, + { + "epoch": 0.6359543158357127, + "grad_norm": 2.304080009460449, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7320044040679932, + "num_tokens": 144851762.0, + "step": 5791 + }, + { + "epoch": 0.6360641335383264, + "grad_norm": 2.2637956142425537, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.720777690410614, + "num_tokens": 144875399.0, + "step": 5792 + }, + { + "epoch": 0.63617395124094, + "grad_norm": 2.3468410968780518, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7186557054519653, + "num_tokens": 144896681.0, + "step": 5793 + }, + { + "epoch": 0.6362837689435537, + "grad_norm": 2.1899380683898926, + "learning_rate": 1e-06, + "loss": 1.0891, + "mean_token_accuracy": 0.6732497215270996, + "num_tokens": 144924779.0, + "step": 5794 + }, + { + "epoch": 0.6363935866461674, + "grad_norm": 2.212869167327881, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7020921111106873, + "num_tokens": 144949469.0, + "step": 5795 + }, + { + "epoch": 0.636503404348781, + "grad_norm": 1.919615626335144, + "learning_rate": 1e-06, + "loss": 1.018, + "mean_token_accuracy": 0.6959185600280762, + "num_tokens": 144984123.0, + "step": 5796 + }, + { + "epoch": 0.6366132220513947, + "grad_norm": 1.989371418952942, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7139138579368591, + "num_tokens": 145016095.0, + "step": 5797 + }, + { + "epoch": 0.6367230397540083, + "grad_norm": 2.1738438606262207, + "learning_rate": 1e-06, + "loss": 1.0308, + "mean_token_accuracy": 0.6847659945487976, + "num_tokens": 145044955.0, + "step": 5798 + }, + { + "epoch": 0.636832857456622, + "grad_norm": 2.4174842834472656, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7113790512084961, + "num_tokens": 145068275.0, + "step": 5799 + }, + { + "epoch": 0.6369426751592356, + "grad_norm": 1.9438248872756958, + "learning_rate": 1e-06, + "loss": 1.0522, + "mean_token_accuracy": 0.6783645153045654, + "num_tokens": 145099605.0, + "step": 5800 + }, + { + "epoch": 0.6370524928618493, + "grad_norm": 2.135859489440918, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7106570601463318, + "num_tokens": 145125495.0, + "step": 5801 + }, + { + "epoch": 0.637162310564463, + "grad_norm": 2.230245351791382, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7021678686141968, + "num_tokens": 145149235.0, + "step": 5802 + }, + { + "epoch": 0.6372721282670767, + "grad_norm": 2.260586977005005, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7012374997138977, + "num_tokens": 145172556.0, + "step": 5803 + }, + { + "epoch": 0.6373819459696903, + "grad_norm": 2.472813844680786, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7201001644134521, + "num_tokens": 145191473.0, + "step": 5804 + }, + { + "epoch": 0.637491763672304, + "grad_norm": 2.2428033351898193, + "learning_rate": 1e-06, + "loss": 1.0108, + "mean_token_accuracy": 0.6934430003166199, + "num_tokens": 145217185.0, + "step": 5805 + }, + { + "epoch": 0.6376015813749176, + "grad_norm": 2.0610716342926025, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6891981363296509, + "num_tokens": 145246724.0, + "step": 5806 + }, + { + "epoch": 0.6377113990775313, + "grad_norm": 2.4372828006744385, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7031593322753906, + "num_tokens": 145268355.0, + "step": 5807 + }, + { + "epoch": 0.6378212167801449, + "grad_norm": 2.48625111579895, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7317498326301575, + "num_tokens": 145287658.0, + "step": 5808 + }, + { + "epoch": 0.6379310344827587, + "grad_norm": 2.577150821685791, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7345923185348511, + "num_tokens": 145307130.0, + "step": 5809 + }, + { + "epoch": 0.6380408521853723, + "grad_norm": 2.4272279739379883, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6978476643562317, + "num_tokens": 145329241.0, + "step": 5810 + }, + { + "epoch": 0.638150669887986, + "grad_norm": 2.6628081798553467, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7047278881072998, + "num_tokens": 145347925.0, + "step": 5811 + }, + { + "epoch": 0.6382604875905996, + "grad_norm": 2.3594586849212646, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.7027807831764221, + "num_tokens": 145375252.0, + "step": 5812 + }, + { + "epoch": 0.6383703052932133, + "grad_norm": 2.3311927318573, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7165015935897827, + "num_tokens": 145398428.0, + "step": 5813 + }, + { + "epoch": 0.6384801229958269, + "grad_norm": 2.141282320022583, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6953257918357849, + "num_tokens": 145425170.0, + "step": 5814 + }, + { + "epoch": 0.6385899406984406, + "grad_norm": 2.0903775691986084, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.703972339630127, + "num_tokens": 145453318.0, + "step": 5815 + }, + { + "epoch": 0.6386997584010542, + "grad_norm": 2.335170269012451, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.706787109375, + "num_tokens": 145476061.0, + "step": 5816 + }, + { + "epoch": 0.638809576103668, + "grad_norm": 2.367745876312256, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7209769487380981, + "num_tokens": 145496858.0, + "step": 5817 + }, + { + "epoch": 0.6389193938062816, + "grad_norm": 2.361664056777954, + "learning_rate": 1e-06, + "loss": 1.0894, + "mean_token_accuracy": 0.6823785305023193, + "num_tokens": 145519245.0, + "step": 5818 + }, + { + "epoch": 0.6390292115088952, + "grad_norm": 2.015733003616333, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6815974116325378, + "num_tokens": 145550614.0, + "step": 5819 + }, + { + "epoch": 0.6391390292115089, + "grad_norm": 2.1853749752044678, + "learning_rate": 1e-06, + "loss": 1.0113, + "mean_token_accuracy": 0.7053850889205933, + "num_tokens": 145577809.0, + "step": 5820 + }, + { + "epoch": 0.6392488469141225, + "grad_norm": 2.2212271690368652, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7065899968147278, + "num_tokens": 145601408.0, + "step": 5821 + }, + { + "epoch": 0.6393586646167362, + "grad_norm": 2.2783377170562744, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6965600252151489, + "num_tokens": 145626543.0, + "step": 5822 + }, + { + "epoch": 0.6394684823193498, + "grad_norm": 2.507913112640381, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6879810094833374, + "num_tokens": 145647374.0, + "step": 5823 + }, + { + "epoch": 0.6395783000219636, + "grad_norm": 2.2631585597991943, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6923936605453491, + "num_tokens": 145672974.0, + "step": 5824 + }, + { + "epoch": 0.6396881177245772, + "grad_norm": 2.514310359954834, + "learning_rate": 1e-06, + "loss": 1.0409, + "mean_token_accuracy": 0.6923882961273193, + "num_tokens": 145699853.0, + "step": 5825 + }, + { + "epoch": 0.6397979354271909, + "grad_norm": 2.240522861480713, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7096083760261536, + "num_tokens": 145724590.0, + "step": 5826 + }, + { + "epoch": 0.6399077531298045, + "grad_norm": 2.339916229248047, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.7056913375854492, + "num_tokens": 145748985.0, + "step": 5827 + }, + { + "epoch": 0.6400175708324182, + "grad_norm": 2.331608533859253, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7187804579734802, + "num_tokens": 145773437.0, + "step": 5828 + }, + { + "epoch": 0.6401273885350318, + "grad_norm": 2.4538967609405518, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7166036367416382, + "num_tokens": 145793115.0, + "step": 5829 + }, + { + "epoch": 0.6402372062376455, + "grad_norm": 2.212507963180542, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7182532548904419, + "num_tokens": 145817747.0, + "step": 5830 + }, + { + "epoch": 0.6403470239402592, + "grad_norm": 2.181736469268799, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.6988033652305603, + "num_tokens": 145843980.0, + "step": 5831 + }, + { + "epoch": 0.6404568416428729, + "grad_norm": 2.0402488708496094, + "learning_rate": 1e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7491533756256104, + "num_tokens": 145869110.0, + "step": 5832 + }, + { + "epoch": 0.6405666593454865, + "grad_norm": 2.359175443649292, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6920357346534729, + "num_tokens": 145894128.0, + "step": 5833 + }, + { + "epoch": 0.6406764770481002, + "grad_norm": 2.4078309535980225, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6949545741081238, + "num_tokens": 145918930.0, + "step": 5834 + }, + { + "epoch": 0.6407862947507138, + "grad_norm": 2.3796727657318115, + "learning_rate": 1e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6840861439704895, + "num_tokens": 145942685.0, + "step": 5835 + }, + { + "epoch": 0.6408961124533274, + "grad_norm": 2.5870471000671387, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7181254625320435, + "num_tokens": 145962143.0, + "step": 5836 + }, + { + "epoch": 0.6410059301559411, + "grad_norm": 2.1682798862457275, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7246038317680359, + "num_tokens": 145987748.0, + "step": 5837 + }, + { + "epoch": 0.6411157478585549, + "grad_norm": 1.9045686721801758, + "learning_rate": 1e-06, + "loss": 1.0527, + "mean_token_accuracy": 0.6870805025100708, + "num_tokens": 146021198.0, + "step": 5838 + }, + { + "epoch": 0.6412255655611685, + "grad_norm": 2.402973175048828, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7099319696426392, + "num_tokens": 146041910.0, + "step": 5839 + }, + { + "epoch": 0.6413353832637821, + "grad_norm": 2.140432596206665, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7090902924537659, + "num_tokens": 146067014.0, + "step": 5840 + }, + { + "epoch": 0.6414452009663958, + "grad_norm": 2.022301435470581, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7235338091850281, + "num_tokens": 146095240.0, + "step": 5841 + }, + { + "epoch": 0.6415550186690094, + "grad_norm": 2.0579843521118164, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.695645809173584, + "num_tokens": 146124476.0, + "step": 5842 + }, + { + "epoch": 0.6416648363716231, + "grad_norm": 2.578834295272827, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.687317967414856, + "num_tokens": 146146042.0, + "step": 5843 + }, + { + "epoch": 0.6417746540742367, + "grad_norm": 2.1776528358459473, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6973356008529663, + "num_tokens": 146172411.0, + "step": 5844 + }, + { + "epoch": 0.6418844717768504, + "grad_norm": 2.177257537841797, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6967310905456543, + "num_tokens": 146198477.0, + "step": 5845 + }, + { + "epoch": 0.6419942894794641, + "grad_norm": 2.531179428100586, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6959884762763977, + "num_tokens": 146220329.0, + "step": 5846 + }, + { + "epoch": 0.6421041071820778, + "grad_norm": 2.0357370376586914, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7309163808822632, + "num_tokens": 146246581.0, + "step": 5847 + }, + { + "epoch": 0.6422139248846914, + "grad_norm": 2.4627344608306885, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7060171961784363, + "num_tokens": 146268950.0, + "step": 5848 + }, + { + "epoch": 0.6423237425873051, + "grad_norm": 2.085735321044922, + "learning_rate": 1e-06, + "loss": 1.0866, + "mean_token_accuracy": 0.6735039949417114, + "num_tokens": 146302651.0, + "step": 5849 + }, + { + "epoch": 0.6424335602899187, + "grad_norm": 2.0342307090759277, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.6908043622970581, + "num_tokens": 146330816.0, + "step": 5850 + }, + { + "epoch": 0.6425433779925324, + "grad_norm": 2.4776203632354736, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7132951021194458, + "num_tokens": 146351233.0, + "step": 5851 + }, + { + "epoch": 0.642653195695146, + "grad_norm": 1.9550039768218994, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7182449102401733, + "num_tokens": 146379276.0, + "step": 5852 + }, + { + "epoch": 0.6427630133977598, + "grad_norm": 2.1157751083374023, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7065250873565674, + "num_tokens": 146405724.0, + "step": 5853 + }, + { + "epoch": 0.6428728311003734, + "grad_norm": 2.3569469451904297, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7176036834716797, + "num_tokens": 146426528.0, + "step": 5854 + }, + { + "epoch": 0.6429826488029871, + "grad_norm": 2.1286096572875977, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7144545316696167, + "num_tokens": 146452607.0, + "step": 5855 + }, + { + "epoch": 0.6430924665056007, + "grad_norm": 2.150118589401245, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6995422840118408, + "num_tokens": 146481102.0, + "step": 5856 + }, + { + "epoch": 0.6432022842082143, + "grad_norm": 2.0042781829833984, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7124495506286621, + "num_tokens": 146507420.0, + "step": 5857 + }, + { + "epoch": 0.643312101910828, + "grad_norm": 2.024256706237793, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.6887332201004028, + "num_tokens": 146536292.0, + "step": 5858 + }, + { + "epoch": 0.6434219196134416, + "grad_norm": 2.1524105072021484, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6910725831985474, + "num_tokens": 146563745.0, + "step": 5859 + }, + { + "epoch": 0.6435317373160554, + "grad_norm": 2.6330347061157227, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6905398368835449, + "num_tokens": 146583721.0, + "step": 5860 + }, + { + "epoch": 0.643641555018669, + "grad_norm": 2.2686033248901367, + "learning_rate": 1e-06, + "loss": 1.0482, + "mean_token_accuracy": 0.6895003318786621, + "num_tokens": 146608839.0, + "step": 5861 + }, + { + "epoch": 0.6437513727212827, + "grad_norm": 2.0500168800354004, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6917780637741089, + "num_tokens": 146639347.0, + "step": 5862 + }, + { + "epoch": 0.6438611904238963, + "grad_norm": 2.272475004196167, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7099823951721191, + "num_tokens": 146664547.0, + "step": 5863 + }, + { + "epoch": 0.64397100812651, + "grad_norm": 2.102250337600708, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7088158130645752, + "num_tokens": 146692702.0, + "step": 5864 + }, + { + "epoch": 0.6440808258291236, + "grad_norm": 1.9573683738708496, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7147994041442871, + "num_tokens": 146723020.0, + "step": 5865 + }, + { + "epoch": 0.6441906435317373, + "grad_norm": 2.2488842010498047, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6937990784645081, + "num_tokens": 146748381.0, + "step": 5866 + }, + { + "epoch": 0.644300461234351, + "grad_norm": 2.0400350093841553, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.7046447396278381, + "num_tokens": 146777695.0, + "step": 5867 + }, + { + "epoch": 0.6444102789369647, + "grad_norm": 2.3648200035095215, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7234065532684326, + "num_tokens": 146800258.0, + "step": 5868 + }, + { + "epoch": 0.6445200966395783, + "grad_norm": 2.298707962036133, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7056021690368652, + "num_tokens": 146822561.0, + "step": 5869 + }, + { + "epoch": 0.644629914342192, + "grad_norm": 2.4210119247436523, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7022857666015625, + "num_tokens": 146844537.0, + "step": 5870 + }, + { + "epoch": 0.6447397320448056, + "grad_norm": 2.225644111633301, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7088717818260193, + "num_tokens": 146869846.0, + "step": 5871 + }, + { + "epoch": 0.6448495497474193, + "grad_norm": 2.293816089630127, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7145169973373413, + "num_tokens": 146892088.0, + "step": 5872 + }, + { + "epoch": 0.6449593674500329, + "grad_norm": 2.456541061401367, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7164576053619385, + "num_tokens": 146913928.0, + "step": 5873 + }, + { + "epoch": 0.6450691851526466, + "grad_norm": 2.2800137996673584, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7068167328834534, + "num_tokens": 146938698.0, + "step": 5874 + }, + { + "epoch": 0.6451790028552603, + "grad_norm": 2.2156879901885986, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7208398580551147, + "num_tokens": 146964329.0, + "step": 5875 + }, + { + "epoch": 0.645288820557874, + "grad_norm": 2.2749063968658447, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.730100154876709, + "num_tokens": 146986484.0, + "step": 5876 + }, + { + "epoch": 0.6453986382604876, + "grad_norm": 2.1682798862457275, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7061141729354858, + "num_tokens": 147010735.0, + "step": 5877 + }, + { + "epoch": 0.6455084559631012, + "grad_norm": 2.498398542404175, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7387102246284485, + "num_tokens": 147030021.0, + "step": 5878 + }, + { + "epoch": 0.6456182736657149, + "grad_norm": 1.9159526824951172, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6906804442405701, + "num_tokens": 147061181.0, + "step": 5879 + }, + { + "epoch": 0.6457280913683285, + "grad_norm": 2.4458699226379395, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7426579594612122, + "num_tokens": 147081037.0, + "step": 5880 + }, + { + "epoch": 0.6458379090709422, + "grad_norm": 2.06477952003479, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.683591365814209, + "num_tokens": 147109127.0, + "step": 5881 + }, + { + "epoch": 0.6459477267735559, + "grad_norm": 2.1505680084228516, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6858916282653809, + "num_tokens": 147137000.0, + "step": 5882 + }, + { + "epoch": 0.6460575444761696, + "grad_norm": 2.277172088623047, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6928279995918274, + "num_tokens": 147160731.0, + "step": 5883 + }, + { + "epoch": 0.6461673621787832, + "grad_norm": 2.0788416862487793, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7040255665779114, + "num_tokens": 147189562.0, + "step": 5884 + }, + { + "epoch": 0.6462771798813969, + "grad_norm": 2.351065158843994, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.744030237197876, + "num_tokens": 147211351.0, + "step": 5885 + }, + { + "epoch": 0.6463869975840105, + "grad_norm": 2.100938081741333, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7259386777877808, + "num_tokens": 147236945.0, + "step": 5886 + }, + { + "epoch": 0.6464968152866242, + "grad_norm": 2.1760013103485107, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.6984824538230896, + "num_tokens": 147265285.0, + "step": 5887 + }, + { + "epoch": 0.6466066329892378, + "grad_norm": 2.3351778984069824, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7020114660263062, + "num_tokens": 147286780.0, + "step": 5888 + }, + { + "epoch": 0.6467164506918516, + "grad_norm": 2.1791861057281494, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7100369930267334, + "num_tokens": 147312285.0, + "step": 5889 + }, + { + "epoch": 0.6468262683944652, + "grad_norm": 2.580944299697876, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6849905848503113, + "num_tokens": 147333368.0, + "step": 5890 + }, + { + "epoch": 0.6469360860970789, + "grad_norm": 2.2650201320648193, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7088548541069031, + "num_tokens": 147356316.0, + "step": 5891 + }, + { + "epoch": 0.6470459037996925, + "grad_norm": 1.949734091758728, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7034924626350403, + "num_tokens": 147386413.0, + "step": 5892 + }, + { + "epoch": 0.6471557215023062, + "grad_norm": 2.0569887161254883, + "learning_rate": 1e-06, + "loss": 1.0123, + "mean_token_accuracy": 0.6942273378372192, + "num_tokens": 147415282.0, + "step": 5893 + }, + { + "epoch": 0.6472655392049198, + "grad_norm": 2.471132278442383, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7172601819038391, + "num_tokens": 147434807.0, + "step": 5894 + }, + { + "epoch": 0.6473753569075335, + "grad_norm": 1.996907114982605, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7170325517654419, + "num_tokens": 147463377.0, + "step": 5895 + }, + { + "epoch": 0.6474851746101472, + "grad_norm": 2.0834403038024902, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6812945604324341, + "num_tokens": 147491225.0, + "step": 5896 + }, + { + "epoch": 0.6475949923127609, + "grad_norm": 2.4151451587677, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7273265719413757, + "num_tokens": 147513374.0, + "step": 5897 + }, + { + "epoch": 0.6477048100153745, + "grad_norm": 2.1597836017608643, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6961032152175903, + "num_tokens": 147539234.0, + "step": 5898 + }, + { + "epoch": 0.6478146277179881, + "grad_norm": 2.032663345336914, + "learning_rate": 1e-06, + "loss": 1.0749, + "mean_token_accuracy": 0.6782655715942383, + "num_tokens": 147569555.0, + "step": 5899 + }, + { + "epoch": 0.6479244454206018, + "grad_norm": 1.9558701515197754, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.709562361240387, + "num_tokens": 147599766.0, + "step": 5900 + }, + { + "epoch": 0.6480342631232154, + "grad_norm": 2.441462516784668, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7184324860572815, + "num_tokens": 147619902.0, + "step": 5901 + }, + { + "epoch": 0.6481440808258291, + "grad_norm": 2.1906464099884033, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7011730670928955, + "num_tokens": 147644878.0, + "step": 5902 + }, + { + "epoch": 0.6482538985284428, + "grad_norm": 2.2793102264404297, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7232809066772461, + "num_tokens": 147667247.0, + "step": 5903 + }, + { + "epoch": 0.6483637162310565, + "grad_norm": 1.8273811340332031, + "learning_rate": 1e-06, + "loss": 1.0596, + "mean_token_accuracy": 0.6786532402038574, + "num_tokens": 147704381.0, + "step": 5904 + }, + { + "epoch": 0.6484735339336701, + "grad_norm": 2.1143054962158203, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6980038285255432, + "num_tokens": 147730962.0, + "step": 5905 + }, + { + "epoch": 0.6485833516362838, + "grad_norm": 2.4765055179595947, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7179462909698486, + "num_tokens": 147750282.0, + "step": 5906 + }, + { + "epoch": 0.6486931693388974, + "grad_norm": 2.3330588340759277, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6921646595001221, + "num_tokens": 147776538.0, + "step": 5907 + }, + { + "epoch": 0.6488029870415111, + "grad_norm": 2.2413759231567383, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7148897647857666, + "num_tokens": 147800244.0, + "step": 5908 + }, + { + "epoch": 0.6489128047441247, + "grad_norm": 2.1522951126098633, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7089362144470215, + "num_tokens": 147826437.0, + "step": 5909 + }, + { + "epoch": 0.6490226224467384, + "grad_norm": 2.3411126136779785, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7370284199714661, + "num_tokens": 147848523.0, + "step": 5910 + }, + { + "epoch": 0.6491324401493521, + "grad_norm": 2.183293581008911, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6818424463272095, + "num_tokens": 147875347.0, + "step": 5911 + }, + { + "epoch": 0.6492422578519658, + "grad_norm": 2.2412431240081787, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.69388747215271, + "num_tokens": 147899306.0, + "step": 5912 + }, + { + "epoch": 0.6493520755545794, + "grad_norm": 1.7931439876556396, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7046632170677185, + "num_tokens": 147933086.0, + "step": 5913 + }, + { + "epoch": 0.6494618932571931, + "grad_norm": 1.9269245862960815, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7150213718414307, + "num_tokens": 147963101.0, + "step": 5914 + }, + { + "epoch": 0.6495717109598067, + "grad_norm": 2.202390432357788, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7254925966262817, + "num_tokens": 147989212.0, + "step": 5915 + }, + { + "epoch": 0.6496815286624203, + "grad_norm": 2.2227237224578857, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7030177116394043, + "num_tokens": 148012665.0, + "step": 5916 + }, + { + "epoch": 0.649791346365034, + "grad_norm": 2.024423122406006, + "learning_rate": 1e-06, + "loss": 0.9966, + "mean_token_accuracy": 0.696958065032959, + "num_tokens": 148040964.0, + "step": 5917 + }, + { + "epoch": 0.6499011640676478, + "grad_norm": 2.1600687503814697, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6977807283401489, + "num_tokens": 148068999.0, + "step": 5918 + }, + { + "epoch": 0.6500109817702614, + "grad_norm": 2.4960789680480957, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7121186852455139, + "num_tokens": 148090086.0, + "step": 5919 + }, + { + "epoch": 0.650120799472875, + "grad_norm": 2.1616480350494385, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.705034613609314, + "num_tokens": 148115840.0, + "step": 5920 + }, + { + "epoch": 0.6502306171754887, + "grad_norm": 2.3282017707824707, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7116727828979492, + "num_tokens": 148140570.0, + "step": 5921 + }, + { + "epoch": 0.6503404348781023, + "grad_norm": 2.12497878074646, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6934502124786377, + "num_tokens": 148166940.0, + "step": 5922 + }, + { + "epoch": 0.650450252580716, + "grad_norm": 2.1305789947509766, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7045038342475891, + "num_tokens": 148194157.0, + "step": 5923 + }, + { + "epoch": 0.6505600702833296, + "grad_norm": 2.250730514526367, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7379034757614136, + "num_tokens": 148216097.0, + "step": 5924 + }, + { + "epoch": 0.6506698879859434, + "grad_norm": 2.1482276916503906, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.698381245136261, + "num_tokens": 148241617.0, + "step": 5925 + }, + { + "epoch": 0.650779705688557, + "grad_norm": 2.2953014373779297, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6958818435668945, + "num_tokens": 148266957.0, + "step": 5926 + }, + { + "epoch": 0.6508895233911707, + "grad_norm": 2.0674116611480713, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7143020033836365, + "num_tokens": 148293080.0, + "step": 5927 + }, + { + "epoch": 0.6509993410937843, + "grad_norm": 2.0622754096984863, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6879914402961731, + "num_tokens": 148321360.0, + "step": 5928 + }, + { + "epoch": 0.651109158796398, + "grad_norm": 1.992060899734497, + "learning_rate": 1e-06, + "loss": 1.0492, + "mean_token_accuracy": 0.6822397112846375, + "num_tokens": 148350999.0, + "step": 5929 + }, + { + "epoch": 0.6512189764990116, + "grad_norm": 2.077009916305542, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6931244730949402, + "num_tokens": 148377898.0, + "step": 5930 + }, + { + "epoch": 0.6513287942016253, + "grad_norm": 2.2116305828094482, + "learning_rate": 1e-06, + "loss": 1.1025, + "mean_token_accuracy": 0.6681835055351257, + "num_tokens": 148404955.0, + "step": 5931 + }, + { + "epoch": 0.651438611904239, + "grad_norm": 1.943182110786438, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6979846954345703, + "num_tokens": 148435496.0, + "step": 5932 + }, + { + "epoch": 0.6515484296068527, + "grad_norm": 2.5229287147521973, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7133888006210327, + "num_tokens": 148453907.0, + "step": 5933 + }, + { + "epoch": 0.6516582473094663, + "grad_norm": 2.0012047290802, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7086955308914185, + "num_tokens": 148481475.0, + "step": 5934 + }, + { + "epoch": 0.65176806501208, + "grad_norm": 2.2081692218780518, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6941016912460327, + "num_tokens": 148506595.0, + "step": 5935 + }, + { + "epoch": 0.6518778827146936, + "grad_norm": 2.588454008102417, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7273749709129333, + "num_tokens": 148524623.0, + "step": 5936 + }, + { + "epoch": 0.6519877004173072, + "grad_norm": 2.245365858078003, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7174550294876099, + "num_tokens": 148546322.0, + "step": 5937 + }, + { + "epoch": 0.6520975181199209, + "grad_norm": 2.1751887798309326, + "learning_rate": 1e-06, + "loss": 1.0459, + "mean_token_accuracy": 0.676096498966217, + "num_tokens": 148572721.0, + "step": 5938 + }, + { + "epoch": 0.6522073358225345, + "grad_norm": 2.6445670127868652, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7235907316207886, + "num_tokens": 148590476.0, + "step": 5939 + }, + { + "epoch": 0.6523171535251483, + "grad_norm": 2.215759515762329, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6978570222854614, + "num_tokens": 148616638.0, + "step": 5940 + }, + { + "epoch": 0.6524269712277619, + "grad_norm": 2.3755381107330322, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7052173614501953, + "num_tokens": 148640670.0, + "step": 5941 + }, + { + "epoch": 0.6525367889303756, + "grad_norm": 2.5154454708099365, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6811830997467041, + "num_tokens": 148661823.0, + "step": 5942 + }, + { + "epoch": 0.6526466066329892, + "grad_norm": 2.1986777782440186, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6892930269241333, + "num_tokens": 148689414.0, + "step": 5943 + }, + { + "epoch": 0.6527564243356029, + "grad_norm": 2.0772552490234375, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7063690423965454, + "num_tokens": 148717422.0, + "step": 5944 + }, + { + "epoch": 0.6528662420382165, + "grad_norm": 2.0252809524536133, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7088527083396912, + "num_tokens": 148744233.0, + "step": 5945 + }, + { + "epoch": 0.6529760597408302, + "grad_norm": 2.1262807846069336, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6769536733627319, + "num_tokens": 148771286.0, + "step": 5946 + }, + { + "epoch": 0.6530858774434439, + "grad_norm": 2.1845555305480957, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7081332802772522, + "num_tokens": 148796195.0, + "step": 5947 + }, + { + "epoch": 0.6531956951460576, + "grad_norm": 2.3816897869110107, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7159814834594727, + "num_tokens": 148818565.0, + "step": 5948 + }, + { + "epoch": 0.6533055128486712, + "grad_norm": 2.1738269329071045, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.697587251663208, + "num_tokens": 148843250.0, + "step": 5949 + }, + { + "epoch": 0.6534153305512849, + "grad_norm": 2.4070255756378174, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7051165103912354, + "num_tokens": 148867762.0, + "step": 5950 + }, + { + "epoch": 0.6535251482538985, + "grad_norm": 2.5202202796936035, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7183504104614258, + "num_tokens": 148888396.0, + "step": 5951 + }, + { + "epoch": 0.6536349659565122, + "grad_norm": 2.0771310329437256, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7167572975158691, + "num_tokens": 148915535.0, + "step": 5952 + }, + { + "epoch": 0.6537447836591258, + "grad_norm": 2.2770049571990967, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7242633700370789, + "num_tokens": 148939879.0, + "step": 5953 + }, + { + "epoch": 0.6538546013617396, + "grad_norm": 2.2546873092651367, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7134672403335571, + "num_tokens": 148963863.0, + "step": 5954 + }, + { + "epoch": 0.6539644190643532, + "grad_norm": 2.2024307250976562, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7105768322944641, + "num_tokens": 148990146.0, + "step": 5955 + }, + { + "epoch": 0.6540742367669669, + "grad_norm": 2.5106043815612793, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7142683267593384, + "num_tokens": 149010400.0, + "step": 5956 + }, + { + "epoch": 0.6541840544695805, + "grad_norm": 2.153712034225464, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7311873435974121, + "num_tokens": 149034308.0, + "step": 5957 + }, + { + "epoch": 0.6542938721721941, + "grad_norm": 2.1212825775146484, + "learning_rate": 1e-06, + "loss": 1.0868, + "mean_token_accuracy": 0.6825616359710693, + "num_tokens": 149060636.0, + "step": 5958 + }, + { + "epoch": 0.6544036898748078, + "grad_norm": 2.214216947555542, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7069743871688843, + "num_tokens": 149085917.0, + "step": 5959 + }, + { + "epoch": 0.6545135075774214, + "grad_norm": 2.538280725479126, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7345243096351624, + "num_tokens": 149104501.0, + "step": 5960 + }, + { + "epoch": 0.6546233252800352, + "grad_norm": 2.2422869205474854, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.7009307742118835, + "num_tokens": 149128349.0, + "step": 5961 + }, + { + "epoch": 0.6547331429826488, + "grad_norm": 2.149054765701294, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.692969799041748, + "num_tokens": 149154214.0, + "step": 5962 + }, + { + "epoch": 0.6548429606852625, + "grad_norm": 2.314382553100586, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7408809065818787, + "num_tokens": 149177044.0, + "step": 5963 + }, + { + "epoch": 0.6549527783878761, + "grad_norm": 2.3320062160491943, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7035149931907654, + "num_tokens": 149198279.0, + "step": 5964 + }, + { + "epoch": 0.6550625960904898, + "grad_norm": 1.902334451675415, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7028127908706665, + "num_tokens": 149231078.0, + "step": 5965 + }, + { + "epoch": 0.6551724137931034, + "grad_norm": 2.152710437774658, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7162096500396729, + "num_tokens": 149257158.0, + "step": 5966 + }, + { + "epoch": 0.6552822314957171, + "grad_norm": 1.990615725517273, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.705285906791687, + "num_tokens": 149284046.0, + "step": 5967 + }, + { + "epoch": 0.6553920491983307, + "grad_norm": 2.206613063812256, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.7030649185180664, + "num_tokens": 149309355.0, + "step": 5968 + }, + { + "epoch": 0.6555018669009445, + "grad_norm": 2.3068554401397705, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.73084557056427, + "num_tokens": 149331496.0, + "step": 5969 + }, + { + "epoch": 0.6556116846035581, + "grad_norm": 2.4574801921844482, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7168800830841064, + "num_tokens": 149350072.0, + "step": 5970 + }, + { + "epoch": 0.6557215023061718, + "grad_norm": 2.437443256378174, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7074558138847351, + "num_tokens": 149373051.0, + "step": 5971 + }, + { + "epoch": 0.6558313200087854, + "grad_norm": 2.188197612762451, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7370331883430481, + "num_tokens": 149395321.0, + "step": 5972 + }, + { + "epoch": 0.6559411377113991, + "grad_norm": 2.2358806133270264, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6997988224029541, + "num_tokens": 149420848.0, + "step": 5973 + }, + { + "epoch": 0.6560509554140127, + "grad_norm": 2.2046000957489014, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.6993876695632935, + "num_tokens": 149446089.0, + "step": 5974 + }, + { + "epoch": 0.6561607731166264, + "grad_norm": 2.228364944458008, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7169427871704102, + "num_tokens": 149468949.0, + "step": 5975 + }, + { + "epoch": 0.6562705908192401, + "grad_norm": 2.3761115074157715, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.682518720626831, + "num_tokens": 149492014.0, + "step": 5976 + }, + { + "epoch": 0.6563804085218538, + "grad_norm": 2.076364517211914, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7245934009552002, + "num_tokens": 149517178.0, + "step": 5977 + }, + { + "epoch": 0.6564902262244674, + "grad_norm": 2.0700323581695557, + "learning_rate": 1e-06, + "loss": 1.0365, + "mean_token_accuracy": 0.6856530904769897, + "num_tokens": 149544136.0, + "step": 5978 + }, + { + "epoch": 0.656600043927081, + "grad_norm": 2.1432182788848877, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7019640207290649, + "num_tokens": 149570397.0, + "step": 5979 + }, + { + "epoch": 0.6567098616296947, + "grad_norm": 2.141049861907959, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7262337803840637, + "num_tokens": 149595247.0, + "step": 5980 + }, + { + "epoch": 0.6568196793323083, + "grad_norm": 2.4665017127990723, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6953818798065186, + "num_tokens": 149616044.0, + "step": 5981 + }, + { + "epoch": 0.656929497034922, + "grad_norm": 2.3052761554718018, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.719746470451355, + "num_tokens": 149638049.0, + "step": 5982 + }, + { + "epoch": 0.6570393147375357, + "grad_norm": 2.06972074508667, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7071820497512817, + "num_tokens": 149666006.0, + "step": 5983 + }, + { + "epoch": 0.6571491324401494, + "grad_norm": 2.391155242919922, + "learning_rate": 1e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6876615285873413, + "num_tokens": 149688266.0, + "step": 5984 + }, + { + "epoch": 0.657258950142763, + "grad_norm": 2.3296356201171875, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6748048067092896, + "num_tokens": 149712315.0, + "step": 5985 + }, + { + "epoch": 0.6573687678453767, + "grad_norm": 2.608994245529175, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.735016942024231, + "num_tokens": 149730849.0, + "step": 5986 + }, + { + "epoch": 0.6574785855479903, + "grad_norm": 2.290743589401245, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7342343926429749, + "num_tokens": 149753100.0, + "step": 5987 + }, + { + "epoch": 0.657588403250604, + "grad_norm": 2.232064723968506, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.723527193069458, + "num_tokens": 149777220.0, + "step": 5988 + }, + { + "epoch": 0.6576982209532176, + "grad_norm": 2.3519463539123535, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7092229723930359, + "num_tokens": 149798957.0, + "step": 5989 + }, + { + "epoch": 0.6578080386558314, + "grad_norm": 2.441068649291992, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7087874412536621, + "num_tokens": 149820235.0, + "step": 5990 + }, + { + "epoch": 0.657917856358445, + "grad_norm": 2.397120714187622, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6992959380149841, + "num_tokens": 149843889.0, + "step": 5991 + }, + { + "epoch": 0.6580276740610587, + "grad_norm": 2.2669899463653564, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6958198547363281, + "num_tokens": 149868318.0, + "step": 5992 + }, + { + "epoch": 0.6581374917636723, + "grad_norm": 2.205641746520996, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7496845722198486, + "num_tokens": 149892050.0, + "step": 5993 + }, + { + "epoch": 0.658247309466286, + "grad_norm": 2.6042377948760986, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7059975266456604, + "num_tokens": 149909991.0, + "step": 5994 + }, + { + "epoch": 0.6583571271688996, + "grad_norm": 2.1287128925323486, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.703241229057312, + "num_tokens": 149935771.0, + "step": 5995 + }, + { + "epoch": 0.6584669448715132, + "grad_norm": 2.0704851150512695, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7013325691223145, + "num_tokens": 149963540.0, + "step": 5996 + }, + { + "epoch": 0.6585767625741269, + "grad_norm": 2.183162212371826, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.727819561958313, + "num_tokens": 149988331.0, + "step": 5997 + }, + { + "epoch": 0.6586865802767407, + "grad_norm": 2.386253833770752, + "learning_rate": 1e-06, + "loss": 1.0206, + "mean_token_accuracy": 0.7014561295509338, + "num_tokens": 150010458.0, + "step": 5998 + }, + { + "epoch": 0.6587963979793543, + "grad_norm": 1.9103591442108154, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7116513252258301, + "num_tokens": 150042492.0, + "step": 5999 + }, + { + "epoch": 0.6589062156819679, + "grad_norm": 1.995840311050415, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7209187150001526, + "num_tokens": 150069131.0, + "step": 6000 + }, + { + "epoch": 0.6590160333845816, + "grad_norm": 2.139535665512085, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.6838592290878296, + "num_tokens": 150095241.0, + "step": 6001 + }, + { + "epoch": 0.6591258510871952, + "grad_norm": 2.4570045471191406, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.712347149848938, + "num_tokens": 150115907.0, + "step": 6002 + }, + { + "epoch": 0.6592356687898089, + "grad_norm": 2.4052298069000244, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.712647557258606, + "num_tokens": 150137065.0, + "step": 6003 + }, + { + "epoch": 0.6593454864924225, + "grad_norm": 2.400949001312256, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7054125070571899, + "num_tokens": 150157830.0, + "step": 6004 + }, + { + "epoch": 0.6594553041950363, + "grad_norm": 2.217883348464966, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7171609401702881, + "num_tokens": 150181194.0, + "step": 6005 + }, + { + "epoch": 0.6595651218976499, + "grad_norm": 2.3578264713287354, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7249358892440796, + "num_tokens": 150202126.0, + "step": 6006 + }, + { + "epoch": 0.6596749396002636, + "grad_norm": 2.2148873805999756, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7045605778694153, + "num_tokens": 150227348.0, + "step": 6007 + }, + { + "epoch": 0.6597847573028772, + "grad_norm": 2.111504077911377, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6959397792816162, + "num_tokens": 150255177.0, + "step": 6008 + }, + { + "epoch": 0.6598945750054909, + "grad_norm": 2.4692494869232178, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7096730470657349, + "num_tokens": 150276437.0, + "step": 6009 + }, + { + "epoch": 0.6600043927081045, + "grad_norm": 2.358321189880371, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6917029619216919, + "num_tokens": 150299721.0, + "step": 6010 + }, + { + "epoch": 0.6601142104107182, + "grad_norm": 2.0527265071868896, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7226611375808716, + "num_tokens": 150325334.0, + "step": 6011 + }, + { + "epoch": 0.6602240281133319, + "grad_norm": 2.3056387901306152, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6871582269668579, + "num_tokens": 150348806.0, + "step": 6012 + }, + { + "epoch": 0.6603338458159456, + "grad_norm": 2.4426968097686768, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7052965760231018, + "num_tokens": 150371242.0, + "step": 6013 + }, + { + "epoch": 0.6604436635185592, + "grad_norm": 2.030482769012451, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6892824172973633, + "num_tokens": 150403306.0, + "step": 6014 + }, + { + "epoch": 0.6605534812211729, + "grad_norm": 2.237025260925293, + "learning_rate": 1e-06, + "loss": 0.9968, + "mean_token_accuracy": 0.6988934874534607, + "num_tokens": 150428917.0, + "step": 6015 + }, + { + "epoch": 0.6606632989237865, + "grad_norm": 2.2630693912506104, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6950811147689819, + "num_tokens": 150454424.0, + "step": 6016 + }, + { + "epoch": 0.6607731166264001, + "grad_norm": 2.3949131965637207, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.7022075057029724, + "num_tokens": 150475828.0, + "step": 6017 + }, + { + "epoch": 0.6608829343290138, + "grad_norm": 2.228032112121582, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7119783759117126, + "num_tokens": 150499075.0, + "step": 6018 + }, + { + "epoch": 0.6609927520316276, + "grad_norm": 2.4094738960266113, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7191541194915771, + "num_tokens": 150520933.0, + "step": 6019 + }, + { + "epoch": 0.6611025697342412, + "grad_norm": 1.955098032951355, + "learning_rate": 1e-06, + "loss": 1.1061, + "mean_token_accuracy": 0.671757698059082, + "num_tokens": 150555458.0, + "step": 6020 + }, + { + "epoch": 0.6612123874368548, + "grad_norm": 2.0055980682373047, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7065086960792542, + "num_tokens": 150583620.0, + "step": 6021 + }, + { + "epoch": 0.6613222051394685, + "grad_norm": 2.2272708415985107, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.723393440246582, + "num_tokens": 150607158.0, + "step": 6022 + }, + { + "epoch": 0.6614320228420821, + "grad_norm": 2.300614356994629, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7065941095352173, + "num_tokens": 150630471.0, + "step": 6023 + }, + { + "epoch": 0.6615418405446958, + "grad_norm": 2.153684377670288, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7009195685386658, + "num_tokens": 150657346.0, + "step": 6024 + }, + { + "epoch": 0.6616516582473094, + "grad_norm": 2.2507734298706055, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7277733087539673, + "num_tokens": 150680331.0, + "step": 6025 + }, + { + "epoch": 0.6617614759499231, + "grad_norm": 2.1243820190429688, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7183347940444946, + "num_tokens": 150705835.0, + "step": 6026 + }, + { + "epoch": 0.6618712936525368, + "grad_norm": 2.4837286472320557, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7007744312286377, + "num_tokens": 150726044.0, + "step": 6027 + }, + { + "epoch": 0.6619811113551505, + "grad_norm": 2.4046759605407715, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.695184588432312, + "num_tokens": 150748665.0, + "step": 6028 + }, + { + "epoch": 0.6620909290577641, + "grad_norm": 2.186720609664917, + "learning_rate": 1e-06, + "loss": 1.0919, + "mean_token_accuracy": 0.6751376390457153, + "num_tokens": 150776080.0, + "step": 6029 + }, + { + "epoch": 0.6622007467603778, + "grad_norm": 2.1803090572357178, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7017019987106323, + "num_tokens": 150801460.0, + "step": 6030 + }, + { + "epoch": 0.6623105644629914, + "grad_norm": 2.596745014190674, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7265175580978394, + "num_tokens": 150819383.0, + "step": 6031 + }, + { + "epoch": 0.6624203821656051, + "grad_norm": 2.20448637008667, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7156269550323486, + "num_tokens": 150843217.0, + "step": 6032 + }, + { + "epoch": 0.6625301998682187, + "grad_norm": 2.281007766723633, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6896799802780151, + "num_tokens": 150866043.0, + "step": 6033 + }, + { + "epoch": 0.6626400175708325, + "grad_norm": 1.8673120737075806, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7084817290306091, + "num_tokens": 150899140.0, + "step": 6034 + }, + { + "epoch": 0.6627498352734461, + "grad_norm": 2.4415993690490723, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7132701873779297, + "num_tokens": 150920061.0, + "step": 6035 + }, + { + "epoch": 0.6628596529760598, + "grad_norm": 2.1104252338409424, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7285604476928711, + "num_tokens": 150946145.0, + "step": 6036 + }, + { + "epoch": 0.6629694706786734, + "grad_norm": 2.203721761703491, + "learning_rate": 1e-06, + "loss": 1.0316, + "mean_token_accuracy": 0.6954160332679749, + "num_tokens": 150971939.0, + "step": 6037 + }, + { + "epoch": 0.663079288381287, + "grad_norm": 2.0086777210235596, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7252774238586426, + "num_tokens": 151000059.0, + "step": 6038 + }, + { + "epoch": 0.6631891060839007, + "grad_norm": 2.3551836013793945, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7355803847312927, + "num_tokens": 151021597.0, + "step": 6039 + }, + { + "epoch": 0.6632989237865143, + "grad_norm": 2.2499258518218994, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.6844315528869629, + "num_tokens": 151047541.0, + "step": 6040 + }, + { + "epoch": 0.6634087414891281, + "grad_norm": 2.612313985824585, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.726962685585022, + "num_tokens": 151064366.0, + "step": 6041 + }, + { + "epoch": 0.6635185591917417, + "grad_norm": 2.1016345024108887, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7093084454536438, + "num_tokens": 151094153.0, + "step": 6042 + }, + { + "epoch": 0.6636283768943554, + "grad_norm": 2.687288999557495, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7295867204666138, + "num_tokens": 151111762.0, + "step": 6043 + }, + { + "epoch": 0.663738194596969, + "grad_norm": 2.517784833908081, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7054464221000671, + "num_tokens": 151131937.0, + "step": 6044 + }, + { + "epoch": 0.6638480122995827, + "grad_norm": 2.6093592643737793, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7195425033569336, + "num_tokens": 151151069.0, + "step": 6045 + }, + { + "epoch": 0.6639578300021963, + "grad_norm": 2.169814348220825, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.7155874967575073, + "num_tokens": 151177715.0, + "step": 6046 + }, + { + "epoch": 0.66406764770481, + "grad_norm": 2.2695136070251465, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7182718515396118, + "num_tokens": 151201093.0, + "step": 6047 + }, + { + "epoch": 0.6641774654074237, + "grad_norm": 2.223386526107788, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7082645893096924, + "num_tokens": 151225625.0, + "step": 6048 + }, + { + "epoch": 0.6642872831100374, + "grad_norm": 2.8505990505218506, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7224102020263672, + "num_tokens": 151242648.0, + "step": 6049 + }, + { + "epoch": 0.664397100812651, + "grad_norm": 2.192685842514038, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6927624940872192, + "num_tokens": 151268512.0, + "step": 6050 + }, + { + "epoch": 0.6645069185152647, + "grad_norm": 2.193037986755371, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7143258452415466, + "num_tokens": 151294057.0, + "step": 6051 + }, + { + "epoch": 0.6646167362178783, + "grad_norm": 2.0244171619415283, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7017976641654968, + "num_tokens": 151323117.0, + "step": 6052 + }, + { + "epoch": 0.664726553920492, + "grad_norm": 2.1199839115142822, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7145275473594666, + "num_tokens": 151348976.0, + "step": 6053 + }, + { + "epoch": 0.6648363716231056, + "grad_norm": 2.158369541168213, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7221311330795288, + "num_tokens": 151372011.0, + "step": 6054 + }, + { + "epoch": 0.6649461893257194, + "grad_norm": 2.237934112548828, + "learning_rate": 1e-06, + "loss": 1.0388, + "mean_token_accuracy": 0.683323323726654, + "num_tokens": 151398701.0, + "step": 6055 + }, + { + "epoch": 0.665056007028333, + "grad_norm": 1.8508803844451904, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6997270584106445, + "num_tokens": 151433990.0, + "step": 6056 + }, + { + "epoch": 0.6651658247309467, + "grad_norm": 2.507150650024414, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7263140678405762, + "num_tokens": 151452560.0, + "step": 6057 + }, + { + "epoch": 0.6652756424335603, + "grad_norm": 2.2850711345672607, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7256331443786621, + "num_tokens": 151473596.0, + "step": 6058 + }, + { + "epoch": 0.665385460136174, + "grad_norm": 2.2383553981781006, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7102248072624207, + "num_tokens": 151496972.0, + "step": 6059 + }, + { + "epoch": 0.6654952778387876, + "grad_norm": 2.252923011779785, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7264721393585205, + "num_tokens": 151519178.0, + "step": 6060 + }, + { + "epoch": 0.6656050955414012, + "grad_norm": 2.0579352378845215, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.724475622177124, + "num_tokens": 151545596.0, + "step": 6061 + }, + { + "epoch": 0.6657149132440149, + "grad_norm": 1.964116096496582, + "learning_rate": 1e-06, + "loss": 1.0517, + "mean_token_accuracy": 0.6788569092750549, + "num_tokens": 151576076.0, + "step": 6062 + }, + { + "epoch": 0.6658247309466286, + "grad_norm": 2.1537675857543945, + "learning_rate": 1e-06, + "loss": 1.0417, + "mean_token_accuracy": 0.6910449862480164, + "num_tokens": 151603298.0, + "step": 6063 + }, + { + "epoch": 0.6659345486492423, + "grad_norm": 2.342010259628296, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7121254205703735, + "num_tokens": 151625419.0, + "step": 6064 + }, + { + "epoch": 0.6660443663518559, + "grad_norm": 2.5351970195770264, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7291042804718018, + "num_tokens": 151646364.0, + "step": 6065 + }, + { + "epoch": 0.6661541840544696, + "grad_norm": 2.1863553524017334, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7269010543823242, + "num_tokens": 151670492.0, + "step": 6066 + }, + { + "epoch": 0.6662640017570832, + "grad_norm": 2.3145084381103516, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7084414958953857, + "num_tokens": 151693812.0, + "step": 6067 + }, + { + "epoch": 0.6663738194596969, + "grad_norm": 2.4548308849334717, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7374626398086548, + "num_tokens": 151714347.0, + "step": 6068 + }, + { + "epoch": 0.6664836371623105, + "grad_norm": 2.402193069458008, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6860153675079346, + "num_tokens": 151736665.0, + "step": 6069 + }, + { + "epoch": 0.6665934548649243, + "grad_norm": 1.9265936613082886, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.6971365213394165, + "num_tokens": 151765907.0, + "step": 6070 + }, + { + "epoch": 0.6667032725675379, + "grad_norm": 2.255152702331543, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7263573408126831, + "num_tokens": 151788946.0, + "step": 6071 + }, + { + "epoch": 0.6668130902701516, + "grad_norm": 1.9781447649002075, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6935408115386963, + "num_tokens": 151819680.0, + "step": 6072 + }, + { + "epoch": 0.6669229079727652, + "grad_norm": 1.9515124559402466, + "learning_rate": 1e-06, + "loss": 1.1135, + "mean_token_accuracy": 0.6624770760536194, + "num_tokens": 151853466.0, + "step": 6073 + }, + { + "epoch": 0.6670327256753789, + "grad_norm": 2.143357515335083, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7076992988586426, + "num_tokens": 151881455.0, + "step": 6074 + }, + { + "epoch": 0.6671425433779925, + "grad_norm": 2.2804789543151855, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6970857381820679, + "num_tokens": 151905671.0, + "step": 6075 + }, + { + "epoch": 0.6672523610806061, + "grad_norm": 2.2592360973358154, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7116305828094482, + "num_tokens": 151930452.0, + "step": 6076 + }, + { + "epoch": 0.6673621787832199, + "grad_norm": 2.1244118213653564, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7144348621368408, + "num_tokens": 151956837.0, + "step": 6077 + }, + { + "epoch": 0.6674719964858336, + "grad_norm": 2.4057884216308594, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.695493221282959, + "num_tokens": 151979639.0, + "step": 6078 + }, + { + "epoch": 0.6675818141884472, + "grad_norm": 2.319441318511963, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7031351923942566, + "num_tokens": 152001951.0, + "step": 6079 + }, + { + "epoch": 0.6676916318910608, + "grad_norm": 2.1523547172546387, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6897017359733582, + "num_tokens": 152027799.0, + "step": 6080 + }, + { + "epoch": 0.6678014495936745, + "grad_norm": 2.3882062435150146, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.709389328956604, + "num_tokens": 152048889.0, + "step": 6081 + }, + { + "epoch": 0.6679112672962881, + "grad_norm": 2.3503646850585938, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.6980308890342712, + "num_tokens": 152072402.0, + "step": 6082 + }, + { + "epoch": 0.6680210849989018, + "grad_norm": 2.906851053237915, + "learning_rate": 1e-06, + "loss": 1.075, + "mean_token_accuracy": 0.6871612071990967, + "num_tokens": 152088822.0, + "step": 6083 + }, + { + "epoch": 0.6681309027015155, + "grad_norm": 2.343388080596924, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7190769910812378, + "num_tokens": 152111639.0, + "step": 6084 + }, + { + "epoch": 0.6682407204041292, + "grad_norm": 2.1474642753601074, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6905232667922974, + "num_tokens": 152139088.0, + "step": 6085 + }, + { + "epoch": 0.6683505381067428, + "grad_norm": 2.2596802711486816, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7176173329353333, + "num_tokens": 152163797.0, + "step": 6086 + }, + { + "epoch": 0.6684603558093565, + "grad_norm": 1.8910154104232788, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.7014082670211792, + "num_tokens": 152195627.0, + "step": 6087 + }, + { + "epoch": 0.6685701735119701, + "grad_norm": 2.018165111541748, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7141430974006653, + "num_tokens": 152223283.0, + "step": 6088 + }, + { + "epoch": 0.6686799912145838, + "grad_norm": 2.3762869834899902, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7103221416473389, + "num_tokens": 152244720.0, + "step": 6089 + }, + { + "epoch": 0.6687898089171974, + "grad_norm": 2.0017013549804688, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7002217173576355, + "num_tokens": 152275781.0, + "step": 6090 + }, + { + "epoch": 0.6688996266198111, + "grad_norm": 2.0228054523468018, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6914629936218262, + "num_tokens": 152306197.0, + "step": 6091 + }, + { + "epoch": 0.6690094443224248, + "grad_norm": 2.2145347595214844, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7098832130432129, + "num_tokens": 152331081.0, + "step": 6092 + }, + { + "epoch": 0.6691192620250385, + "grad_norm": 2.3387537002563477, + "learning_rate": 1e-06, + "loss": 1.0486, + "mean_token_accuracy": 0.6870096921920776, + "num_tokens": 152354544.0, + "step": 6093 + }, + { + "epoch": 0.6692290797276521, + "grad_norm": 2.0277976989746094, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7217808961868286, + "num_tokens": 152382072.0, + "step": 6094 + }, + { + "epoch": 0.6693388974302658, + "grad_norm": 2.439847230911255, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7304965257644653, + "num_tokens": 152401463.0, + "step": 6095 + }, + { + "epoch": 0.6694487151328794, + "grad_norm": 2.1181042194366455, + "learning_rate": 1e-06, + "loss": 1.0401, + "mean_token_accuracy": 0.6880810856819153, + "num_tokens": 152430727.0, + "step": 6096 + }, + { + "epoch": 0.669558532835493, + "grad_norm": 2.051054000854492, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7104249000549316, + "num_tokens": 152458324.0, + "step": 6097 + }, + { + "epoch": 0.6696683505381067, + "grad_norm": 1.98252534866333, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6965839862823486, + "num_tokens": 152488163.0, + "step": 6098 + }, + { + "epoch": 0.6697781682407205, + "grad_norm": 2.0269153118133545, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.6884496808052063, + "num_tokens": 152518456.0, + "step": 6099 + }, + { + "epoch": 0.6698879859433341, + "grad_norm": 2.2904751300811768, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6766296625137329, + "num_tokens": 152544326.0, + "step": 6100 + }, + { + "epoch": 0.6699978036459477, + "grad_norm": 2.106410503387451, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6982420682907104, + "num_tokens": 152570542.0, + "step": 6101 + }, + { + "epoch": 0.6701076213485614, + "grad_norm": 2.555969715118408, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7281069755554199, + "num_tokens": 152589662.0, + "step": 6102 + }, + { + "epoch": 0.670217439051175, + "grad_norm": 2.3669373989105225, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7156774997711182, + "num_tokens": 152611053.0, + "step": 6103 + }, + { + "epoch": 0.6703272567537887, + "grad_norm": 2.5187504291534424, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7059336304664612, + "num_tokens": 152636181.0, + "step": 6104 + }, + { + "epoch": 0.6704370744564023, + "grad_norm": 2.5364184379577637, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7399900555610657, + "num_tokens": 152656038.0, + "step": 6105 + }, + { + "epoch": 0.6705468921590161, + "grad_norm": 2.3991448879241943, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7129559516906738, + "num_tokens": 152676198.0, + "step": 6106 + }, + { + "epoch": 0.6706567098616297, + "grad_norm": 2.409472703933716, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.719605565071106, + "num_tokens": 152697888.0, + "step": 6107 + }, + { + "epoch": 0.6707665275642434, + "grad_norm": 2.331676959991455, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7044451832771301, + "num_tokens": 152720619.0, + "step": 6108 + }, + { + "epoch": 0.670876345266857, + "grad_norm": 2.25366473197937, + "learning_rate": 1e-06, + "loss": 1.0079, + "mean_token_accuracy": 0.694726824760437, + "num_tokens": 152745365.0, + "step": 6109 + }, + { + "epoch": 0.6709861629694707, + "grad_norm": 2.415898561477661, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.6848915815353394, + "num_tokens": 152767550.0, + "step": 6110 + }, + { + "epoch": 0.6710959806720843, + "grad_norm": 2.0593795776367188, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.6965271234512329, + "num_tokens": 152796002.0, + "step": 6111 + }, + { + "epoch": 0.671205798374698, + "grad_norm": 2.186217784881592, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.6996946334838867, + "num_tokens": 152823307.0, + "step": 6112 + }, + { + "epoch": 0.6713156160773117, + "grad_norm": 2.1726267337799072, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6936701536178589, + "num_tokens": 152849644.0, + "step": 6113 + }, + { + "epoch": 0.6714254337799254, + "grad_norm": 2.345106601715088, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7168023586273193, + "num_tokens": 152870684.0, + "step": 6114 + }, + { + "epoch": 0.671535251482539, + "grad_norm": 2.4265758991241455, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7201758027076721, + "num_tokens": 152891239.0, + "step": 6115 + }, + { + "epoch": 0.6716450691851527, + "grad_norm": 2.2571327686309814, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7142746448516846, + "num_tokens": 152914635.0, + "step": 6116 + }, + { + "epoch": 0.6717548868877663, + "grad_norm": 2.5764312744140625, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7168671488761902, + "num_tokens": 152932534.0, + "step": 6117 + }, + { + "epoch": 0.67186470459038, + "grad_norm": 2.2311959266662598, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.7056401968002319, + "num_tokens": 152957822.0, + "step": 6118 + }, + { + "epoch": 0.6719745222929936, + "grad_norm": 2.1453092098236084, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7055633664131165, + "num_tokens": 152982627.0, + "step": 6119 + }, + { + "epoch": 0.6720843399956072, + "grad_norm": 1.9124220609664917, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7102981805801392, + "num_tokens": 153014442.0, + "step": 6120 + }, + { + "epoch": 0.672194157698221, + "grad_norm": 2.11153507232666, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.710738480091095, + "num_tokens": 153040319.0, + "step": 6121 + }, + { + "epoch": 0.6723039754008346, + "grad_norm": 2.192216396331787, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7175883650779724, + "num_tokens": 153063892.0, + "step": 6122 + }, + { + "epoch": 0.6724137931034483, + "grad_norm": 2.4161486625671387, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.702113151550293, + "num_tokens": 153086988.0, + "step": 6123 + }, + { + "epoch": 0.6725236108060619, + "grad_norm": 2.427642822265625, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7406275272369385, + "num_tokens": 153105997.0, + "step": 6124 + }, + { + "epoch": 0.6726334285086756, + "grad_norm": 2.092305898666382, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7025405168533325, + "num_tokens": 153133382.0, + "step": 6125 + }, + { + "epoch": 0.6727432462112892, + "grad_norm": 2.5290908813476562, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7269052267074585, + "num_tokens": 153151988.0, + "step": 6126 + }, + { + "epoch": 0.6728530639139029, + "grad_norm": 2.217992067337036, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7292830944061279, + "num_tokens": 153175311.0, + "step": 6127 + }, + { + "epoch": 0.6729628816165166, + "grad_norm": 2.3927700519561768, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.732266902923584, + "num_tokens": 153196604.0, + "step": 6128 + }, + { + "epoch": 0.6730726993191303, + "grad_norm": 2.08477520942688, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.707856297492981, + "num_tokens": 153224463.0, + "step": 6129 + }, + { + "epoch": 0.6731825170217439, + "grad_norm": 2.257766008377075, + "learning_rate": 1e-06, + "loss": 1.0114, + "mean_token_accuracy": 0.6959177851676941, + "num_tokens": 153249434.0, + "step": 6130 + }, + { + "epoch": 0.6732923347243576, + "grad_norm": 2.345609664916992, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.740946352481842, + "num_tokens": 153269395.0, + "step": 6131 + }, + { + "epoch": 0.6734021524269712, + "grad_norm": 2.410513162612915, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6991149187088013, + "num_tokens": 153291426.0, + "step": 6132 + }, + { + "epoch": 0.6735119701295849, + "grad_norm": 2.6115453243255615, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7197483777999878, + "num_tokens": 153310123.0, + "step": 6133 + }, + { + "epoch": 0.6736217878321985, + "grad_norm": 2.1464202404022217, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6919736862182617, + "num_tokens": 153338646.0, + "step": 6134 + }, + { + "epoch": 0.6737316055348123, + "grad_norm": 2.098076343536377, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7200050354003906, + "num_tokens": 153366061.0, + "step": 6135 + }, + { + "epoch": 0.6738414232374259, + "grad_norm": 2.478710889816284, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7147655487060547, + "num_tokens": 153387457.0, + "step": 6136 + }, + { + "epoch": 0.6739512409400396, + "grad_norm": 2.516134262084961, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6823782920837402, + "num_tokens": 153408647.0, + "step": 6137 + }, + { + "epoch": 0.6740610586426532, + "grad_norm": 2.4716179370880127, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7230708003044128, + "num_tokens": 153429129.0, + "step": 6138 + }, + { + "epoch": 0.6741708763452668, + "grad_norm": 2.314911127090454, + "learning_rate": 1e-06, + "loss": 1.0975, + "mean_token_accuracy": 0.674086332321167, + "num_tokens": 153452375.0, + "step": 6139 + }, + { + "epoch": 0.6742806940478805, + "grad_norm": 2.4378137588500977, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.7031263113021851, + "num_tokens": 153474504.0, + "step": 6140 + }, + { + "epoch": 0.6743905117504941, + "grad_norm": 2.166990041732788, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7306035161018372, + "num_tokens": 153500382.0, + "step": 6141 + }, + { + "epoch": 0.6745003294531079, + "grad_norm": 2.1292290687561035, + "learning_rate": 1e-06, + "loss": 1.0498, + "mean_token_accuracy": 0.6818100810050964, + "num_tokens": 153530141.0, + "step": 6142 + }, + { + "epoch": 0.6746101471557215, + "grad_norm": 2.1563990116119385, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6850389838218689, + "num_tokens": 153555627.0, + "step": 6143 + }, + { + "epoch": 0.6747199648583352, + "grad_norm": 2.416468381881714, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7023392915725708, + "num_tokens": 153578090.0, + "step": 6144 + }, + { + "epoch": 0.6748297825609488, + "grad_norm": 2.5419328212738037, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7179675698280334, + "num_tokens": 153596498.0, + "step": 6145 + }, + { + "epoch": 0.6749396002635625, + "grad_norm": 2.395833730697632, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7105110287666321, + "num_tokens": 153619012.0, + "step": 6146 + }, + { + "epoch": 0.6750494179661761, + "grad_norm": 2.0167057514190674, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.700928270816803, + "num_tokens": 153649769.0, + "step": 6147 + }, + { + "epoch": 0.6751592356687898, + "grad_norm": 2.344529151916504, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6994915008544922, + "num_tokens": 153671607.0, + "step": 6148 + }, + { + "epoch": 0.6752690533714034, + "grad_norm": 1.9871853590011597, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.714033305644989, + "num_tokens": 153701600.0, + "step": 6149 + }, + { + "epoch": 0.6753788710740172, + "grad_norm": 2.371577262878418, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7288916110992432, + "num_tokens": 153723939.0, + "step": 6150 + }, + { + "epoch": 0.6754886887766308, + "grad_norm": 2.333216905593872, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6870307922363281, + "num_tokens": 153747128.0, + "step": 6151 + }, + { + "epoch": 0.6755985064792445, + "grad_norm": 1.8892625570297241, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7152513265609741, + "num_tokens": 153778652.0, + "step": 6152 + }, + { + "epoch": 0.6757083241818581, + "grad_norm": 2.28283953666687, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7047612071037292, + "num_tokens": 153802174.0, + "step": 6153 + }, + { + "epoch": 0.6758181418844718, + "grad_norm": 2.0053627490997314, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6894640326499939, + "num_tokens": 153831578.0, + "step": 6154 + }, + { + "epoch": 0.6759279595870854, + "grad_norm": 2.615598440170288, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7137206196784973, + "num_tokens": 153850243.0, + "step": 6155 + }, + { + "epoch": 0.676037777289699, + "grad_norm": 2.1491940021514893, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7065826654434204, + "num_tokens": 153876962.0, + "step": 6156 + }, + { + "epoch": 0.6761475949923128, + "grad_norm": 2.3660881519317627, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7011833190917969, + "num_tokens": 153900235.0, + "step": 6157 + }, + { + "epoch": 0.6762574126949265, + "grad_norm": 1.9709569215774536, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6883237957954407, + "num_tokens": 153932843.0, + "step": 6158 + }, + { + "epoch": 0.6763672303975401, + "grad_norm": 2.4056293964385986, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7183976173400879, + "num_tokens": 153952825.0, + "step": 6159 + }, + { + "epoch": 0.6764770481001537, + "grad_norm": 2.4792723655700684, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7134457230567932, + "num_tokens": 153973327.0, + "step": 6160 + }, + { + "epoch": 0.6765868658027674, + "grad_norm": 2.1946921348571777, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7026577591896057, + "num_tokens": 153998192.0, + "step": 6161 + }, + { + "epoch": 0.676696683505381, + "grad_norm": 1.948293924331665, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7112094759941101, + "num_tokens": 154028280.0, + "step": 6162 + }, + { + "epoch": 0.6768065012079947, + "grad_norm": 2.336879014968872, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7075381875038147, + "num_tokens": 154050881.0, + "step": 6163 + }, + { + "epoch": 0.6769163189106084, + "grad_norm": 2.3534700870513916, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7166669368743896, + "num_tokens": 154074104.0, + "step": 6164 + }, + { + "epoch": 0.6770261366132221, + "grad_norm": 2.357471227645874, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6940521001815796, + "num_tokens": 154097122.0, + "step": 6165 + }, + { + "epoch": 0.6771359543158357, + "grad_norm": 2.3131070137023926, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6953353881835938, + "num_tokens": 154120808.0, + "step": 6166 + }, + { + "epoch": 0.6772457720184494, + "grad_norm": 1.886293649673462, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7161177396774292, + "num_tokens": 154153713.0, + "step": 6167 + }, + { + "epoch": 0.677355589721063, + "grad_norm": 2.0729892253875732, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6811655163764954, + "num_tokens": 154184030.0, + "step": 6168 + }, + { + "epoch": 0.6774654074236767, + "grad_norm": 1.9451452493667603, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.689789354801178, + "num_tokens": 154216388.0, + "step": 6169 + }, + { + "epoch": 0.6775752251262903, + "grad_norm": 2.299757719039917, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7243483066558838, + "num_tokens": 154238766.0, + "step": 6170 + }, + { + "epoch": 0.6776850428289041, + "grad_norm": 2.1509780883789062, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7174947261810303, + "num_tokens": 154263349.0, + "step": 6171 + }, + { + "epoch": 0.6777948605315177, + "grad_norm": 1.8674659729003906, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7031744122505188, + "num_tokens": 154296498.0, + "step": 6172 + }, + { + "epoch": 0.6779046782341314, + "grad_norm": 2.290663480758667, + "learning_rate": 1e-06, + "loss": 1.0518, + "mean_token_accuracy": 0.6838608980178833, + "num_tokens": 154320618.0, + "step": 6173 + }, + { + "epoch": 0.678014495936745, + "grad_norm": 2.0293242931365967, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.7018262147903442, + "num_tokens": 154350509.0, + "step": 6174 + }, + { + "epoch": 0.6781243136393587, + "grad_norm": 1.9783639907836914, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7111265063285828, + "num_tokens": 154381606.0, + "step": 6175 + }, + { + "epoch": 0.6782341313419723, + "grad_norm": 2.232919692993164, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7239154577255249, + "num_tokens": 154405283.0, + "step": 6176 + }, + { + "epoch": 0.678343949044586, + "grad_norm": 1.9200291633605957, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7045828104019165, + "num_tokens": 154436609.0, + "step": 6177 + }, + { + "epoch": 0.6784537667471996, + "grad_norm": 2.173523426055908, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7078726887702942, + "num_tokens": 154460384.0, + "step": 6178 + }, + { + "epoch": 0.6785635844498134, + "grad_norm": 2.2107269763946533, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6835665106773376, + "num_tokens": 154487812.0, + "step": 6179 + }, + { + "epoch": 0.678673402152427, + "grad_norm": 2.5629334449768066, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.730093240737915, + "num_tokens": 154507007.0, + "step": 6180 + }, + { + "epoch": 0.6787832198550406, + "grad_norm": 1.9121556282043457, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7117813229560852, + "num_tokens": 154537739.0, + "step": 6181 + }, + { + "epoch": 0.6788930375576543, + "grad_norm": 2.254704475402832, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7056127786636353, + "num_tokens": 154560936.0, + "step": 6182 + }, + { + "epoch": 0.6790028552602679, + "grad_norm": 2.181267499923706, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7064008712768555, + "num_tokens": 154586759.0, + "step": 6183 + }, + { + "epoch": 0.6791126729628816, + "grad_norm": 2.360398054122925, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7267546653747559, + "num_tokens": 154608326.0, + "step": 6184 + }, + { + "epoch": 0.6792224906654952, + "grad_norm": 2.0458786487579346, + "learning_rate": 1e-06, + "loss": 1.0466, + "mean_token_accuracy": 0.6930809020996094, + "num_tokens": 154638877.0, + "step": 6185 + }, + { + "epoch": 0.679332308368109, + "grad_norm": 2.1397511959075928, + "learning_rate": 1e-06, + "loss": 1.0386, + "mean_token_accuracy": 0.686299741268158, + "num_tokens": 154667668.0, + "step": 6186 + }, + { + "epoch": 0.6794421260707226, + "grad_norm": 2.1768670082092285, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7159613370895386, + "num_tokens": 154691127.0, + "step": 6187 + }, + { + "epoch": 0.6795519437733363, + "grad_norm": 2.212857484817505, + "learning_rate": 1e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7527316212654114, + "num_tokens": 154712993.0, + "step": 6188 + }, + { + "epoch": 0.6796617614759499, + "grad_norm": 2.1194698810577393, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7169994711875916, + "num_tokens": 154739745.0, + "step": 6189 + }, + { + "epoch": 0.6797715791785636, + "grad_norm": 2.3452935218811035, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7190823554992676, + "num_tokens": 154762757.0, + "step": 6190 + }, + { + "epoch": 0.6798813968811772, + "grad_norm": 2.56951904296875, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7077535390853882, + "num_tokens": 154782548.0, + "step": 6191 + }, + { + "epoch": 0.6799912145837909, + "grad_norm": 2.343689203262329, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7075260877609253, + "num_tokens": 154804677.0, + "step": 6192 + }, + { + "epoch": 0.6801010322864046, + "grad_norm": 2.2857346534729004, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7160746455192566, + "num_tokens": 154827804.0, + "step": 6193 + }, + { + "epoch": 0.6802108499890183, + "grad_norm": 2.146789073944092, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.694869875907898, + "num_tokens": 154853800.0, + "step": 6194 + }, + { + "epoch": 0.6803206676916319, + "grad_norm": 2.099043846130371, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6980833411216736, + "num_tokens": 154881878.0, + "step": 6195 + }, + { + "epoch": 0.6804304853942456, + "grad_norm": 2.0999443531036377, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.69788658618927, + "num_tokens": 154910943.0, + "step": 6196 + }, + { + "epoch": 0.6805403030968592, + "grad_norm": 2.164041042327881, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6960048675537109, + "num_tokens": 154935417.0, + "step": 6197 + }, + { + "epoch": 0.6806501207994728, + "grad_norm": 2.1475014686584473, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7089734077453613, + "num_tokens": 154961629.0, + "step": 6198 + }, + { + "epoch": 0.6807599385020865, + "grad_norm": 2.843337059020996, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.698418140411377, + "num_tokens": 154980256.0, + "step": 6199 + }, + { + "epoch": 0.6808697562047002, + "grad_norm": 2.276981830596924, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.696976363658905, + "num_tokens": 155005006.0, + "step": 6200 + }, + { + "epoch": 0.6809795739073139, + "grad_norm": 1.9741853475570679, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6893701553344727, + "num_tokens": 155035966.0, + "step": 6201 + }, + { + "epoch": 0.6810893916099275, + "grad_norm": 2.336684226989746, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7295883297920227, + "num_tokens": 155058073.0, + "step": 6202 + }, + { + "epoch": 0.6811992093125412, + "grad_norm": 2.127683639526367, + "learning_rate": 1e-06, + "loss": 1.0237, + "mean_token_accuracy": 0.6908273696899414, + "num_tokens": 155086659.0, + "step": 6203 + }, + { + "epoch": 0.6813090270151548, + "grad_norm": 2.1889305114746094, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7107945084571838, + "num_tokens": 155111134.0, + "step": 6204 + }, + { + "epoch": 0.6814188447177685, + "grad_norm": 2.203707695007324, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.70054030418396, + "num_tokens": 155134887.0, + "step": 6205 + }, + { + "epoch": 0.6815286624203821, + "grad_norm": 2.009202718734741, + "learning_rate": 1e-06, + "loss": 1.0718, + "mean_token_accuracy": 0.678653359413147, + "num_tokens": 155163920.0, + "step": 6206 + }, + { + "epoch": 0.6816384801229959, + "grad_norm": 2.1761767864227295, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6847079992294312, + "num_tokens": 155189770.0, + "step": 6207 + }, + { + "epoch": 0.6817482978256095, + "grad_norm": 2.1487278938293457, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7120745182037354, + "num_tokens": 155216883.0, + "step": 6208 + }, + { + "epoch": 0.6818581155282232, + "grad_norm": 2.1797964572906494, + "learning_rate": 1e-06, + "loss": 0.816, + "mean_token_accuracy": 0.7380945682525635, + "num_tokens": 155241233.0, + "step": 6209 + }, + { + "epoch": 0.6819679332308368, + "grad_norm": 2.0966951847076416, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6938257813453674, + "num_tokens": 155267786.0, + "step": 6210 + }, + { + "epoch": 0.6820777509334505, + "grad_norm": 2.163022994995117, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6832190752029419, + "num_tokens": 155294062.0, + "step": 6211 + }, + { + "epoch": 0.6821875686360641, + "grad_norm": 2.302229881286621, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6914357542991638, + "num_tokens": 155318588.0, + "step": 6212 + }, + { + "epoch": 0.6822973863386778, + "grad_norm": 2.3283228874206543, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6873818039894104, + "num_tokens": 155341783.0, + "step": 6213 + }, + { + "epoch": 0.6824072040412914, + "grad_norm": 2.4366495609283447, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7120733261108398, + "num_tokens": 155362852.0, + "step": 6214 + }, + { + "epoch": 0.6825170217439052, + "grad_norm": 2.481968641281128, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7321842908859253, + "num_tokens": 155381638.0, + "step": 6215 + }, + { + "epoch": 0.6826268394465188, + "grad_norm": 2.5977799892425537, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7291666269302368, + "num_tokens": 155399670.0, + "step": 6216 + }, + { + "epoch": 0.6827366571491325, + "grad_norm": 2.553799629211426, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.713758111000061, + "num_tokens": 155420674.0, + "step": 6217 + }, + { + "epoch": 0.6828464748517461, + "grad_norm": 2.01863169670105, + "learning_rate": 1e-06, + "loss": 1.0698, + "mean_token_accuracy": 0.680152416229248, + "num_tokens": 155452012.0, + "step": 6218 + }, + { + "epoch": 0.6829562925543597, + "grad_norm": 2.3572628498077393, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7153101563453674, + "num_tokens": 155474068.0, + "step": 6219 + }, + { + "epoch": 0.6830661102569734, + "grad_norm": 2.3422703742980957, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7030815482139587, + "num_tokens": 155496624.0, + "step": 6220 + }, + { + "epoch": 0.683175927959587, + "grad_norm": 1.9374390840530396, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6955177187919617, + "num_tokens": 155528163.0, + "step": 6221 + }, + { + "epoch": 0.6832857456622008, + "grad_norm": 2.1472842693328857, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7174081802368164, + "num_tokens": 155555862.0, + "step": 6222 + }, + { + "epoch": 0.6833955633648144, + "grad_norm": 2.384321451187134, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7146614789962769, + "num_tokens": 155576778.0, + "step": 6223 + }, + { + "epoch": 0.6835053810674281, + "grad_norm": 2.379415988922119, + "learning_rate": 1e-06, + "loss": 0.9868, + "mean_token_accuracy": 0.7130823731422424, + "num_tokens": 155599753.0, + "step": 6224 + }, + { + "epoch": 0.6836151987700417, + "grad_norm": 2.3600754737854004, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.710869312286377, + "num_tokens": 155621377.0, + "step": 6225 + }, + { + "epoch": 0.6837250164726554, + "grad_norm": 2.2952663898468018, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.698304295539856, + "num_tokens": 155645034.0, + "step": 6226 + }, + { + "epoch": 0.683834834175269, + "grad_norm": 2.137817144393921, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.684175968170166, + "num_tokens": 155673634.0, + "step": 6227 + }, + { + "epoch": 0.6839446518778827, + "grad_norm": 2.2391676902770996, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7207048535346985, + "num_tokens": 155696751.0, + "step": 6228 + }, + { + "epoch": 0.6840544695804964, + "grad_norm": 2.099853754043579, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7101360559463501, + "num_tokens": 155723586.0, + "step": 6229 + }, + { + "epoch": 0.6841642872831101, + "grad_norm": 1.9889521598815918, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7191998958587646, + "num_tokens": 155752087.0, + "step": 6230 + }, + { + "epoch": 0.6842741049857237, + "grad_norm": 2.201660633087158, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.720629870891571, + "num_tokens": 155778547.0, + "step": 6231 + }, + { + "epoch": 0.6843839226883374, + "grad_norm": 1.9600365161895752, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6910316944122314, + "num_tokens": 155811104.0, + "step": 6232 + }, + { + "epoch": 0.684493740390951, + "grad_norm": 2.5434765815734863, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7300060987472534, + "num_tokens": 155830433.0, + "step": 6233 + }, + { + "epoch": 0.6846035580935647, + "grad_norm": 2.2552073001861572, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7175362706184387, + "num_tokens": 155853689.0, + "step": 6234 + }, + { + "epoch": 0.6847133757961783, + "grad_norm": 2.4015109539031982, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7022008895874023, + "num_tokens": 155876164.0, + "step": 6235 + }, + { + "epoch": 0.6848231934987921, + "grad_norm": 2.223830461502075, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7172914743423462, + "num_tokens": 155899506.0, + "step": 6236 + }, + { + "epoch": 0.6849330112014057, + "grad_norm": 2.334063768386841, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7241113781929016, + "num_tokens": 155921410.0, + "step": 6237 + }, + { + "epoch": 0.6850428289040194, + "grad_norm": 2.19559645652771, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7271722555160522, + "num_tokens": 155945648.0, + "step": 6238 + }, + { + "epoch": 0.685152646606633, + "grad_norm": 2.448306083679199, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.709723711013794, + "num_tokens": 155967700.0, + "step": 6239 + }, + { + "epoch": 0.6852624643092466, + "grad_norm": 2.2671666145324707, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7106083035469055, + "num_tokens": 155992405.0, + "step": 6240 + }, + { + "epoch": 0.6853722820118603, + "grad_norm": 2.028134346008301, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7122145295143127, + "num_tokens": 156020005.0, + "step": 6241 + }, + { + "epoch": 0.6854820997144739, + "grad_norm": 2.1187002658843994, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7070595026016235, + "num_tokens": 156045749.0, + "step": 6242 + }, + { + "epoch": 0.6855919174170876, + "grad_norm": 1.9448370933532715, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6932002902030945, + "num_tokens": 156077146.0, + "step": 6243 + }, + { + "epoch": 0.6857017351197013, + "grad_norm": 2.1968300342559814, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7261388301849365, + "num_tokens": 156101151.0, + "step": 6244 + }, + { + "epoch": 0.685811552822315, + "grad_norm": 2.298856258392334, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.6790823936462402, + "num_tokens": 156124912.0, + "step": 6245 + }, + { + "epoch": 0.6859213705249286, + "grad_norm": 2.0838358402252197, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.708173394203186, + "num_tokens": 156154951.0, + "step": 6246 + }, + { + "epoch": 0.6860311882275423, + "grad_norm": 2.4388129711151123, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7182297706604004, + "num_tokens": 156176185.0, + "step": 6247 + }, + { + "epoch": 0.6861410059301559, + "grad_norm": 2.1110920906066895, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7318031787872314, + "num_tokens": 156200560.0, + "step": 6248 + }, + { + "epoch": 0.6862508236327696, + "grad_norm": 2.038128137588501, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.704024612903595, + "num_tokens": 156227882.0, + "step": 6249 + }, + { + "epoch": 0.6863606413353832, + "grad_norm": 2.1474266052246094, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7122898697853088, + "num_tokens": 156252262.0, + "step": 6250 + }, + { + "epoch": 0.686470459037997, + "grad_norm": 1.9524555206298828, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6797166466712952, + "num_tokens": 156286337.0, + "step": 6251 + }, + { + "epoch": 0.6865802767406106, + "grad_norm": 2.05179500579834, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6957725286483765, + "num_tokens": 156315642.0, + "step": 6252 + }, + { + "epoch": 0.6866900944432243, + "grad_norm": 2.392021656036377, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.701109766960144, + "num_tokens": 156337322.0, + "step": 6253 + }, + { + "epoch": 0.6867999121458379, + "grad_norm": 2.209261655807495, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6783540844917297, + "num_tokens": 156364281.0, + "step": 6254 + }, + { + "epoch": 0.6869097298484516, + "grad_norm": 2.2207119464874268, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6964859962463379, + "num_tokens": 156391028.0, + "step": 6255 + }, + { + "epoch": 0.6870195475510652, + "grad_norm": 2.298022508621216, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7223333120346069, + "num_tokens": 156413486.0, + "step": 6256 + }, + { + "epoch": 0.6871293652536788, + "grad_norm": 2.080019950866699, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.6917215585708618, + "num_tokens": 156441834.0, + "step": 6257 + }, + { + "epoch": 0.6872391829562926, + "grad_norm": 2.025259017944336, + "learning_rate": 1e-06, + "loss": 1.0257, + "mean_token_accuracy": 0.6837440729141235, + "num_tokens": 156471359.0, + "step": 6258 + }, + { + "epoch": 0.6873490006589063, + "grad_norm": 2.004650115966797, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.696861743927002, + "num_tokens": 156501049.0, + "step": 6259 + }, + { + "epoch": 0.6874588183615199, + "grad_norm": 2.226548194885254, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.739813506603241, + "num_tokens": 156524748.0, + "step": 6260 + }, + { + "epoch": 0.6875686360641335, + "grad_norm": 2.092629909515381, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6922481060028076, + "num_tokens": 156553233.0, + "step": 6261 + }, + { + "epoch": 0.6876784537667472, + "grad_norm": 2.3453750610351562, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.714057207107544, + "num_tokens": 156575930.0, + "step": 6262 + }, + { + "epoch": 0.6877882714693608, + "grad_norm": 2.1269538402557373, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7150353193283081, + "num_tokens": 156602056.0, + "step": 6263 + }, + { + "epoch": 0.6878980891719745, + "grad_norm": 2.8682801723480225, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7382380962371826, + "num_tokens": 156618604.0, + "step": 6264 + }, + { + "epoch": 0.6880079068745882, + "grad_norm": 2.4362878799438477, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7082967162132263, + "num_tokens": 156639414.0, + "step": 6265 + }, + { + "epoch": 0.6881177245772019, + "grad_norm": 2.0490596294403076, + "learning_rate": 1e-06, + "loss": 1.0559, + "mean_token_accuracy": 0.6966254115104675, + "num_tokens": 156668944.0, + "step": 6266 + }, + { + "epoch": 0.6882275422798155, + "grad_norm": 2.1595773696899414, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7244802117347717, + "num_tokens": 156693679.0, + "step": 6267 + }, + { + "epoch": 0.6883373599824292, + "grad_norm": 2.377223253250122, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7204632759094238, + "num_tokens": 156716562.0, + "step": 6268 + }, + { + "epoch": 0.6884471776850428, + "grad_norm": 2.2462360858917236, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7004519701004028, + "num_tokens": 156741235.0, + "step": 6269 + }, + { + "epoch": 0.6885569953876565, + "grad_norm": 2.292032480239868, + "learning_rate": 1e-06, + "loss": 1.0403, + "mean_token_accuracy": 0.6858609914779663, + "num_tokens": 156765311.0, + "step": 6270 + }, + { + "epoch": 0.6886668130902701, + "grad_norm": 2.3466241359710693, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7156223654747009, + "num_tokens": 156787631.0, + "step": 6271 + }, + { + "epoch": 0.6887766307928838, + "grad_norm": 2.3153982162475586, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7226365804672241, + "num_tokens": 156810370.0, + "step": 6272 + }, + { + "epoch": 0.6888864484954975, + "grad_norm": 1.9107975959777832, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6920768618583679, + "num_tokens": 156841409.0, + "step": 6273 + }, + { + "epoch": 0.6889962661981112, + "grad_norm": 2.251487970352173, + "learning_rate": 1e-06, + "loss": 1.005, + "mean_token_accuracy": 0.7008653879165649, + "num_tokens": 156865895.0, + "step": 6274 + }, + { + "epoch": 0.6891060839007248, + "grad_norm": 2.422048568725586, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7193897366523743, + "num_tokens": 156888291.0, + "step": 6275 + }, + { + "epoch": 0.6892159016033385, + "grad_norm": 1.921676516532898, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7190563678741455, + "num_tokens": 156917622.0, + "step": 6276 + }, + { + "epoch": 0.6893257193059521, + "grad_norm": 2.055147647857666, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.6928815841674805, + "num_tokens": 156946026.0, + "step": 6277 + }, + { + "epoch": 0.6894355370085657, + "grad_norm": 2.0512635707855225, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7042217254638672, + "num_tokens": 156974411.0, + "step": 6278 + }, + { + "epoch": 0.6895453547111794, + "grad_norm": 2.0451653003692627, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.700512707233429, + "num_tokens": 157005140.0, + "step": 6279 + }, + { + "epoch": 0.6896551724137931, + "grad_norm": 2.2879061698913574, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7166070938110352, + "num_tokens": 157027192.0, + "step": 6280 + }, + { + "epoch": 0.6897649901164068, + "grad_norm": 2.5630741119384766, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.6985441446304321, + "num_tokens": 157047045.0, + "step": 6281 + }, + { + "epoch": 0.6898748078190204, + "grad_norm": 2.1056511402130127, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7061139345169067, + "num_tokens": 157072218.0, + "step": 6282 + }, + { + "epoch": 0.6899846255216341, + "grad_norm": 1.7726719379425049, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6831042170524597, + "num_tokens": 157111458.0, + "step": 6283 + }, + { + "epoch": 0.6900944432242477, + "grad_norm": 2.4695889949798584, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7310428619384766, + "num_tokens": 157131531.0, + "step": 6284 + }, + { + "epoch": 0.6902042609268614, + "grad_norm": 2.4866225719451904, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7241733074188232, + "num_tokens": 157151819.0, + "step": 6285 + }, + { + "epoch": 0.690314078629475, + "grad_norm": 2.2205376625061035, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7056986093521118, + "num_tokens": 157174944.0, + "step": 6286 + }, + { + "epoch": 0.6904238963320888, + "grad_norm": 2.1217827796936035, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7242141962051392, + "num_tokens": 157202627.0, + "step": 6287 + }, + { + "epoch": 0.6905337140347024, + "grad_norm": 2.258561134338379, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7097258567810059, + "num_tokens": 157224503.0, + "step": 6288 + }, + { + "epoch": 0.6906435317373161, + "grad_norm": 2.3085289001464844, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7134267091751099, + "num_tokens": 157249103.0, + "step": 6289 + }, + { + "epoch": 0.6907533494399297, + "grad_norm": 1.829785943031311, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7130129337310791, + "num_tokens": 157281429.0, + "step": 6290 + }, + { + "epoch": 0.6908631671425434, + "grad_norm": 2.119450807571411, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.706015408039093, + "num_tokens": 157310030.0, + "step": 6291 + }, + { + "epoch": 0.690972984845157, + "grad_norm": 2.0765693187713623, + "learning_rate": 1e-06, + "loss": 1.029, + "mean_token_accuracy": 0.6865421533584595, + "num_tokens": 157341522.0, + "step": 6292 + }, + { + "epoch": 0.6910828025477707, + "grad_norm": 2.293330669403076, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7293257117271423, + "num_tokens": 157364792.0, + "step": 6293 + }, + { + "epoch": 0.6911926202503844, + "grad_norm": 2.3036556243896484, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7130558490753174, + "num_tokens": 157389174.0, + "step": 6294 + }, + { + "epoch": 0.6913024379529981, + "grad_norm": 2.369983434677124, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7200063467025757, + "num_tokens": 157410073.0, + "step": 6295 + }, + { + "epoch": 0.6914122556556117, + "grad_norm": 2.25106143951416, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.71098792552948, + "num_tokens": 157435928.0, + "step": 6296 + }, + { + "epoch": 0.6915220733582254, + "grad_norm": 2.32292103767395, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7041852474212646, + "num_tokens": 157458588.0, + "step": 6297 + }, + { + "epoch": 0.691631891060839, + "grad_norm": 2.386691093444824, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.7045655250549316, + "num_tokens": 157481126.0, + "step": 6298 + }, + { + "epoch": 0.6917417087634526, + "grad_norm": 2.101536273956299, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7378820180892944, + "num_tokens": 157507380.0, + "step": 6299 + }, + { + "epoch": 0.6918515264660663, + "grad_norm": 2.1903505325317383, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.6988382339477539, + "num_tokens": 157533210.0, + "step": 6300 + }, + { + "epoch": 0.6919613441686799, + "grad_norm": 1.7366684675216675, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7142382860183716, + "num_tokens": 157571986.0, + "step": 6301 + }, + { + "epoch": 0.6920711618712937, + "grad_norm": 2.542111396789551, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7329318523406982, + "num_tokens": 157592244.0, + "step": 6302 + }, + { + "epoch": 0.6921809795739073, + "grad_norm": 2.203233003616333, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7179975509643555, + "num_tokens": 157616855.0, + "step": 6303 + }, + { + "epoch": 0.692290797276521, + "grad_norm": 1.865794062614441, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6983740329742432, + "num_tokens": 157648956.0, + "step": 6304 + }, + { + "epoch": 0.6924006149791346, + "grad_norm": 1.9872608184814453, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7230489253997803, + "num_tokens": 157677444.0, + "step": 6305 + }, + { + "epoch": 0.6925104326817483, + "grad_norm": 2.2061078548431396, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7206329107284546, + "num_tokens": 157703230.0, + "step": 6306 + }, + { + "epoch": 0.6926202503843619, + "grad_norm": 2.241049289703369, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.71378493309021, + "num_tokens": 157729364.0, + "step": 6307 + }, + { + "epoch": 0.6927300680869756, + "grad_norm": 2.4528605937957764, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7004840970039368, + "num_tokens": 157750800.0, + "step": 6308 + }, + { + "epoch": 0.6928398857895893, + "grad_norm": 2.4400622844696045, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7057545185089111, + "num_tokens": 157776224.0, + "step": 6309 + }, + { + "epoch": 0.692949703492203, + "grad_norm": 2.447139263153076, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6828141808509827, + "num_tokens": 157800093.0, + "step": 6310 + }, + { + "epoch": 0.6930595211948166, + "grad_norm": 2.5667755603790283, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7209391593933105, + "num_tokens": 157818698.0, + "step": 6311 + }, + { + "epoch": 0.6931693388974303, + "grad_norm": 2.2468955516815186, + "learning_rate": 1e-06, + "loss": 1.1025, + "mean_token_accuracy": 0.6805002689361572, + "num_tokens": 157846087.0, + "step": 6312 + }, + { + "epoch": 0.6932791566000439, + "grad_norm": 2.2970714569091797, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7064403295516968, + "num_tokens": 157869246.0, + "step": 6313 + }, + { + "epoch": 0.6933889743026576, + "grad_norm": 2.298945665359497, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.705866277217865, + "num_tokens": 157893701.0, + "step": 6314 + }, + { + "epoch": 0.6934987920052712, + "grad_norm": 2.193444013595581, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6976379156112671, + "num_tokens": 157918796.0, + "step": 6315 + }, + { + "epoch": 0.693608609707885, + "grad_norm": 2.2208728790283203, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6971081495285034, + "num_tokens": 157944812.0, + "step": 6316 + }, + { + "epoch": 0.6937184274104986, + "grad_norm": 2.2236292362213135, + "learning_rate": 1e-06, + "loss": 1.0332, + "mean_token_accuracy": 0.6875571012496948, + "num_tokens": 157969116.0, + "step": 6317 + }, + { + "epoch": 0.6938282451131123, + "grad_norm": 2.1314892768859863, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7009652853012085, + "num_tokens": 157995515.0, + "step": 6318 + }, + { + "epoch": 0.6939380628157259, + "grad_norm": 2.0447354316711426, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6889516115188599, + "num_tokens": 158025553.0, + "step": 6319 + }, + { + "epoch": 0.6940478805183395, + "grad_norm": 2.3581833839416504, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.697504997253418, + "num_tokens": 158048720.0, + "step": 6320 + }, + { + "epoch": 0.6941576982209532, + "grad_norm": 2.2400741577148438, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7069269418716431, + "num_tokens": 158073411.0, + "step": 6321 + }, + { + "epoch": 0.6942675159235668, + "grad_norm": 2.2426915168762207, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7083237171173096, + "num_tokens": 158097403.0, + "step": 6322 + }, + { + "epoch": 0.6943773336261806, + "grad_norm": 2.1642322540283203, + "learning_rate": 1e-06, + "loss": 0.8368, + "mean_token_accuracy": 0.7313927412033081, + "num_tokens": 158120593.0, + "step": 6323 + }, + { + "epoch": 0.6944871513287942, + "grad_norm": 2.4198060035705566, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7161335945129395, + "num_tokens": 158142770.0, + "step": 6324 + }, + { + "epoch": 0.6945969690314079, + "grad_norm": 2.0941178798675537, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7416404485702515, + "num_tokens": 158168964.0, + "step": 6325 + }, + { + "epoch": 0.6947067867340215, + "grad_norm": 2.3279175758361816, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7100955247879028, + "num_tokens": 158192768.0, + "step": 6326 + }, + { + "epoch": 0.6948166044366352, + "grad_norm": 2.2095258235931396, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6983113884925842, + "num_tokens": 158218409.0, + "step": 6327 + }, + { + "epoch": 0.6949264221392488, + "grad_norm": 2.279627799987793, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.712462306022644, + "num_tokens": 158242721.0, + "step": 6328 + }, + { + "epoch": 0.6950362398418625, + "grad_norm": 2.1394691467285156, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.6957319378852844, + "num_tokens": 158269744.0, + "step": 6329 + }, + { + "epoch": 0.6951460575444761, + "grad_norm": 2.0297675132751465, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6937258243560791, + "num_tokens": 158297655.0, + "step": 6330 + }, + { + "epoch": 0.6952558752470899, + "grad_norm": 2.050426959991455, + "learning_rate": 1e-06, + "loss": 1.1328, + "mean_token_accuracy": 0.6659284234046936, + "num_tokens": 158329025.0, + "step": 6331 + }, + { + "epoch": 0.6953656929497035, + "grad_norm": 2.2263381481170654, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7054054737091064, + "num_tokens": 158354265.0, + "step": 6332 + }, + { + "epoch": 0.6954755106523172, + "grad_norm": 2.088613271713257, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6882963180541992, + "num_tokens": 158383100.0, + "step": 6333 + }, + { + "epoch": 0.6955853283549308, + "grad_norm": 2.111708164215088, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7140322923660278, + "num_tokens": 158410725.0, + "step": 6334 + }, + { + "epoch": 0.6956951460575445, + "grad_norm": 2.272413969039917, + "learning_rate": 1e-06, + "loss": 1.06, + "mean_token_accuracy": 0.6813478469848633, + "num_tokens": 158436496.0, + "step": 6335 + }, + { + "epoch": 0.6958049637601581, + "grad_norm": 2.2180233001708984, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7195807695388794, + "num_tokens": 158459306.0, + "step": 6336 + }, + { + "epoch": 0.6959147814627717, + "grad_norm": 2.172928810119629, + "learning_rate": 1e-06, + "loss": 1.0764, + "mean_token_accuracy": 0.6815932989120483, + "num_tokens": 158487243.0, + "step": 6337 + }, + { + "epoch": 0.6960245991653855, + "grad_norm": 2.3761162757873535, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7228628396987915, + "num_tokens": 158509115.0, + "step": 6338 + }, + { + "epoch": 0.6961344168679992, + "grad_norm": 2.316437005996704, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7300065755844116, + "num_tokens": 158530362.0, + "step": 6339 + }, + { + "epoch": 0.6962442345706128, + "grad_norm": 2.1078193187713623, + "learning_rate": 1e-06, + "loss": 1.0721, + "mean_token_accuracy": 0.6756505370140076, + "num_tokens": 158559546.0, + "step": 6340 + }, + { + "epoch": 0.6963540522732264, + "grad_norm": 1.984739899635315, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7198070287704468, + "num_tokens": 158588841.0, + "step": 6341 + }, + { + "epoch": 0.6964638699758401, + "grad_norm": 2.217247247695923, + "learning_rate": 1e-06, + "loss": 1.0219, + "mean_token_accuracy": 0.6925854682922363, + "num_tokens": 158613096.0, + "step": 6342 + }, + { + "epoch": 0.6965736876784537, + "grad_norm": 2.227109670639038, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7097725868225098, + "num_tokens": 158637349.0, + "step": 6343 + }, + { + "epoch": 0.6966835053810674, + "grad_norm": 2.650153875350952, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7181521654129028, + "num_tokens": 158655564.0, + "step": 6344 + }, + { + "epoch": 0.6967933230836811, + "grad_norm": 2.1305899620056152, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6973223090171814, + "num_tokens": 158682287.0, + "step": 6345 + }, + { + "epoch": 0.6969031407862948, + "grad_norm": 2.583942174911499, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7145986557006836, + "num_tokens": 158701895.0, + "step": 6346 + }, + { + "epoch": 0.6970129584889084, + "grad_norm": 2.2995028495788574, + "learning_rate": 1e-06, + "loss": 1.0428, + "mean_token_accuracy": 0.679105281829834, + "num_tokens": 158726713.0, + "step": 6347 + }, + { + "epoch": 0.6971227761915221, + "grad_norm": 2.2756757736206055, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7066003084182739, + "num_tokens": 158752388.0, + "step": 6348 + }, + { + "epoch": 0.6972325938941357, + "grad_norm": 2.181885242462158, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.72805255651474, + "num_tokens": 158777678.0, + "step": 6349 + }, + { + "epoch": 0.6973424115967494, + "grad_norm": 2.384143352508545, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.6987969875335693, + "num_tokens": 158799520.0, + "step": 6350 + }, + { + "epoch": 0.697452229299363, + "grad_norm": 2.261620283126831, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6992345452308655, + "num_tokens": 158823198.0, + "step": 6351 + }, + { + "epoch": 0.6975620470019768, + "grad_norm": 2.1178274154663086, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6909284591674805, + "num_tokens": 158851453.0, + "step": 6352 + }, + { + "epoch": 0.6976718647045904, + "grad_norm": 2.3026294708251953, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7219308018684387, + "num_tokens": 158873920.0, + "step": 6353 + }, + { + "epoch": 0.6977816824072041, + "grad_norm": 2.970684289932251, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7143110036849976, + "num_tokens": 158892452.0, + "step": 6354 + }, + { + "epoch": 0.6978915001098177, + "grad_norm": 2.2212400436401367, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7016956210136414, + "num_tokens": 158919656.0, + "step": 6355 + }, + { + "epoch": 0.6980013178124314, + "grad_norm": 2.3120832443237305, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7185271978378296, + "num_tokens": 158942431.0, + "step": 6356 + }, + { + "epoch": 0.698111135515045, + "grad_norm": 2.243922710418701, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.71620774269104, + "num_tokens": 158966572.0, + "step": 6357 + }, + { + "epoch": 0.6982209532176586, + "grad_norm": 2.1428706645965576, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.704008936882019, + "num_tokens": 158994573.0, + "step": 6358 + }, + { + "epoch": 0.6983307709202724, + "grad_norm": 2.366654872894287, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6972546577453613, + "num_tokens": 159017730.0, + "step": 6359 + }, + { + "epoch": 0.698440588622886, + "grad_norm": 2.241554021835327, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7015889883041382, + "num_tokens": 159042621.0, + "step": 6360 + }, + { + "epoch": 0.6985504063254997, + "grad_norm": 2.3971762657165527, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7174276113510132, + "num_tokens": 159065180.0, + "step": 6361 + }, + { + "epoch": 0.6986602240281133, + "grad_norm": 2.168210744857788, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7060229778289795, + "num_tokens": 159092589.0, + "step": 6362 + }, + { + "epoch": 0.698770041730727, + "grad_norm": 2.2930920124053955, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7212536334991455, + "num_tokens": 159116902.0, + "step": 6363 + }, + { + "epoch": 0.6988798594333406, + "grad_norm": 2.1385393142700195, + "learning_rate": 1e-06, + "loss": 1.0023, + "mean_token_accuracy": 0.6948859691619873, + "num_tokens": 159144293.0, + "step": 6364 + }, + { + "epoch": 0.6989896771359543, + "grad_norm": 2.166952133178711, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7094411849975586, + "num_tokens": 159168781.0, + "step": 6365 + }, + { + "epoch": 0.6990994948385679, + "grad_norm": 2.455214023590088, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7170045971870422, + "num_tokens": 159189980.0, + "step": 6366 + }, + { + "epoch": 0.6992093125411817, + "grad_norm": 2.3682785034179688, + "learning_rate": 1e-06, + "loss": 1.0408, + "mean_token_accuracy": 0.698255181312561, + "num_tokens": 159213438.0, + "step": 6367 + }, + { + "epoch": 0.6993191302437953, + "grad_norm": 2.195556640625, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7215028405189514, + "num_tokens": 159236549.0, + "step": 6368 + }, + { + "epoch": 0.699428947946409, + "grad_norm": 2.258714437484741, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7155603170394897, + "num_tokens": 159262374.0, + "step": 6369 + }, + { + "epoch": 0.6995387656490226, + "grad_norm": 2.1761887073516846, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.712256669998169, + "num_tokens": 159288782.0, + "step": 6370 + }, + { + "epoch": 0.6996485833516363, + "grad_norm": 1.9568202495574951, + "learning_rate": 1e-06, + "loss": 1.0405, + "mean_token_accuracy": 0.6815706491470337, + "num_tokens": 159320799.0, + "step": 6371 + }, + { + "epoch": 0.6997584010542499, + "grad_norm": 2.414104461669922, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7142989635467529, + "num_tokens": 159342409.0, + "step": 6372 + }, + { + "epoch": 0.6998682187568636, + "grad_norm": 2.823853015899658, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.730428159236908, + "num_tokens": 159359085.0, + "step": 6373 + }, + { + "epoch": 0.6999780364594773, + "grad_norm": 2.185772657394409, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7131147384643555, + "num_tokens": 159383773.0, + "step": 6374 + }, + { + "epoch": 0.700087854162091, + "grad_norm": 2.0885581970214844, + "learning_rate": 1e-06, + "loss": 1.046, + "mean_token_accuracy": 0.6996998190879822, + "num_tokens": 159409867.0, + "step": 6375 + }, + { + "epoch": 0.7001976718647046, + "grad_norm": 2.525768995285034, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7097403407096863, + "num_tokens": 159429000.0, + "step": 6376 + }, + { + "epoch": 0.7003074895673183, + "grad_norm": 2.747889757156372, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7119721174240112, + "num_tokens": 159447000.0, + "step": 6377 + }, + { + "epoch": 0.7004173072699319, + "grad_norm": 2.235903024673462, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7127273678779602, + "num_tokens": 159470435.0, + "step": 6378 + }, + { + "epoch": 0.7005271249725455, + "grad_norm": 2.002687931060791, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7075505256652832, + "num_tokens": 159503398.0, + "step": 6379 + }, + { + "epoch": 0.7006369426751592, + "grad_norm": 2.2032222747802734, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6885709166526794, + "num_tokens": 159530355.0, + "step": 6380 + }, + { + "epoch": 0.700746760377773, + "grad_norm": 2.1021296977996826, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7505553960800171, + "num_tokens": 159558666.0, + "step": 6381 + }, + { + "epoch": 0.7008565780803866, + "grad_norm": 2.4282305240631104, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7050745487213135, + "num_tokens": 159580724.0, + "step": 6382 + }, + { + "epoch": 0.7009663957830002, + "grad_norm": 2.0041542053222656, + "learning_rate": 1e-06, + "loss": 1.0064, + "mean_token_accuracy": 0.6934717893600464, + "num_tokens": 159610299.0, + "step": 6383 + }, + { + "epoch": 0.7010762134856139, + "grad_norm": 2.1290087699890137, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.7018595933914185, + "num_tokens": 159637960.0, + "step": 6384 + }, + { + "epoch": 0.7011860311882275, + "grad_norm": 2.2641096115112305, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6917794346809387, + "num_tokens": 159665862.0, + "step": 6385 + }, + { + "epoch": 0.7012958488908412, + "grad_norm": 2.092154026031494, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7015444040298462, + "num_tokens": 159692452.0, + "step": 6386 + }, + { + "epoch": 0.7014056665934548, + "grad_norm": 2.202515125274658, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7102319598197937, + "num_tokens": 159716370.0, + "step": 6387 + }, + { + "epoch": 0.7015154842960686, + "grad_norm": 2.1319921016693115, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7058624029159546, + "num_tokens": 159741932.0, + "step": 6388 + }, + { + "epoch": 0.7016253019986822, + "grad_norm": 2.4177663326263428, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7153863310813904, + "num_tokens": 159763564.0, + "step": 6389 + }, + { + "epoch": 0.7017351197012959, + "grad_norm": 2.3282206058502197, + "learning_rate": 1e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7375422716140747, + "num_tokens": 159786219.0, + "step": 6390 + }, + { + "epoch": 0.7018449374039095, + "grad_norm": 2.0225400924682617, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7201609015464783, + "num_tokens": 159814301.0, + "step": 6391 + }, + { + "epoch": 0.7019547551065232, + "grad_norm": 2.001255989074707, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6933861970901489, + "num_tokens": 159844925.0, + "step": 6392 + }, + { + "epoch": 0.7020645728091368, + "grad_norm": 2.1784720420837402, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7085874080657959, + "num_tokens": 159871255.0, + "step": 6393 + }, + { + "epoch": 0.7021743905117505, + "grad_norm": 2.1664297580718994, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7000624537467957, + "num_tokens": 159898088.0, + "step": 6394 + }, + { + "epoch": 0.7022842082143641, + "grad_norm": 2.0872316360473633, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.7002949714660645, + "num_tokens": 159924688.0, + "step": 6395 + }, + { + "epoch": 0.7023940259169779, + "grad_norm": 2.1726114749908447, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6946667432785034, + "num_tokens": 159951586.0, + "step": 6396 + }, + { + "epoch": 0.7025038436195915, + "grad_norm": 2.0620946884155273, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7066774368286133, + "num_tokens": 159978184.0, + "step": 6397 + }, + { + "epoch": 0.7026136613222052, + "grad_norm": 1.8819775581359863, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6999129056930542, + "num_tokens": 160012200.0, + "step": 6398 + }, + { + "epoch": 0.7027234790248188, + "grad_norm": 2.284623861312866, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7078746557235718, + "num_tokens": 160035729.0, + "step": 6399 + }, + { + "epoch": 0.7028332967274324, + "grad_norm": 1.9717135429382324, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7170908451080322, + "num_tokens": 160064023.0, + "step": 6400 + }, + { + "epoch": 0.7029431144300461, + "grad_norm": 2.354979991912842, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.689536452293396, + "num_tokens": 160087095.0, + "step": 6401 + }, + { + "epoch": 0.7030529321326597, + "grad_norm": 2.073631763458252, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.7082509994506836, + "num_tokens": 160114595.0, + "step": 6402 + }, + { + "epoch": 0.7031627498352735, + "grad_norm": 2.745680332183838, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7222504019737244, + "num_tokens": 160131611.0, + "step": 6403 + }, + { + "epoch": 0.7032725675378871, + "grad_norm": 2.247166633605957, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6946219205856323, + "num_tokens": 160157328.0, + "step": 6404 + }, + { + "epoch": 0.7033823852405008, + "grad_norm": 2.2035200595855713, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7135999202728271, + "num_tokens": 160182079.0, + "step": 6405 + }, + { + "epoch": 0.7034922029431144, + "grad_norm": 2.5416419506073, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7076371908187866, + "num_tokens": 160200514.0, + "step": 6406 + }, + { + "epoch": 0.7036020206457281, + "grad_norm": 2.2370805740356445, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7040432095527649, + "num_tokens": 160224326.0, + "step": 6407 + }, + { + "epoch": 0.7037118383483417, + "grad_norm": 2.371480941772461, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.713903546333313, + "num_tokens": 160245968.0, + "step": 6408 + }, + { + "epoch": 0.7038216560509554, + "grad_norm": 2.1638529300689697, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6907705664634705, + "num_tokens": 160272814.0, + "step": 6409 + }, + { + "epoch": 0.7039314737535691, + "grad_norm": 2.1092402935028076, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7048365473747253, + "num_tokens": 160299126.0, + "step": 6410 + }, + { + "epoch": 0.7040412914561828, + "grad_norm": 2.5036509037017822, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7101467847824097, + "num_tokens": 160320257.0, + "step": 6411 + }, + { + "epoch": 0.7041511091587964, + "grad_norm": 2.1790060997009277, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7219049334526062, + "num_tokens": 160345312.0, + "step": 6412 + }, + { + "epoch": 0.7042609268614101, + "grad_norm": 2.255659818649292, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7352222204208374, + "num_tokens": 160367774.0, + "step": 6413 + }, + { + "epoch": 0.7043707445640237, + "grad_norm": 2.1724884510040283, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7190842032432556, + "num_tokens": 160391875.0, + "step": 6414 + }, + { + "epoch": 0.7044805622666374, + "grad_norm": 2.1109540462493896, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7203104496002197, + "num_tokens": 160418249.0, + "step": 6415 + }, + { + "epoch": 0.704590379969251, + "grad_norm": 2.2143406867980957, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6965988278388977, + "num_tokens": 160442798.0, + "step": 6416 + }, + { + "epoch": 0.7047001976718648, + "grad_norm": 1.815018653869629, + "learning_rate": 1e-06, + "loss": 1.1128, + "mean_token_accuracy": 0.6704233288764954, + "num_tokens": 160480379.0, + "step": 6417 + }, + { + "epoch": 0.7048100153744784, + "grad_norm": 2.381826162338257, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7007753252983093, + "num_tokens": 160502609.0, + "step": 6418 + }, + { + "epoch": 0.704919833077092, + "grad_norm": 2.3415141105651855, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7157167196273804, + "num_tokens": 160523597.0, + "step": 6419 + }, + { + "epoch": 0.7050296507797057, + "grad_norm": 2.5068905353546143, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6871112585067749, + "num_tokens": 160546888.0, + "step": 6420 + }, + { + "epoch": 0.7051394684823193, + "grad_norm": 2.257430076599121, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.6995660066604614, + "num_tokens": 160572283.0, + "step": 6421 + }, + { + "epoch": 0.705249286184933, + "grad_norm": 2.057699203491211, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7186540365219116, + "num_tokens": 160602018.0, + "step": 6422 + }, + { + "epoch": 0.7053591038875466, + "grad_norm": 2.253957986831665, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7117284536361694, + "num_tokens": 160627588.0, + "step": 6423 + }, + { + "epoch": 0.7054689215901603, + "grad_norm": 2.4282689094543457, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7100672721862793, + "num_tokens": 160648995.0, + "step": 6424 + }, + { + "epoch": 0.705578739292774, + "grad_norm": 2.50126051902771, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6974338293075562, + "num_tokens": 160669736.0, + "step": 6425 + }, + { + "epoch": 0.7056885569953877, + "grad_norm": 2.3348119258880615, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7218450307846069, + "num_tokens": 160690905.0, + "step": 6426 + }, + { + "epoch": 0.7057983746980013, + "grad_norm": 2.0662455558776855, + "learning_rate": 1e-06, + "loss": 1.0669, + "mean_token_accuracy": 0.6774204969406128, + "num_tokens": 160720298.0, + "step": 6427 + }, + { + "epoch": 0.705908192400615, + "grad_norm": 2.0740489959716797, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.7046116590499878, + "num_tokens": 160749634.0, + "step": 6428 + }, + { + "epoch": 0.7060180101032286, + "grad_norm": 2.1903843879699707, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7041614055633545, + "num_tokens": 160773290.0, + "step": 6429 + }, + { + "epoch": 0.7061278278058423, + "grad_norm": 2.2112932205200195, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7307978272438049, + "num_tokens": 160797118.0, + "step": 6430 + }, + { + "epoch": 0.7062376455084559, + "grad_norm": 2.107534170150757, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7288192510604858, + "num_tokens": 160821975.0, + "step": 6431 + }, + { + "epoch": 0.7063474632110697, + "grad_norm": 2.11568546295166, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7063284516334534, + "num_tokens": 160850539.0, + "step": 6432 + }, + { + "epoch": 0.7064572809136833, + "grad_norm": 2.2099759578704834, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7001043558120728, + "num_tokens": 160875061.0, + "step": 6433 + }, + { + "epoch": 0.706567098616297, + "grad_norm": 2.0858020782470703, + "learning_rate": 1e-06, + "loss": 1.0496, + "mean_token_accuracy": 0.7031765580177307, + "num_tokens": 160902455.0, + "step": 6434 + }, + { + "epoch": 0.7066769163189106, + "grad_norm": 2.295527219772339, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7062385082244873, + "num_tokens": 160925032.0, + "step": 6435 + }, + { + "epoch": 0.7067867340215243, + "grad_norm": 1.9873560667037964, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6859014630317688, + "num_tokens": 160957748.0, + "step": 6436 + }, + { + "epoch": 0.7068965517241379, + "grad_norm": 2.0579934120178223, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.695249617099762, + "num_tokens": 160987621.0, + "step": 6437 + }, + { + "epoch": 0.7070063694267515, + "grad_norm": 2.3722755908966064, + "learning_rate": 1e-06, + "loss": 1.0542, + "mean_token_accuracy": 0.6819935441017151, + "num_tokens": 161012695.0, + "step": 6438 + }, + { + "epoch": 0.7071161871293653, + "grad_norm": 2.222087860107422, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6902819871902466, + "num_tokens": 161038749.0, + "step": 6439 + }, + { + "epoch": 0.707226004831979, + "grad_norm": 1.9356088638305664, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7073394060134888, + "num_tokens": 161069458.0, + "step": 6440 + }, + { + "epoch": 0.7073358225345926, + "grad_norm": 2.2209198474884033, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7354438304901123, + "num_tokens": 161091092.0, + "step": 6441 + }, + { + "epoch": 0.7074456402372062, + "grad_norm": 2.094782590866089, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7128074169158936, + "num_tokens": 161117099.0, + "step": 6442 + }, + { + "epoch": 0.7075554579398199, + "grad_norm": 2.450582265853882, + "learning_rate": 1e-06, + "loss": 1.0063, + "mean_token_accuracy": 0.6982169151306152, + "num_tokens": 161138837.0, + "step": 6443 + }, + { + "epoch": 0.7076652756424335, + "grad_norm": 2.1368823051452637, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.697494387626648, + "num_tokens": 161165264.0, + "step": 6444 + }, + { + "epoch": 0.7077750933450472, + "grad_norm": 2.370858669281006, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6896432042121887, + "num_tokens": 161187315.0, + "step": 6445 + }, + { + "epoch": 0.7078849110476609, + "grad_norm": 2.4011337757110596, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.6950269937515259, + "num_tokens": 161209584.0, + "step": 6446 + }, + { + "epoch": 0.7079947287502746, + "grad_norm": 1.9468802213668823, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6815296411514282, + "num_tokens": 161243422.0, + "step": 6447 + }, + { + "epoch": 0.7081045464528882, + "grad_norm": 1.9701801538467407, + "learning_rate": 1e-06, + "loss": 1.0528, + "mean_token_accuracy": 0.6839419603347778, + "num_tokens": 161276526.0, + "step": 6448 + }, + { + "epoch": 0.7082143641555019, + "grad_norm": 2.424365520477295, + "learning_rate": 1e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7399436235427856, + "num_tokens": 161296924.0, + "step": 6449 + }, + { + "epoch": 0.7083241818581155, + "grad_norm": 2.148775100708008, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.7076963186264038, + "num_tokens": 161321246.0, + "step": 6450 + }, + { + "epoch": 0.7084339995607292, + "grad_norm": 2.401184558868408, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7153145670890808, + "num_tokens": 161341709.0, + "step": 6451 + }, + { + "epoch": 0.7085438172633428, + "grad_norm": 2.0767784118652344, + "learning_rate": 1e-06, + "loss": 1.0443, + "mean_token_accuracy": 0.6999735236167908, + "num_tokens": 161370949.0, + "step": 6452 + }, + { + "epoch": 0.7086536349659565, + "grad_norm": 2.368382215499878, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7037680149078369, + "num_tokens": 161395177.0, + "step": 6453 + }, + { + "epoch": 0.7087634526685702, + "grad_norm": 2.6340315341949463, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7095155119895935, + "num_tokens": 161415423.0, + "step": 6454 + }, + { + "epoch": 0.7088732703711839, + "grad_norm": 2.7160329818725586, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7255396842956543, + "num_tokens": 161432592.0, + "step": 6455 + }, + { + "epoch": 0.7089830880737975, + "grad_norm": 2.2108213901519775, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.7007611989974976, + "num_tokens": 161456518.0, + "step": 6456 + }, + { + "epoch": 0.7090929057764112, + "grad_norm": 2.1887881755828857, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7220460176467896, + "num_tokens": 161481498.0, + "step": 6457 + }, + { + "epoch": 0.7092027234790248, + "grad_norm": 2.089812755584717, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.71198970079422, + "num_tokens": 161509355.0, + "step": 6458 + }, + { + "epoch": 0.7093125411816384, + "grad_norm": 2.2046878337860107, + "learning_rate": 1e-06, + "loss": 1.0127, + "mean_token_accuracy": 0.6954180002212524, + "num_tokens": 161535071.0, + "step": 6459 + }, + { + "epoch": 0.7094223588842521, + "grad_norm": 2.250415802001953, + "learning_rate": 1e-06, + "loss": 1.0772, + "mean_token_accuracy": 0.6751012802124023, + "num_tokens": 161560424.0, + "step": 6460 + }, + { + "epoch": 0.7095321765868658, + "grad_norm": 2.188652753829956, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7078101634979248, + "num_tokens": 161584688.0, + "step": 6461 + }, + { + "epoch": 0.7096419942894795, + "grad_norm": 2.07136869430542, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6926195621490479, + "num_tokens": 161611526.0, + "step": 6462 + }, + { + "epoch": 0.7097518119920931, + "grad_norm": 2.5177390575408936, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7014089822769165, + "num_tokens": 161634259.0, + "step": 6463 + }, + { + "epoch": 0.7098616296947068, + "grad_norm": 2.0363900661468506, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6954545378684998, + "num_tokens": 161664230.0, + "step": 6464 + }, + { + "epoch": 0.7099714473973204, + "grad_norm": 2.3008904457092285, + "learning_rate": 1e-06, + "loss": 1.0855, + "mean_token_accuracy": 0.6653944849967957, + "num_tokens": 161686991.0, + "step": 6465 + }, + { + "epoch": 0.7100812650999341, + "grad_norm": 2.075470209121704, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6891728639602661, + "num_tokens": 161715206.0, + "step": 6466 + }, + { + "epoch": 0.7101910828025477, + "grad_norm": 2.1789488792419434, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6948059797286987, + "num_tokens": 161742186.0, + "step": 6467 + }, + { + "epoch": 0.7103009005051615, + "grad_norm": 2.0501341819763184, + "learning_rate": 1e-06, + "loss": 1.0899, + "mean_token_accuracy": 0.6779620051383972, + "num_tokens": 161773028.0, + "step": 6468 + }, + { + "epoch": 0.7104107182077751, + "grad_norm": 2.0278661251068115, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6877939701080322, + "num_tokens": 161804845.0, + "step": 6469 + }, + { + "epoch": 0.7105205359103888, + "grad_norm": 1.9021472930908203, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6991481781005859, + "num_tokens": 161837643.0, + "step": 6470 + }, + { + "epoch": 0.7106303536130024, + "grad_norm": 2.230287790298462, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7034256458282471, + "num_tokens": 161862899.0, + "step": 6471 + }, + { + "epoch": 0.7107401713156161, + "grad_norm": 2.389617681503296, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.704076886177063, + "num_tokens": 161884608.0, + "step": 6472 + }, + { + "epoch": 0.7108499890182297, + "grad_norm": 2.00639271736145, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7274377346038818, + "num_tokens": 161910962.0, + "step": 6473 + }, + { + "epoch": 0.7109598067208434, + "grad_norm": 2.1035826206207275, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6852878332138062, + "num_tokens": 161939531.0, + "step": 6474 + }, + { + "epoch": 0.7110696244234571, + "grad_norm": 2.228098154067993, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7082229852676392, + "num_tokens": 161962766.0, + "step": 6475 + }, + { + "epoch": 0.7111794421260708, + "grad_norm": 2.031496047973633, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7131097316741943, + "num_tokens": 161990121.0, + "step": 6476 + }, + { + "epoch": 0.7112892598286844, + "grad_norm": 2.0535614490509033, + "learning_rate": 1e-06, + "loss": 1.0885, + "mean_token_accuracy": 0.6819161772727966, + "num_tokens": 162020299.0, + "step": 6477 + }, + { + "epoch": 0.711399077531298, + "grad_norm": 2.3508551120758057, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7005807757377625, + "num_tokens": 162045266.0, + "step": 6478 + }, + { + "epoch": 0.7115088952339117, + "grad_norm": 1.9421236515045166, + "learning_rate": 1e-06, + "loss": 1.0298, + "mean_token_accuracy": 0.683993935585022, + "num_tokens": 162077240.0, + "step": 6479 + }, + { + "epoch": 0.7116187129365253, + "grad_norm": 2.247941493988037, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7087380886077881, + "num_tokens": 162103512.0, + "step": 6480 + }, + { + "epoch": 0.711728530639139, + "grad_norm": 2.0117926597595215, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.6950122117996216, + "num_tokens": 162134298.0, + "step": 6481 + }, + { + "epoch": 0.7118383483417526, + "grad_norm": 2.0940945148468018, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7007306814193726, + "num_tokens": 162161627.0, + "step": 6482 + }, + { + "epoch": 0.7119481660443664, + "grad_norm": 2.270914316177368, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7044081687927246, + "num_tokens": 162185087.0, + "step": 6483 + }, + { + "epoch": 0.71205798374698, + "grad_norm": 2.267725706100464, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.6958715915679932, + "num_tokens": 162207573.0, + "step": 6484 + }, + { + "epoch": 0.7121678014495937, + "grad_norm": 2.1246607303619385, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6982331871986389, + "num_tokens": 162236081.0, + "step": 6485 + }, + { + "epoch": 0.7122776191522073, + "grad_norm": 2.1019883155822754, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7011145353317261, + "num_tokens": 162261619.0, + "step": 6486 + }, + { + "epoch": 0.712387436854821, + "grad_norm": 2.355518341064453, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7162567377090454, + "num_tokens": 162284089.0, + "step": 6487 + }, + { + "epoch": 0.7124972545574346, + "grad_norm": 2.369868755340576, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7171486616134644, + "num_tokens": 162306417.0, + "step": 6488 + }, + { + "epoch": 0.7126070722600483, + "grad_norm": 2.5809454917907715, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7263103723526001, + "num_tokens": 162325725.0, + "step": 6489 + }, + { + "epoch": 0.712716889962662, + "grad_norm": 2.272678852081299, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7076930999755859, + "num_tokens": 162349344.0, + "step": 6490 + }, + { + "epoch": 0.7128267076652757, + "grad_norm": 2.2261698246002197, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7186421155929565, + "num_tokens": 162375507.0, + "step": 6491 + }, + { + "epoch": 0.7129365253678893, + "grad_norm": 2.452298879623413, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7194564342498779, + "num_tokens": 162395210.0, + "step": 6492 + }, + { + "epoch": 0.713046343070503, + "grad_norm": 2.031400442123413, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.7032289505004883, + "num_tokens": 162427114.0, + "step": 6493 + }, + { + "epoch": 0.7131561607731166, + "grad_norm": 2.1393826007843018, + "learning_rate": 1e-06, + "loss": 1.0896, + "mean_token_accuracy": 0.6747639179229736, + "num_tokens": 162454389.0, + "step": 6494 + }, + { + "epoch": 0.7132659784757303, + "grad_norm": 2.2214481830596924, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6957250833511353, + "num_tokens": 162480341.0, + "step": 6495 + }, + { + "epoch": 0.7133757961783439, + "grad_norm": 2.1354873180389404, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6961219310760498, + "num_tokens": 162508930.0, + "step": 6496 + }, + { + "epoch": 0.7134856138809577, + "grad_norm": 2.2555437088012695, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7256069183349609, + "num_tokens": 162533335.0, + "step": 6497 + }, + { + "epoch": 0.7135954315835713, + "grad_norm": 2.341231346130371, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7039055228233337, + "num_tokens": 162557945.0, + "step": 6498 + }, + { + "epoch": 0.713705249286185, + "grad_norm": 2.2806344032287598, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.6991089582443237, + "num_tokens": 162582515.0, + "step": 6499 + }, + { + "epoch": 0.7138150669887986, + "grad_norm": 1.9931377172470093, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7120999693870544, + "num_tokens": 162610747.0, + "step": 6500 + }, + { + "epoch": 0.7139248846914122, + "grad_norm": 2.121124267578125, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6981832981109619, + "num_tokens": 162637980.0, + "step": 6501 + }, + { + "epoch": 0.7140347023940259, + "grad_norm": 2.143219470977783, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.6979601979255676, + "num_tokens": 162664737.0, + "step": 6502 + }, + { + "epoch": 0.7141445200966395, + "grad_norm": 2.168703079223633, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7084478735923767, + "num_tokens": 162689088.0, + "step": 6503 + }, + { + "epoch": 0.7142543377992533, + "grad_norm": 2.380051851272583, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7134865522384644, + "num_tokens": 162711110.0, + "step": 6504 + }, + { + "epoch": 0.7143641555018669, + "grad_norm": 1.874224066734314, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.708798885345459, + "num_tokens": 162742511.0, + "step": 6505 + }, + { + "epoch": 0.7144739732044806, + "grad_norm": 2.4793877601623535, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7337737679481506, + "num_tokens": 162762435.0, + "step": 6506 + }, + { + "epoch": 0.7145837909070942, + "grad_norm": 2.147409200668335, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7045617699623108, + "num_tokens": 162788586.0, + "step": 6507 + }, + { + "epoch": 0.7146936086097079, + "grad_norm": 2.011094808578491, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6938405632972717, + "num_tokens": 162816678.0, + "step": 6508 + }, + { + "epoch": 0.7148034263123215, + "grad_norm": 2.0843048095703125, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.7004803419113159, + "num_tokens": 162844021.0, + "step": 6509 + }, + { + "epoch": 0.7149132440149352, + "grad_norm": 1.8696035146713257, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7059069871902466, + "num_tokens": 162877041.0, + "step": 6510 + }, + { + "epoch": 0.7150230617175488, + "grad_norm": 2.328670024871826, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7033341526985168, + "num_tokens": 162901128.0, + "step": 6511 + }, + { + "epoch": 0.7151328794201626, + "grad_norm": 2.291571617126465, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6801930665969849, + "num_tokens": 162928570.0, + "step": 6512 + }, + { + "epoch": 0.7152426971227762, + "grad_norm": 2.278857469558716, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6919108629226685, + "num_tokens": 162955034.0, + "step": 6513 + }, + { + "epoch": 0.7153525148253899, + "grad_norm": 2.3250033855438232, + "learning_rate": 1e-06, + "loss": 1.0419, + "mean_token_accuracy": 0.6811248064041138, + "num_tokens": 162980326.0, + "step": 6514 + }, + { + "epoch": 0.7154623325280035, + "grad_norm": 2.1655147075653076, + "learning_rate": 1e-06, + "loss": 1.0317, + "mean_token_accuracy": 0.6921021938323975, + "num_tokens": 163006965.0, + "step": 6515 + }, + { + "epoch": 0.7155721502306172, + "grad_norm": 2.271695613861084, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7174596786499023, + "num_tokens": 163028769.0, + "step": 6516 + }, + { + "epoch": 0.7156819679332308, + "grad_norm": 2.0741255283355713, + "learning_rate": 1e-06, + "loss": 1.122, + "mean_token_accuracy": 0.6600469946861267, + "num_tokens": 163057972.0, + "step": 6517 + }, + { + "epoch": 0.7157917856358444, + "grad_norm": 2.234633684158325, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7091162800788879, + "num_tokens": 163081256.0, + "step": 6518 + }, + { + "epoch": 0.7159016033384582, + "grad_norm": 2.342219591140747, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.7158449292182922, + "num_tokens": 163103791.0, + "step": 6519 + }, + { + "epoch": 0.7160114210410718, + "grad_norm": 2.47189998626709, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7192578315734863, + "num_tokens": 163124874.0, + "step": 6520 + }, + { + "epoch": 0.7161212387436855, + "grad_norm": 1.904091477394104, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7237069010734558, + "num_tokens": 163155891.0, + "step": 6521 + }, + { + "epoch": 0.7162310564462991, + "grad_norm": 2.3028838634490967, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7152121663093567, + "num_tokens": 163178864.0, + "step": 6522 + }, + { + "epoch": 0.7163408741489128, + "grad_norm": 2.0144214630126953, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7185919284820557, + "num_tokens": 163206588.0, + "step": 6523 + }, + { + "epoch": 0.7164506918515264, + "grad_norm": 2.228356122970581, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7105405926704407, + "num_tokens": 163232026.0, + "step": 6524 + }, + { + "epoch": 0.7165605095541401, + "grad_norm": 2.541257858276367, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7099448442459106, + "num_tokens": 163251566.0, + "step": 6525 + }, + { + "epoch": 0.7166703272567538, + "grad_norm": 2.530261278152466, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7233508229255676, + "num_tokens": 163270689.0, + "step": 6526 + }, + { + "epoch": 0.7167801449593675, + "grad_norm": 2.225090265274048, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.702691376209259, + "num_tokens": 163294114.0, + "step": 6527 + }, + { + "epoch": 0.7168899626619811, + "grad_norm": 2.3182454109191895, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7079668641090393, + "num_tokens": 163316750.0, + "step": 6528 + }, + { + "epoch": 0.7169997803645948, + "grad_norm": 1.941823959350586, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.683441162109375, + "num_tokens": 163348914.0, + "step": 6529 + }, + { + "epoch": 0.7171095980672084, + "grad_norm": 2.225285053253174, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7039618492126465, + "num_tokens": 163372500.0, + "step": 6530 + }, + { + "epoch": 0.7172194157698221, + "grad_norm": 2.1990747451782227, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7131546139717102, + "num_tokens": 163396165.0, + "step": 6531 + }, + { + "epoch": 0.7173292334724357, + "grad_norm": 2.367058277130127, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6958482265472412, + "num_tokens": 163418930.0, + "step": 6532 + }, + { + "epoch": 0.7174390511750495, + "grad_norm": 2.4644405841827393, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7130327224731445, + "num_tokens": 163441561.0, + "step": 6533 + }, + { + "epoch": 0.7175488688776631, + "grad_norm": 2.290356397628784, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.704651415348053, + "num_tokens": 163465951.0, + "step": 6534 + }, + { + "epoch": 0.7176586865802768, + "grad_norm": 2.268507719039917, + "learning_rate": 1e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7411209344863892, + "num_tokens": 163488768.0, + "step": 6535 + }, + { + "epoch": 0.7177685042828904, + "grad_norm": 2.2987942695617676, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7115514278411865, + "num_tokens": 163512739.0, + "step": 6536 + }, + { + "epoch": 0.717878321985504, + "grad_norm": 2.2952306270599365, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7066280841827393, + "num_tokens": 163536667.0, + "step": 6537 + }, + { + "epoch": 0.7179881396881177, + "grad_norm": 2.569692373275757, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7178540825843811, + "num_tokens": 163554801.0, + "step": 6538 + }, + { + "epoch": 0.7180979573907313, + "grad_norm": 2.3170325756073, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7013673186302185, + "num_tokens": 163578283.0, + "step": 6539 + }, + { + "epoch": 0.7182077750933451, + "grad_norm": 2.7488925457000732, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7221387028694153, + "num_tokens": 163593859.0, + "step": 6540 + }, + { + "epoch": 0.7183175927959587, + "grad_norm": 2.241562604904175, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.689286470413208, + "num_tokens": 163619358.0, + "step": 6541 + }, + { + "epoch": 0.7184274104985724, + "grad_norm": 2.238373279571533, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.7133314609527588, + "num_tokens": 163643464.0, + "step": 6542 + }, + { + "epoch": 0.718537228201186, + "grad_norm": 2.136626720428467, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7354098558425903, + "num_tokens": 163667564.0, + "step": 6543 + }, + { + "epoch": 0.7186470459037997, + "grad_norm": 2.4977834224700928, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7121521234512329, + "num_tokens": 163689200.0, + "step": 6544 + }, + { + "epoch": 0.7187568636064133, + "grad_norm": 2.3912830352783203, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7308892607688904, + "num_tokens": 163709776.0, + "step": 6545 + }, + { + "epoch": 0.718866681309027, + "grad_norm": 2.168966293334961, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.677390456199646, + "num_tokens": 163734865.0, + "step": 6546 + }, + { + "epoch": 0.7189764990116406, + "grad_norm": 2.7247018814086914, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7181193232536316, + "num_tokens": 163753402.0, + "step": 6547 + }, + { + "epoch": 0.7190863167142544, + "grad_norm": 2.163566827774048, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6901075839996338, + "num_tokens": 163780958.0, + "step": 6548 + }, + { + "epoch": 0.719196134416868, + "grad_norm": 2.3856561183929443, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7196963429450989, + "num_tokens": 163803487.0, + "step": 6549 + }, + { + "epoch": 0.7193059521194817, + "grad_norm": 2.0759079456329346, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.6995556950569153, + "num_tokens": 163831982.0, + "step": 6550 + }, + { + "epoch": 0.7194157698220953, + "grad_norm": 2.135885238647461, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7135376334190369, + "num_tokens": 163858060.0, + "step": 6551 + }, + { + "epoch": 0.719525587524709, + "grad_norm": 2.2489378452301025, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7043143510818481, + "num_tokens": 163882887.0, + "step": 6552 + }, + { + "epoch": 0.7196354052273226, + "grad_norm": 2.0135600566864014, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.6956338882446289, + "num_tokens": 163911173.0, + "step": 6553 + }, + { + "epoch": 0.7197452229299363, + "grad_norm": 2.2058470249176025, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6948386430740356, + "num_tokens": 163936818.0, + "step": 6554 + }, + { + "epoch": 0.71985504063255, + "grad_norm": 2.4304609298706055, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7007139325141907, + "num_tokens": 163959314.0, + "step": 6555 + }, + { + "epoch": 0.7199648583351637, + "grad_norm": 2.2046396732330322, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7150081396102905, + "num_tokens": 163983454.0, + "step": 6556 + }, + { + "epoch": 0.7200746760377773, + "grad_norm": 2.1615750789642334, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7186551690101624, + "num_tokens": 164009597.0, + "step": 6557 + }, + { + "epoch": 0.720184493740391, + "grad_norm": 2.4236271381378174, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7106363773345947, + "num_tokens": 164031785.0, + "step": 6558 + }, + { + "epoch": 0.7202943114430046, + "grad_norm": 2.5312256813049316, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7195262908935547, + "num_tokens": 164050623.0, + "step": 6559 + }, + { + "epoch": 0.7204041291456182, + "grad_norm": 2.331605911254883, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7172302007675171, + "num_tokens": 164074197.0, + "step": 6560 + }, + { + "epoch": 0.7205139468482319, + "grad_norm": 2.1725635528564453, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7174355983734131, + "num_tokens": 164098067.0, + "step": 6561 + }, + { + "epoch": 0.7206237645508456, + "grad_norm": 2.237802028656006, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.72565758228302, + "num_tokens": 164119706.0, + "step": 6562 + }, + { + "epoch": 0.7207335822534593, + "grad_norm": 2.3809022903442383, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7027549147605896, + "num_tokens": 164141095.0, + "step": 6563 + }, + { + "epoch": 0.7208433999560729, + "grad_norm": 2.029334306716919, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7212036848068237, + "num_tokens": 164169976.0, + "step": 6564 + }, + { + "epoch": 0.7209532176586866, + "grad_norm": 2.0883712768554688, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7261629104614258, + "num_tokens": 164197506.0, + "step": 6565 + }, + { + "epoch": 0.7210630353613002, + "grad_norm": 2.3131747245788574, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7022927403450012, + "num_tokens": 164220996.0, + "step": 6566 + }, + { + "epoch": 0.7211728530639139, + "grad_norm": 2.1297171115875244, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7097253799438477, + "num_tokens": 164247369.0, + "step": 6567 + }, + { + "epoch": 0.7212826707665275, + "grad_norm": 2.154331922531128, + "learning_rate": 1e-06, + "loss": 1.0311, + "mean_token_accuracy": 0.6911820769309998, + "num_tokens": 164276474.0, + "step": 6568 + }, + { + "epoch": 0.7213924884691413, + "grad_norm": 2.436152696609497, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7228842973709106, + "num_tokens": 164297080.0, + "step": 6569 + }, + { + "epoch": 0.7215023061717549, + "grad_norm": 2.1696572303771973, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6916728615760803, + "num_tokens": 164324700.0, + "step": 6570 + }, + { + "epoch": 0.7216121238743686, + "grad_norm": 2.2343969345092773, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7098323702812195, + "num_tokens": 164350270.0, + "step": 6571 + }, + { + "epoch": 0.7217219415769822, + "grad_norm": 2.1941637992858887, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7098152041435242, + "num_tokens": 164375273.0, + "step": 6572 + }, + { + "epoch": 0.7218317592795959, + "grad_norm": 2.414608955383301, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7118976712226868, + "num_tokens": 164396663.0, + "step": 6573 + }, + { + "epoch": 0.7219415769822095, + "grad_norm": 1.953901767730713, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7090499401092529, + "num_tokens": 164427407.0, + "step": 6574 + }, + { + "epoch": 0.7220513946848232, + "grad_norm": 2.345519781112671, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7006735801696777, + "num_tokens": 164450133.0, + "step": 6575 + }, + { + "epoch": 0.7221612123874368, + "grad_norm": 2.3219873905181885, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7074583768844604, + "num_tokens": 164474824.0, + "step": 6576 + }, + { + "epoch": 0.7222710300900506, + "grad_norm": 2.108535051345825, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.7014391422271729, + "num_tokens": 164502923.0, + "step": 6577 + }, + { + "epoch": 0.7223808477926642, + "grad_norm": 2.398735284805298, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7123546600341797, + "num_tokens": 164526605.0, + "step": 6578 + }, + { + "epoch": 0.7224906654952779, + "grad_norm": 2.3045568466186523, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.697424054145813, + "num_tokens": 164551945.0, + "step": 6579 + }, + { + "epoch": 0.7226004831978915, + "grad_norm": 2.098954677581787, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7143468856811523, + "num_tokens": 164579772.0, + "step": 6580 + }, + { + "epoch": 0.7227103009005051, + "grad_norm": 2.317650556564331, + "learning_rate": 1e-06, + "loss": 1.0569, + "mean_token_accuracy": 0.6790637969970703, + "num_tokens": 164604654.0, + "step": 6581 + }, + { + "epoch": 0.7228201186031188, + "grad_norm": 2.331843614578247, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7048512101173401, + "num_tokens": 164627304.0, + "step": 6582 + }, + { + "epoch": 0.7229299363057324, + "grad_norm": 2.4388318061828613, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7042301893234253, + "num_tokens": 164646903.0, + "step": 6583 + }, + { + "epoch": 0.7230397540083462, + "grad_norm": 2.163475513458252, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7025731205940247, + "num_tokens": 164673294.0, + "step": 6584 + }, + { + "epoch": 0.7231495717109598, + "grad_norm": 2.005427360534668, + "learning_rate": 1e-06, + "loss": 1.0355, + "mean_token_accuracy": 0.6875468492507935, + "num_tokens": 164702958.0, + "step": 6585 + }, + { + "epoch": 0.7232593894135735, + "grad_norm": 2.407702684402466, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6971701383590698, + "num_tokens": 164725075.0, + "step": 6586 + }, + { + "epoch": 0.7233692071161871, + "grad_norm": 2.0310001373291016, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6951864361763, + "num_tokens": 164753380.0, + "step": 6587 + }, + { + "epoch": 0.7234790248188008, + "grad_norm": 2.3060853481292725, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7199676036834717, + "num_tokens": 164776810.0, + "step": 6588 + }, + { + "epoch": 0.7235888425214144, + "grad_norm": 2.2001805305480957, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7078486680984497, + "num_tokens": 164802687.0, + "step": 6589 + }, + { + "epoch": 0.7236986602240281, + "grad_norm": 2.2981040477752686, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7125486731529236, + "num_tokens": 164823642.0, + "step": 6590 + }, + { + "epoch": 0.7238084779266418, + "grad_norm": 2.3679964542388916, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7017734050750732, + "num_tokens": 164847072.0, + "step": 6591 + }, + { + "epoch": 0.7239182956292555, + "grad_norm": 2.2010276317596436, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7021679282188416, + "num_tokens": 164871716.0, + "step": 6592 + }, + { + "epoch": 0.7240281133318691, + "grad_norm": 2.2730398178100586, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7239047884941101, + "num_tokens": 164895461.0, + "step": 6593 + }, + { + "epoch": 0.7241379310344828, + "grad_norm": 2.0826644897460938, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7049800157546997, + "num_tokens": 164922556.0, + "step": 6594 + }, + { + "epoch": 0.7242477487370964, + "grad_norm": 2.7942872047424316, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7086213827133179, + "num_tokens": 164938957.0, + "step": 6595 + }, + { + "epoch": 0.72435756643971, + "grad_norm": 2.281694173812866, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7030887603759766, + "num_tokens": 164961621.0, + "step": 6596 + }, + { + "epoch": 0.7244673841423237, + "grad_norm": 2.310422897338867, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7110903263092041, + "num_tokens": 164985099.0, + "step": 6597 + }, + { + "epoch": 0.7245772018449375, + "grad_norm": 2.0293173789978027, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.709774374961853, + "num_tokens": 165013015.0, + "step": 6598 + }, + { + "epoch": 0.7246870195475511, + "grad_norm": 2.022104024887085, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6929192543029785, + "num_tokens": 165042536.0, + "step": 6599 + }, + { + "epoch": 0.7247968372501647, + "grad_norm": 2.26906418800354, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7207468748092651, + "num_tokens": 165066648.0, + "step": 6600 + }, + { + "epoch": 0.7249066549527784, + "grad_norm": 2.323158025741577, + "learning_rate": 1e-06, + "loss": 0.9801, + "mean_token_accuracy": 0.7003318071365356, + "num_tokens": 165092075.0, + "step": 6601 + }, + { + "epoch": 0.725016472655392, + "grad_norm": 2.234470844268799, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7085883617401123, + "num_tokens": 165115856.0, + "step": 6602 + }, + { + "epoch": 0.7251262903580057, + "grad_norm": 2.141150712966919, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6868624091148376, + "num_tokens": 165142311.0, + "step": 6603 + }, + { + "epoch": 0.7252361080606193, + "grad_norm": 2.169914484024048, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7351828217506409, + "num_tokens": 165164941.0, + "step": 6604 + }, + { + "epoch": 0.725345925763233, + "grad_norm": 2.379682779312134, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7262964844703674, + "num_tokens": 165185992.0, + "step": 6605 + }, + { + "epoch": 0.7254557434658467, + "grad_norm": 2.0458545684814453, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7086201906204224, + "num_tokens": 165215347.0, + "step": 6606 + }, + { + "epoch": 0.7255655611684604, + "grad_norm": 2.462658643722534, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7209836840629578, + "num_tokens": 165236182.0, + "step": 6607 + }, + { + "epoch": 0.725675378871074, + "grad_norm": 2.2296676635742188, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7188842296600342, + "num_tokens": 165261515.0, + "step": 6608 + }, + { + "epoch": 0.7257851965736877, + "grad_norm": 2.0442211627960205, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6941332817077637, + "num_tokens": 165291837.0, + "step": 6609 + }, + { + "epoch": 0.7258950142763013, + "grad_norm": 2.4777755737304688, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.725845456123352, + "num_tokens": 165309706.0, + "step": 6610 + }, + { + "epoch": 0.726004831978915, + "grad_norm": 2.1482088565826416, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7261137366294861, + "num_tokens": 165333814.0, + "step": 6611 + }, + { + "epoch": 0.7261146496815286, + "grad_norm": 2.278144121170044, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7041928768157959, + "num_tokens": 165357383.0, + "step": 6612 + }, + { + "epoch": 0.7262244673841424, + "grad_norm": 2.253018379211426, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.695443332195282, + "num_tokens": 165382682.0, + "step": 6613 + }, + { + "epoch": 0.726334285086756, + "grad_norm": 2.145996332168579, + "learning_rate": 1e-06, + "loss": 1.038, + "mean_token_accuracy": 0.6792431473731995, + "num_tokens": 165408138.0, + "step": 6614 + }, + { + "epoch": 0.7264441027893697, + "grad_norm": 2.6140003204345703, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7057472467422485, + "num_tokens": 165428515.0, + "step": 6615 + }, + { + "epoch": 0.7265539204919833, + "grad_norm": 2.072065591812134, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7141268253326416, + "num_tokens": 165456929.0, + "step": 6616 + }, + { + "epoch": 0.726663738194597, + "grad_norm": 2.4325900077819824, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7195392847061157, + "num_tokens": 165478125.0, + "step": 6617 + }, + { + "epoch": 0.7267735558972106, + "grad_norm": 2.2791459560394287, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6888384222984314, + "num_tokens": 165501352.0, + "step": 6618 + }, + { + "epoch": 0.7268833735998242, + "grad_norm": 2.3495123386383057, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6909147500991821, + "num_tokens": 165524698.0, + "step": 6619 + }, + { + "epoch": 0.726993191302438, + "grad_norm": 2.0076756477355957, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7188855409622192, + "num_tokens": 165553069.0, + "step": 6620 + }, + { + "epoch": 0.7271030090050516, + "grad_norm": 1.9844703674316406, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7182143926620483, + "num_tokens": 165584426.0, + "step": 6621 + }, + { + "epoch": 0.7272128267076653, + "grad_norm": 2.0711421966552734, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7120150327682495, + "num_tokens": 165612095.0, + "step": 6622 + }, + { + "epoch": 0.7273226444102789, + "grad_norm": 2.3045551776885986, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6935350894927979, + "num_tokens": 165638340.0, + "step": 6623 + }, + { + "epoch": 0.7274324621128926, + "grad_norm": 2.2008249759674072, + "learning_rate": 1e-06, + "loss": 1.0851, + "mean_token_accuracy": 0.6722745895385742, + "num_tokens": 165662902.0, + "step": 6624 + }, + { + "epoch": 0.7275422798155062, + "grad_norm": 2.412274122238159, + "learning_rate": 1e-06, + "loss": 1.0131, + "mean_token_accuracy": 0.6942225694656372, + "num_tokens": 165684805.0, + "step": 6625 + }, + { + "epoch": 0.7276520975181199, + "grad_norm": 2.365811824798584, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7275470495223999, + "num_tokens": 165705334.0, + "step": 6626 + }, + { + "epoch": 0.7277619152207336, + "grad_norm": 1.9637181758880615, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6971986293792725, + "num_tokens": 165735726.0, + "step": 6627 + }, + { + "epoch": 0.7278717329233473, + "grad_norm": 2.4218835830688477, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7219122052192688, + "num_tokens": 165757895.0, + "step": 6628 + }, + { + "epoch": 0.7279815506259609, + "grad_norm": 2.072589874267578, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.6980569362640381, + "num_tokens": 165786503.0, + "step": 6629 + }, + { + "epoch": 0.7280913683285746, + "grad_norm": 2.371325731277466, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.707430362701416, + "num_tokens": 165807663.0, + "step": 6630 + }, + { + "epoch": 0.7282011860311882, + "grad_norm": 1.9292069673538208, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7059741020202637, + "num_tokens": 165838359.0, + "step": 6631 + }, + { + "epoch": 0.7283110037338019, + "grad_norm": 2.155613422393799, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6941171884536743, + "num_tokens": 165861677.0, + "step": 6632 + }, + { + "epoch": 0.7284208214364155, + "grad_norm": 2.2903122901916504, + "learning_rate": 1e-06, + "loss": 0.812, + "mean_token_accuracy": 0.743336021900177, + "num_tokens": 165882507.0, + "step": 6633 + }, + { + "epoch": 0.7285306391390292, + "grad_norm": 2.3782575130462646, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7159720063209534, + "num_tokens": 165905040.0, + "step": 6634 + }, + { + "epoch": 0.7286404568416429, + "grad_norm": 2.3527450561523438, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7196593284606934, + "num_tokens": 165926674.0, + "step": 6635 + }, + { + "epoch": 0.7287502745442566, + "grad_norm": 1.9484257698059082, + "learning_rate": 1e-06, + "loss": 1.0294, + "mean_token_accuracy": 0.6952380537986755, + "num_tokens": 165958253.0, + "step": 6636 + }, + { + "epoch": 0.7288600922468702, + "grad_norm": 2.4095914363861084, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7123265266418457, + "num_tokens": 165979421.0, + "step": 6637 + }, + { + "epoch": 0.7289699099494839, + "grad_norm": 1.9714082479476929, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7149247527122498, + "num_tokens": 166008256.0, + "step": 6638 + }, + { + "epoch": 0.7290797276520975, + "grad_norm": 2.173011541366577, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7218800783157349, + "num_tokens": 166031952.0, + "step": 6639 + }, + { + "epoch": 0.7291895453547111, + "grad_norm": 2.2114648818969727, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7015490531921387, + "num_tokens": 166055842.0, + "step": 6640 + }, + { + "epoch": 0.7292993630573248, + "grad_norm": 2.2942323684692383, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.712683916091919, + "num_tokens": 166079279.0, + "step": 6641 + }, + { + "epoch": 0.7294091807599385, + "grad_norm": 2.1243984699249268, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6896207928657532, + "num_tokens": 166105296.0, + "step": 6642 + }, + { + "epoch": 0.7295189984625522, + "grad_norm": 1.9793007373809814, + "learning_rate": 1e-06, + "loss": 0.9953, + "mean_token_accuracy": 0.6979461908340454, + "num_tokens": 166134912.0, + "step": 6643 + }, + { + "epoch": 0.7296288161651658, + "grad_norm": 2.0438473224639893, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7012041807174683, + "num_tokens": 166162043.0, + "step": 6644 + }, + { + "epoch": 0.7297386338677795, + "grad_norm": 2.1023459434509277, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7033904194831848, + "num_tokens": 166190629.0, + "step": 6645 + }, + { + "epoch": 0.7298484515703931, + "grad_norm": 2.007368326187134, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.7124123573303223, + "num_tokens": 166218896.0, + "step": 6646 + }, + { + "epoch": 0.7299582692730068, + "grad_norm": 2.1548733711242676, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7026832699775696, + "num_tokens": 166242486.0, + "step": 6647 + }, + { + "epoch": 0.7300680869756204, + "grad_norm": 2.5503339767456055, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7137510776519775, + "num_tokens": 166262383.0, + "step": 6648 + }, + { + "epoch": 0.7301779046782342, + "grad_norm": 2.4565212726593018, + "learning_rate": 1e-06, + "loss": 0.8284, + "mean_token_accuracy": 0.7410005927085876, + "num_tokens": 166280891.0, + "step": 6649 + }, + { + "epoch": 0.7302877223808478, + "grad_norm": 2.2521517276763916, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7195433378219604, + "num_tokens": 166302635.0, + "step": 6650 + }, + { + "epoch": 0.7303975400834615, + "grad_norm": 2.007223606109619, + "learning_rate": 1e-06, + "loss": 1.0737, + "mean_token_accuracy": 0.6789003610610962, + "num_tokens": 166335232.0, + "step": 6651 + }, + { + "epoch": 0.7305073577860751, + "grad_norm": 2.174193859100342, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7206266522407532, + "num_tokens": 166358323.0, + "step": 6652 + }, + { + "epoch": 0.7306171754886888, + "grad_norm": 2.3290390968322754, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7123765349388123, + "num_tokens": 166379716.0, + "step": 6653 + }, + { + "epoch": 0.7307269931913024, + "grad_norm": 2.1321370601654053, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7267786264419556, + "num_tokens": 166404109.0, + "step": 6654 + }, + { + "epoch": 0.7308368108939161, + "grad_norm": 2.3261806964874268, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.6979459524154663, + "num_tokens": 166427400.0, + "step": 6655 + }, + { + "epoch": 0.7309466285965298, + "grad_norm": 2.300818920135498, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.706125020980835, + "num_tokens": 166449745.0, + "step": 6656 + }, + { + "epoch": 0.7310564462991435, + "grad_norm": 2.310656785964966, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.6906787157058716, + "num_tokens": 166474881.0, + "step": 6657 + }, + { + "epoch": 0.7311662640017571, + "grad_norm": 2.300598621368408, + "learning_rate": 1e-06, + "loss": 1.0589, + "mean_token_accuracy": 0.6883570551872253, + "num_tokens": 166500549.0, + "step": 6658 + }, + { + "epoch": 0.7312760817043708, + "grad_norm": 2.1020760536193848, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.704397976398468, + "num_tokens": 166530247.0, + "step": 6659 + }, + { + "epoch": 0.7313858994069844, + "grad_norm": 2.12612247467041, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6962357759475708, + "num_tokens": 166558049.0, + "step": 6660 + }, + { + "epoch": 0.731495717109598, + "grad_norm": 2.0808703899383545, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6988698244094849, + "num_tokens": 166587528.0, + "step": 6661 + }, + { + "epoch": 0.7316055348122117, + "grad_norm": 2.424661159515381, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7221052646636963, + "num_tokens": 166608839.0, + "step": 6662 + }, + { + "epoch": 0.7317153525148253, + "grad_norm": 2.1130740642547607, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6935731768608093, + "num_tokens": 166635345.0, + "step": 6663 + }, + { + "epoch": 0.7318251702174391, + "grad_norm": 2.21240234375, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7273982763290405, + "num_tokens": 166659368.0, + "step": 6664 + }, + { + "epoch": 0.7319349879200527, + "grad_norm": 2.1939656734466553, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6987342834472656, + "num_tokens": 166687619.0, + "step": 6665 + }, + { + "epoch": 0.7320448056226664, + "grad_norm": 2.561678647994995, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7394207715988159, + "num_tokens": 166706102.0, + "step": 6666 + }, + { + "epoch": 0.73215462332528, + "grad_norm": 2.7128360271453857, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7298464775085449, + "num_tokens": 166722403.0, + "step": 6667 + }, + { + "epoch": 0.7322644410278937, + "grad_norm": 1.9536502361297607, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6980938911437988, + "num_tokens": 166751275.0, + "step": 6668 + }, + { + "epoch": 0.7323742587305073, + "grad_norm": 2.150012731552124, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6892625093460083, + "num_tokens": 166778700.0, + "step": 6669 + }, + { + "epoch": 0.732484076433121, + "grad_norm": 2.222822904586792, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7164784669876099, + "num_tokens": 166801884.0, + "step": 6670 + }, + { + "epoch": 0.7325938941357347, + "grad_norm": 2.1247222423553467, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.702605128288269, + "num_tokens": 166828352.0, + "step": 6671 + }, + { + "epoch": 0.7327037118383484, + "grad_norm": 2.255439043045044, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.714944064617157, + "num_tokens": 166851994.0, + "step": 6672 + }, + { + "epoch": 0.732813529540962, + "grad_norm": 1.9926061630249023, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.728803277015686, + "num_tokens": 166880810.0, + "step": 6673 + }, + { + "epoch": 0.7329233472435757, + "grad_norm": 2.0445456504821777, + "learning_rate": 1e-06, + "loss": 1.0627, + "mean_token_accuracy": 0.6770437359809875, + "num_tokens": 166911073.0, + "step": 6674 + }, + { + "epoch": 0.7330331649461893, + "grad_norm": 2.5103840827941895, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.7109690308570862, + "num_tokens": 166931879.0, + "step": 6675 + }, + { + "epoch": 0.733142982648803, + "grad_norm": 2.3555469512939453, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7149533629417419, + "num_tokens": 166954104.0, + "step": 6676 + }, + { + "epoch": 0.7332528003514166, + "grad_norm": 2.1171982288360596, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7134425044059753, + "num_tokens": 166980356.0, + "step": 6677 + }, + { + "epoch": 0.7333626180540304, + "grad_norm": 2.112560272216797, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.725459098815918, + "num_tokens": 167006270.0, + "step": 6678 + }, + { + "epoch": 0.733472435756644, + "grad_norm": 2.4386537075042725, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.710974395275116, + "num_tokens": 167026013.0, + "step": 6679 + }, + { + "epoch": 0.7335822534592576, + "grad_norm": 2.1083567142486572, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.6815396547317505, + "num_tokens": 167053559.0, + "step": 6680 + }, + { + "epoch": 0.7336920711618713, + "grad_norm": 2.079836130142212, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.6831722259521484, + "num_tokens": 167082577.0, + "step": 6681 + }, + { + "epoch": 0.7338018888644849, + "grad_norm": 1.909541368484497, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7115678787231445, + "num_tokens": 167113756.0, + "step": 6682 + }, + { + "epoch": 0.7339117065670986, + "grad_norm": 1.8496239185333252, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6928859949111938, + "num_tokens": 167149135.0, + "step": 6683 + }, + { + "epoch": 0.7340215242697122, + "grad_norm": 2.2947723865509033, + "learning_rate": 1e-06, + "loss": 1.0293, + "mean_token_accuracy": 0.6939047574996948, + "num_tokens": 167173401.0, + "step": 6684 + }, + { + "epoch": 0.734131341972326, + "grad_norm": 2.222604751586914, + "learning_rate": 1e-06, + "loss": 1.0487, + "mean_token_accuracy": 0.6962791681289673, + "num_tokens": 167198754.0, + "step": 6685 + }, + { + "epoch": 0.7342411596749396, + "grad_norm": 2.0285418033599854, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7229524850845337, + "num_tokens": 167227980.0, + "step": 6686 + }, + { + "epoch": 0.7343509773775533, + "grad_norm": 2.0808708667755127, + "learning_rate": 1e-06, + "loss": 1.0747, + "mean_token_accuracy": 0.6699548363685608, + "num_tokens": 167257366.0, + "step": 6687 + }, + { + "epoch": 0.7344607950801669, + "grad_norm": 2.3556177616119385, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7089889049530029, + "num_tokens": 167282184.0, + "step": 6688 + }, + { + "epoch": 0.7345706127827806, + "grad_norm": 2.0295348167419434, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6921789646148682, + "num_tokens": 167310356.0, + "step": 6689 + }, + { + "epoch": 0.7346804304853942, + "grad_norm": 2.228039026260376, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.707526445388794, + "num_tokens": 167337055.0, + "step": 6690 + }, + { + "epoch": 0.7347902481880079, + "grad_norm": 2.307943344116211, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7201052904129028, + "num_tokens": 167359167.0, + "step": 6691 + }, + { + "epoch": 0.7349000658906216, + "grad_norm": 2.132459878921509, + "learning_rate": 1e-06, + "loss": 1.0565, + "mean_token_accuracy": 0.6868144273757935, + "num_tokens": 167386627.0, + "step": 6692 + }, + { + "epoch": 0.7350098835932353, + "grad_norm": 2.0363657474517822, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.7042208313941956, + "num_tokens": 167415155.0, + "step": 6693 + }, + { + "epoch": 0.7351197012958489, + "grad_norm": 2.092646837234497, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.721899151802063, + "num_tokens": 167442835.0, + "step": 6694 + }, + { + "epoch": 0.7352295189984626, + "grad_norm": 2.4947657585144043, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7088906168937683, + "num_tokens": 167461795.0, + "step": 6695 + }, + { + "epoch": 0.7353393367010762, + "grad_norm": 2.3184220790863037, + "learning_rate": 1e-06, + "loss": 1.0362, + "mean_token_accuracy": 0.6863549947738647, + "num_tokens": 167486915.0, + "step": 6696 + }, + { + "epoch": 0.7354491544036899, + "grad_norm": 2.2421677112579346, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7017161846160889, + "num_tokens": 167511201.0, + "step": 6697 + }, + { + "epoch": 0.7355589721063035, + "grad_norm": 2.2038466930389404, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6972285509109497, + "num_tokens": 167538697.0, + "step": 6698 + }, + { + "epoch": 0.7356687898089171, + "grad_norm": 2.2449939250946045, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7141735553741455, + "num_tokens": 167563569.0, + "step": 6699 + }, + { + "epoch": 0.7357786075115309, + "grad_norm": 2.356053113937378, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7237973213195801, + "num_tokens": 167584063.0, + "step": 6700 + }, + { + "epoch": 0.7358884252141445, + "grad_norm": 2.3031198978424072, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.70820152759552, + "num_tokens": 167606280.0, + "step": 6701 + }, + { + "epoch": 0.7359982429167582, + "grad_norm": 1.9062020778656006, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7168474793434143, + "num_tokens": 167638041.0, + "step": 6702 + }, + { + "epoch": 0.7361080606193718, + "grad_norm": 1.855599284172058, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.712088406085968, + "num_tokens": 167669588.0, + "step": 6703 + }, + { + "epoch": 0.7362178783219855, + "grad_norm": 2.4508557319641113, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7318998575210571, + "num_tokens": 167690146.0, + "step": 6704 + }, + { + "epoch": 0.7363276960245991, + "grad_norm": 2.589020252227783, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7334305047988892, + "num_tokens": 167708761.0, + "step": 6705 + }, + { + "epoch": 0.7364375137272128, + "grad_norm": 2.5937817096710205, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7060237526893616, + "num_tokens": 167728109.0, + "step": 6706 + }, + { + "epoch": 0.7365473314298265, + "grad_norm": 2.244903326034546, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7236186265945435, + "num_tokens": 167751351.0, + "step": 6707 + }, + { + "epoch": 0.7366571491324402, + "grad_norm": 2.3426225185394287, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7176588177680969, + "num_tokens": 167772494.0, + "step": 6708 + }, + { + "epoch": 0.7367669668350538, + "grad_norm": 2.1192052364349365, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6920012831687927, + "num_tokens": 167801673.0, + "step": 6709 + }, + { + "epoch": 0.7368767845376675, + "grad_norm": 2.4789206981658936, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6919928193092346, + "num_tokens": 167823254.0, + "step": 6710 + }, + { + "epoch": 0.7369866022402811, + "grad_norm": 2.439995288848877, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7205816507339478, + "num_tokens": 167843393.0, + "step": 6711 + }, + { + "epoch": 0.7370964199428948, + "grad_norm": 2.096071720123291, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6893633604049683, + "num_tokens": 167871735.0, + "step": 6712 + }, + { + "epoch": 0.7372062376455084, + "grad_norm": 2.156028985977173, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7022438049316406, + "num_tokens": 167896179.0, + "step": 6713 + }, + { + "epoch": 0.7373160553481222, + "grad_norm": 2.0934722423553467, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7168875932693481, + "num_tokens": 167922119.0, + "step": 6714 + }, + { + "epoch": 0.7374258730507358, + "grad_norm": 2.0163826942443848, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7020843029022217, + "num_tokens": 167952084.0, + "step": 6715 + }, + { + "epoch": 0.7375356907533495, + "grad_norm": 2.064157009124756, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6986007690429688, + "num_tokens": 167979203.0, + "step": 6716 + }, + { + "epoch": 0.7376455084559631, + "grad_norm": 2.4961187839508057, + "learning_rate": 1e-06, + "loss": 1.0276, + "mean_token_accuracy": 0.6877722144126892, + "num_tokens": 168001005.0, + "step": 6717 + }, + { + "epoch": 0.7377553261585768, + "grad_norm": 2.3217029571533203, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.711081326007843, + "num_tokens": 168022850.0, + "step": 6718 + }, + { + "epoch": 0.7378651438611904, + "grad_norm": 2.323110342025757, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7081825137138367, + "num_tokens": 168046104.0, + "step": 6719 + }, + { + "epoch": 0.737974961563804, + "grad_norm": 2.3163435459136963, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7246072292327881, + "num_tokens": 168068742.0, + "step": 6720 + }, + { + "epoch": 0.7380847792664178, + "grad_norm": 2.262221574783325, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7101027965545654, + "num_tokens": 168092032.0, + "step": 6721 + }, + { + "epoch": 0.7381945969690314, + "grad_norm": 2.2611868381500244, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6997539401054382, + "num_tokens": 168114894.0, + "step": 6722 + }, + { + "epoch": 0.7383044146716451, + "grad_norm": 2.2596375942230225, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7111973762512207, + "num_tokens": 168139159.0, + "step": 6723 + }, + { + "epoch": 0.7384142323742587, + "grad_norm": 2.3109657764434814, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7124137878417969, + "num_tokens": 168162499.0, + "step": 6724 + }, + { + "epoch": 0.7385240500768724, + "grad_norm": 2.379889726638794, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7373846769332886, + "num_tokens": 168180538.0, + "step": 6725 + }, + { + "epoch": 0.738633867779486, + "grad_norm": 2.1963703632354736, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6981942653656006, + "num_tokens": 168206281.0, + "step": 6726 + }, + { + "epoch": 0.7387436854820997, + "grad_norm": 2.2760424613952637, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7228450775146484, + "num_tokens": 168228961.0, + "step": 6727 + }, + { + "epoch": 0.7388535031847133, + "grad_norm": 1.949248194694519, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7159728407859802, + "num_tokens": 168259035.0, + "step": 6728 + }, + { + "epoch": 0.7389633208873271, + "grad_norm": 2.566678762435913, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.720895528793335, + "num_tokens": 168278405.0, + "step": 6729 + }, + { + "epoch": 0.7390731385899407, + "grad_norm": 2.129232883453369, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7027386426925659, + "num_tokens": 168304906.0, + "step": 6730 + }, + { + "epoch": 0.7391829562925544, + "grad_norm": 2.1299846172332764, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7122016549110413, + "num_tokens": 168330490.0, + "step": 6731 + }, + { + "epoch": 0.739292773995168, + "grad_norm": 2.482088327407837, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7040209770202637, + "num_tokens": 168350335.0, + "step": 6732 + }, + { + "epoch": 0.7394025916977817, + "grad_norm": 2.320970058441162, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7228432893753052, + "num_tokens": 168372647.0, + "step": 6733 + }, + { + "epoch": 0.7395124094003953, + "grad_norm": 2.3504421710968018, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6912333965301514, + "num_tokens": 168396659.0, + "step": 6734 + }, + { + "epoch": 0.739622227103009, + "grad_norm": 2.4919328689575195, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7284082770347595, + "num_tokens": 168422310.0, + "step": 6735 + }, + { + "epoch": 0.7397320448056227, + "grad_norm": 2.244398832321167, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6925957202911377, + "num_tokens": 168450740.0, + "step": 6736 + }, + { + "epoch": 0.7398418625082364, + "grad_norm": 2.816603660583496, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.704513430595398, + "num_tokens": 168467367.0, + "step": 6737 + }, + { + "epoch": 0.73995168021085, + "grad_norm": 2.3640193939208984, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7048046588897705, + "num_tokens": 168489430.0, + "step": 6738 + }, + { + "epoch": 0.7400614979134637, + "grad_norm": 2.2195072174072266, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7121484279632568, + "num_tokens": 168515427.0, + "step": 6739 + }, + { + "epoch": 0.7401713156160773, + "grad_norm": 2.1942391395568848, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7204816341400146, + "num_tokens": 168541212.0, + "step": 6740 + }, + { + "epoch": 0.7402811333186909, + "grad_norm": 2.390728235244751, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7417051792144775, + "num_tokens": 168561291.0, + "step": 6741 + }, + { + "epoch": 0.7403909510213046, + "grad_norm": 2.4717957973480225, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7538428902626038, + "num_tokens": 168581636.0, + "step": 6742 + }, + { + "epoch": 0.7405007687239183, + "grad_norm": 1.9847458600997925, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7124056220054626, + "num_tokens": 168611238.0, + "step": 6743 + }, + { + "epoch": 0.740610586426532, + "grad_norm": 1.9716441631317139, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.697126030921936, + "num_tokens": 168642211.0, + "step": 6744 + }, + { + "epoch": 0.7407204041291456, + "grad_norm": 2.425980567932129, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7281885743141174, + "num_tokens": 168660980.0, + "step": 6745 + }, + { + "epoch": 0.7408302218317593, + "grad_norm": 2.315244674682617, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.711895227432251, + "num_tokens": 168685535.0, + "step": 6746 + }, + { + "epoch": 0.7409400395343729, + "grad_norm": 2.2552757263183594, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7079710960388184, + "num_tokens": 168711031.0, + "step": 6747 + }, + { + "epoch": 0.7410498572369866, + "grad_norm": 2.026546001434326, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6842032670974731, + "num_tokens": 168742238.0, + "step": 6748 + }, + { + "epoch": 0.7411596749396002, + "grad_norm": 2.055323839187622, + "learning_rate": 1e-06, + "loss": 1.0187, + "mean_token_accuracy": 0.690815806388855, + "num_tokens": 168771551.0, + "step": 6749 + }, + { + "epoch": 0.741269492642214, + "grad_norm": 2.3677449226379395, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.703027069568634, + "num_tokens": 168793422.0, + "step": 6750 + }, + { + "epoch": 0.7413793103448276, + "grad_norm": 2.3256328105926514, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6927622556686401, + "num_tokens": 168816668.0, + "step": 6751 + }, + { + "epoch": 0.7414891280474413, + "grad_norm": 2.345008373260498, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.735322117805481, + "num_tokens": 168837624.0, + "step": 6752 + }, + { + "epoch": 0.7415989457500549, + "grad_norm": 2.3858730792999268, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7072292566299438, + "num_tokens": 168859196.0, + "step": 6753 + }, + { + "epoch": 0.7417087634526686, + "grad_norm": 2.5785577297210693, + "learning_rate": 1e-06, + "loss": 1.0491, + "mean_token_accuracy": 0.6819629669189453, + "num_tokens": 168881358.0, + "step": 6754 + }, + { + "epoch": 0.7418185811552822, + "grad_norm": 2.1277740001678467, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7017616033554077, + "num_tokens": 168907075.0, + "step": 6755 + }, + { + "epoch": 0.7419283988578959, + "grad_norm": 2.0868051052093506, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7138763666152954, + "num_tokens": 168933327.0, + "step": 6756 + }, + { + "epoch": 0.7420382165605095, + "grad_norm": 2.1187744140625, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6966134309768677, + "num_tokens": 168961101.0, + "step": 6757 + }, + { + "epoch": 0.7421480342631233, + "grad_norm": 2.196610689163208, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6886662244796753, + "num_tokens": 168987229.0, + "step": 6758 + }, + { + "epoch": 0.7422578519657369, + "grad_norm": 2.1672921180725098, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6847774386405945, + "num_tokens": 169012285.0, + "step": 6759 + }, + { + "epoch": 0.7423676696683505, + "grad_norm": 2.094862699508667, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7051925659179688, + "num_tokens": 169039432.0, + "step": 6760 + }, + { + "epoch": 0.7424774873709642, + "grad_norm": 2.55324387550354, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6966708898544312, + "num_tokens": 169060066.0, + "step": 6761 + }, + { + "epoch": 0.7425873050735778, + "grad_norm": 1.9920017719268799, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6983739733695984, + "num_tokens": 169088765.0, + "step": 6762 + }, + { + "epoch": 0.7426971227761915, + "grad_norm": 2.134053945541382, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7008887529373169, + "num_tokens": 169114289.0, + "step": 6763 + }, + { + "epoch": 0.7428069404788051, + "grad_norm": 2.2335259914398193, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6809206008911133, + "num_tokens": 169139556.0, + "step": 6764 + }, + { + "epoch": 0.7429167581814189, + "grad_norm": 2.127814769744873, + "learning_rate": 1e-06, + "loss": 1.0733, + "mean_token_accuracy": 0.6788390874862671, + "num_tokens": 169167856.0, + "step": 6765 + }, + { + "epoch": 0.7430265758840325, + "grad_norm": 2.2195916175842285, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7070115804672241, + "num_tokens": 169191840.0, + "step": 6766 + }, + { + "epoch": 0.7431363935866462, + "grad_norm": 2.0794837474823, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.692118763923645, + "num_tokens": 169219739.0, + "step": 6767 + }, + { + "epoch": 0.7432462112892598, + "grad_norm": 2.097255229949951, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6967767477035522, + "num_tokens": 169247377.0, + "step": 6768 + }, + { + "epoch": 0.7433560289918735, + "grad_norm": 2.302792549133301, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7207626104354858, + "num_tokens": 169269604.0, + "step": 6769 + }, + { + "epoch": 0.7434658466944871, + "grad_norm": 2.2696337699890137, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.6937313079833984, + "num_tokens": 169294174.0, + "step": 6770 + }, + { + "epoch": 0.7435756643971008, + "grad_norm": 2.245757579803467, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7248839139938354, + "num_tokens": 169317531.0, + "step": 6771 + }, + { + "epoch": 0.7436854820997145, + "grad_norm": 2.0674614906311035, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7285127639770508, + "num_tokens": 169343575.0, + "step": 6772 + }, + { + "epoch": 0.7437952998023282, + "grad_norm": 2.2124922275543213, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7254638671875, + "num_tokens": 169367018.0, + "step": 6773 + }, + { + "epoch": 0.7439051175049418, + "grad_norm": 1.9820671081542969, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7104886770248413, + "num_tokens": 169396287.0, + "step": 6774 + }, + { + "epoch": 0.7440149352075555, + "grad_norm": 2.245590925216675, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7155776023864746, + "num_tokens": 169418475.0, + "step": 6775 + }, + { + "epoch": 0.7441247529101691, + "grad_norm": 2.1709578037261963, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.7008306980133057, + "num_tokens": 169443801.0, + "step": 6776 + }, + { + "epoch": 0.7442345706127828, + "grad_norm": 2.3000285625457764, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7075662612915039, + "num_tokens": 169466316.0, + "step": 6777 + }, + { + "epoch": 0.7443443883153964, + "grad_norm": 2.4860715866088867, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7094175815582275, + "num_tokens": 169486506.0, + "step": 6778 + }, + { + "epoch": 0.7444542060180102, + "grad_norm": 2.489319324493408, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7141209840774536, + "num_tokens": 169506263.0, + "step": 6779 + }, + { + "epoch": 0.7445640237206238, + "grad_norm": 2.223886728286743, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7339423298835754, + "num_tokens": 169530696.0, + "step": 6780 + }, + { + "epoch": 0.7446738414232374, + "grad_norm": 2.189709186553955, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6976046562194824, + "num_tokens": 169556679.0, + "step": 6781 + }, + { + "epoch": 0.7447836591258511, + "grad_norm": 2.313908338546753, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6885249018669128, + "num_tokens": 169578973.0, + "step": 6782 + }, + { + "epoch": 0.7448934768284647, + "grad_norm": 2.577653408050537, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7308135628700256, + "num_tokens": 169597483.0, + "step": 6783 + }, + { + "epoch": 0.7450032945310784, + "grad_norm": 2.221473455429077, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6933043599128723, + "num_tokens": 169622852.0, + "step": 6784 + }, + { + "epoch": 0.745113112233692, + "grad_norm": 2.1309468746185303, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.69612056016922, + "num_tokens": 169648876.0, + "step": 6785 + }, + { + "epoch": 0.7452229299363057, + "grad_norm": 2.233302354812622, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.685657799243927, + "num_tokens": 169674232.0, + "step": 6786 + }, + { + "epoch": 0.7453327476389194, + "grad_norm": 2.5730676651000977, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6991353034973145, + "num_tokens": 169693635.0, + "step": 6787 + }, + { + "epoch": 0.7454425653415331, + "grad_norm": 1.929254412651062, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6933819055557251, + "num_tokens": 169725149.0, + "step": 6788 + }, + { + "epoch": 0.7455523830441467, + "grad_norm": 2.2116777896881104, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7022794485092163, + "num_tokens": 169749229.0, + "step": 6789 + }, + { + "epoch": 0.7456622007467604, + "grad_norm": 2.550821304321289, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7219712734222412, + "num_tokens": 169768259.0, + "step": 6790 + }, + { + "epoch": 0.745772018449374, + "grad_norm": 2.5576012134552, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.727576494216919, + "num_tokens": 169787331.0, + "step": 6791 + }, + { + "epoch": 0.7458818361519877, + "grad_norm": 2.095745801925659, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7077666521072388, + "num_tokens": 169814685.0, + "step": 6792 + }, + { + "epoch": 0.7459916538546013, + "grad_norm": 1.8846577405929565, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6957782506942749, + "num_tokens": 169849263.0, + "step": 6793 + }, + { + "epoch": 0.7461014715572151, + "grad_norm": 2.1753430366516113, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6913341283798218, + "num_tokens": 169876690.0, + "step": 6794 + }, + { + "epoch": 0.7462112892598287, + "grad_norm": 2.4177792072296143, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.69370436668396, + "num_tokens": 169900694.0, + "step": 6795 + }, + { + "epoch": 0.7463211069624424, + "grad_norm": 2.4384829998016357, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7161241769790649, + "num_tokens": 169923186.0, + "step": 6796 + }, + { + "epoch": 0.746430924665056, + "grad_norm": 2.0272796154022217, + "learning_rate": 1e-06, + "loss": 0.7743, + "mean_token_accuracy": 0.7475153207778931, + "num_tokens": 169948076.0, + "step": 6797 + }, + { + "epoch": 0.7465407423676697, + "grad_norm": 2.207590103149414, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7340987324714661, + "num_tokens": 169971421.0, + "step": 6798 + }, + { + "epoch": 0.7466505600702833, + "grad_norm": 2.0677764415740967, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7195115089416504, + "num_tokens": 169998890.0, + "step": 6799 + }, + { + "epoch": 0.7467603777728969, + "grad_norm": 2.21032977104187, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7333032488822937, + "num_tokens": 170022519.0, + "step": 6800 + }, + { + "epoch": 0.7468701954755107, + "grad_norm": 2.3578040599823, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.6954030990600586, + "num_tokens": 170044697.0, + "step": 6801 + }, + { + "epoch": 0.7469800131781243, + "grad_norm": 2.410855531692505, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.7034281492233276, + "num_tokens": 170068205.0, + "step": 6802 + }, + { + "epoch": 0.747089830880738, + "grad_norm": 2.163637161254883, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.727013885974884, + "num_tokens": 170092347.0, + "step": 6803 + }, + { + "epoch": 0.7471996485833516, + "grad_norm": 2.53989577293396, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7172693610191345, + "num_tokens": 170112832.0, + "step": 6804 + }, + { + "epoch": 0.7473094662859653, + "grad_norm": 2.2873027324676514, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.6940737962722778, + "num_tokens": 170136393.0, + "step": 6805 + }, + { + "epoch": 0.7474192839885789, + "grad_norm": 2.5479912757873535, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7202399969100952, + "num_tokens": 170156878.0, + "step": 6806 + }, + { + "epoch": 0.7475291016911926, + "grad_norm": 2.0803442001342773, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6988177299499512, + "num_tokens": 170185200.0, + "step": 6807 + }, + { + "epoch": 0.7476389193938063, + "grad_norm": 2.2232306003570557, + "learning_rate": 1e-06, + "loss": 1.0479, + "mean_token_accuracy": 0.6797267198562622, + "num_tokens": 170211175.0, + "step": 6808 + }, + { + "epoch": 0.74774873709642, + "grad_norm": 2.197796583175659, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.727094292640686, + "num_tokens": 170235970.0, + "step": 6809 + }, + { + "epoch": 0.7478585547990336, + "grad_norm": 2.012922763824463, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7113417387008667, + "num_tokens": 170265492.0, + "step": 6810 + }, + { + "epoch": 0.7479683725016473, + "grad_norm": 2.576784372329712, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.724923849105835, + "num_tokens": 170283530.0, + "step": 6811 + }, + { + "epoch": 0.7480781902042609, + "grad_norm": 2.4447426795959473, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7159197330474854, + "num_tokens": 170303374.0, + "step": 6812 + }, + { + "epoch": 0.7481880079068746, + "grad_norm": 2.2081549167633057, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7066076993942261, + "num_tokens": 170330545.0, + "step": 6813 + }, + { + "epoch": 0.7482978256094882, + "grad_norm": 2.089921474456787, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7172419428825378, + "num_tokens": 170356806.0, + "step": 6814 + }, + { + "epoch": 0.7484076433121019, + "grad_norm": 2.038090705871582, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7103862762451172, + "num_tokens": 170384727.0, + "step": 6815 + }, + { + "epoch": 0.7485174610147156, + "grad_norm": 2.4115657806396484, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7028529644012451, + "num_tokens": 170406630.0, + "step": 6816 + }, + { + "epoch": 0.7486272787173293, + "grad_norm": 2.4550626277923584, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6912976503372192, + "num_tokens": 170427638.0, + "step": 6817 + }, + { + "epoch": 0.7487370964199429, + "grad_norm": 2.0211238861083984, + "learning_rate": 1e-06, + "loss": 1.0395, + "mean_token_accuracy": 0.6928187012672424, + "num_tokens": 170456064.0, + "step": 6818 + }, + { + "epoch": 0.7488469141225566, + "grad_norm": 2.28281569480896, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7142066359519958, + "num_tokens": 170477409.0, + "step": 6819 + }, + { + "epoch": 0.7489567318251702, + "grad_norm": 2.3853299617767334, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7070214152336121, + "num_tokens": 170499442.0, + "step": 6820 + }, + { + "epoch": 0.7490665495277838, + "grad_norm": 2.519526720046997, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7003564834594727, + "num_tokens": 170520284.0, + "step": 6821 + }, + { + "epoch": 0.7491763672303975, + "grad_norm": 2.1931519508361816, + "learning_rate": 1e-06, + "loss": 1.0377, + "mean_token_accuracy": 0.6877515316009521, + "num_tokens": 170549160.0, + "step": 6822 + }, + { + "epoch": 0.7492861849330112, + "grad_norm": 2.358156442642212, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6930117607116699, + "num_tokens": 170572331.0, + "step": 6823 + }, + { + "epoch": 0.7493960026356249, + "grad_norm": 2.054847478866577, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.716557502746582, + "num_tokens": 170599557.0, + "step": 6824 + }, + { + "epoch": 0.7495058203382385, + "grad_norm": 2.3416988849639893, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7176818251609802, + "num_tokens": 170623165.0, + "step": 6825 + }, + { + "epoch": 0.7496156380408522, + "grad_norm": 2.1921725273132324, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7057540416717529, + "num_tokens": 170650181.0, + "step": 6826 + }, + { + "epoch": 0.7497254557434658, + "grad_norm": 2.1803550720214844, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7090104818344116, + "num_tokens": 170676579.0, + "step": 6827 + }, + { + "epoch": 0.7498352734460795, + "grad_norm": 2.148057460784912, + "learning_rate": 1e-06, + "loss": 0.9799, + "mean_token_accuracy": 0.7030856013298035, + "num_tokens": 170703018.0, + "step": 6828 + }, + { + "epoch": 0.7499450911486931, + "grad_norm": 2.1722183227539062, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7198430299758911, + "num_tokens": 170727752.0, + "step": 6829 + }, + { + "epoch": 0.7500549088513069, + "grad_norm": 2.012758493423462, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7261221408843994, + "num_tokens": 170754687.0, + "step": 6830 + }, + { + "epoch": 0.7501647265539205, + "grad_norm": 2.3680901527404785, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6785155534744263, + "num_tokens": 170777589.0, + "step": 6831 + }, + { + "epoch": 0.7502745442565342, + "grad_norm": 1.9646409749984741, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7097309827804565, + "num_tokens": 170805493.0, + "step": 6832 + }, + { + "epoch": 0.7503843619591478, + "grad_norm": 2.318918228149414, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7093195915222168, + "num_tokens": 170828385.0, + "step": 6833 + }, + { + "epoch": 0.7504941796617615, + "grad_norm": 2.2528955936431885, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7361434698104858, + "num_tokens": 170851163.0, + "step": 6834 + }, + { + "epoch": 0.7506039973643751, + "grad_norm": 2.3449552059173584, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.707276463508606, + "num_tokens": 170873240.0, + "step": 6835 + }, + { + "epoch": 0.7507138150669888, + "grad_norm": 2.1881470680236816, + "learning_rate": 1e-06, + "loss": 1.0323, + "mean_token_accuracy": 0.6896451711654663, + "num_tokens": 170901918.0, + "step": 6836 + }, + { + "epoch": 0.7508236327696025, + "grad_norm": 2.303734064102173, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6866150498390198, + "num_tokens": 170926689.0, + "step": 6837 + }, + { + "epoch": 0.7509334504722162, + "grad_norm": 2.095931053161621, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.720035970211029, + "num_tokens": 170954728.0, + "step": 6838 + }, + { + "epoch": 0.7510432681748298, + "grad_norm": 2.04789400100708, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7170027494430542, + "num_tokens": 170983647.0, + "step": 6839 + }, + { + "epoch": 0.7511530858774434, + "grad_norm": 2.1293904781341553, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7102060914039612, + "num_tokens": 171008042.0, + "step": 6840 + }, + { + "epoch": 0.7512629035800571, + "grad_norm": 2.2866134643554688, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.713344395160675, + "num_tokens": 171030511.0, + "step": 6841 + }, + { + "epoch": 0.7513727212826707, + "grad_norm": 2.300294876098633, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7297185659408569, + "num_tokens": 171053077.0, + "step": 6842 + }, + { + "epoch": 0.7514825389852844, + "grad_norm": 1.8358094692230225, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.706257700920105, + "num_tokens": 171087059.0, + "step": 6843 + }, + { + "epoch": 0.7515923566878981, + "grad_norm": 2.3346095085144043, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7064496874809265, + "num_tokens": 171110551.0, + "step": 6844 + }, + { + "epoch": 0.7517021743905118, + "grad_norm": 2.159886598587036, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.7086973786354065, + "num_tokens": 171137026.0, + "step": 6845 + }, + { + "epoch": 0.7518119920931254, + "grad_norm": 1.8407721519470215, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7138158082962036, + "num_tokens": 171169729.0, + "step": 6846 + }, + { + "epoch": 0.7519218097957391, + "grad_norm": 2.051284074783325, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6954066753387451, + "num_tokens": 171198753.0, + "step": 6847 + }, + { + "epoch": 0.7520316274983527, + "grad_norm": 2.083754539489746, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7012771368026733, + "num_tokens": 171225772.0, + "step": 6848 + }, + { + "epoch": 0.7521414452009664, + "grad_norm": 1.9581209421157837, + "learning_rate": 1e-06, + "loss": 1.0611, + "mean_token_accuracy": 0.6853622198104858, + "num_tokens": 171256575.0, + "step": 6849 + }, + { + "epoch": 0.75225126290358, + "grad_norm": 2.0021042823791504, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7027910947799683, + "num_tokens": 171285213.0, + "step": 6850 + }, + { + "epoch": 0.7523610806061937, + "grad_norm": 2.0352184772491455, + "learning_rate": 1e-06, + "loss": 1.0359, + "mean_token_accuracy": 0.6851115822792053, + "num_tokens": 171314586.0, + "step": 6851 + }, + { + "epoch": 0.7524708983088074, + "grad_norm": 2.277257204055786, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7023556232452393, + "num_tokens": 171338977.0, + "step": 6852 + }, + { + "epoch": 0.7525807160114211, + "grad_norm": 2.3402152061462402, + "learning_rate": 1e-06, + "loss": 1.0444, + "mean_token_accuracy": 0.7104208469390869, + "num_tokens": 171363846.0, + "step": 6853 + }, + { + "epoch": 0.7526905337140347, + "grad_norm": 2.11332368850708, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.7042500376701355, + "num_tokens": 171389345.0, + "step": 6854 + }, + { + "epoch": 0.7528003514166484, + "grad_norm": 2.0518996715545654, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7230156660079956, + "num_tokens": 171415895.0, + "step": 6855 + }, + { + "epoch": 0.752910169119262, + "grad_norm": 2.135747194290161, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7094599604606628, + "num_tokens": 171444607.0, + "step": 6856 + }, + { + "epoch": 0.7530199868218757, + "grad_norm": 2.2772223949432373, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7002118229866028, + "num_tokens": 171467459.0, + "step": 6857 + }, + { + "epoch": 0.7531298045244893, + "grad_norm": 2.1821770668029785, + "learning_rate": 1e-06, + "loss": 1.0531, + "mean_token_accuracy": 0.677043080329895, + "num_tokens": 171494602.0, + "step": 6858 + }, + { + "epoch": 0.7532396222271031, + "grad_norm": 1.9212379455566406, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7071475982666016, + "num_tokens": 171524736.0, + "step": 6859 + }, + { + "epoch": 0.7533494399297167, + "grad_norm": 2.2698614597320557, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7100303769111633, + "num_tokens": 171547610.0, + "step": 6860 + }, + { + "epoch": 0.7534592576323303, + "grad_norm": 2.2939226627349854, + "learning_rate": 1e-06, + "loss": 1.0328, + "mean_token_accuracy": 0.6925830245018005, + "num_tokens": 171572850.0, + "step": 6861 + }, + { + "epoch": 0.753569075334944, + "grad_norm": 2.119608163833618, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7105740308761597, + "num_tokens": 171600773.0, + "step": 6862 + }, + { + "epoch": 0.7536788930375576, + "grad_norm": 1.94105863571167, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7230024337768555, + "num_tokens": 171631800.0, + "step": 6863 + }, + { + "epoch": 0.7537887107401713, + "grad_norm": 2.1889402866363525, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6949883699417114, + "num_tokens": 171656941.0, + "step": 6864 + }, + { + "epoch": 0.7538985284427849, + "grad_norm": 2.0644166469573975, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.7095468044281006, + "num_tokens": 171683815.0, + "step": 6865 + }, + { + "epoch": 0.7540083461453987, + "grad_norm": 2.1408185958862305, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7194008231163025, + "num_tokens": 171711100.0, + "step": 6866 + }, + { + "epoch": 0.7541181638480123, + "grad_norm": 2.561399459838867, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7003675699234009, + "num_tokens": 171731055.0, + "step": 6867 + }, + { + "epoch": 0.754227981550626, + "grad_norm": 2.1552019119262695, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7263450622558594, + "num_tokens": 171755642.0, + "step": 6868 + }, + { + "epoch": 0.7543377992532396, + "grad_norm": 2.0346245765686035, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7040746212005615, + "num_tokens": 171785443.0, + "step": 6869 + }, + { + "epoch": 0.7544476169558533, + "grad_norm": 1.9638324975967407, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7045261263847351, + "num_tokens": 171814359.0, + "step": 6870 + }, + { + "epoch": 0.7545574346584669, + "grad_norm": 2.589921236038208, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7114681005477905, + "num_tokens": 171832474.0, + "step": 6871 + }, + { + "epoch": 0.7546672523610806, + "grad_norm": 2.251804828643799, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7118682861328125, + "num_tokens": 171858446.0, + "step": 6872 + }, + { + "epoch": 0.7547770700636943, + "grad_norm": 2.173680305480957, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7023242712020874, + "num_tokens": 171883424.0, + "step": 6873 + }, + { + "epoch": 0.754886887766308, + "grad_norm": 2.0110647678375244, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7142698168754578, + "num_tokens": 171911422.0, + "step": 6874 + }, + { + "epoch": 0.7549967054689216, + "grad_norm": 2.250623941421509, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7034882307052612, + "num_tokens": 171934978.0, + "step": 6875 + }, + { + "epoch": 0.7551065231715353, + "grad_norm": 2.293208122253418, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6983789801597595, + "num_tokens": 171960660.0, + "step": 6876 + }, + { + "epoch": 0.7552163408741489, + "grad_norm": 2.444408893585205, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7015074491500854, + "num_tokens": 171981322.0, + "step": 6877 + }, + { + "epoch": 0.7553261585767626, + "grad_norm": 2.2861812114715576, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.683579683303833, + "num_tokens": 172005867.0, + "step": 6878 + }, + { + "epoch": 0.7554359762793762, + "grad_norm": 1.9659146070480347, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7209224104881287, + "num_tokens": 172036807.0, + "step": 6879 + }, + { + "epoch": 0.7555457939819898, + "grad_norm": 2.8024120330810547, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7134087681770325, + "num_tokens": 172052505.0, + "step": 6880 + }, + { + "epoch": 0.7556556116846036, + "grad_norm": 2.175346612930298, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7025023698806763, + "num_tokens": 172080284.0, + "step": 6881 + }, + { + "epoch": 0.7557654293872172, + "grad_norm": 2.019249200820923, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7158803939819336, + "num_tokens": 172107593.0, + "step": 6882 + }, + { + "epoch": 0.7558752470898309, + "grad_norm": 2.2524843215942383, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.7035964131355286, + "num_tokens": 172133884.0, + "step": 6883 + }, + { + "epoch": 0.7559850647924445, + "grad_norm": 2.1882169246673584, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7090053558349609, + "num_tokens": 172157835.0, + "step": 6884 + }, + { + "epoch": 0.7560948824950582, + "grad_norm": 2.4381814002990723, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7164583206176758, + "num_tokens": 172177963.0, + "step": 6885 + }, + { + "epoch": 0.7562047001976718, + "grad_norm": 1.9086546897888184, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.6960482597351074, + "num_tokens": 172210451.0, + "step": 6886 + }, + { + "epoch": 0.7563145179002855, + "grad_norm": 2.2317276000976562, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7195791602134705, + "num_tokens": 172233692.0, + "step": 6887 + }, + { + "epoch": 0.7564243356028992, + "grad_norm": 2.0238072872161865, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6984188556671143, + "num_tokens": 172263489.0, + "step": 6888 + }, + { + "epoch": 0.7565341533055129, + "grad_norm": 2.0262036323547363, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.6918637752532959, + "num_tokens": 172291669.0, + "step": 6889 + }, + { + "epoch": 0.7566439710081265, + "grad_norm": 2.3298802375793457, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7078973650932312, + "num_tokens": 172316101.0, + "step": 6890 + }, + { + "epoch": 0.7567537887107402, + "grad_norm": 1.9536776542663574, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7304449677467346, + "num_tokens": 172344648.0, + "step": 6891 + }, + { + "epoch": 0.7568636064133538, + "grad_norm": 2.421243667602539, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7072120904922485, + "num_tokens": 172364471.0, + "step": 6892 + }, + { + "epoch": 0.7569734241159675, + "grad_norm": 2.389655351638794, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7014309763908386, + "num_tokens": 172387146.0, + "step": 6893 + }, + { + "epoch": 0.7570832418185811, + "grad_norm": 2.5644357204437256, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7205154299736023, + "num_tokens": 172405049.0, + "step": 6894 + }, + { + "epoch": 0.7571930595211949, + "grad_norm": 2.146944046020508, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7202893495559692, + "num_tokens": 172428815.0, + "step": 6895 + }, + { + "epoch": 0.7573028772238085, + "grad_norm": 2.5136778354644775, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7273329496383667, + "num_tokens": 172447989.0, + "step": 6896 + }, + { + "epoch": 0.7574126949264222, + "grad_norm": 2.2619080543518066, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6986242532730103, + "num_tokens": 172470937.0, + "step": 6897 + }, + { + "epoch": 0.7575225126290358, + "grad_norm": 2.364659547805786, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7133128046989441, + "num_tokens": 172491940.0, + "step": 6898 + }, + { + "epoch": 0.7576323303316495, + "grad_norm": 1.9317810535430908, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6862025260925293, + "num_tokens": 172524048.0, + "step": 6899 + }, + { + "epoch": 0.7577421480342631, + "grad_norm": 2.090850353240967, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.696613073348999, + "num_tokens": 172551434.0, + "step": 6900 + }, + { + "epoch": 0.7578519657368767, + "grad_norm": 1.9137951135635376, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7081268429756165, + "num_tokens": 172584438.0, + "step": 6901 + }, + { + "epoch": 0.7579617834394905, + "grad_norm": 2.6140975952148438, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7024877071380615, + "num_tokens": 172601806.0, + "step": 6902 + }, + { + "epoch": 0.7580716011421041, + "grad_norm": 2.066227436065674, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7110486030578613, + "num_tokens": 172627842.0, + "step": 6903 + }, + { + "epoch": 0.7581814188447178, + "grad_norm": 2.1232564449310303, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7072302103042603, + "num_tokens": 172654015.0, + "step": 6904 + }, + { + "epoch": 0.7582912365473314, + "grad_norm": 2.038889169692993, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7172080278396606, + "num_tokens": 172682592.0, + "step": 6905 + }, + { + "epoch": 0.7584010542499451, + "grad_norm": 2.0999248027801514, + "learning_rate": 1e-06, + "loss": 1.0954, + "mean_token_accuracy": 0.6717300415039062, + "num_tokens": 172709362.0, + "step": 6906 + }, + { + "epoch": 0.7585108719525587, + "grad_norm": 2.397742509841919, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7368283271789551, + "num_tokens": 172730598.0, + "step": 6907 + }, + { + "epoch": 0.7586206896551724, + "grad_norm": 2.121731996536255, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6938663125038147, + "num_tokens": 172758314.0, + "step": 6908 + }, + { + "epoch": 0.758730507357786, + "grad_norm": 2.1733601093292236, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7115520238876343, + "num_tokens": 172782926.0, + "step": 6909 + }, + { + "epoch": 0.7588403250603998, + "grad_norm": 2.381775140762329, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7241538763046265, + "num_tokens": 172803641.0, + "step": 6910 + }, + { + "epoch": 0.7589501427630134, + "grad_norm": 1.9679462909698486, + "learning_rate": 1e-06, + "loss": 0.8277, + "mean_token_accuracy": 0.7403090000152588, + "num_tokens": 172830218.0, + "step": 6911 + }, + { + "epoch": 0.7590599604656271, + "grad_norm": 2.0347955226898193, + "learning_rate": 1e-06, + "loss": 1.1211, + "mean_token_accuracy": 0.6634044647216797, + "num_tokens": 172861948.0, + "step": 6912 + }, + { + "epoch": 0.7591697781682407, + "grad_norm": 2.565922975540161, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7081667184829712, + "num_tokens": 172882146.0, + "step": 6913 + }, + { + "epoch": 0.7592795958708544, + "grad_norm": 2.108172655105591, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7079892158508301, + "num_tokens": 172909043.0, + "step": 6914 + }, + { + "epoch": 0.759389413573468, + "grad_norm": 2.1898245811462402, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7056610584259033, + "num_tokens": 172934628.0, + "step": 6915 + }, + { + "epoch": 0.7594992312760817, + "grad_norm": 2.3287668228149414, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6962607502937317, + "num_tokens": 172957868.0, + "step": 6916 + }, + { + "epoch": 0.7596090489786954, + "grad_norm": 1.9524251222610474, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.701015055179596, + "num_tokens": 172988865.0, + "step": 6917 + }, + { + "epoch": 0.7597188666813091, + "grad_norm": 2.1684536933898926, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6876226663589478, + "num_tokens": 173014996.0, + "step": 6918 + }, + { + "epoch": 0.7598286843839227, + "grad_norm": 2.1752519607543945, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7312343120574951, + "num_tokens": 173040202.0, + "step": 6919 + }, + { + "epoch": 0.7599385020865363, + "grad_norm": 2.063401937484741, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7029596567153931, + "num_tokens": 173067618.0, + "step": 6920 + }, + { + "epoch": 0.76004831978915, + "grad_norm": 1.9638994932174683, + "learning_rate": 1e-06, + "loss": 1.098, + "mean_token_accuracy": 0.6728554964065552, + "num_tokens": 173100393.0, + "step": 6921 + }, + { + "epoch": 0.7601581374917636, + "grad_norm": 2.24896502494812, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7318121194839478, + "num_tokens": 173125146.0, + "step": 6922 + }, + { + "epoch": 0.7602679551943773, + "grad_norm": 2.246203899383545, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7352641224861145, + "num_tokens": 173147967.0, + "step": 6923 + }, + { + "epoch": 0.760377772896991, + "grad_norm": 2.2654478549957275, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6931746602058411, + "num_tokens": 173175145.0, + "step": 6924 + }, + { + "epoch": 0.7604875905996047, + "grad_norm": 2.6712305545806885, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7176691293716431, + "num_tokens": 173194313.0, + "step": 6925 + }, + { + "epoch": 0.7605974083022183, + "grad_norm": 2.4502713680267334, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7232692241668701, + "num_tokens": 173214493.0, + "step": 6926 + }, + { + "epoch": 0.760707226004832, + "grad_norm": 2.219055652618408, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7025792002677917, + "num_tokens": 173238616.0, + "step": 6927 + }, + { + "epoch": 0.7608170437074456, + "grad_norm": 1.907373070716858, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6947471499443054, + "num_tokens": 173272313.0, + "step": 6928 + }, + { + "epoch": 0.7609268614100593, + "grad_norm": 2.6388375759124756, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7217046022415161, + "num_tokens": 173289427.0, + "step": 6929 + }, + { + "epoch": 0.7610366791126729, + "grad_norm": 2.5421762466430664, + "learning_rate": 1e-06, + "loss": 1.0399, + "mean_token_accuracy": 0.6893117427825928, + "num_tokens": 173310583.0, + "step": 6930 + }, + { + "epoch": 0.7611464968152867, + "grad_norm": 2.3274333477020264, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7082029581069946, + "num_tokens": 173334183.0, + "step": 6931 + }, + { + "epoch": 0.7612563145179003, + "grad_norm": 2.164667844772339, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6938903331756592, + "num_tokens": 173360489.0, + "step": 6932 + }, + { + "epoch": 0.761366132220514, + "grad_norm": 2.1955020427703857, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7114260196685791, + "num_tokens": 173383577.0, + "step": 6933 + }, + { + "epoch": 0.7614759499231276, + "grad_norm": 2.202678680419922, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.723076343536377, + "num_tokens": 173406758.0, + "step": 6934 + }, + { + "epoch": 0.7615857676257413, + "grad_norm": 2.218052625656128, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7077685594558716, + "num_tokens": 173430762.0, + "step": 6935 + }, + { + "epoch": 0.7616955853283549, + "grad_norm": 1.800408124923706, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6967591643333435, + "num_tokens": 173466575.0, + "step": 6936 + }, + { + "epoch": 0.7618054030309686, + "grad_norm": 2.4738898277282715, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7150972485542297, + "num_tokens": 173487080.0, + "step": 6937 + }, + { + "epoch": 0.7619152207335822, + "grad_norm": 2.407026767730713, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7359504699707031, + "num_tokens": 173507039.0, + "step": 6938 + }, + { + "epoch": 0.762025038436196, + "grad_norm": 2.050762891769409, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.726250171661377, + "num_tokens": 173534623.0, + "step": 6939 + }, + { + "epoch": 0.7621348561388096, + "grad_norm": 2.2000951766967773, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7056044340133667, + "num_tokens": 173560602.0, + "step": 6940 + }, + { + "epoch": 0.7622446738414232, + "grad_norm": 2.3074893951416016, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7264052033424377, + "num_tokens": 173582372.0, + "step": 6941 + }, + { + "epoch": 0.7623544915440369, + "grad_norm": 2.0768208503723145, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7351205348968506, + "num_tokens": 173608367.0, + "step": 6942 + }, + { + "epoch": 0.7624643092466505, + "grad_norm": 2.095914840698242, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7439046502113342, + "num_tokens": 173633928.0, + "step": 6943 + }, + { + "epoch": 0.7625741269492642, + "grad_norm": 2.254944086074829, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6937788128852844, + "num_tokens": 173658202.0, + "step": 6944 + }, + { + "epoch": 0.7626839446518778, + "grad_norm": 2.0890705585479736, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7119324803352356, + "num_tokens": 173684501.0, + "step": 6945 + }, + { + "epoch": 0.7627937623544916, + "grad_norm": 2.107374906539917, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7120035886764526, + "num_tokens": 173712775.0, + "step": 6946 + }, + { + "epoch": 0.7629035800571052, + "grad_norm": 2.5660767555236816, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.712296187877655, + "num_tokens": 173731297.0, + "step": 6947 + }, + { + "epoch": 0.7630133977597189, + "grad_norm": 2.147204637527466, + "learning_rate": 1e-06, + "loss": 1.0939, + "mean_token_accuracy": 0.6757014989852905, + "num_tokens": 173759884.0, + "step": 6948 + }, + { + "epoch": 0.7631232154623325, + "grad_norm": 2.220949649810791, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.724746823310852, + "num_tokens": 173783124.0, + "step": 6949 + }, + { + "epoch": 0.7632330331649462, + "grad_norm": 2.2939047813415527, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7134311199188232, + "num_tokens": 173805768.0, + "step": 6950 + }, + { + "epoch": 0.7633428508675598, + "grad_norm": 2.0162932872772217, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6898308992385864, + "num_tokens": 173837123.0, + "step": 6951 + }, + { + "epoch": 0.7634526685701735, + "grad_norm": 2.1463568210601807, + "learning_rate": 1e-06, + "loss": 1.0214, + "mean_token_accuracy": 0.6924679279327393, + "num_tokens": 173864139.0, + "step": 6952 + }, + { + "epoch": 0.7635624862727872, + "grad_norm": 2.4737119674682617, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7278412580490112, + "num_tokens": 173884652.0, + "step": 6953 + }, + { + "epoch": 0.7636723039754009, + "grad_norm": 2.2178292274475098, + "learning_rate": 1e-06, + "loss": 1.0236, + "mean_token_accuracy": 0.687131404876709, + "num_tokens": 173910366.0, + "step": 6954 + }, + { + "epoch": 0.7637821216780145, + "grad_norm": 2.04146146774292, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7360602617263794, + "num_tokens": 173935683.0, + "step": 6955 + }, + { + "epoch": 0.7638919393806282, + "grad_norm": 2.002372980117798, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6982582211494446, + "num_tokens": 173963762.0, + "step": 6956 + }, + { + "epoch": 0.7640017570832418, + "grad_norm": 2.3945953845977783, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.6993975639343262, + "num_tokens": 173984754.0, + "step": 6957 + }, + { + "epoch": 0.7641115747858555, + "grad_norm": 2.0714614391326904, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6934270262718201, + "num_tokens": 174012519.0, + "step": 6958 + }, + { + "epoch": 0.7642213924884691, + "grad_norm": 2.1627042293548584, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7252590656280518, + "num_tokens": 174036513.0, + "step": 6959 + }, + { + "epoch": 0.7643312101910829, + "grad_norm": 2.5002477169036865, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7171486616134644, + "num_tokens": 174055410.0, + "step": 6960 + }, + { + "epoch": 0.7644410278936965, + "grad_norm": 2.750864267349243, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7106400728225708, + "num_tokens": 174073550.0, + "step": 6961 + }, + { + "epoch": 0.7645508455963101, + "grad_norm": 2.149353504180908, + "learning_rate": 1e-06, + "loss": 1.0209, + "mean_token_accuracy": 0.6916757822036743, + "num_tokens": 174099470.0, + "step": 6962 + }, + { + "epoch": 0.7646606632989238, + "grad_norm": 1.984251856803894, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.7024518251419067, + "num_tokens": 174130343.0, + "step": 6963 + }, + { + "epoch": 0.7647704810015374, + "grad_norm": 2.2901787757873535, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7078843116760254, + "num_tokens": 174153104.0, + "step": 6964 + }, + { + "epoch": 0.7648802987041511, + "grad_norm": 2.262331485748291, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7115222215652466, + "num_tokens": 174176728.0, + "step": 6965 + }, + { + "epoch": 0.7649901164067647, + "grad_norm": 2.1635258197784424, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7048846483230591, + "num_tokens": 174206368.0, + "step": 6966 + }, + { + "epoch": 0.7650999341093784, + "grad_norm": 2.418257713317871, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7289594411849976, + "num_tokens": 174227671.0, + "step": 6967 + }, + { + "epoch": 0.7652097518119921, + "grad_norm": 2.2097554206848145, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.6967474818229675, + "num_tokens": 174253639.0, + "step": 6968 + }, + { + "epoch": 0.7653195695146058, + "grad_norm": 2.092189311981201, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7171280980110168, + "num_tokens": 174280084.0, + "step": 6969 + }, + { + "epoch": 0.7654293872172194, + "grad_norm": 1.9306048154830933, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6873393654823303, + "num_tokens": 174310109.0, + "step": 6970 + }, + { + "epoch": 0.7655392049198331, + "grad_norm": 2.2606608867645264, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.6966050863265991, + "num_tokens": 174333768.0, + "step": 6971 + }, + { + "epoch": 0.7656490226224467, + "grad_norm": 2.369307279586792, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.7148538827896118, + "num_tokens": 174354836.0, + "step": 6972 + }, + { + "epoch": 0.7657588403250604, + "grad_norm": 2.304230213165283, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7276473045349121, + "num_tokens": 174376038.0, + "step": 6973 + }, + { + "epoch": 0.765868658027674, + "grad_norm": 2.3408243656158447, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7032468914985657, + "num_tokens": 174398179.0, + "step": 6974 + }, + { + "epoch": 0.7659784757302878, + "grad_norm": 1.9823291301727295, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7104834914207458, + "num_tokens": 174429253.0, + "step": 6975 + }, + { + "epoch": 0.7660882934329014, + "grad_norm": 2.2684166431427, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6939219236373901, + "num_tokens": 174453160.0, + "step": 6976 + }, + { + "epoch": 0.7661981111355151, + "grad_norm": 2.2803282737731934, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7041025161743164, + "num_tokens": 174476663.0, + "step": 6977 + }, + { + "epoch": 0.7663079288381287, + "grad_norm": 2.5707294940948486, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7292789220809937, + "num_tokens": 174494104.0, + "step": 6978 + }, + { + "epoch": 0.7664177465407424, + "grad_norm": 2.492774486541748, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.6976885795593262, + "num_tokens": 174514703.0, + "step": 6979 + }, + { + "epoch": 0.766527564243356, + "grad_norm": 2.1048271656036377, + "learning_rate": 1e-06, + "loss": 1.0387, + "mean_token_accuracy": 0.6857109665870667, + "num_tokens": 174542249.0, + "step": 6980 + }, + { + "epoch": 0.7666373819459696, + "grad_norm": 2.2553296089172363, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7147599458694458, + "num_tokens": 174566431.0, + "step": 6981 + }, + { + "epoch": 0.7667471996485834, + "grad_norm": 2.1283018589019775, + "learning_rate": 1e-06, + "loss": 1.0705, + "mean_token_accuracy": 0.6746538281440735, + "num_tokens": 174594264.0, + "step": 6982 + }, + { + "epoch": 0.766857017351197, + "grad_norm": 2.68902850151062, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7100390195846558, + "num_tokens": 174612494.0, + "step": 6983 + }, + { + "epoch": 0.7669668350538107, + "grad_norm": 2.011903762817383, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7058030366897583, + "num_tokens": 174642585.0, + "step": 6984 + }, + { + "epoch": 0.7670766527564243, + "grad_norm": 2.08776593208313, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7303807735443115, + "num_tokens": 174668718.0, + "step": 6985 + }, + { + "epoch": 0.767186470459038, + "grad_norm": 2.4215452671051025, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7161043882369995, + "num_tokens": 174689172.0, + "step": 6986 + }, + { + "epoch": 0.7672962881616516, + "grad_norm": 2.368849992752075, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.708463191986084, + "num_tokens": 174711097.0, + "step": 6987 + }, + { + "epoch": 0.7674061058642653, + "grad_norm": 1.9166736602783203, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6956249475479126, + "num_tokens": 174743500.0, + "step": 6988 + }, + { + "epoch": 0.767515923566879, + "grad_norm": 2.3305912017822266, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7038896083831787, + "num_tokens": 174766603.0, + "step": 6989 + }, + { + "epoch": 0.7676257412694927, + "grad_norm": 2.0536627769470215, + "learning_rate": 1e-06, + "loss": 1.0035, + "mean_token_accuracy": 0.7025245428085327, + "num_tokens": 174797812.0, + "step": 6990 + }, + { + "epoch": 0.7677355589721063, + "grad_norm": 2.3105289936065674, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7004002332687378, + "num_tokens": 174821696.0, + "step": 6991 + }, + { + "epoch": 0.76784537667472, + "grad_norm": 1.9588474035263062, + "learning_rate": 1e-06, + "loss": 1.0554, + "mean_token_accuracy": 0.6830679178237915, + "num_tokens": 174855551.0, + "step": 6992 + }, + { + "epoch": 0.7679551943773336, + "grad_norm": 2.140397310256958, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7121421098709106, + "num_tokens": 174882084.0, + "step": 6993 + }, + { + "epoch": 0.7680650120799473, + "grad_norm": 2.235919237136841, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7329689264297485, + "num_tokens": 174904566.0, + "step": 6994 + }, + { + "epoch": 0.7681748297825609, + "grad_norm": 2.1010916233062744, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7045397162437439, + "num_tokens": 174931289.0, + "step": 6995 + }, + { + "epoch": 0.7682846474851747, + "grad_norm": 2.0162317752838135, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.6910586953163147, + "num_tokens": 174961054.0, + "step": 6996 + }, + { + "epoch": 0.7683944651877883, + "grad_norm": 2.2729573249816895, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.6959058046340942, + "num_tokens": 174985116.0, + "step": 6997 + }, + { + "epoch": 0.768504282890402, + "grad_norm": 2.2287161350250244, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6872265338897705, + "num_tokens": 175010264.0, + "step": 6998 + }, + { + "epoch": 0.7686141005930156, + "grad_norm": 2.1005032062530518, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7143694162368774, + "num_tokens": 175038370.0, + "step": 6999 + }, + { + "epoch": 0.7687239182956292, + "grad_norm": 2.5427966117858887, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7286238074302673, + "num_tokens": 175059718.0, + "step": 7000 + }, + { + "epoch": 0.7688337359982429, + "grad_norm": 2.3235538005828857, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7099472284317017, + "num_tokens": 175080518.0, + "step": 7001 + }, + { + "epoch": 0.7689435537008565, + "grad_norm": 2.169325351715088, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7040815353393555, + "num_tokens": 175106787.0, + "step": 7002 + }, + { + "epoch": 0.7690533714034702, + "grad_norm": 2.0503854751586914, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7056599855422974, + "num_tokens": 175133468.0, + "step": 7003 + }, + { + "epoch": 0.7691631891060839, + "grad_norm": 2.13340163230896, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7283236384391785, + "num_tokens": 175160329.0, + "step": 7004 + }, + { + "epoch": 0.7692730068086976, + "grad_norm": 2.330533504486084, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7128201723098755, + "num_tokens": 175181605.0, + "step": 7005 + }, + { + "epoch": 0.7693828245113112, + "grad_norm": 2.433432102203369, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7166277766227722, + "num_tokens": 175201865.0, + "step": 7006 + }, + { + "epoch": 0.7694926422139249, + "grad_norm": 2.0450727939605713, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7230260372161865, + "num_tokens": 175228745.0, + "step": 7007 + }, + { + "epoch": 0.7696024599165385, + "grad_norm": 2.2415802478790283, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7028535604476929, + "num_tokens": 175253942.0, + "step": 7008 + }, + { + "epoch": 0.7697122776191522, + "grad_norm": 2.2503693103790283, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7288990020751953, + "num_tokens": 175277273.0, + "step": 7009 + }, + { + "epoch": 0.7698220953217658, + "grad_norm": 1.945626139640808, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6943622827529907, + "num_tokens": 175308012.0, + "step": 7010 + }, + { + "epoch": 0.7699319130243796, + "grad_norm": 2.186828851699829, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7020952105522156, + "num_tokens": 175332601.0, + "step": 7011 + }, + { + "epoch": 0.7700417307269932, + "grad_norm": 2.229285955429077, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7022335529327393, + "num_tokens": 175357999.0, + "step": 7012 + }, + { + "epoch": 0.7701515484296069, + "grad_norm": 2.457987070083618, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.727645993232727, + "num_tokens": 175377530.0, + "step": 7013 + }, + { + "epoch": 0.7702613661322205, + "grad_norm": 2.5342793464660645, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7172268033027649, + "num_tokens": 175396918.0, + "step": 7014 + }, + { + "epoch": 0.7703711838348342, + "grad_norm": 2.031768798828125, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6941977739334106, + "num_tokens": 175424341.0, + "step": 7015 + }, + { + "epoch": 0.7704810015374478, + "grad_norm": 2.1899898052215576, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7166814804077148, + "num_tokens": 175449173.0, + "step": 7016 + }, + { + "epoch": 0.7705908192400615, + "grad_norm": 2.3308889865875244, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7103044986724854, + "num_tokens": 175471203.0, + "step": 7017 + }, + { + "epoch": 0.7707006369426752, + "grad_norm": 2.392788887023926, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7131339311599731, + "num_tokens": 175491166.0, + "step": 7018 + }, + { + "epoch": 0.7708104546452889, + "grad_norm": 2.087982177734375, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.692063570022583, + "num_tokens": 175519379.0, + "step": 7019 + }, + { + "epoch": 0.7709202723479025, + "grad_norm": 2.0462071895599365, + "learning_rate": 1e-06, + "loss": 1.0412, + "mean_token_accuracy": 0.6832034587860107, + "num_tokens": 175548714.0, + "step": 7020 + }, + { + "epoch": 0.7710300900505161, + "grad_norm": 2.0536043643951416, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7202266454696655, + "num_tokens": 175573894.0, + "step": 7021 + }, + { + "epoch": 0.7711399077531298, + "grad_norm": 2.1766035556793213, + "learning_rate": 1e-06, + "loss": 1.0474, + "mean_token_accuracy": 0.6869760751724243, + "num_tokens": 175601493.0, + "step": 7022 + }, + { + "epoch": 0.7712497254557434, + "grad_norm": 2.4337329864501953, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.7109270095825195, + "num_tokens": 175622308.0, + "step": 7023 + }, + { + "epoch": 0.7713595431583571, + "grad_norm": 2.2663660049438477, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7180348634719849, + "num_tokens": 175646456.0, + "step": 7024 + }, + { + "epoch": 0.7714693608609708, + "grad_norm": 2.125413179397583, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6968750953674316, + "num_tokens": 175674858.0, + "step": 7025 + }, + { + "epoch": 0.7715791785635845, + "grad_norm": 2.2217137813568115, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7008728384971619, + "num_tokens": 175699914.0, + "step": 7026 + }, + { + "epoch": 0.7716889962661981, + "grad_norm": 2.3339900970458984, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7114264965057373, + "num_tokens": 175725806.0, + "step": 7027 + }, + { + "epoch": 0.7717988139688118, + "grad_norm": 2.501673460006714, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.6995970010757446, + "num_tokens": 175746886.0, + "step": 7028 + }, + { + "epoch": 0.7719086316714254, + "grad_norm": 2.338702917098999, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7218090295791626, + "num_tokens": 175771498.0, + "step": 7029 + }, + { + "epoch": 0.7720184493740391, + "grad_norm": 2.5371310710906982, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.7116001844406128, + "num_tokens": 175791981.0, + "step": 7030 + }, + { + "epoch": 0.7721282670766527, + "grad_norm": 2.2445645332336426, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7268171906471252, + "num_tokens": 175816143.0, + "step": 7031 + }, + { + "epoch": 0.7722380847792664, + "grad_norm": 2.3487389087677, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6899131536483765, + "num_tokens": 175840409.0, + "step": 7032 + }, + { + "epoch": 0.7723479024818801, + "grad_norm": 2.445786952972412, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7091518640518188, + "num_tokens": 175862110.0, + "step": 7033 + }, + { + "epoch": 0.7724577201844938, + "grad_norm": 2.229323387145996, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.6927988529205322, + "num_tokens": 175887227.0, + "step": 7034 + }, + { + "epoch": 0.7725675378871074, + "grad_norm": 2.407228946685791, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7276914119720459, + "num_tokens": 175908179.0, + "step": 7035 + }, + { + "epoch": 0.7726773555897211, + "grad_norm": 2.03145170211792, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7256465554237366, + "num_tokens": 175934794.0, + "step": 7036 + }, + { + "epoch": 0.7727871732923347, + "grad_norm": 2.3072197437286377, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6955536603927612, + "num_tokens": 175961735.0, + "step": 7037 + }, + { + "epoch": 0.7728969909949484, + "grad_norm": 2.5728838443756104, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7113720178604126, + "num_tokens": 175982804.0, + "step": 7038 + }, + { + "epoch": 0.773006808697562, + "grad_norm": 2.2645609378814697, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7054386734962463, + "num_tokens": 176006331.0, + "step": 7039 + }, + { + "epoch": 0.7731166264001758, + "grad_norm": 2.392517566680908, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6942083239555359, + "num_tokens": 176029586.0, + "step": 7040 + }, + { + "epoch": 0.7732264441027894, + "grad_norm": 2.355412483215332, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7048609256744385, + "num_tokens": 176052704.0, + "step": 7041 + }, + { + "epoch": 0.773336261805403, + "grad_norm": 2.0295605659484863, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7147308588027954, + "num_tokens": 176080661.0, + "step": 7042 + }, + { + "epoch": 0.7734460795080167, + "grad_norm": 2.204759120941162, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7208065986633301, + "num_tokens": 176103682.0, + "step": 7043 + }, + { + "epoch": 0.7735558972106303, + "grad_norm": 2.2512564659118652, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7262782454490662, + "num_tokens": 176128703.0, + "step": 7044 + }, + { + "epoch": 0.773665714913244, + "grad_norm": 2.0907230377197266, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.686560869216919, + "num_tokens": 176157086.0, + "step": 7045 + }, + { + "epoch": 0.7737755326158576, + "grad_norm": 2.010498046875, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7023383378982544, + "num_tokens": 176183685.0, + "step": 7046 + }, + { + "epoch": 0.7738853503184714, + "grad_norm": 2.1589577198028564, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.712734043598175, + "num_tokens": 176210270.0, + "step": 7047 + }, + { + "epoch": 0.773995168021085, + "grad_norm": 2.301581382751465, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6857420206069946, + "num_tokens": 176235991.0, + "step": 7048 + }, + { + "epoch": 0.7741049857236987, + "grad_norm": 2.384242057800293, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7147054672241211, + "num_tokens": 176259064.0, + "step": 7049 + }, + { + "epoch": 0.7742148034263123, + "grad_norm": 2.3861045837402344, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7329734563827515, + "num_tokens": 176279561.0, + "step": 7050 + }, + { + "epoch": 0.774324621128926, + "grad_norm": 2.2404465675354004, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7100374698638916, + "num_tokens": 176303972.0, + "step": 7051 + }, + { + "epoch": 0.7744344388315396, + "grad_norm": 2.2216267585754395, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7184001207351685, + "num_tokens": 176329007.0, + "step": 7052 + }, + { + "epoch": 0.7745442565341533, + "grad_norm": 2.648360252380371, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7156557440757751, + "num_tokens": 176347626.0, + "step": 7053 + }, + { + "epoch": 0.774654074236767, + "grad_norm": 2.3157296180725098, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7128610610961914, + "num_tokens": 176371831.0, + "step": 7054 + }, + { + "epoch": 0.7747638919393807, + "grad_norm": 2.340956449508667, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7209459543228149, + "num_tokens": 176395318.0, + "step": 7055 + }, + { + "epoch": 0.7748737096419943, + "grad_norm": 2.252847909927368, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7091645002365112, + "num_tokens": 176420859.0, + "step": 7056 + }, + { + "epoch": 0.774983527344608, + "grad_norm": 2.36663818359375, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7106809020042419, + "num_tokens": 176442562.0, + "step": 7057 + }, + { + "epoch": 0.7750933450472216, + "grad_norm": 2.3850350379943848, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7008705139160156, + "num_tokens": 176465567.0, + "step": 7058 + }, + { + "epoch": 0.7752031627498353, + "grad_norm": 2.0052671432495117, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6946668028831482, + "num_tokens": 176496104.0, + "step": 7059 + }, + { + "epoch": 0.7753129804524489, + "grad_norm": 2.560960531234741, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7113940119743347, + "num_tokens": 176516041.0, + "step": 7060 + }, + { + "epoch": 0.7754227981550625, + "grad_norm": 2.805513858795166, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7305748462677002, + "num_tokens": 176533278.0, + "step": 7061 + }, + { + "epoch": 0.7755326158576763, + "grad_norm": 2.329925298690796, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7213811874389648, + "num_tokens": 176555460.0, + "step": 7062 + }, + { + "epoch": 0.77564243356029, + "grad_norm": 2.5060360431671143, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.732040286064148, + "num_tokens": 176574116.0, + "step": 7063 + }, + { + "epoch": 0.7757522512629036, + "grad_norm": 2.0066044330596924, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7087655663490295, + "num_tokens": 176602109.0, + "step": 7064 + }, + { + "epoch": 0.7758620689655172, + "grad_norm": 2.1880264282226562, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7051209807395935, + "num_tokens": 176626214.0, + "step": 7065 + }, + { + "epoch": 0.7759718866681309, + "grad_norm": 2.1527321338653564, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7001128792762756, + "num_tokens": 176650501.0, + "step": 7066 + }, + { + "epoch": 0.7760817043707445, + "grad_norm": 2.3282015323638916, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.73389732837677, + "num_tokens": 176672511.0, + "step": 7067 + }, + { + "epoch": 0.7761915220733582, + "grad_norm": 1.9874051809310913, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7002594470977783, + "num_tokens": 176702327.0, + "step": 7068 + }, + { + "epoch": 0.7763013397759719, + "grad_norm": 2.2012665271759033, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7089172005653381, + "num_tokens": 176726893.0, + "step": 7069 + }, + { + "epoch": 0.7764111574785856, + "grad_norm": 2.472285270690918, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7205125093460083, + "num_tokens": 176746068.0, + "step": 7070 + }, + { + "epoch": 0.7765209751811992, + "grad_norm": 2.1991469860076904, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7104814648628235, + "num_tokens": 176769543.0, + "step": 7071 + }, + { + "epoch": 0.7766307928838129, + "grad_norm": 2.589493989944458, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7070467472076416, + "num_tokens": 176790156.0, + "step": 7072 + }, + { + "epoch": 0.7767406105864265, + "grad_norm": 2.3850600719451904, + "learning_rate": 1e-06, + "loss": 1.027, + "mean_token_accuracy": 0.6888898611068726, + "num_tokens": 176813523.0, + "step": 7073 + }, + { + "epoch": 0.7768504282890402, + "grad_norm": 2.1965131759643555, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7199193835258484, + "num_tokens": 176837248.0, + "step": 7074 + }, + { + "epoch": 0.7769602459916538, + "grad_norm": 2.4714555740356445, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7127692103385925, + "num_tokens": 176856856.0, + "step": 7075 + }, + { + "epoch": 0.7770700636942676, + "grad_norm": 2.0297505855560303, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7232347726821899, + "num_tokens": 176884823.0, + "step": 7076 + }, + { + "epoch": 0.7771798813968812, + "grad_norm": 2.159367084503174, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6933159828186035, + "num_tokens": 176909499.0, + "step": 7077 + }, + { + "epoch": 0.7772896990994949, + "grad_norm": 1.9453057050704956, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6886353492736816, + "num_tokens": 176942051.0, + "step": 7078 + }, + { + "epoch": 0.7773995168021085, + "grad_norm": 2.266655921936035, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7216190695762634, + "num_tokens": 176963841.0, + "step": 7079 + }, + { + "epoch": 0.7775093345047221, + "grad_norm": 2.1207520961761475, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6880672574043274, + "num_tokens": 176991050.0, + "step": 7080 + }, + { + "epoch": 0.7776191522073358, + "grad_norm": 2.0394325256347656, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7095451354980469, + "num_tokens": 177018419.0, + "step": 7081 + }, + { + "epoch": 0.7777289699099494, + "grad_norm": 2.4570868015289307, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7034940719604492, + "num_tokens": 177039514.0, + "step": 7082 + }, + { + "epoch": 0.7778387876125632, + "grad_norm": 1.8760324716567993, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7083001136779785, + "num_tokens": 177070928.0, + "step": 7083 + }, + { + "epoch": 0.7779486053151768, + "grad_norm": 2.138242483139038, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6918203830718994, + "num_tokens": 177098178.0, + "step": 7084 + }, + { + "epoch": 0.7780584230177905, + "grad_norm": 2.7042858600616455, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.733475923538208, + "num_tokens": 177114884.0, + "step": 7085 + }, + { + "epoch": 0.7781682407204041, + "grad_norm": 2.255159854888916, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7109062671661377, + "num_tokens": 177138953.0, + "step": 7086 + }, + { + "epoch": 0.7782780584230178, + "grad_norm": 2.321269989013672, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.702795684337616, + "num_tokens": 177163180.0, + "step": 7087 + }, + { + "epoch": 0.7783878761256314, + "grad_norm": 2.0116865634918213, + "learning_rate": 1e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.6802916526794434, + "num_tokens": 177194148.0, + "step": 7088 + }, + { + "epoch": 0.7784976938282451, + "grad_norm": 2.4613780975341797, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.709949254989624, + "num_tokens": 177215070.0, + "step": 7089 + }, + { + "epoch": 0.7786075115308587, + "grad_norm": 2.7604944705963135, + "learning_rate": 1e-06, + "loss": 0.7869, + "mean_token_accuracy": 0.7499730587005615, + "num_tokens": 177229572.0, + "step": 7090 + }, + { + "epoch": 0.7787173292334725, + "grad_norm": 2.4455597400665283, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.701378345489502, + "num_tokens": 177252064.0, + "step": 7091 + }, + { + "epoch": 0.7788271469360861, + "grad_norm": 2.1585662364959717, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6799607276916504, + "num_tokens": 177279250.0, + "step": 7092 + }, + { + "epoch": 0.7789369646386998, + "grad_norm": 1.9102822542190552, + "learning_rate": 1e-06, + "loss": 1.0516, + "mean_token_accuracy": 0.6810775995254517, + "num_tokens": 177313233.0, + "step": 7093 + }, + { + "epoch": 0.7790467823413134, + "grad_norm": 2.425544261932373, + "learning_rate": 1e-06, + "loss": 1.0284, + "mean_token_accuracy": 0.687606692314148, + "num_tokens": 177334295.0, + "step": 7094 + }, + { + "epoch": 0.7791566000439271, + "grad_norm": 2.3821258544921875, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7170820236206055, + "num_tokens": 177356025.0, + "step": 7095 + }, + { + "epoch": 0.7792664177465407, + "grad_norm": 2.4259989261627197, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6907341480255127, + "num_tokens": 177380135.0, + "step": 7096 + }, + { + "epoch": 0.7793762354491544, + "grad_norm": 2.2819631099700928, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7071219682693481, + "num_tokens": 177402044.0, + "step": 7097 + }, + { + "epoch": 0.7794860531517681, + "grad_norm": 2.29777193069458, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7205948829650879, + "num_tokens": 177424386.0, + "step": 7098 + }, + { + "epoch": 0.7795958708543818, + "grad_norm": 2.049882650375366, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7045577764511108, + "num_tokens": 177453542.0, + "step": 7099 + }, + { + "epoch": 0.7797056885569954, + "grad_norm": 2.1328155994415283, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7120263576507568, + "num_tokens": 177480425.0, + "step": 7100 + }, + { + "epoch": 0.779815506259609, + "grad_norm": 2.185770273208618, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6920849680900574, + "num_tokens": 177507359.0, + "step": 7101 + }, + { + "epoch": 0.7799253239622227, + "grad_norm": 2.067288398742676, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.713131308555603, + "num_tokens": 177535323.0, + "step": 7102 + }, + { + "epoch": 0.7800351416648363, + "grad_norm": 2.2941601276397705, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.713137149810791, + "num_tokens": 177558187.0, + "step": 7103 + }, + { + "epoch": 0.78014495936745, + "grad_norm": 2.2164077758789062, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.7018916606903076, + "num_tokens": 177582507.0, + "step": 7104 + }, + { + "epoch": 0.7802547770700637, + "grad_norm": 2.0593862533569336, + "learning_rate": 1e-06, + "loss": 1.0789, + "mean_token_accuracy": 0.6751655340194702, + "num_tokens": 177612821.0, + "step": 7105 + }, + { + "epoch": 0.7803645947726774, + "grad_norm": 1.9093061685562134, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7043244242668152, + "num_tokens": 177643912.0, + "step": 7106 + }, + { + "epoch": 0.780474412475291, + "grad_norm": 2.3086259365081787, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6930010318756104, + "num_tokens": 177666973.0, + "step": 7107 + }, + { + "epoch": 0.7805842301779047, + "grad_norm": 1.9675296545028687, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.7135728597640991, + "num_tokens": 177692340.0, + "step": 7108 + }, + { + "epoch": 0.7806940478805183, + "grad_norm": 2.02675724029541, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7154847979545593, + "num_tokens": 177718755.0, + "step": 7109 + }, + { + "epoch": 0.780803865583132, + "grad_norm": 2.2881417274475098, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7131072282791138, + "num_tokens": 177741501.0, + "step": 7110 + }, + { + "epoch": 0.7809136832857456, + "grad_norm": 2.1816723346710205, + "learning_rate": 1e-06, + "loss": 1.0268, + "mean_token_accuracy": 0.6922373175621033, + "num_tokens": 177767107.0, + "step": 7111 + }, + { + "epoch": 0.7810235009883594, + "grad_norm": 1.9605281352996826, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7012348175048828, + "num_tokens": 177797012.0, + "step": 7112 + }, + { + "epoch": 0.781133318690973, + "grad_norm": 2.2160263061523438, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7200252413749695, + "num_tokens": 177822269.0, + "step": 7113 + }, + { + "epoch": 0.7812431363935867, + "grad_norm": 2.2745068073272705, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7189555168151855, + "num_tokens": 177845302.0, + "step": 7114 + }, + { + "epoch": 0.7813529540962003, + "grad_norm": 2.2915701866149902, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7098117470741272, + "num_tokens": 177868670.0, + "step": 7115 + }, + { + "epoch": 0.781462771798814, + "grad_norm": 2.1635491847991943, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7103187441825867, + "num_tokens": 177893328.0, + "step": 7116 + }, + { + "epoch": 0.7815725895014276, + "grad_norm": 2.189640760421753, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6957195997238159, + "num_tokens": 177919233.0, + "step": 7117 + }, + { + "epoch": 0.7816824072040413, + "grad_norm": 2.4017882347106934, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7177766561508179, + "num_tokens": 177940474.0, + "step": 7118 + }, + { + "epoch": 0.7817922249066549, + "grad_norm": 2.062088966369629, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7053025960922241, + "num_tokens": 177969021.0, + "step": 7119 + }, + { + "epoch": 0.7819020426092687, + "grad_norm": 2.156378746032715, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7074831128120422, + "num_tokens": 177995437.0, + "step": 7120 + }, + { + "epoch": 0.7820118603118823, + "grad_norm": 2.3172054290771484, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7051539421081543, + "num_tokens": 178018313.0, + "step": 7121 + }, + { + "epoch": 0.782121678014496, + "grad_norm": 2.3430778980255127, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6969794034957886, + "num_tokens": 178041869.0, + "step": 7122 + }, + { + "epoch": 0.7822314957171096, + "grad_norm": 2.030384063720703, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.7012227773666382, + "num_tokens": 178072370.0, + "step": 7123 + }, + { + "epoch": 0.7823413134197232, + "grad_norm": 2.0991053581237793, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7029504776000977, + "num_tokens": 178100589.0, + "step": 7124 + }, + { + "epoch": 0.7824511311223369, + "grad_norm": 2.0976405143737793, + "learning_rate": 1e-06, + "loss": 1.036, + "mean_token_accuracy": 0.6913509964942932, + "num_tokens": 178130185.0, + "step": 7125 + }, + { + "epoch": 0.7825609488249505, + "grad_norm": 2.0621750354766846, + "learning_rate": 1e-06, + "loss": 1.0941, + "mean_token_accuracy": 0.6717851758003235, + "num_tokens": 178157064.0, + "step": 7126 + }, + { + "epoch": 0.7826707665275643, + "grad_norm": 2.342773675918579, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7298204898834229, + "num_tokens": 178177519.0, + "step": 7127 + }, + { + "epoch": 0.7827805842301779, + "grad_norm": 2.125393867492676, + "learning_rate": 1e-06, + "loss": 1.0831, + "mean_token_accuracy": 0.6731966733932495, + "num_tokens": 178204705.0, + "step": 7128 + }, + { + "epoch": 0.7828904019327916, + "grad_norm": 2.1797616481781006, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6957176923751831, + "num_tokens": 178229898.0, + "step": 7129 + }, + { + "epoch": 0.7830002196354052, + "grad_norm": 2.2454068660736084, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7331117391586304, + "num_tokens": 178251078.0, + "step": 7130 + }, + { + "epoch": 0.7831100373380189, + "grad_norm": 2.1572930812835693, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6924346685409546, + "num_tokens": 178278421.0, + "step": 7131 + }, + { + "epoch": 0.7832198550406325, + "grad_norm": 2.1915464401245117, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.705660343170166, + "num_tokens": 178304833.0, + "step": 7132 + }, + { + "epoch": 0.7833296727432462, + "grad_norm": 2.157531976699829, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.7009119987487793, + "num_tokens": 178330915.0, + "step": 7133 + }, + { + "epoch": 0.7834394904458599, + "grad_norm": 2.2964015007019043, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7083295583724976, + "num_tokens": 178354803.0, + "step": 7134 + }, + { + "epoch": 0.7835493081484736, + "grad_norm": 2.3587870597839355, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7150772213935852, + "num_tokens": 178375764.0, + "step": 7135 + }, + { + "epoch": 0.7836591258510872, + "grad_norm": 2.336312770843506, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6881482601165771, + "num_tokens": 178397138.0, + "step": 7136 + }, + { + "epoch": 0.7837689435537009, + "grad_norm": 2.0412039756774902, + "learning_rate": 1e-06, + "loss": 1.0536, + "mean_token_accuracy": 0.689475953578949, + "num_tokens": 178427295.0, + "step": 7137 + }, + { + "epoch": 0.7838787612563145, + "grad_norm": 2.799027681350708, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7091797590255737, + "num_tokens": 178446784.0, + "step": 7138 + }, + { + "epoch": 0.7839885789589282, + "grad_norm": 2.372453451156616, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7351878881454468, + "num_tokens": 178466969.0, + "step": 7139 + }, + { + "epoch": 0.7840983966615418, + "grad_norm": 2.1231982707977295, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7286691069602966, + "num_tokens": 178492494.0, + "step": 7140 + }, + { + "epoch": 0.7842082143641556, + "grad_norm": 1.933988332748413, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6898424625396729, + "num_tokens": 178524989.0, + "step": 7141 + }, + { + "epoch": 0.7843180320667692, + "grad_norm": 2.0952460765838623, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7075901031494141, + "num_tokens": 178551149.0, + "step": 7142 + }, + { + "epoch": 0.7844278497693828, + "grad_norm": 2.131521463394165, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.6958613395690918, + "num_tokens": 178578797.0, + "step": 7143 + }, + { + "epoch": 0.7845376674719965, + "grad_norm": 2.2245285511016846, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7080608010292053, + "num_tokens": 178602991.0, + "step": 7144 + }, + { + "epoch": 0.7846474851746101, + "grad_norm": 2.013336181640625, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7057864665985107, + "num_tokens": 178630319.0, + "step": 7145 + }, + { + "epoch": 0.7847573028772238, + "grad_norm": 2.35306453704834, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7039451599121094, + "num_tokens": 178653015.0, + "step": 7146 + }, + { + "epoch": 0.7848671205798374, + "grad_norm": 2.5282390117645264, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7127182483673096, + "num_tokens": 178676238.0, + "step": 7147 + }, + { + "epoch": 0.7849769382824512, + "grad_norm": 2.274545192718506, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.7083985805511475, + "num_tokens": 178700300.0, + "step": 7148 + }, + { + "epoch": 0.7850867559850648, + "grad_norm": 1.9802912473678589, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.7022552490234375, + "num_tokens": 178728952.0, + "step": 7149 + }, + { + "epoch": 0.7851965736876785, + "grad_norm": 2.1231606006622314, + "learning_rate": 1e-06, + "loss": 0.9969, + "mean_token_accuracy": 0.7014681696891785, + "num_tokens": 178756488.0, + "step": 7150 + }, + { + "epoch": 0.7853063913902921, + "grad_norm": 2.209111213684082, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6842002868652344, + "num_tokens": 178781698.0, + "step": 7151 + }, + { + "epoch": 0.7854162090929058, + "grad_norm": 2.4318904876708984, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7231175899505615, + "num_tokens": 178802289.0, + "step": 7152 + }, + { + "epoch": 0.7855260267955194, + "grad_norm": 2.0312516689300537, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7043886184692383, + "num_tokens": 178830489.0, + "step": 7153 + }, + { + "epoch": 0.7856358444981331, + "grad_norm": 2.187164545059204, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7122880220413208, + "num_tokens": 178857062.0, + "step": 7154 + }, + { + "epoch": 0.7857456622007467, + "grad_norm": 2.525710105895996, + "learning_rate": 1e-06, + "loss": 1.0526, + "mean_token_accuracy": 0.6865220069885254, + "num_tokens": 178877319.0, + "step": 7155 + }, + { + "epoch": 0.7858554799033605, + "grad_norm": 2.208474636077881, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7140381932258606, + "num_tokens": 178902212.0, + "step": 7156 + }, + { + "epoch": 0.7859652976059741, + "grad_norm": 2.193326234817505, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.7020432353019714, + "num_tokens": 178928508.0, + "step": 7157 + }, + { + "epoch": 0.7860751153085878, + "grad_norm": 2.254882335662842, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7486361265182495, + "num_tokens": 178948516.0, + "step": 7158 + }, + { + "epoch": 0.7861849330112014, + "grad_norm": 2.591111660003662, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7030963897705078, + "num_tokens": 178968452.0, + "step": 7159 + }, + { + "epoch": 0.786294750713815, + "grad_norm": 2.3619863986968994, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7212862968444824, + "num_tokens": 178991063.0, + "step": 7160 + }, + { + "epoch": 0.7864045684164287, + "grad_norm": 2.1423745155334473, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.695289134979248, + "num_tokens": 179016577.0, + "step": 7161 + }, + { + "epoch": 0.7865143861190423, + "grad_norm": 2.20143985748291, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7355891466140747, + "num_tokens": 179039491.0, + "step": 7162 + }, + { + "epoch": 0.7866242038216561, + "grad_norm": 2.3299200534820557, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6838966608047485, + "num_tokens": 179064236.0, + "step": 7163 + }, + { + "epoch": 0.7867340215242697, + "grad_norm": 2.1022963523864746, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.7052526473999023, + "num_tokens": 179092293.0, + "step": 7164 + }, + { + "epoch": 0.7868438392268834, + "grad_norm": 2.0557641983032227, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7094639539718628, + "num_tokens": 179119969.0, + "step": 7165 + }, + { + "epoch": 0.786953656929497, + "grad_norm": 2.068329334259033, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7238013744354248, + "num_tokens": 179144906.0, + "step": 7166 + }, + { + "epoch": 0.7870634746321107, + "grad_norm": 2.4832260608673096, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7052692174911499, + "num_tokens": 179167774.0, + "step": 7167 + }, + { + "epoch": 0.7871732923347243, + "grad_norm": 2.499228000640869, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7297797799110413, + "num_tokens": 179187290.0, + "step": 7168 + }, + { + "epoch": 0.787283110037338, + "grad_norm": 2.1250531673431396, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7067413926124573, + "num_tokens": 179213451.0, + "step": 7169 + }, + { + "epoch": 0.7873929277399517, + "grad_norm": 2.1039435863494873, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7081795930862427, + "num_tokens": 179240373.0, + "step": 7170 + }, + { + "epoch": 0.7875027454425654, + "grad_norm": 2.2520904541015625, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7116398811340332, + "num_tokens": 179263451.0, + "step": 7171 + }, + { + "epoch": 0.787612563145179, + "grad_norm": 2.5718443393707275, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7217093706130981, + "num_tokens": 179281889.0, + "step": 7172 + }, + { + "epoch": 0.7877223808477927, + "grad_norm": 2.0530600547790527, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7023125886917114, + "num_tokens": 179309778.0, + "step": 7173 + }, + { + "epoch": 0.7878321985504063, + "grad_norm": 2.2684574127197266, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.6979957222938538, + "num_tokens": 179334855.0, + "step": 7174 + }, + { + "epoch": 0.78794201625302, + "grad_norm": 2.5664994716644287, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6995087265968323, + "num_tokens": 179356030.0, + "step": 7175 + }, + { + "epoch": 0.7880518339556336, + "grad_norm": 2.373438835144043, + "learning_rate": 1e-06, + "loss": 1.0867, + "mean_token_accuracy": 0.6699849367141724, + "num_tokens": 179380119.0, + "step": 7176 + }, + { + "epoch": 0.7881616516582474, + "grad_norm": 2.4753096103668213, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7075304985046387, + "num_tokens": 179400582.0, + "step": 7177 + }, + { + "epoch": 0.788271469360861, + "grad_norm": 2.146599292755127, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7120315432548523, + "num_tokens": 179426286.0, + "step": 7178 + }, + { + "epoch": 0.7883812870634747, + "grad_norm": 2.2257282733917236, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6963351964950562, + "num_tokens": 179453454.0, + "step": 7179 + }, + { + "epoch": 0.7884911047660883, + "grad_norm": 2.374626636505127, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7094429731369019, + "num_tokens": 179476336.0, + "step": 7180 + }, + { + "epoch": 0.788600922468702, + "grad_norm": 2.0518360137939453, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7283706068992615, + "num_tokens": 179500982.0, + "step": 7181 + }, + { + "epoch": 0.7887107401713156, + "grad_norm": 2.1909172534942627, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7131252288818359, + "num_tokens": 179525331.0, + "step": 7182 + }, + { + "epoch": 0.7888205578739292, + "grad_norm": 2.318345308303833, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7031034827232361, + "num_tokens": 179547915.0, + "step": 7183 + }, + { + "epoch": 0.7889303755765429, + "grad_norm": 2.481248617172241, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7084408402442932, + "num_tokens": 179569663.0, + "step": 7184 + }, + { + "epoch": 0.7890401932791566, + "grad_norm": 2.327667713165283, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6944656372070312, + "num_tokens": 179594162.0, + "step": 7185 + }, + { + "epoch": 0.7891500109817703, + "grad_norm": 1.8971655368804932, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7015699744224548, + "num_tokens": 179625411.0, + "step": 7186 + }, + { + "epoch": 0.7892598286843839, + "grad_norm": 2.2802019119262695, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7175472378730774, + "num_tokens": 179647798.0, + "step": 7187 + }, + { + "epoch": 0.7893696463869976, + "grad_norm": 2.483828067779541, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.712587833404541, + "num_tokens": 179667901.0, + "step": 7188 + }, + { + "epoch": 0.7894794640896112, + "grad_norm": 2.3015449047088623, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7145277261734009, + "num_tokens": 179690591.0, + "step": 7189 + }, + { + "epoch": 0.7895892817922249, + "grad_norm": 2.252401828765869, + "learning_rate": 1e-06, + "loss": 1.0449, + "mean_token_accuracy": 0.6932278871536255, + "num_tokens": 179715813.0, + "step": 7190 + }, + { + "epoch": 0.7896990994948385, + "grad_norm": 2.348497152328491, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7065205574035645, + "num_tokens": 179738252.0, + "step": 7191 + }, + { + "epoch": 0.7898089171974523, + "grad_norm": 1.9787395000457764, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7090255618095398, + "num_tokens": 179769025.0, + "step": 7192 + }, + { + "epoch": 0.7899187349000659, + "grad_norm": 2.020935535430908, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.703751802444458, + "num_tokens": 179796854.0, + "step": 7193 + }, + { + "epoch": 0.7900285526026796, + "grad_norm": 2.244828939437866, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7271097898483276, + "num_tokens": 179819525.0, + "step": 7194 + }, + { + "epoch": 0.7901383703052932, + "grad_norm": 1.9983536005020142, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.7011138200759888, + "num_tokens": 179850675.0, + "step": 7195 + }, + { + "epoch": 0.7902481880079069, + "grad_norm": 2.5286967754364014, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7307008504867554, + "num_tokens": 179870018.0, + "step": 7196 + }, + { + "epoch": 0.7903580057105205, + "grad_norm": 2.1269783973693848, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7114953994750977, + "num_tokens": 179895221.0, + "step": 7197 + }, + { + "epoch": 0.7904678234131342, + "grad_norm": 2.2773892879486084, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7172273993492126, + "num_tokens": 179917645.0, + "step": 7198 + }, + { + "epoch": 0.7905776411157479, + "grad_norm": 2.2634596824645996, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7119696140289307, + "num_tokens": 179941139.0, + "step": 7199 + }, + { + "epoch": 0.7906874588183616, + "grad_norm": 2.29965877532959, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7197614312171936, + "num_tokens": 179963540.0, + "step": 7200 + }, + { + "epoch": 0.7907972765209752, + "grad_norm": 2.316467761993408, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7287906408309937, + "num_tokens": 179986576.0, + "step": 7201 + }, + { + "epoch": 0.7909070942235888, + "grad_norm": 2.092768669128418, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7118656635284424, + "num_tokens": 180011419.0, + "step": 7202 + }, + { + "epoch": 0.7910169119262025, + "grad_norm": 2.1075358390808105, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7214770317077637, + "num_tokens": 180035131.0, + "step": 7203 + }, + { + "epoch": 0.7911267296288161, + "grad_norm": 2.2798244953155518, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7111069560050964, + "num_tokens": 180058887.0, + "step": 7204 + }, + { + "epoch": 0.7912365473314298, + "grad_norm": 2.294748306274414, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6893054246902466, + "num_tokens": 180082349.0, + "step": 7205 + }, + { + "epoch": 0.7913463650340435, + "grad_norm": 2.042973756790161, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7223510146141052, + "num_tokens": 180107511.0, + "step": 7206 + }, + { + "epoch": 0.7914561827366572, + "grad_norm": 2.192502498626709, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.687117338180542, + "num_tokens": 180133789.0, + "step": 7207 + }, + { + "epoch": 0.7915660004392708, + "grad_norm": 2.2469496726989746, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7135888338088989, + "num_tokens": 180158801.0, + "step": 7208 + }, + { + "epoch": 0.7916758181418845, + "grad_norm": 2.303492546081543, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6896730661392212, + "num_tokens": 180183662.0, + "step": 7209 + }, + { + "epoch": 0.7917856358444981, + "grad_norm": 2.330343246459961, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7255100607872009, + "num_tokens": 180204979.0, + "step": 7210 + }, + { + "epoch": 0.7918954535471118, + "grad_norm": 2.2163169384002686, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7219812273979187, + "num_tokens": 180229742.0, + "step": 7211 + }, + { + "epoch": 0.7920052712497254, + "grad_norm": 2.2371346950531006, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7191662788391113, + "num_tokens": 180253063.0, + "step": 7212 + }, + { + "epoch": 0.7921150889523391, + "grad_norm": 2.031332492828369, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7191537618637085, + "num_tokens": 180280201.0, + "step": 7213 + }, + { + "epoch": 0.7922249066549528, + "grad_norm": 2.2416625022888184, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.699349582195282, + "num_tokens": 180303349.0, + "step": 7214 + }, + { + "epoch": 0.7923347243575665, + "grad_norm": 2.4019081592559814, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7138676643371582, + "num_tokens": 180324103.0, + "step": 7215 + }, + { + "epoch": 0.7924445420601801, + "grad_norm": 2.37088680267334, + "learning_rate": 1e-06, + "loss": 1.0122, + "mean_token_accuracy": 0.6889464259147644, + "num_tokens": 180345697.0, + "step": 7216 + }, + { + "epoch": 0.7925543597627938, + "grad_norm": 1.942834734916687, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7174266576766968, + "num_tokens": 180374132.0, + "step": 7217 + }, + { + "epoch": 0.7926641774654074, + "grad_norm": 2.2034661769866943, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6968846917152405, + "num_tokens": 180398365.0, + "step": 7218 + }, + { + "epoch": 0.792773995168021, + "grad_norm": 1.8779171705245972, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6955151557922363, + "num_tokens": 180432088.0, + "step": 7219 + }, + { + "epoch": 0.7928838128706347, + "grad_norm": 2.0697169303894043, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.694339394569397, + "num_tokens": 180459948.0, + "step": 7220 + }, + { + "epoch": 0.7929936305732485, + "grad_norm": 1.974358081817627, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7107574939727783, + "num_tokens": 180489545.0, + "step": 7221 + }, + { + "epoch": 0.7931034482758621, + "grad_norm": 2.0382115840911865, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.7042555809020996, + "num_tokens": 180520208.0, + "step": 7222 + }, + { + "epoch": 0.7932132659784757, + "grad_norm": 2.617828845977783, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7156013250350952, + "num_tokens": 180540141.0, + "step": 7223 + }, + { + "epoch": 0.7933230836810894, + "grad_norm": 2.2060346603393555, + "learning_rate": 1e-06, + "loss": 1.0336, + "mean_token_accuracy": 0.6842939853668213, + "num_tokens": 180570050.0, + "step": 7224 + }, + { + "epoch": 0.793432901383703, + "grad_norm": 2.29663348197937, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7157211899757385, + "num_tokens": 180592325.0, + "step": 7225 + }, + { + "epoch": 0.7935427190863167, + "grad_norm": 2.528191089630127, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7154706120491028, + "num_tokens": 180611993.0, + "step": 7226 + }, + { + "epoch": 0.7936525367889303, + "grad_norm": 2.283975839614868, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6996816396713257, + "num_tokens": 180636719.0, + "step": 7227 + }, + { + "epoch": 0.7937623544915441, + "grad_norm": 2.326167583465576, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7037155032157898, + "num_tokens": 180661911.0, + "step": 7228 + }, + { + "epoch": 0.7938721721941577, + "grad_norm": 2.233025550842285, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7083172798156738, + "num_tokens": 180685512.0, + "step": 7229 + }, + { + "epoch": 0.7939819898967714, + "grad_norm": 2.2333996295928955, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6880174875259399, + "num_tokens": 180710414.0, + "step": 7230 + }, + { + "epoch": 0.794091807599385, + "grad_norm": 2.7464914321899414, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7314482927322388, + "num_tokens": 180727427.0, + "step": 7231 + }, + { + "epoch": 0.7942016253019987, + "grad_norm": 2.0782926082611084, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.7065709233283997, + "num_tokens": 180756645.0, + "step": 7232 + }, + { + "epoch": 0.7943114430046123, + "grad_norm": 2.3078997135162354, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7271267175674438, + "num_tokens": 180779635.0, + "step": 7233 + }, + { + "epoch": 0.794421260707226, + "grad_norm": 2.2359254360198975, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7042871713638306, + "num_tokens": 180802816.0, + "step": 7234 + }, + { + "epoch": 0.7945310784098397, + "grad_norm": 2.1477582454681396, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.6979128122329712, + "num_tokens": 180829112.0, + "step": 7235 + }, + { + "epoch": 0.7946408961124534, + "grad_norm": 2.149033546447754, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7229626774787903, + "num_tokens": 180857456.0, + "step": 7236 + }, + { + "epoch": 0.794750713815067, + "grad_norm": 2.446821451187134, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.716733455657959, + "num_tokens": 180880665.0, + "step": 7237 + }, + { + "epoch": 0.7948605315176807, + "grad_norm": 1.9503533840179443, + "learning_rate": 1e-06, + "loss": 0.973, + "mean_token_accuracy": 0.7035298943519592, + "num_tokens": 180911091.0, + "step": 7238 + }, + { + "epoch": 0.7949703492202943, + "grad_norm": 1.9967174530029297, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7175337672233582, + "num_tokens": 180940748.0, + "step": 7239 + }, + { + "epoch": 0.795080166922908, + "grad_norm": 2.1221625804901123, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7093155980110168, + "num_tokens": 180968230.0, + "step": 7240 + }, + { + "epoch": 0.7951899846255216, + "grad_norm": 2.5987935066223145, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7355163097381592, + "num_tokens": 180985805.0, + "step": 7241 + }, + { + "epoch": 0.7952998023281352, + "grad_norm": 2.256770372390747, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7239187955856323, + "num_tokens": 181007671.0, + "step": 7242 + }, + { + "epoch": 0.795409620030749, + "grad_norm": 2.3913514614105225, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.699608564376831, + "num_tokens": 181030016.0, + "step": 7243 + }, + { + "epoch": 0.7955194377333626, + "grad_norm": 2.5998640060424805, + "learning_rate": 1e-06, + "loss": 1.0574, + "mean_token_accuracy": 0.681756854057312, + "num_tokens": 181049655.0, + "step": 7244 + }, + { + "epoch": 0.7956292554359763, + "grad_norm": 2.251034736633301, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.715589165687561, + "num_tokens": 181075253.0, + "step": 7245 + }, + { + "epoch": 0.7957390731385899, + "grad_norm": 2.151412010192871, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7114177942276001, + "num_tokens": 181100071.0, + "step": 7246 + }, + { + "epoch": 0.7958488908412036, + "grad_norm": 2.02638840675354, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6986680030822754, + "num_tokens": 181128824.0, + "step": 7247 + }, + { + "epoch": 0.7959587085438172, + "grad_norm": 2.143198013305664, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7280461192131042, + "num_tokens": 181153156.0, + "step": 7248 + }, + { + "epoch": 0.7960685262464309, + "grad_norm": 2.4811909198760986, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.726570188999176, + "num_tokens": 181171403.0, + "step": 7249 + }, + { + "epoch": 0.7961783439490446, + "grad_norm": 2.2356104850769043, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.703558623790741, + "num_tokens": 181194961.0, + "step": 7250 + }, + { + "epoch": 0.7962881616516583, + "grad_norm": 2.0085129737854004, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6969249844551086, + "num_tokens": 181225377.0, + "step": 7251 + }, + { + "epoch": 0.7963979793542719, + "grad_norm": 2.1143038272857666, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.6998957395553589, + "num_tokens": 181252556.0, + "step": 7252 + }, + { + "epoch": 0.7965077970568856, + "grad_norm": 1.957545518875122, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.701518177986145, + "num_tokens": 181282519.0, + "step": 7253 + }, + { + "epoch": 0.7966176147594992, + "grad_norm": 2.2264506816864014, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6864956617355347, + "num_tokens": 181307518.0, + "step": 7254 + }, + { + "epoch": 0.7967274324621129, + "grad_norm": 2.577108860015869, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7070077657699585, + "num_tokens": 181326760.0, + "step": 7255 + }, + { + "epoch": 0.7968372501647265, + "grad_norm": 2.1117780208587646, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7019547820091248, + "num_tokens": 181353001.0, + "step": 7256 + }, + { + "epoch": 0.7969470678673403, + "grad_norm": 2.149716854095459, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7417992353439331, + "num_tokens": 181376597.0, + "step": 7257 + }, + { + "epoch": 0.7970568855699539, + "grad_norm": 1.8018864393234253, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.69180828332901, + "num_tokens": 181412185.0, + "step": 7258 + }, + { + "epoch": 0.7971667032725676, + "grad_norm": 2.085041046142578, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6950961947441101, + "num_tokens": 181439918.0, + "step": 7259 + }, + { + "epoch": 0.7972765209751812, + "grad_norm": 2.1883363723754883, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7061498165130615, + "num_tokens": 181465722.0, + "step": 7260 + }, + { + "epoch": 0.7973863386777948, + "grad_norm": 2.2042598724365234, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7301977276802063, + "num_tokens": 181490969.0, + "step": 7261 + }, + { + "epoch": 0.7974961563804085, + "grad_norm": 2.1240365505218506, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.6984332203865051, + "num_tokens": 181517092.0, + "step": 7262 + }, + { + "epoch": 0.7976059740830221, + "grad_norm": 2.2806143760681152, + "learning_rate": 1e-06, + "loss": 1.0567, + "mean_token_accuracy": 0.6841264367103577, + "num_tokens": 181541434.0, + "step": 7263 + }, + { + "epoch": 0.7977157917856359, + "grad_norm": 1.9369959831237793, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7088419795036316, + "num_tokens": 181571837.0, + "step": 7264 + }, + { + "epoch": 0.7978256094882495, + "grad_norm": 1.948113203048706, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7149370312690735, + "num_tokens": 181601177.0, + "step": 7265 + }, + { + "epoch": 0.7979354271908632, + "grad_norm": 2.41286563873291, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7311416864395142, + "num_tokens": 181621737.0, + "step": 7266 + }, + { + "epoch": 0.7980452448934768, + "grad_norm": 2.1410834789276123, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6954216957092285, + "num_tokens": 181647211.0, + "step": 7267 + }, + { + "epoch": 0.7981550625960905, + "grad_norm": 2.2447919845581055, + "learning_rate": 1e-06, + "loss": 1.1047, + "mean_token_accuracy": 0.6634958386421204, + "num_tokens": 181674158.0, + "step": 7268 + }, + { + "epoch": 0.7982648802987041, + "grad_norm": 2.7019782066345215, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7153352499008179, + "num_tokens": 181692057.0, + "step": 7269 + }, + { + "epoch": 0.7983746980013178, + "grad_norm": 2.117541551589966, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.697973370552063, + "num_tokens": 181717978.0, + "step": 7270 + }, + { + "epoch": 0.7984845157039314, + "grad_norm": 2.5684831142425537, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7232298851013184, + "num_tokens": 181735776.0, + "step": 7271 + }, + { + "epoch": 0.7985943334065452, + "grad_norm": 2.5019779205322266, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7361629009246826, + "num_tokens": 181755004.0, + "step": 7272 + }, + { + "epoch": 0.7987041511091588, + "grad_norm": 2.160557270050049, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7123256325721741, + "num_tokens": 181780769.0, + "step": 7273 + }, + { + "epoch": 0.7988139688117725, + "grad_norm": 2.3375978469848633, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7264934182167053, + "num_tokens": 181801435.0, + "step": 7274 + }, + { + "epoch": 0.7989237865143861, + "grad_norm": 2.424591302871704, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.727358877658844, + "num_tokens": 181821917.0, + "step": 7275 + }, + { + "epoch": 0.7990336042169998, + "grad_norm": 2.104647159576416, + "learning_rate": 1e-06, + "loss": 1.047, + "mean_token_accuracy": 0.6777029037475586, + "num_tokens": 181850285.0, + "step": 7276 + }, + { + "epoch": 0.7991434219196134, + "grad_norm": 2.03631591796875, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7112782597541809, + "num_tokens": 181876600.0, + "step": 7277 + }, + { + "epoch": 0.799253239622227, + "grad_norm": 2.2438137531280518, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7129064798355103, + "num_tokens": 181899572.0, + "step": 7278 + }, + { + "epoch": 0.7993630573248408, + "grad_norm": 2.1783101558685303, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6925187110900879, + "num_tokens": 181926187.0, + "step": 7279 + }, + { + "epoch": 0.7994728750274545, + "grad_norm": 2.272136926651001, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.700792133808136, + "num_tokens": 181948869.0, + "step": 7280 + }, + { + "epoch": 0.7995826927300681, + "grad_norm": 2.2441232204437256, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7239127159118652, + "num_tokens": 181970898.0, + "step": 7281 + }, + { + "epoch": 0.7996925104326817, + "grad_norm": 2.1322736740112305, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7315738797187805, + "num_tokens": 181997602.0, + "step": 7282 + }, + { + "epoch": 0.7998023281352954, + "grad_norm": 2.3508496284484863, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7099703550338745, + "num_tokens": 182019865.0, + "step": 7283 + }, + { + "epoch": 0.799912145837909, + "grad_norm": 2.1155617237091064, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.7063124179840088, + "num_tokens": 182046517.0, + "step": 7284 + }, + { + "epoch": 0.8000219635405227, + "grad_norm": 1.9847909212112427, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6954643130302429, + "num_tokens": 182075936.0, + "step": 7285 + }, + { + "epoch": 0.8001317812431364, + "grad_norm": 2.262086868286133, + "learning_rate": 1e-06, + "loss": 1.0092, + "mean_token_accuracy": 0.6986343860626221, + "num_tokens": 182099785.0, + "step": 7286 + }, + { + "epoch": 0.8002415989457501, + "grad_norm": 2.1751723289489746, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.7005705237388611, + "num_tokens": 182124810.0, + "step": 7287 + }, + { + "epoch": 0.8003514166483637, + "grad_norm": 2.652611017227173, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7070051431655884, + "num_tokens": 182142536.0, + "step": 7288 + }, + { + "epoch": 0.8004612343509774, + "grad_norm": 2.2116174697875977, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7309406995773315, + "num_tokens": 182165795.0, + "step": 7289 + }, + { + "epoch": 0.800571052053591, + "grad_norm": 2.2580666542053223, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6967673897743225, + "num_tokens": 182188941.0, + "step": 7290 + }, + { + "epoch": 0.8006808697562047, + "grad_norm": 2.116859197616577, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7168900966644287, + "num_tokens": 182215962.0, + "step": 7291 + }, + { + "epoch": 0.8007906874588183, + "grad_norm": 1.912820816040039, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6915779113769531, + "num_tokens": 182248217.0, + "step": 7292 + }, + { + "epoch": 0.8009005051614321, + "grad_norm": 2.2288641929626465, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7164393663406372, + "num_tokens": 182271167.0, + "step": 7293 + }, + { + "epoch": 0.8010103228640457, + "grad_norm": 1.9799526929855347, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.7042694091796875, + "num_tokens": 182303803.0, + "step": 7294 + }, + { + "epoch": 0.8011201405666594, + "grad_norm": 1.8885523080825806, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6966689825057983, + "num_tokens": 182336838.0, + "step": 7295 + }, + { + "epoch": 0.801229958269273, + "grad_norm": 2.0311708450317383, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.736188530921936, + "num_tokens": 182363718.0, + "step": 7296 + }, + { + "epoch": 0.8013397759718867, + "grad_norm": 2.13264536857605, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6826108694076538, + "num_tokens": 182391883.0, + "step": 7297 + }, + { + "epoch": 0.8014495936745003, + "grad_norm": 2.4107141494750977, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7212441563606262, + "num_tokens": 182412459.0, + "step": 7298 + }, + { + "epoch": 0.801559411377114, + "grad_norm": 2.2085204124450684, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7022720575332642, + "num_tokens": 182435294.0, + "step": 7299 + }, + { + "epoch": 0.8016692290797276, + "grad_norm": 1.915421962738037, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6874233484268188, + "num_tokens": 182467366.0, + "step": 7300 + }, + { + "epoch": 0.8017790467823414, + "grad_norm": 2.117739677429199, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7029894590377808, + "num_tokens": 182494445.0, + "step": 7301 + }, + { + "epoch": 0.801888864484955, + "grad_norm": 2.1686911582946777, + "learning_rate": 1e-06, + "loss": 1.048, + "mean_token_accuracy": 0.6801660656929016, + "num_tokens": 182521649.0, + "step": 7302 + }, + { + "epoch": 0.8019986821875686, + "grad_norm": 2.3375556468963623, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7050098180770874, + "num_tokens": 182545462.0, + "step": 7303 + }, + { + "epoch": 0.8021084998901823, + "grad_norm": 2.53121280670166, + "learning_rate": 1e-06, + "loss": 1.0374, + "mean_token_accuracy": 0.6920837163925171, + "num_tokens": 182565023.0, + "step": 7304 + }, + { + "epoch": 0.8022183175927959, + "grad_norm": 2.589742422103882, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7296144962310791, + "num_tokens": 182582957.0, + "step": 7305 + }, + { + "epoch": 0.8023281352954096, + "grad_norm": 2.302107334136963, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6928296089172363, + "num_tokens": 182606150.0, + "step": 7306 + }, + { + "epoch": 0.8024379529980232, + "grad_norm": 2.324943780899048, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7304904460906982, + "num_tokens": 182625784.0, + "step": 7307 + }, + { + "epoch": 0.802547770700637, + "grad_norm": 2.176629066467285, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.710784912109375, + "num_tokens": 182650053.0, + "step": 7308 + }, + { + "epoch": 0.8026575884032506, + "grad_norm": 2.5637471675872803, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7091875672340393, + "num_tokens": 182669551.0, + "step": 7309 + }, + { + "epoch": 0.8027674061058643, + "grad_norm": 2.4310433864593506, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7196964025497437, + "num_tokens": 182689206.0, + "step": 7310 + }, + { + "epoch": 0.8028772238084779, + "grad_norm": 2.2924680709838867, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7176744937896729, + "num_tokens": 182712475.0, + "step": 7311 + }, + { + "epoch": 0.8029870415110916, + "grad_norm": 2.0668540000915527, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7166657447814941, + "num_tokens": 182740138.0, + "step": 7312 + }, + { + "epoch": 0.8030968592137052, + "grad_norm": 2.2606441974639893, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7109251618385315, + "num_tokens": 182762886.0, + "step": 7313 + }, + { + "epoch": 0.8032066769163189, + "grad_norm": 2.3875343799591064, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7124134302139282, + "num_tokens": 182783854.0, + "step": 7314 + }, + { + "epoch": 0.8033164946189326, + "grad_norm": 2.066838026046753, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7345980405807495, + "num_tokens": 182809531.0, + "step": 7315 + }, + { + "epoch": 0.8034263123215463, + "grad_norm": 2.396756410598755, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7158439755439758, + "num_tokens": 182829274.0, + "step": 7316 + }, + { + "epoch": 0.8035361300241599, + "grad_norm": 2.150775909423828, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6882346868515015, + "num_tokens": 182854520.0, + "step": 7317 + }, + { + "epoch": 0.8036459477267736, + "grad_norm": 2.2025434970855713, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7198019623756409, + "num_tokens": 182878383.0, + "step": 7318 + }, + { + "epoch": 0.8037557654293872, + "grad_norm": 2.180074453353882, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7174209356307983, + "num_tokens": 182902850.0, + "step": 7319 + }, + { + "epoch": 0.8038655831320008, + "grad_norm": 2.4371204376220703, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7009851336479187, + "num_tokens": 182923929.0, + "step": 7320 + }, + { + "epoch": 0.8039754008346145, + "grad_norm": 2.4510629177093506, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7306942939758301, + "num_tokens": 182942938.0, + "step": 7321 + }, + { + "epoch": 0.8040852185372283, + "grad_norm": 1.9926306009292603, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7032374739646912, + "num_tokens": 182972599.0, + "step": 7322 + }, + { + "epoch": 0.8041950362398419, + "grad_norm": 2.1807754039764404, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7300415635108948, + "num_tokens": 182996775.0, + "step": 7323 + }, + { + "epoch": 0.8043048539424555, + "grad_norm": 2.228182792663574, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6970962882041931, + "num_tokens": 183021730.0, + "step": 7324 + }, + { + "epoch": 0.8044146716450692, + "grad_norm": 1.8529791831970215, + "learning_rate": 1e-06, + "loss": 1.0166, + "mean_token_accuracy": 0.693123459815979, + "num_tokens": 183056486.0, + "step": 7325 + }, + { + "epoch": 0.8045244893476828, + "grad_norm": 2.0431385040283203, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7231894731521606, + "num_tokens": 183084546.0, + "step": 7326 + }, + { + "epoch": 0.8046343070502965, + "grad_norm": 1.9931106567382812, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7073532938957214, + "num_tokens": 183112171.0, + "step": 7327 + }, + { + "epoch": 0.8047441247529101, + "grad_norm": 2.1103804111480713, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.7028743028640747, + "num_tokens": 183138113.0, + "step": 7328 + }, + { + "epoch": 0.8048539424555239, + "grad_norm": 2.119002342224121, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7006407976150513, + "num_tokens": 183164793.0, + "step": 7329 + }, + { + "epoch": 0.8049637601581375, + "grad_norm": 2.479691743850708, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7207218408584595, + "num_tokens": 183184873.0, + "step": 7330 + }, + { + "epoch": 0.8050735778607512, + "grad_norm": 2.1292004585266113, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.707170844078064, + "num_tokens": 183212091.0, + "step": 7331 + }, + { + "epoch": 0.8051833955633648, + "grad_norm": 2.2219040393829346, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7120373249053955, + "num_tokens": 183237900.0, + "step": 7332 + }, + { + "epoch": 0.8052932132659785, + "grad_norm": 1.923984408378601, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7057531476020813, + "num_tokens": 183268995.0, + "step": 7333 + }, + { + "epoch": 0.8054030309685921, + "grad_norm": 2.1973636150360107, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7059119939804077, + "num_tokens": 183295686.0, + "step": 7334 + }, + { + "epoch": 0.8055128486712058, + "grad_norm": 2.0084478855133057, + "learning_rate": 1e-06, + "loss": 1.0346, + "mean_token_accuracy": 0.6828880310058594, + "num_tokens": 183326841.0, + "step": 7335 + }, + { + "epoch": 0.8056226663738194, + "grad_norm": 2.009711503982544, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.6997725367546082, + "num_tokens": 183354537.0, + "step": 7336 + }, + { + "epoch": 0.8057324840764332, + "grad_norm": 2.5553736686706543, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7359292507171631, + "num_tokens": 183372673.0, + "step": 7337 + }, + { + "epoch": 0.8058423017790468, + "grad_norm": 1.8981602191925049, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.6995447874069214, + "num_tokens": 183403183.0, + "step": 7338 + }, + { + "epoch": 0.8059521194816605, + "grad_norm": 2.209409236907959, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.716806173324585, + "num_tokens": 183427139.0, + "step": 7339 + }, + { + "epoch": 0.8060619371842741, + "grad_norm": 2.251598596572876, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.7053689956665039, + "num_tokens": 183450301.0, + "step": 7340 + }, + { + "epoch": 0.8061717548868877, + "grad_norm": 2.245610475540161, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.7065317034721375, + "num_tokens": 183475503.0, + "step": 7341 + }, + { + "epoch": 0.8062815725895014, + "grad_norm": 2.190845489501953, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7124910354614258, + "num_tokens": 183500581.0, + "step": 7342 + }, + { + "epoch": 0.806391390292115, + "grad_norm": 2.196716070175171, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7067664861679077, + "num_tokens": 183524260.0, + "step": 7343 + }, + { + "epoch": 0.8065012079947288, + "grad_norm": 2.286190986633301, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7170904874801636, + "num_tokens": 183547010.0, + "step": 7344 + }, + { + "epoch": 0.8066110256973424, + "grad_norm": 2.420405626296997, + "learning_rate": 1e-06, + "loss": 1.0378, + "mean_token_accuracy": 0.6862791180610657, + "num_tokens": 183572794.0, + "step": 7345 + }, + { + "epoch": 0.8067208433999561, + "grad_norm": 2.135173797607422, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7169214487075806, + "num_tokens": 183598667.0, + "step": 7346 + }, + { + "epoch": 0.8068306611025697, + "grad_norm": 2.2076497077941895, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6843308210372925, + "num_tokens": 183624024.0, + "step": 7347 + }, + { + "epoch": 0.8069404788051834, + "grad_norm": 1.8648364543914795, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6880854368209839, + "num_tokens": 183657856.0, + "step": 7348 + }, + { + "epoch": 0.807050296507797, + "grad_norm": 2.284370183944702, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.702499270439148, + "num_tokens": 183681300.0, + "step": 7349 + }, + { + "epoch": 0.8071601142104107, + "grad_norm": 1.9745731353759766, + "learning_rate": 1e-06, + "loss": 1.0059, + "mean_token_accuracy": 0.7023687362670898, + "num_tokens": 183708834.0, + "step": 7350 + }, + { + "epoch": 0.8072699319130244, + "grad_norm": 2.2718214988708496, + "learning_rate": 1e-06, + "loss": 1.0353, + "mean_token_accuracy": 0.6876209378242493, + "num_tokens": 183734190.0, + "step": 7351 + }, + { + "epoch": 0.8073797496156381, + "grad_norm": 2.3483738899230957, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7003294825553894, + "num_tokens": 183757416.0, + "step": 7352 + }, + { + "epoch": 0.8074895673182517, + "grad_norm": 2.1294188499450684, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7174026966094971, + "num_tokens": 183783047.0, + "step": 7353 + }, + { + "epoch": 0.8075993850208654, + "grad_norm": 2.2714900970458984, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6892334222793579, + "num_tokens": 183806921.0, + "step": 7354 + }, + { + "epoch": 0.807709202723479, + "grad_norm": 2.3609678745269775, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7292559146881104, + "num_tokens": 183828510.0, + "step": 7355 + }, + { + "epoch": 0.8078190204260927, + "grad_norm": 2.0788726806640625, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6928550004959106, + "num_tokens": 183856653.0, + "step": 7356 + }, + { + "epoch": 0.8079288381287063, + "grad_norm": 2.0942986011505127, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7320584058761597, + "num_tokens": 183881352.0, + "step": 7357 + }, + { + "epoch": 0.8080386558313201, + "grad_norm": 2.2155351638793945, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.705990195274353, + "num_tokens": 183904426.0, + "step": 7358 + }, + { + "epoch": 0.8081484735339337, + "grad_norm": 2.434882879257202, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7132338285446167, + "num_tokens": 183926041.0, + "step": 7359 + }, + { + "epoch": 0.8082582912365474, + "grad_norm": 2.2103729248046875, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7286425828933716, + "num_tokens": 183948880.0, + "step": 7360 + }, + { + "epoch": 0.808368108939161, + "grad_norm": 1.9404335021972656, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7088809013366699, + "num_tokens": 183979852.0, + "step": 7361 + }, + { + "epoch": 0.8084779266417746, + "grad_norm": 2.3284201622009277, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.7068347930908203, + "num_tokens": 184000825.0, + "step": 7362 + }, + { + "epoch": 0.8085877443443883, + "grad_norm": 2.193727970123291, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7079116702079773, + "num_tokens": 184025875.0, + "step": 7363 + }, + { + "epoch": 0.8086975620470019, + "grad_norm": 2.299614906311035, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7068268060684204, + "num_tokens": 184049639.0, + "step": 7364 + }, + { + "epoch": 0.8088073797496156, + "grad_norm": 2.2172839641571045, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7204230427742004, + "num_tokens": 184073429.0, + "step": 7365 + }, + { + "epoch": 0.8089171974522293, + "grad_norm": 2.010279655456543, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.68984454870224, + "num_tokens": 184104727.0, + "step": 7366 + }, + { + "epoch": 0.809027015154843, + "grad_norm": 2.361599922180176, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7233335971832275, + "num_tokens": 184126418.0, + "step": 7367 + }, + { + "epoch": 0.8091368328574566, + "grad_norm": 2.495861053466797, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7042011618614197, + "num_tokens": 184147642.0, + "step": 7368 + }, + { + "epoch": 0.8092466505600703, + "grad_norm": 2.0054051876068115, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.6996078491210938, + "num_tokens": 184177624.0, + "step": 7369 + }, + { + "epoch": 0.8093564682626839, + "grad_norm": 2.346085786819458, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7112634778022766, + "num_tokens": 184202686.0, + "step": 7370 + }, + { + "epoch": 0.8094662859652976, + "grad_norm": 2.3501672744750977, + "learning_rate": 1e-06, + "loss": 1.0682, + "mean_token_accuracy": 0.6852979063987732, + "num_tokens": 184225551.0, + "step": 7371 + }, + { + "epoch": 0.8095761036679112, + "grad_norm": 2.2352590560913086, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7241281270980835, + "num_tokens": 184250880.0, + "step": 7372 + }, + { + "epoch": 0.809685921370525, + "grad_norm": 2.167356491088867, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6941549181938171, + "num_tokens": 184277939.0, + "step": 7373 + }, + { + "epoch": 0.8097957390731386, + "grad_norm": 2.385326385498047, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7082317471504211, + "num_tokens": 184300123.0, + "step": 7374 + }, + { + "epoch": 0.8099055567757523, + "grad_norm": 2.039562702178955, + "learning_rate": 1e-06, + "loss": 1.0446, + "mean_token_accuracy": 0.6958087682723999, + "num_tokens": 184328803.0, + "step": 7375 + }, + { + "epoch": 0.8100153744783659, + "grad_norm": 2.1669387817382812, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7116104364395142, + "num_tokens": 184353724.0, + "step": 7376 + }, + { + "epoch": 0.8101251921809796, + "grad_norm": 2.051950216293335, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.7084553241729736, + "num_tokens": 184380333.0, + "step": 7377 + }, + { + "epoch": 0.8102350098835932, + "grad_norm": 2.254485845565796, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7095968723297119, + "num_tokens": 184404019.0, + "step": 7378 + }, + { + "epoch": 0.8103448275862069, + "grad_norm": 2.231491804122925, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6930761337280273, + "num_tokens": 184428703.0, + "step": 7379 + }, + { + "epoch": 0.8104546452888206, + "grad_norm": 2.236534357070923, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.697096586227417, + "num_tokens": 184452303.0, + "step": 7380 + }, + { + "epoch": 0.8105644629914343, + "grad_norm": 2.100658416748047, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6929246783256531, + "num_tokens": 184478392.0, + "step": 7381 + }, + { + "epoch": 0.8106742806940479, + "grad_norm": 2.2294423580169678, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6950520873069763, + "num_tokens": 184502370.0, + "step": 7382 + }, + { + "epoch": 0.8107840983966615, + "grad_norm": 2.493178606033325, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7184328436851501, + "num_tokens": 184522067.0, + "step": 7383 + }, + { + "epoch": 0.8108939160992752, + "grad_norm": 2.1381444931030273, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7084589004516602, + "num_tokens": 184547781.0, + "step": 7384 + }, + { + "epoch": 0.8110037338018888, + "grad_norm": 2.332082748413086, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7457269430160522, + "num_tokens": 184571157.0, + "step": 7385 + }, + { + "epoch": 0.8111135515045025, + "grad_norm": 2.205566167831421, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7141470909118652, + "num_tokens": 184595200.0, + "step": 7386 + }, + { + "epoch": 0.8112233692071162, + "grad_norm": 2.222243070602417, + "learning_rate": 1e-06, + "loss": 1.0608, + "mean_token_accuracy": 0.6885193586349487, + "num_tokens": 184621482.0, + "step": 7387 + }, + { + "epoch": 0.8113331869097299, + "grad_norm": 2.0838987827301025, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.712884783744812, + "num_tokens": 184647883.0, + "step": 7388 + }, + { + "epoch": 0.8114430046123435, + "grad_norm": 2.102278470993042, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7057451009750366, + "num_tokens": 184676894.0, + "step": 7389 + }, + { + "epoch": 0.8115528223149572, + "grad_norm": 2.3193581104278564, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6917601823806763, + "num_tokens": 184700437.0, + "step": 7390 + }, + { + "epoch": 0.8116626400175708, + "grad_norm": 2.316518545150757, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7175902128219604, + "num_tokens": 184722298.0, + "step": 7391 + }, + { + "epoch": 0.8117724577201845, + "grad_norm": 2.0697054862976074, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.7040508389472961, + "num_tokens": 184749687.0, + "step": 7392 + }, + { + "epoch": 0.8118822754227981, + "grad_norm": 2.1870362758636475, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7134479284286499, + "num_tokens": 184773631.0, + "step": 7393 + }, + { + "epoch": 0.8119920931254118, + "grad_norm": 2.56781005859375, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7163968682289124, + "num_tokens": 184792092.0, + "step": 7394 + }, + { + "epoch": 0.8121019108280255, + "grad_norm": 2.072551965713501, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7082602977752686, + "num_tokens": 184819393.0, + "step": 7395 + }, + { + "epoch": 0.8122117285306392, + "grad_norm": 2.236971855163574, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7112287282943726, + "num_tokens": 184841919.0, + "step": 7396 + }, + { + "epoch": 0.8123215462332528, + "grad_norm": 2.2435171604156494, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7144474387168884, + "num_tokens": 184865435.0, + "step": 7397 + }, + { + "epoch": 0.8124313639358665, + "grad_norm": 2.0738964080810547, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7022882699966431, + "num_tokens": 184892178.0, + "step": 7398 + }, + { + "epoch": 0.8125411816384801, + "grad_norm": 2.4501163959503174, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7307008504867554, + "num_tokens": 184912706.0, + "step": 7399 + }, + { + "epoch": 0.8126509993410937, + "grad_norm": 2.4812629222869873, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6880373954772949, + "num_tokens": 184933922.0, + "step": 7400 + }, + { + "epoch": 0.8127608170437074, + "grad_norm": 2.5027613639831543, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7303539514541626, + "num_tokens": 184953020.0, + "step": 7401 + }, + { + "epoch": 0.8128706347463212, + "grad_norm": 2.2700631618499756, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7131716012954712, + "num_tokens": 184977197.0, + "step": 7402 + }, + { + "epoch": 0.8129804524489348, + "grad_norm": 2.124114990234375, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7121114134788513, + "num_tokens": 185004031.0, + "step": 7403 + }, + { + "epoch": 0.8130902701515484, + "grad_norm": 2.257627010345459, + "learning_rate": 1e-06, + "loss": 1.0923, + "mean_token_accuracy": 0.6849573254585266, + "num_tokens": 185028956.0, + "step": 7404 + }, + { + "epoch": 0.8132000878541621, + "grad_norm": 1.9642688035964966, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.710770845413208, + "num_tokens": 185059352.0, + "step": 7405 + }, + { + "epoch": 0.8133099055567757, + "grad_norm": 2.0478477478027344, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7090228796005249, + "num_tokens": 185086491.0, + "step": 7406 + }, + { + "epoch": 0.8134197232593894, + "grad_norm": 1.9642798900604248, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7034851312637329, + "num_tokens": 185118518.0, + "step": 7407 + }, + { + "epoch": 0.813529540962003, + "grad_norm": 2.277381420135498, + "learning_rate": 1e-06, + "loss": 1.052, + "mean_token_accuracy": 0.6830316781997681, + "num_tokens": 185143676.0, + "step": 7408 + }, + { + "epoch": 0.8136393586646168, + "grad_norm": 2.196476697921753, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.705055832862854, + "num_tokens": 185169075.0, + "step": 7409 + }, + { + "epoch": 0.8137491763672304, + "grad_norm": 2.1440374851226807, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7212328910827637, + "num_tokens": 185195008.0, + "step": 7410 + }, + { + "epoch": 0.8138589940698441, + "grad_norm": 2.475949764251709, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7130157947540283, + "num_tokens": 185215424.0, + "step": 7411 + }, + { + "epoch": 0.8139688117724577, + "grad_norm": 2.393683433532715, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7356674075126648, + "num_tokens": 185236951.0, + "step": 7412 + }, + { + "epoch": 0.8140786294750714, + "grad_norm": 2.4320290088653564, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7070483565330505, + "num_tokens": 185259542.0, + "step": 7413 + }, + { + "epoch": 0.814188447177685, + "grad_norm": 2.151306390762329, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7249282598495483, + "num_tokens": 185284992.0, + "step": 7414 + }, + { + "epoch": 0.8142982648802987, + "grad_norm": 2.4245195388793945, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6996602416038513, + "num_tokens": 185306540.0, + "step": 7415 + }, + { + "epoch": 0.8144080825829124, + "grad_norm": 2.2418675422668457, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7152502536773682, + "num_tokens": 185329698.0, + "step": 7416 + }, + { + "epoch": 0.8145179002855261, + "grad_norm": 2.8206682205200195, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7090771198272705, + "num_tokens": 185346835.0, + "step": 7417 + }, + { + "epoch": 0.8146277179881397, + "grad_norm": 2.6178557872772217, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7064856886863708, + "num_tokens": 185367210.0, + "step": 7418 + }, + { + "epoch": 0.8147375356907534, + "grad_norm": 2.192286252975464, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.739684522151947, + "num_tokens": 185390665.0, + "step": 7419 + }, + { + "epoch": 0.814847353393367, + "grad_norm": 1.9783084392547607, + "learning_rate": 1e-06, + "loss": 1.0089, + "mean_token_accuracy": 0.6872968077659607, + "num_tokens": 185421829.0, + "step": 7420 + }, + { + "epoch": 0.8149571710959806, + "grad_norm": 2.358238935470581, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7356748580932617, + "num_tokens": 185443593.0, + "step": 7421 + }, + { + "epoch": 0.8150669887985943, + "grad_norm": 2.135819435119629, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6877470016479492, + "num_tokens": 185471820.0, + "step": 7422 + }, + { + "epoch": 0.8151768065012079, + "grad_norm": 2.0435760021209717, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6782622933387756, + "num_tokens": 185501737.0, + "step": 7423 + }, + { + "epoch": 0.8152866242038217, + "grad_norm": 2.6308350563049316, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7185415625572205, + "num_tokens": 185518804.0, + "step": 7424 + }, + { + "epoch": 0.8153964419064353, + "grad_norm": 1.9201058149337769, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.688348650932312, + "num_tokens": 185550098.0, + "step": 7425 + }, + { + "epoch": 0.815506259609049, + "grad_norm": 2.543288469314575, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.728689432144165, + "num_tokens": 185569203.0, + "step": 7426 + }, + { + "epoch": 0.8156160773116626, + "grad_norm": 2.3220269680023193, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7242864370346069, + "num_tokens": 185593108.0, + "step": 7427 + }, + { + "epoch": 0.8157258950142763, + "grad_norm": 2.0205390453338623, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6969538331031799, + "num_tokens": 185625314.0, + "step": 7428 + }, + { + "epoch": 0.8158357127168899, + "grad_norm": 2.55230712890625, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7168766260147095, + "num_tokens": 185644037.0, + "step": 7429 + }, + { + "epoch": 0.8159455304195036, + "grad_norm": 2.196326732635498, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.693098783493042, + "num_tokens": 185668979.0, + "step": 7430 + }, + { + "epoch": 0.8160553481221173, + "grad_norm": 2.1082820892333984, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7112576961517334, + "num_tokens": 185695773.0, + "step": 7431 + }, + { + "epoch": 0.816165165824731, + "grad_norm": 2.1936581134796143, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7093446850776672, + "num_tokens": 185721099.0, + "step": 7432 + }, + { + "epoch": 0.8162749835273446, + "grad_norm": 2.3897757530212402, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.712432861328125, + "num_tokens": 185742888.0, + "step": 7433 + }, + { + "epoch": 0.8163848012299583, + "grad_norm": 2.0787887573242188, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6836887001991272, + "num_tokens": 185770520.0, + "step": 7434 + }, + { + "epoch": 0.8164946189325719, + "grad_norm": 2.084372043609619, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.714453935623169, + "num_tokens": 185799407.0, + "step": 7435 + }, + { + "epoch": 0.8166044366351856, + "grad_norm": 1.951274037361145, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6965190172195435, + "num_tokens": 185831791.0, + "step": 7436 + }, + { + "epoch": 0.8167142543377992, + "grad_norm": 2.120652437210083, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7069060206413269, + "num_tokens": 185856046.0, + "step": 7437 + }, + { + "epoch": 0.816824072040413, + "grad_norm": 1.9990509748458862, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.6921310424804688, + "num_tokens": 185885992.0, + "step": 7438 + }, + { + "epoch": 0.8169338897430266, + "grad_norm": 2.4059762954711914, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7230552434921265, + "num_tokens": 185906722.0, + "step": 7439 + }, + { + "epoch": 0.8170437074456403, + "grad_norm": 2.211820602416992, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7104814052581787, + "num_tokens": 185932620.0, + "step": 7440 + }, + { + "epoch": 0.8171535251482539, + "grad_norm": 1.9644184112548828, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6970468163490295, + "num_tokens": 185963415.0, + "step": 7441 + }, + { + "epoch": 0.8172633428508675, + "grad_norm": 2.136204957962036, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.727341890335083, + "num_tokens": 185989322.0, + "step": 7442 + }, + { + "epoch": 0.8173731605534812, + "grad_norm": 2.420910120010376, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.706415057182312, + "num_tokens": 186009768.0, + "step": 7443 + }, + { + "epoch": 0.8174829782560948, + "grad_norm": 2.548739433288574, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7317344546318054, + "num_tokens": 186028610.0, + "step": 7444 + }, + { + "epoch": 0.8175927959587086, + "grad_norm": 2.060292959213257, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.6978808045387268, + "num_tokens": 186057760.0, + "step": 7445 + }, + { + "epoch": 0.8177026136613222, + "grad_norm": 2.274625539779663, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7090240120887756, + "num_tokens": 186081877.0, + "step": 7446 + }, + { + "epoch": 0.8178124313639359, + "grad_norm": 2.244920253753662, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.734573245048523, + "num_tokens": 186105985.0, + "step": 7447 + }, + { + "epoch": 0.8179222490665495, + "grad_norm": 2.217172622680664, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6818366050720215, + "num_tokens": 186131034.0, + "step": 7448 + }, + { + "epoch": 0.8180320667691632, + "grad_norm": 2.126624345779419, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7059127688407898, + "num_tokens": 186160422.0, + "step": 7449 + }, + { + "epoch": 0.8181418844717768, + "grad_norm": 2.4150991439819336, + "learning_rate": 1e-06, + "loss": 1.0221, + "mean_token_accuracy": 0.6905484199523926, + "num_tokens": 186184281.0, + "step": 7450 + }, + { + "epoch": 0.8182517021743905, + "grad_norm": 2.1765151023864746, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7271286249160767, + "num_tokens": 186208715.0, + "step": 7451 + }, + { + "epoch": 0.8183615198770041, + "grad_norm": 1.9016858339309692, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.69584059715271, + "num_tokens": 186240105.0, + "step": 7452 + }, + { + "epoch": 0.8184713375796179, + "grad_norm": 2.2449867725372314, + "learning_rate": 1e-06, + "loss": 1.0149, + "mean_token_accuracy": 0.6977229118347168, + "num_tokens": 186266748.0, + "step": 7453 + }, + { + "epoch": 0.8185811552822315, + "grad_norm": 2.244818925857544, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7343457937240601, + "num_tokens": 186290335.0, + "step": 7454 + }, + { + "epoch": 0.8186909729848452, + "grad_norm": 2.3410542011260986, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.700977087020874, + "num_tokens": 186312382.0, + "step": 7455 + }, + { + "epoch": 0.8188007906874588, + "grad_norm": 2.007554769515991, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7073087096214294, + "num_tokens": 186340068.0, + "step": 7456 + }, + { + "epoch": 0.8189106083900725, + "grad_norm": 2.053441047668457, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.682630717754364, + "num_tokens": 186367447.0, + "step": 7457 + }, + { + "epoch": 0.8190204260926861, + "grad_norm": 2.5024914741516113, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.707520604133606, + "num_tokens": 186394094.0, + "step": 7458 + }, + { + "epoch": 0.8191302437952998, + "grad_norm": 2.0933175086975098, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7263582944869995, + "num_tokens": 186419761.0, + "step": 7459 + }, + { + "epoch": 0.8192400614979135, + "grad_norm": 2.332932472229004, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7269542217254639, + "num_tokens": 186439668.0, + "step": 7460 + }, + { + "epoch": 0.8193498792005272, + "grad_norm": 2.2073895931243896, + "learning_rate": 1e-06, + "loss": 1.0535, + "mean_token_accuracy": 0.687562108039856, + "num_tokens": 186463974.0, + "step": 7461 + }, + { + "epoch": 0.8194596969031408, + "grad_norm": 2.26615047454834, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6883900165557861, + "num_tokens": 186489174.0, + "step": 7462 + }, + { + "epoch": 0.8195695146057544, + "grad_norm": 2.0227229595184326, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.6992462873458862, + "num_tokens": 186517340.0, + "step": 7463 + }, + { + "epoch": 0.8196793323083681, + "grad_norm": 2.2251503467559814, + "learning_rate": 1e-06, + "loss": 1.0709, + "mean_token_accuracy": 0.6742532253265381, + "num_tokens": 186543481.0, + "step": 7464 + }, + { + "epoch": 0.8197891500109817, + "grad_norm": 2.1641178131103516, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7121129035949707, + "num_tokens": 186566268.0, + "step": 7465 + }, + { + "epoch": 0.8198989677135954, + "grad_norm": 2.1906447410583496, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7075595855712891, + "num_tokens": 186589512.0, + "step": 7466 + }, + { + "epoch": 0.8200087854162091, + "grad_norm": 2.3823225498199463, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7095726728439331, + "num_tokens": 186611829.0, + "step": 7467 + }, + { + "epoch": 0.8201186031188228, + "grad_norm": 2.2406647205352783, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7178556323051453, + "num_tokens": 186635968.0, + "step": 7468 + }, + { + "epoch": 0.8202284208214364, + "grad_norm": 2.0619781017303467, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6939926147460938, + "num_tokens": 186665150.0, + "step": 7469 + }, + { + "epoch": 0.8203382385240501, + "grad_norm": 2.075428009033203, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.6949988007545471, + "num_tokens": 186693637.0, + "step": 7470 + }, + { + "epoch": 0.8204480562266637, + "grad_norm": 2.4393322467803955, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7127288579940796, + "num_tokens": 186715446.0, + "step": 7471 + }, + { + "epoch": 0.8205578739292774, + "grad_norm": 2.151723623275757, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7202920317649841, + "num_tokens": 186740965.0, + "step": 7472 + }, + { + "epoch": 0.820667691631891, + "grad_norm": 2.27340030670166, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7200707197189331, + "num_tokens": 186764105.0, + "step": 7473 + }, + { + "epoch": 0.8207775093345048, + "grad_norm": 2.389310598373413, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.7035659551620483, + "num_tokens": 186785478.0, + "step": 7474 + }, + { + "epoch": 0.8208873270371184, + "grad_norm": 2.475438117980957, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7374455332756042, + "num_tokens": 186805188.0, + "step": 7475 + }, + { + "epoch": 0.8209971447397321, + "grad_norm": 2.141514778137207, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7299060821533203, + "num_tokens": 186830172.0, + "step": 7476 + }, + { + "epoch": 0.8211069624423457, + "grad_norm": 2.265496253967285, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7028669118881226, + "num_tokens": 186855029.0, + "step": 7477 + }, + { + "epoch": 0.8212167801449594, + "grad_norm": 2.3767049312591553, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.6991947293281555, + "num_tokens": 186878170.0, + "step": 7478 + }, + { + "epoch": 0.821326597847573, + "grad_norm": 2.08661150932312, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7019803524017334, + "num_tokens": 186906154.0, + "step": 7479 + }, + { + "epoch": 0.8214364155501866, + "grad_norm": 2.071481227874756, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6830580234527588, + "num_tokens": 186935873.0, + "step": 7480 + }, + { + "epoch": 0.8215462332528004, + "grad_norm": 1.984959602355957, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.6993264555931091, + "num_tokens": 186965235.0, + "step": 7481 + }, + { + "epoch": 0.821656050955414, + "grad_norm": 2.0647642612457275, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7098385691642761, + "num_tokens": 186992645.0, + "step": 7482 + }, + { + "epoch": 0.8217658686580277, + "grad_norm": 2.106290817260742, + "learning_rate": 1e-06, + "loss": 1.0533, + "mean_token_accuracy": 0.6801983118057251, + "num_tokens": 187022255.0, + "step": 7483 + }, + { + "epoch": 0.8218756863606413, + "grad_norm": 2.148014783859253, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.7011785507202148, + "num_tokens": 187047925.0, + "step": 7484 + }, + { + "epoch": 0.821985504063255, + "grad_norm": 2.1567223072052, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7054083943367004, + "num_tokens": 187073991.0, + "step": 7485 + }, + { + "epoch": 0.8220953217658686, + "grad_norm": 2.1023597717285156, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6933449506759644, + "num_tokens": 187099607.0, + "step": 7486 + }, + { + "epoch": 0.8222051394684823, + "grad_norm": 2.55776047706604, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7180556654930115, + "num_tokens": 187119208.0, + "step": 7487 + }, + { + "epoch": 0.8223149571710959, + "grad_norm": 2.29907488822937, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7344563007354736, + "num_tokens": 187142407.0, + "step": 7488 + }, + { + "epoch": 0.8224247748737097, + "grad_norm": 2.284219741821289, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.6997919678688049, + "num_tokens": 187167441.0, + "step": 7489 + }, + { + "epoch": 0.8225345925763233, + "grad_norm": 1.9682886600494385, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6913012862205505, + "num_tokens": 187198228.0, + "step": 7490 + }, + { + "epoch": 0.822644410278937, + "grad_norm": 2.040980577468872, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.723170280456543, + "num_tokens": 187226572.0, + "step": 7491 + }, + { + "epoch": 0.8227542279815506, + "grad_norm": 2.260592222213745, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.7001270651817322, + "num_tokens": 187250094.0, + "step": 7492 + }, + { + "epoch": 0.8228640456841643, + "grad_norm": 2.127758264541626, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6980105638504028, + "num_tokens": 187278065.0, + "step": 7493 + }, + { + "epoch": 0.8229738633867779, + "grad_norm": 2.2162094116210938, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7052998542785645, + "num_tokens": 187301288.0, + "step": 7494 + }, + { + "epoch": 0.8230836810893916, + "grad_norm": 2.262805461883545, + "learning_rate": 1e-06, + "loss": 1.0513, + "mean_token_accuracy": 0.6880476474761963, + "num_tokens": 187326738.0, + "step": 7495 + }, + { + "epoch": 0.8231934987920053, + "grad_norm": 2.1602137088775635, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6991393566131592, + "num_tokens": 187352124.0, + "step": 7496 + }, + { + "epoch": 0.823303316494619, + "grad_norm": 2.2822062969207764, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.708660364151001, + "num_tokens": 187373983.0, + "step": 7497 + }, + { + "epoch": 0.8234131341972326, + "grad_norm": 2.3353912830352783, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.725843071937561, + "num_tokens": 187396661.0, + "step": 7498 + }, + { + "epoch": 0.8235229518998463, + "grad_norm": 2.251866102218628, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7246930599212646, + "num_tokens": 187417734.0, + "step": 7499 + }, + { + "epoch": 0.8236327696024599, + "grad_norm": 2.078413486480713, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7101209163665771, + "num_tokens": 187444555.0, + "step": 7500 + }, + { + "epoch": 0.8237425873050735, + "grad_norm": 2.2656941413879395, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7228517532348633, + "num_tokens": 187466703.0, + "step": 7501 + }, + { + "epoch": 0.8238524050076872, + "grad_norm": 2.46738862991333, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7383794784545898, + "num_tokens": 187486053.0, + "step": 7502 + }, + { + "epoch": 0.823962222710301, + "grad_norm": 2.1825356483459473, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7044551372528076, + "num_tokens": 187510967.0, + "step": 7503 + }, + { + "epoch": 0.8240720404129146, + "grad_norm": 2.4758219718933105, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7425872683525085, + "num_tokens": 187530001.0, + "step": 7504 + }, + { + "epoch": 0.8241818581155282, + "grad_norm": 2.2573981285095215, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7038384675979614, + "num_tokens": 187552699.0, + "step": 7505 + }, + { + "epoch": 0.8242916758181419, + "grad_norm": 2.127467393875122, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7269448041915894, + "num_tokens": 187576690.0, + "step": 7506 + }, + { + "epoch": 0.8244014935207555, + "grad_norm": 2.4755749702453613, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7156954407691956, + "num_tokens": 187596911.0, + "step": 7507 + }, + { + "epoch": 0.8245113112233692, + "grad_norm": 2.075775146484375, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7309596538543701, + "num_tokens": 187623942.0, + "step": 7508 + }, + { + "epoch": 0.8246211289259828, + "grad_norm": 2.2985775470733643, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7054067254066467, + "num_tokens": 187648507.0, + "step": 7509 + }, + { + "epoch": 0.8247309466285966, + "grad_norm": 2.328125476837158, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7043970227241516, + "num_tokens": 187671181.0, + "step": 7510 + }, + { + "epoch": 0.8248407643312102, + "grad_norm": 2.109463691711426, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.700198769569397, + "num_tokens": 187698785.0, + "step": 7511 + }, + { + "epoch": 0.8249505820338239, + "grad_norm": 2.2405765056610107, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6938263177871704, + "num_tokens": 187723380.0, + "step": 7512 + }, + { + "epoch": 0.8250603997364375, + "grad_norm": 2.491332530975342, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7236168384552002, + "num_tokens": 187742765.0, + "step": 7513 + }, + { + "epoch": 0.8251702174390512, + "grad_norm": 2.145020008087158, + "learning_rate": 1e-06, + "loss": 1.0779, + "mean_token_accuracy": 0.6723482012748718, + "num_tokens": 187770723.0, + "step": 7514 + }, + { + "epoch": 0.8252800351416648, + "grad_norm": 2.5023064613342285, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7075732350349426, + "num_tokens": 187791149.0, + "step": 7515 + }, + { + "epoch": 0.8253898528442785, + "grad_norm": 2.1822495460510254, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7107645869255066, + "num_tokens": 187819963.0, + "step": 7516 + }, + { + "epoch": 0.8254996705468921, + "grad_norm": 2.014846086502075, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7060865759849548, + "num_tokens": 187847694.0, + "step": 7517 + }, + { + "epoch": 0.8256094882495059, + "grad_norm": 2.173726797103882, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.6963921189308167, + "num_tokens": 187872642.0, + "step": 7518 + }, + { + "epoch": 0.8257193059521195, + "grad_norm": 2.1762282848358154, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.6923741698265076, + "num_tokens": 187898998.0, + "step": 7519 + }, + { + "epoch": 0.8258291236547332, + "grad_norm": 2.2318952083587646, + "learning_rate": 1e-06, + "loss": 1.0356, + "mean_token_accuracy": 0.6937486529350281, + "num_tokens": 187922934.0, + "step": 7520 + }, + { + "epoch": 0.8259389413573468, + "grad_norm": 2.1295249462127686, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7040338516235352, + "num_tokens": 187949269.0, + "step": 7521 + }, + { + "epoch": 0.8260487590599604, + "grad_norm": 2.299964189529419, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6930433511734009, + "num_tokens": 187973319.0, + "step": 7522 + }, + { + "epoch": 0.8261585767625741, + "grad_norm": 2.2519943714141846, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7079527378082275, + "num_tokens": 187996551.0, + "step": 7523 + }, + { + "epoch": 0.8262683944651877, + "grad_norm": 2.2607979774475098, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7303519248962402, + "num_tokens": 188019342.0, + "step": 7524 + }, + { + "epoch": 0.8263782121678015, + "grad_norm": 2.415987968444824, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7234809398651123, + "num_tokens": 188039835.0, + "step": 7525 + }, + { + "epoch": 0.8264880298704151, + "grad_norm": 2.191873550415039, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7085400819778442, + "num_tokens": 188065582.0, + "step": 7526 + }, + { + "epoch": 0.8265978475730288, + "grad_norm": 2.1432337760925293, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7039844989776611, + "num_tokens": 188092787.0, + "step": 7527 + }, + { + "epoch": 0.8267076652756424, + "grad_norm": 2.385462760925293, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7316262722015381, + "num_tokens": 188116849.0, + "step": 7528 + }, + { + "epoch": 0.8268174829782561, + "grad_norm": 1.968031406402588, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.7024804949760437, + "num_tokens": 188149127.0, + "step": 7529 + }, + { + "epoch": 0.8269273006808697, + "grad_norm": 2.2892818450927734, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6955171823501587, + "num_tokens": 188171694.0, + "step": 7530 + }, + { + "epoch": 0.8270371183834834, + "grad_norm": 2.0661299228668213, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7160731554031372, + "num_tokens": 188199166.0, + "step": 7531 + }, + { + "epoch": 0.8271469360860971, + "grad_norm": 2.0424234867095947, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7163577079772949, + "num_tokens": 188225669.0, + "step": 7532 + }, + { + "epoch": 0.8272567537887108, + "grad_norm": 2.1677849292755127, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6935151815414429, + "num_tokens": 188252164.0, + "step": 7533 + }, + { + "epoch": 0.8273665714913244, + "grad_norm": 2.1840062141418457, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6979870796203613, + "num_tokens": 188277510.0, + "step": 7534 + }, + { + "epoch": 0.8274763891939381, + "grad_norm": 2.5887610912323, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7205950617790222, + "num_tokens": 188295656.0, + "step": 7535 + }, + { + "epoch": 0.8275862068965517, + "grad_norm": 2.109372854232788, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7161555886268616, + "num_tokens": 188320857.0, + "step": 7536 + }, + { + "epoch": 0.8276960245991654, + "grad_norm": 2.68275785446167, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.72995525598526, + "num_tokens": 188338210.0, + "step": 7537 + }, + { + "epoch": 0.827805842301779, + "grad_norm": 2.0118460655212402, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7133435010910034, + "num_tokens": 188369338.0, + "step": 7538 + }, + { + "epoch": 0.8279156600043928, + "grad_norm": 2.0195822715759277, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7185359597206116, + "num_tokens": 188396106.0, + "step": 7539 + }, + { + "epoch": 0.8280254777070064, + "grad_norm": 2.0877106189727783, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7039933204650879, + "num_tokens": 188422518.0, + "step": 7540 + }, + { + "epoch": 0.82813529540962, + "grad_norm": 2.566894292831421, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7195543646812439, + "num_tokens": 188441708.0, + "step": 7541 + }, + { + "epoch": 0.8282451131122337, + "grad_norm": 2.074239730834961, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7221022844314575, + "num_tokens": 188466690.0, + "step": 7542 + }, + { + "epoch": 0.8283549308148473, + "grad_norm": 1.9140690565109253, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7175673246383667, + "num_tokens": 188498129.0, + "step": 7543 + }, + { + "epoch": 0.828464748517461, + "grad_norm": 2.261047601699829, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7060903906822205, + "num_tokens": 188522658.0, + "step": 7544 + }, + { + "epoch": 0.8285745662200746, + "grad_norm": 2.4602346420288086, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7171739339828491, + "num_tokens": 188543261.0, + "step": 7545 + }, + { + "epoch": 0.8286843839226883, + "grad_norm": 2.2043704986572266, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6843932867050171, + "num_tokens": 188569337.0, + "step": 7546 + }, + { + "epoch": 0.828794201625302, + "grad_norm": 2.0092203617095947, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7251744866371155, + "num_tokens": 188595984.0, + "step": 7547 + }, + { + "epoch": 0.8289040193279157, + "grad_norm": 2.2549593448638916, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.6958468556404114, + "num_tokens": 188620740.0, + "step": 7548 + }, + { + "epoch": 0.8290138370305293, + "grad_norm": 1.9872721433639526, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.6987965703010559, + "num_tokens": 188651020.0, + "step": 7549 + }, + { + "epoch": 0.829123654733143, + "grad_norm": 2.3435723781585693, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.7014216184616089, + "num_tokens": 188673234.0, + "step": 7550 + }, + { + "epoch": 0.8292334724357566, + "grad_norm": 2.303447723388672, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7152557373046875, + "num_tokens": 188697365.0, + "step": 7551 + }, + { + "epoch": 0.8293432901383703, + "grad_norm": 2.2609446048736572, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7061601877212524, + "num_tokens": 188721585.0, + "step": 7552 + }, + { + "epoch": 0.8294531078409839, + "grad_norm": 2.005556344985962, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7363972067832947, + "num_tokens": 188749346.0, + "step": 7553 + }, + { + "epoch": 0.8295629255435977, + "grad_norm": 2.039687156677246, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7183157205581665, + "num_tokens": 188778620.0, + "step": 7554 + }, + { + "epoch": 0.8296727432462113, + "grad_norm": 1.995750904083252, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.700282096862793, + "num_tokens": 188809073.0, + "step": 7555 + }, + { + "epoch": 0.829782560948825, + "grad_norm": 2.3850302696228027, + "learning_rate": 1e-06, + "loss": 0.8006, + "mean_token_accuracy": 0.7491605281829834, + "num_tokens": 188829636.0, + "step": 7556 + }, + { + "epoch": 0.8298923786514386, + "grad_norm": 1.9325107336044312, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6952210664749146, + "num_tokens": 188862265.0, + "step": 7557 + }, + { + "epoch": 0.8300021963540523, + "grad_norm": 2.499528169631958, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7262071967124939, + "num_tokens": 188881275.0, + "step": 7558 + }, + { + "epoch": 0.8301120140566659, + "grad_norm": 2.08697772026062, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7007777690887451, + "num_tokens": 188909039.0, + "step": 7559 + }, + { + "epoch": 0.8302218317592795, + "grad_norm": 2.0580153465270996, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7104340195655823, + "num_tokens": 188938942.0, + "step": 7560 + }, + { + "epoch": 0.8303316494618933, + "grad_norm": 2.349318265914917, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7141810655593872, + "num_tokens": 188962048.0, + "step": 7561 + }, + { + "epoch": 0.830441467164507, + "grad_norm": 2.3781018257141113, + "learning_rate": 1e-06, + "loss": 1.0725, + "mean_token_accuracy": 0.6776679158210754, + "num_tokens": 188985914.0, + "step": 7562 + }, + { + "epoch": 0.8305512848671206, + "grad_norm": 2.1015963554382324, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6888253092765808, + "num_tokens": 189014339.0, + "step": 7563 + }, + { + "epoch": 0.8306611025697342, + "grad_norm": 2.222585916519165, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7087401747703552, + "num_tokens": 189037188.0, + "step": 7564 + }, + { + "epoch": 0.8307709202723479, + "grad_norm": 2.1518607139587402, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7183116674423218, + "num_tokens": 189062752.0, + "step": 7565 + }, + { + "epoch": 0.8308807379749615, + "grad_norm": 2.246755599975586, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7082891464233398, + "num_tokens": 189086523.0, + "step": 7566 + }, + { + "epoch": 0.8309905556775752, + "grad_norm": 1.9185539484024048, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6898036003112793, + "num_tokens": 189119431.0, + "step": 7567 + }, + { + "epoch": 0.8311003733801889, + "grad_norm": 2.3022284507751465, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7466008067131042, + "num_tokens": 189141530.0, + "step": 7568 + }, + { + "epoch": 0.8312101910828026, + "grad_norm": 2.2480058670043945, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6952199935913086, + "num_tokens": 189166035.0, + "step": 7569 + }, + { + "epoch": 0.8313200087854162, + "grad_norm": 2.7480783462524414, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7250511646270752, + "num_tokens": 189181309.0, + "step": 7570 + }, + { + "epoch": 0.8314298264880299, + "grad_norm": 2.0783846378326416, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.6894069910049438, + "num_tokens": 189210823.0, + "step": 7571 + }, + { + "epoch": 0.8315396441906435, + "grad_norm": 2.1739354133605957, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6918157339096069, + "num_tokens": 189235741.0, + "step": 7572 + }, + { + "epoch": 0.8316494618932572, + "grad_norm": 2.3405375480651855, + "learning_rate": 1e-06, + "loss": 1.057, + "mean_token_accuracy": 0.6920940279960632, + "num_tokens": 189260304.0, + "step": 7573 + }, + { + "epoch": 0.8317592795958708, + "grad_norm": 2.2201991081237793, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7177743911743164, + "num_tokens": 189285143.0, + "step": 7574 + }, + { + "epoch": 0.8318690972984845, + "grad_norm": 2.1641969680786133, + "learning_rate": 1e-06, + "loss": 1.0269, + "mean_token_accuracy": 0.6842811107635498, + "num_tokens": 189312493.0, + "step": 7575 + }, + { + "epoch": 0.8319789150010982, + "grad_norm": 2.2374720573425293, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7158840894699097, + "num_tokens": 189339038.0, + "step": 7576 + }, + { + "epoch": 0.8320887327037119, + "grad_norm": 2.3261990547180176, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6939082145690918, + "num_tokens": 189361901.0, + "step": 7577 + }, + { + "epoch": 0.8321985504063255, + "grad_norm": 2.2544634342193604, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.7010495662689209, + "num_tokens": 189386785.0, + "step": 7578 + }, + { + "epoch": 0.8323083681089392, + "grad_norm": 2.1988115310668945, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7089139223098755, + "num_tokens": 189412023.0, + "step": 7579 + }, + { + "epoch": 0.8324181858115528, + "grad_norm": 2.109245777130127, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7154849171638489, + "num_tokens": 189436644.0, + "step": 7580 + }, + { + "epoch": 0.8325280035141664, + "grad_norm": 2.493009090423584, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7096842527389526, + "num_tokens": 189457385.0, + "step": 7581 + }, + { + "epoch": 0.8326378212167801, + "grad_norm": 2.204087734222412, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.7021254301071167, + "num_tokens": 189482924.0, + "step": 7582 + }, + { + "epoch": 0.8327476389193939, + "grad_norm": 2.4015567302703857, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7274457812309265, + "num_tokens": 189504805.0, + "step": 7583 + }, + { + "epoch": 0.8328574566220075, + "grad_norm": 2.0015838146209717, + "learning_rate": 1e-06, + "loss": 1.1051, + "mean_token_accuracy": 0.6671063303947449, + "num_tokens": 189536286.0, + "step": 7584 + }, + { + "epoch": 0.8329672743246211, + "grad_norm": 2.5602989196777344, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7261120080947876, + "num_tokens": 189555178.0, + "step": 7585 + }, + { + "epoch": 0.8330770920272348, + "grad_norm": 2.2104101181030273, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6880892515182495, + "num_tokens": 189579062.0, + "step": 7586 + }, + { + "epoch": 0.8331869097298484, + "grad_norm": 2.1416726112365723, + "learning_rate": 1e-06, + "loss": 1.0022, + "mean_token_accuracy": 0.6900714635848999, + "num_tokens": 189605309.0, + "step": 7587 + }, + { + "epoch": 0.8332967274324621, + "grad_norm": 2.005628824234009, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6946032047271729, + "num_tokens": 189633788.0, + "step": 7588 + }, + { + "epoch": 0.8334065451350757, + "grad_norm": 2.051908493041992, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6987249851226807, + "num_tokens": 189662126.0, + "step": 7589 + }, + { + "epoch": 0.8335163628376895, + "grad_norm": 2.016221284866333, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7043861746788025, + "num_tokens": 189690564.0, + "step": 7590 + }, + { + "epoch": 0.8336261805403031, + "grad_norm": 2.1916356086730957, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6926825642585754, + "num_tokens": 189715677.0, + "step": 7591 + }, + { + "epoch": 0.8337359982429168, + "grad_norm": 2.128065586090088, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.7073864936828613, + "num_tokens": 189742691.0, + "step": 7592 + }, + { + "epoch": 0.8338458159455304, + "grad_norm": 2.218860387802124, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7084702849388123, + "num_tokens": 189767964.0, + "step": 7593 + }, + { + "epoch": 0.8339556336481441, + "grad_norm": 2.2137272357940674, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.721913754940033, + "num_tokens": 189790990.0, + "step": 7594 + }, + { + "epoch": 0.8340654513507577, + "grad_norm": 2.19047212600708, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7106393575668335, + "num_tokens": 189815028.0, + "step": 7595 + }, + { + "epoch": 0.8341752690533714, + "grad_norm": 2.033936023712158, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7078580856323242, + "num_tokens": 189842205.0, + "step": 7596 + }, + { + "epoch": 0.8342850867559851, + "grad_norm": 1.8923412561416626, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.708141565322876, + "num_tokens": 189872100.0, + "step": 7597 + }, + { + "epoch": 0.8343949044585988, + "grad_norm": 2.0647807121276855, + "learning_rate": 1e-06, + "loss": 1.0156, + "mean_token_accuracy": 0.6915227174758911, + "num_tokens": 189899880.0, + "step": 7598 + }, + { + "epoch": 0.8345047221612124, + "grad_norm": 2.494706392288208, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7314732074737549, + "num_tokens": 189919845.0, + "step": 7599 + }, + { + "epoch": 0.834614539863826, + "grad_norm": 2.1079185009002686, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7053232789039612, + "num_tokens": 189946913.0, + "step": 7600 + }, + { + "epoch": 0.8347243575664397, + "grad_norm": 2.2423601150512695, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6882532238960266, + "num_tokens": 189972139.0, + "step": 7601 + }, + { + "epoch": 0.8348341752690533, + "grad_norm": 2.292478084564209, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7208831310272217, + "num_tokens": 189994903.0, + "step": 7602 + }, + { + "epoch": 0.834943992971667, + "grad_norm": 2.359766960144043, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7265918850898743, + "num_tokens": 190016204.0, + "step": 7603 + }, + { + "epoch": 0.8350538106742806, + "grad_norm": 2.0260400772094727, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6984517574310303, + "num_tokens": 190045188.0, + "step": 7604 + }, + { + "epoch": 0.8351636283768944, + "grad_norm": 2.209651231765747, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7087573409080505, + "num_tokens": 190069219.0, + "step": 7605 + }, + { + "epoch": 0.835273446079508, + "grad_norm": 2.3405394554138184, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7164866924285889, + "num_tokens": 190091078.0, + "step": 7606 + }, + { + "epoch": 0.8353832637821217, + "grad_norm": 2.147855043411255, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6870934963226318, + "num_tokens": 190119405.0, + "step": 7607 + }, + { + "epoch": 0.8354930814847353, + "grad_norm": 2.2543880939483643, + "learning_rate": 1e-06, + "loss": 1.0005, + "mean_token_accuracy": 0.7025539875030518, + "num_tokens": 190144540.0, + "step": 7608 + }, + { + "epoch": 0.835602899187349, + "grad_norm": 2.25553035736084, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.6899194717407227, + "num_tokens": 190168982.0, + "step": 7609 + }, + { + "epoch": 0.8357127168899626, + "grad_norm": 2.1415016651153564, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7094148993492126, + "num_tokens": 190195691.0, + "step": 7610 + }, + { + "epoch": 0.8358225345925763, + "grad_norm": 2.0408987998962402, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7130348682403564, + "num_tokens": 190222321.0, + "step": 7611 + }, + { + "epoch": 0.83593235229519, + "grad_norm": 2.5069708824157715, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7007827758789062, + "num_tokens": 190244910.0, + "step": 7612 + }, + { + "epoch": 0.8360421699978037, + "grad_norm": 2.2914035320281982, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6915931701660156, + "num_tokens": 190269090.0, + "step": 7613 + }, + { + "epoch": 0.8361519877004173, + "grad_norm": 2.5067522525787354, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7113010883331299, + "num_tokens": 190289977.0, + "step": 7614 + }, + { + "epoch": 0.836261805403031, + "grad_norm": 2.119471549987793, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7086849808692932, + "num_tokens": 190317550.0, + "step": 7615 + }, + { + "epoch": 0.8363716231056446, + "grad_norm": 2.0249788761138916, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7175416350364685, + "num_tokens": 190347105.0, + "step": 7616 + }, + { + "epoch": 0.8364814408082583, + "grad_norm": 2.271472454071045, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7067781090736389, + "num_tokens": 190371985.0, + "step": 7617 + }, + { + "epoch": 0.8365912585108719, + "grad_norm": 2.3766894340515137, + "learning_rate": 1e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7549498677253723, + "num_tokens": 190392446.0, + "step": 7618 + }, + { + "epoch": 0.8367010762134857, + "grad_norm": 2.2871434688568115, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7186290621757507, + "num_tokens": 190414887.0, + "step": 7619 + }, + { + "epoch": 0.8368108939160993, + "grad_norm": 2.159132242202759, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7150467038154602, + "num_tokens": 190440487.0, + "step": 7620 + }, + { + "epoch": 0.836920711618713, + "grad_norm": 2.4264724254608154, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.7008013725280762, + "num_tokens": 190463305.0, + "step": 7621 + }, + { + "epoch": 0.8370305293213266, + "grad_norm": 2.148803234100342, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7162051200866699, + "num_tokens": 190488863.0, + "step": 7622 + }, + { + "epoch": 0.8371403470239402, + "grad_norm": 2.365933418273926, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6972912549972534, + "num_tokens": 190511999.0, + "step": 7623 + }, + { + "epoch": 0.8372501647265539, + "grad_norm": 2.538780689239502, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7205865383148193, + "num_tokens": 190532207.0, + "step": 7624 + }, + { + "epoch": 0.8373599824291675, + "grad_norm": 2.512446641921997, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7135632634162903, + "num_tokens": 190552672.0, + "step": 7625 + }, + { + "epoch": 0.8374698001317813, + "grad_norm": 2.297222137451172, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7300969958305359, + "num_tokens": 190575448.0, + "step": 7626 + }, + { + "epoch": 0.8375796178343949, + "grad_norm": 2.425507068634033, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7317153811454773, + "num_tokens": 190594380.0, + "step": 7627 + }, + { + "epoch": 0.8376894355370086, + "grad_norm": 2.2852861881256104, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.693481981754303, + "num_tokens": 190617524.0, + "step": 7628 + }, + { + "epoch": 0.8377992532396222, + "grad_norm": 2.0108487606048584, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7373509407043457, + "num_tokens": 190645942.0, + "step": 7629 + }, + { + "epoch": 0.8379090709422359, + "grad_norm": 2.4712960720062256, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7203315496444702, + "num_tokens": 190665321.0, + "step": 7630 + }, + { + "epoch": 0.8380188886448495, + "grad_norm": 2.179304361343384, + "learning_rate": 1e-06, + "loss": 1.088, + "mean_token_accuracy": 0.6698089838027954, + "num_tokens": 190691765.0, + "step": 7631 + }, + { + "epoch": 0.8381287063474632, + "grad_norm": 2.2168562412261963, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7247085571289062, + "num_tokens": 190715606.0, + "step": 7632 + }, + { + "epoch": 0.8382385240500769, + "grad_norm": 2.1047110557556152, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7189114093780518, + "num_tokens": 190743067.0, + "step": 7633 + }, + { + "epoch": 0.8383483417526906, + "grad_norm": 2.3522396087646484, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.7084065675735474, + "num_tokens": 190766714.0, + "step": 7634 + }, + { + "epoch": 0.8384581594553042, + "grad_norm": 2.2344248294830322, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7110030651092529, + "num_tokens": 190793557.0, + "step": 7635 + }, + { + "epoch": 0.8385679771579179, + "grad_norm": 1.9695883989334106, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.7077716588973999, + "num_tokens": 190824528.0, + "step": 7636 + }, + { + "epoch": 0.8386777948605315, + "grad_norm": 1.9883747100830078, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7061295509338379, + "num_tokens": 190854341.0, + "step": 7637 + }, + { + "epoch": 0.8387876125631452, + "grad_norm": 1.9981642961502075, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7158212661743164, + "num_tokens": 190883837.0, + "step": 7638 + }, + { + "epoch": 0.8388974302657588, + "grad_norm": 2.166848659515381, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7199515700340271, + "num_tokens": 190909050.0, + "step": 7639 + }, + { + "epoch": 0.8390072479683724, + "grad_norm": 2.393827199935913, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.695158839225769, + "num_tokens": 190930519.0, + "step": 7640 + }, + { + "epoch": 0.8391170656709862, + "grad_norm": 2.189438581466675, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7352455854415894, + "num_tokens": 190956451.0, + "step": 7641 + }, + { + "epoch": 0.8392268833735999, + "grad_norm": 2.191373109817505, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7164191007614136, + "num_tokens": 190984370.0, + "step": 7642 + }, + { + "epoch": 0.8393367010762135, + "grad_norm": 2.228620767593384, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7067664861679077, + "num_tokens": 191007978.0, + "step": 7643 + }, + { + "epoch": 0.8394465187788271, + "grad_norm": 2.088609457015991, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6988615989685059, + "num_tokens": 191034654.0, + "step": 7644 + }, + { + "epoch": 0.8395563364814408, + "grad_norm": 2.05336594581604, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7108052968978882, + "num_tokens": 191063423.0, + "step": 7645 + }, + { + "epoch": 0.8396661541840544, + "grad_norm": 2.349475860595703, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.7087517380714417, + "num_tokens": 191086311.0, + "step": 7646 + }, + { + "epoch": 0.8397759718866681, + "grad_norm": 2.072232961654663, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6974352598190308, + "num_tokens": 191114782.0, + "step": 7647 + }, + { + "epoch": 0.8398857895892818, + "grad_norm": 2.231403350830078, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7069822549819946, + "num_tokens": 191140700.0, + "step": 7648 + }, + { + "epoch": 0.8399956072918955, + "grad_norm": 1.9925906658172607, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6957495808601379, + "num_tokens": 191170730.0, + "step": 7649 + }, + { + "epoch": 0.8401054249945091, + "grad_norm": 2.3665997982025146, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7114614248275757, + "num_tokens": 191193366.0, + "step": 7650 + }, + { + "epoch": 0.8402152426971228, + "grad_norm": 2.2050979137420654, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7156834602355957, + "num_tokens": 191219447.0, + "step": 7651 + }, + { + "epoch": 0.8403250603997364, + "grad_norm": 2.155055046081543, + "learning_rate": 1e-06, + "loss": 1.0382, + "mean_token_accuracy": 0.6927748322486877, + "num_tokens": 191246671.0, + "step": 7652 + }, + { + "epoch": 0.8404348781023501, + "grad_norm": 2.400853395462036, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7223314046859741, + "num_tokens": 191267000.0, + "step": 7653 + }, + { + "epoch": 0.8405446958049637, + "grad_norm": 2.024714469909668, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6930103898048401, + "num_tokens": 191294624.0, + "step": 7654 + }, + { + "epoch": 0.8406545135075775, + "grad_norm": 2.3122239112854004, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7482884526252747, + "num_tokens": 191316971.0, + "step": 7655 + }, + { + "epoch": 0.8407643312101911, + "grad_norm": 2.512369394302368, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6973158121109009, + "num_tokens": 191337743.0, + "step": 7656 + }, + { + "epoch": 0.8408741489128048, + "grad_norm": 2.2630529403686523, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6858603954315186, + "num_tokens": 191361607.0, + "step": 7657 + }, + { + "epoch": 0.8409839666154184, + "grad_norm": 2.2363524436950684, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7212498188018799, + "num_tokens": 191383782.0, + "step": 7658 + }, + { + "epoch": 0.8410937843180321, + "grad_norm": 2.328340768814087, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6901006698608398, + "num_tokens": 191407642.0, + "step": 7659 + }, + { + "epoch": 0.8412036020206457, + "grad_norm": 2.1823537349700928, + "learning_rate": 1e-06, + "loss": 1.0256, + "mean_token_accuracy": 0.6896588206291199, + "num_tokens": 191434606.0, + "step": 7660 + }, + { + "epoch": 0.8413134197232593, + "grad_norm": 2.162735939025879, + "learning_rate": 1e-06, + "loss": 1.0599, + "mean_token_accuracy": 0.6814197301864624, + "num_tokens": 191460749.0, + "step": 7661 + }, + { + "epoch": 0.8414232374258731, + "grad_norm": 2.6567060947418213, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.706167459487915, + "num_tokens": 191478213.0, + "step": 7662 + }, + { + "epoch": 0.8415330551284868, + "grad_norm": 2.141155958175659, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7202291488647461, + "num_tokens": 191506407.0, + "step": 7663 + }, + { + "epoch": 0.8416428728311004, + "grad_norm": 2.365830898284912, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7141746878623962, + "num_tokens": 191527384.0, + "step": 7664 + }, + { + "epoch": 0.841752690533714, + "grad_norm": 2.1007726192474365, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7075150609016418, + "num_tokens": 191554074.0, + "step": 7665 + }, + { + "epoch": 0.8418625082363277, + "grad_norm": 2.150810718536377, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6955240368843079, + "num_tokens": 191579561.0, + "step": 7666 + }, + { + "epoch": 0.8419723259389413, + "grad_norm": 2.221280574798584, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7186559438705444, + "num_tokens": 191603877.0, + "step": 7667 + }, + { + "epoch": 0.842082143641555, + "grad_norm": 2.4663186073303223, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.7016281485557556, + "num_tokens": 191626427.0, + "step": 7668 + }, + { + "epoch": 0.8421919613441686, + "grad_norm": 2.0208773612976074, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.701346755027771, + "num_tokens": 191655047.0, + "step": 7669 + }, + { + "epoch": 0.8423017790467824, + "grad_norm": 2.436790943145752, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7031883597373962, + "num_tokens": 191677526.0, + "step": 7670 + }, + { + "epoch": 0.842411596749396, + "grad_norm": 2.101530075073242, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7194745540618896, + "num_tokens": 191703978.0, + "step": 7671 + }, + { + "epoch": 0.8425214144520097, + "grad_norm": 2.3196628093719482, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.6965221166610718, + "num_tokens": 191728668.0, + "step": 7672 + }, + { + "epoch": 0.8426312321546233, + "grad_norm": 2.174257278442383, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6942891478538513, + "num_tokens": 191756288.0, + "step": 7673 + }, + { + "epoch": 0.842741049857237, + "grad_norm": 2.456554412841797, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.719292402267456, + "num_tokens": 191779098.0, + "step": 7674 + }, + { + "epoch": 0.8428508675598506, + "grad_norm": 2.1284537315368652, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7398646473884583, + "num_tokens": 191803789.0, + "step": 7675 + }, + { + "epoch": 0.8429606852624643, + "grad_norm": 2.257486581802368, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6942185759544373, + "num_tokens": 191829405.0, + "step": 7676 + }, + { + "epoch": 0.843070502965078, + "grad_norm": 2.4603426456451416, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.688452959060669, + "num_tokens": 191850804.0, + "step": 7677 + }, + { + "epoch": 0.8431803206676917, + "grad_norm": 2.143416166305542, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7179614305496216, + "num_tokens": 191875054.0, + "step": 7678 + }, + { + "epoch": 0.8432901383703053, + "grad_norm": 2.329742670059204, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7219600677490234, + "num_tokens": 191896248.0, + "step": 7679 + }, + { + "epoch": 0.843399956072919, + "grad_norm": 2.08778715133667, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7249118685722351, + "num_tokens": 191921511.0, + "step": 7680 + }, + { + "epoch": 0.8435097737755326, + "grad_norm": 2.3842580318450928, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7073768973350525, + "num_tokens": 191941683.0, + "step": 7681 + }, + { + "epoch": 0.8436195914781462, + "grad_norm": 2.6446781158447266, + "learning_rate": 1e-06, + "loss": 0.7904, + "mean_token_accuracy": 0.7511351108551025, + "num_tokens": 191957545.0, + "step": 7682 + }, + { + "epoch": 0.8437294091807599, + "grad_norm": 2.3391263484954834, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7191827893257141, + "num_tokens": 191979010.0, + "step": 7683 + }, + { + "epoch": 0.8438392268833736, + "grad_norm": 1.9532307386398315, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7083986401557922, + "num_tokens": 192009474.0, + "step": 7684 + }, + { + "epoch": 0.8439490445859873, + "grad_norm": 2.3000106811523438, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6956340670585632, + "num_tokens": 192033655.0, + "step": 7685 + }, + { + "epoch": 0.8440588622886009, + "grad_norm": 2.477876663208008, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7054114937782288, + "num_tokens": 192054112.0, + "step": 7686 + }, + { + "epoch": 0.8441686799912146, + "grad_norm": 2.1101462841033936, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6940198540687561, + "num_tokens": 192080817.0, + "step": 7687 + }, + { + "epoch": 0.8442784976938282, + "grad_norm": 2.3073530197143555, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.708245038986206, + "num_tokens": 192104650.0, + "step": 7688 + }, + { + "epoch": 0.8443883153964419, + "grad_norm": 2.1852939128875732, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.731037974357605, + "num_tokens": 192130168.0, + "step": 7689 + }, + { + "epoch": 0.8444981330990555, + "grad_norm": 2.402878761291504, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7119202613830566, + "num_tokens": 192150462.0, + "step": 7690 + }, + { + "epoch": 0.8446079508016693, + "grad_norm": 2.335404396057129, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6807749271392822, + "num_tokens": 192174377.0, + "step": 7691 + }, + { + "epoch": 0.8447177685042829, + "grad_norm": 2.183612585067749, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.709601879119873, + "num_tokens": 192199973.0, + "step": 7692 + }, + { + "epoch": 0.8448275862068966, + "grad_norm": 1.9880598783493042, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6977893114089966, + "num_tokens": 192230008.0, + "step": 7693 + }, + { + "epoch": 0.8449374039095102, + "grad_norm": 2.105597496032715, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7224812507629395, + "num_tokens": 192256795.0, + "step": 7694 + }, + { + "epoch": 0.8450472216121239, + "grad_norm": 2.7176620960235596, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7259202003479004, + "num_tokens": 192272758.0, + "step": 7695 + }, + { + "epoch": 0.8451570393147375, + "grad_norm": 2.1320502758026123, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7306833267211914, + "num_tokens": 192297562.0, + "step": 7696 + }, + { + "epoch": 0.8452668570173512, + "grad_norm": 2.3737919330596924, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7179326415061951, + "num_tokens": 192320189.0, + "step": 7697 + }, + { + "epoch": 0.8453766747199648, + "grad_norm": 2.1687207221984863, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7162936329841614, + "num_tokens": 192345596.0, + "step": 7698 + }, + { + "epoch": 0.8454864924225786, + "grad_norm": 2.0295543670654297, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6855664253234863, + "num_tokens": 192376002.0, + "step": 7699 + }, + { + "epoch": 0.8455963101251922, + "grad_norm": 2.163590908050537, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7062709331512451, + "num_tokens": 192400875.0, + "step": 7700 + }, + { + "epoch": 0.8457061278278059, + "grad_norm": 2.0805718898773193, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7008955478668213, + "num_tokens": 192428438.0, + "step": 7701 + }, + { + "epoch": 0.8458159455304195, + "grad_norm": 2.2910959720611572, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7273577451705933, + "num_tokens": 192449894.0, + "step": 7702 + }, + { + "epoch": 0.8459257632330331, + "grad_norm": 2.141533374786377, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7216976881027222, + "num_tokens": 192475444.0, + "step": 7703 + }, + { + "epoch": 0.8460355809356468, + "grad_norm": 2.336057424545288, + "learning_rate": 1e-06, + "loss": 1.009, + "mean_token_accuracy": 0.6985803842544556, + "num_tokens": 192498605.0, + "step": 7704 + }, + { + "epoch": 0.8461453986382604, + "grad_norm": 1.9485387802124023, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7022009491920471, + "num_tokens": 192529980.0, + "step": 7705 + }, + { + "epoch": 0.8462552163408742, + "grad_norm": 1.766783356666565, + "learning_rate": 1e-06, + "loss": 1.0054, + "mean_token_accuracy": 0.6907684206962585, + "num_tokens": 192567579.0, + "step": 7706 + }, + { + "epoch": 0.8463650340434878, + "grad_norm": 2.1855571269989014, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7061401605606079, + "num_tokens": 192590745.0, + "step": 7707 + }, + { + "epoch": 0.8464748517461015, + "grad_norm": 1.964437484741211, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7088022232055664, + "num_tokens": 192619690.0, + "step": 7708 + }, + { + "epoch": 0.8465846694487151, + "grad_norm": 2.2818474769592285, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7174509763717651, + "num_tokens": 192642369.0, + "step": 7709 + }, + { + "epoch": 0.8466944871513288, + "grad_norm": 2.2997775077819824, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7115343809127808, + "num_tokens": 192666225.0, + "step": 7710 + }, + { + "epoch": 0.8468043048539424, + "grad_norm": 2.216386556625366, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7158335447311401, + "num_tokens": 192689576.0, + "step": 7711 + }, + { + "epoch": 0.8469141225565561, + "grad_norm": 1.9636459350585938, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7000261545181274, + "num_tokens": 192721052.0, + "step": 7712 + }, + { + "epoch": 0.8470239402591698, + "grad_norm": 2.3174736499786377, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.699192464351654, + "num_tokens": 192744627.0, + "step": 7713 + }, + { + "epoch": 0.8471337579617835, + "grad_norm": 2.1658823490142822, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7120065689086914, + "num_tokens": 192768996.0, + "step": 7714 + }, + { + "epoch": 0.8472435756643971, + "grad_norm": 2.3073489665985107, + "learning_rate": 1e-06, + "loss": 1.0044, + "mean_token_accuracy": 0.6951918601989746, + "num_tokens": 192793015.0, + "step": 7715 + }, + { + "epoch": 0.8473533933670108, + "grad_norm": 2.0228545665740967, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7070103883743286, + "num_tokens": 192822663.0, + "step": 7716 + }, + { + "epoch": 0.8474632110696244, + "grad_norm": 2.3551218509674072, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6965847611427307, + "num_tokens": 192847904.0, + "step": 7717 + }, + { + "epoch": 0.8475730287722381, + "grad_norm": 2.105140209197998, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6881940364837646, + "num_tokens": 192874608.0, + "step": 7718 + }, + { + "epoch": 0.8476828464748517, + "grad_norm": 2.057321786880493, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7233007550239563, + "num_tokens": 192902825.0, + "step": 7719 + }, + { + "epoch": 0.8477926641774655, + "grad_norm": 2.3156039714813232, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7196944355964661, + "num_tokens": 192924792.0, + "step": 7720 + }, + { + "epoch": 0.8479024818800791, + "grad_norm": 2.0767710208892822, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6912552714347839, + "num_tokens": 192952688.0, + "step": 7721 + }, + { + "epoch": 0.8480122995826928, + "grad_norm": 2.3315300941467285, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7207162976264954, + "num_tokens": 192973430.0, + "step": 7722 + }, + { + "epoch": 0.8481221172853064, + "grad_norm": 2.201150417327881, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6922610998153687, + "num_tokens": 193000401.0, + "step": 7723 + }, + { + "epoch": 0.84823193498792, + "grad_norm": 2.0879156589508057, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.7022500038146973, + "num_tokens": 193026083.0, + "step": 7724 + }, + { + "epoch": 0.8483417526905337, + "grad_norm": 2.1462719440460205, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.702653706073761, + "num_tokens": 193052472.0, + "step": 7725 + }, + { + "epoch": 0.8484515703931473, + "grad_norm": 2.3484435081481934, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7022222280502319, + "num_tokens": 193075543.0, + "step": 7726 + }, + { + "epoch": 0.848561388095761, + "grad_norm": 2.1464366912841797, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6798372268676758, + "num_tokens": 193103783.0, + "step": 7727 + }, + { + "epoch": 0.8486712057983747, + "grad_norm": 2.0540578365325928, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7022332549095154, + "num_tokens": 193133474.0, + "step": 7728 + }, + { + "epoch": 0.8487810235009884, + "grad_norm": 2.1364760398864746, + "learning_rate": 1e-06, + "loss": 1.0178, + "mean_token_accuracy": 0.7005982995033264, + "num_tokens": 193160099.0, + "step": 7729 + }, + { + "epoch": 0.848890841203602, + "grad_norm": 2.018582820892334, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7099029421806335, + "num_tokens": 193188601.0, + "step": 7730 + }, + { + "epoch": 0.8490006589062157, + "grad_norm": 2.1064722537994385, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6858198046684265, + "num_tokens": 193216449.0, + "step": 7731 + }, + { + "epoch": 0.8491104766088293, + "grad_norm": 2.2507383823394775, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7201424241065979, + "num_tokens": 193239136.0, + "step": 7732 + }, + { + "epoch": 0.849220294311443, + "grad_norm": 2.558990955352783, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.703200101852417, + "num_tokens": 193258825.0, + "step": 7733 + }, + { + "epoch": 0.8493301120140566, + "grad_norm": 2.735905647277832, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7148094773292542, + "num_tokens": 193274721.0, + "step": 7734 + }, + { + "epoch": 0.8494399297166704, + "grad_norm": 2.1544899940490723, + "learning_rate": 1e-06, + "loss": 1.0767, + "mean_token_accuracy": 0.6799909472465515, + "num_tokens": 193302209.0, + "step": 7735 + }, + { + "epoch": 0.849549747419284, + "grad_norm": 2.0997555255889893, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6954048871994019, + "num_tokens": 193329164.0, + "step": 7736 + }, + { + "epoch": 0.8496595651218977, + "grad_norm": 2.3429744243621826, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.6986249685287476, + "num_tokens": 193351737.0, + "step": 7737 + }, + { + "epoch": 0.8497693828245113, + "grad_norm": 2.4341695308685303, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7327314019203186, + "num_tokens": 193371165.0, + "step": 7738 + }, + { + "epoch": 0.849879200527125, + "grad_norm": 2.2652180194854736, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7351779341697693, + "num_tokens": 193392871.0, + "step": 7739 + }, + { + "epoch": 0.8499890182297386, + "grad_norm": 1.9988211393356323, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6945419907569885, + "num_tokens": 193422327.0, + "step": 7740 + }, + { + "epoch": 0.8500988359323522, + "grad_norm": 2.320256233215332, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7054545879364014, + "num_tokens": 193445493.0, + "step": 7741 + }, + { + "epoch": 0.850208653634966, + "grad_norm": 2.23512864112854, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7305680513381958, + "num_tokens": 193467017.0, + "step": 7742 + }, + { + "epoch": 0.8503184713375797, + "grad_norm": 1.9984036684036255, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7185561656951904, + "num_tokens": 193496128.0, + "step": 7743 + }, + { + "epoch": 0.8504282890401933, + "grad_norm": 2.0420217514038086, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7250487804412842, + "num_tokens": 193526023.0, + "step": 7744 + }, + { + "epoch": 0.8505381067428069, + "grad_norm": 2.3164215087890625, + "learning_rate": 1e-06, + "loss": 1.0613, + "mean_token_accuracy": 0.6860816478729248, + "num_tokens": 193549977.0, + "step": 7745 + }, + { + "epoch": 0.8506479244454206, + "grad_norm": 1.8972718715667725, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.690430223941803, + "num_tokens": 193581325.0, + "step": 7746 + }, + { + "epoch": 0.8507577421480342, + "grad_norm": 2.393329620361328, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7059956789016724, + "num_tokens": 193602202.0, + "step": 7747 + }, + { + "epoch": 0.8508675598506479, + "grad_norm": 2.2177577018737793, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7272986173629761, + "num_tokens": 193624506.0, + "step": 7748 + }, + { + "epoch": 0.8509773775532616, + "grad_norm": 2.596007823944092, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7100053429603577, + "num_tokens": 193642597.0, + "step": 7749 + }, + { + "epoch": 0.8510871952558753, + "grad_norm": 1.9608606100082397, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7118844389915466, + "num_tokens": 193672304.0, + "step": 7750 + }, + { + "epoch": 0.8511970129584889, + "grad_norm": 2.2021944522857666, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6936383247375488, + "num_tokens": 193699903.0, + "step": 7751 + }, + { + "epoch": 0.8513068306611026, + "grad_norm": 2.2608776092529297, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7000095844268799, + "num_tokens": 193723949.0, + "step": 7752 + }, + { + "epoch": 0.8514166483637162, + "grad_norm": 1.9191761016845703, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6814450025558472, + "num_tokens": 193757313.0, + "step": 7753 + }, + { + "epoch": 0.8515264660663299, + "grad_norm": 2.2444398403167725, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6995431780815125, + "num_tokens": 193782132.0, + "step": 7754 + }, + { + "epoch": 0.8516362837689435, + "grad_norm": 2.5956594944000244, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.72862708568573, + "num_tokens": 193800778.0, + "step": 7755 + }, + { + "epoch": 0.8517461014715572, + "grad_norm": 2.4292185306549072, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7218244075775146, + "num_tokens": 193820620.0, + "step": 7756 + }, + { + "epoch": 0.8518559191741709, + "grad_norm": 2.157961845397949, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7240434288978577, + "num_tokens": 193845431.0, + "step": 7757 + }, + { + "epoch": 0.8519657368767846, + "grad_norm": 2.089506149291992, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7026284337043762, + "num_tokens": 193873630.0, + "step": 7758 + }, + { + "epoch": 0.8520755545793982, + "grad_norm": 2.5503134727478027, + "learning_rate": 1e-06, + "loss": 1.0368, + "mean_token_accuracy": 0.6969887614250183, + "num_tokens": 193895401.0, + "step": 7759 + }, + { + "epoch": 0.8521853722820119, + "grad_norm": 2.1571507453918457, + "learning_rate": 1e-06, + "loss": 0.9772, + "mean_token_accuracy": 0.7026820182800293, + "num_tokens": 193924558.0, + "step": 7760 + }, + { + "epoch": 0.8522951899846255, + "grad_norm": 2.5796947479248047, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6831033825874329, + "num_tokens": 193944788.0, + "step": 7761 + }, + { + "epoch": 0.8524050076872391, + "grad_norm": 2.295559883117676, + "learning_rate": 1e-06, + "loss": 1.0607, + "mean_token_accuracy": 0.6736240386962891, + "num_tokens": 193969783.0, + "step": 7762 + }, + { + "epoch": 0.8525148253898528, + "grad_norm": 2.155069351196289, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7063456773757935, + "num_tokens": 193994341.0, + "step": 7763 + }, + { + "epoch": 0.8526246430924665, + "grad_norm": 2.215224266052246, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.7014868855476379, + "num_tokens": 194018924.0, + "step": 7764 + }, + { + "epoch": 0.8527344607950802, + "grad_norm": 2.164079427719116, + "learning_rate": 1e-06, + "loss": 0.9768, + "mean_token_accuracy": 0.7048346996307373, + "num_tokens": 194043754.0, + "step": 7765 + }, + { + "epoch": 0.8528442784976938, + "grad_norm": 2.3212032318115234, + "learning_rate": 1e-06, + "loss": 1.0722, + "mean_token_accuracy": 0.6789811253547668, + "num_tokens": 194068670.0, + "step": 7766 + }, + { + "epoch": 0.8529540962003075, + "grad_norm": 2.576097011566162, + "learning_rate": 1e-06, + "loss": 1.0071, + "mean_token_accuracy": 0.6922377943992615, + "num_tokens": 194089253.0, + "step": 7767 + }, + { + "epoch": 0.8530639139029211, + "grad_norm": 2.2278361320495605, + "learning_rate": 1e-06, + "loss": 1.0539, + "mean_token_accuracy": 0.6765486001968384, + "num_tokens": 194117607.0, + "step": 7768 + }, + { + "epoch": 0.8531737316055348, + "grad_norm": 2.4895641803741455, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7187123894691467, + "num_tokens": 194139269.0, + "step": 7769 + }, + { + "epoch": 0.8532835493081484, + "grad_norm": 2.1704556941986084, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7196134328842163, + "num_tokens": 194163421.0, + "step": 7770 + }, + { + "epoch": 0.8533933670107622, + "grad_norm": 2.2716310024261475, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7202642560005188, + "num_tokens": 194186468.0, + "step": 7771 + }, + { + "epoch": 0.8535031847133758, + "grad_norm": 2.0508782863616943, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7101420164108276, + "num_tokens": 194216496.0, + "step": 7772 + }, + { + "epoch": 0.8536130024159895, + "grad_norm": 2.0746302604675293, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7411255836486816, + "num_tokens": 194242916.0, + "step": 7773 + }, + { + "epoch": 0.8537228201186031, + "grad_norm": 2.290402889251709, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7109477519989014, + "num_tokens": 194267031.0, + "step": 7774 + }, + { + "epoch": 0.8538326378212168, + "grad_norm": 2.0436289310455322, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7218397259712219, + "num_tokens": 194296200.0, + "step": 7775 + }, + { + "epoch": 0.8539424555238304, + "grad_norm": 2.4396138191223145, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6994488835334778, + "num_tokens": 194316483.0, + "step": 7776 + }, + { + "epoch": 0.8540522732264441, + "grad_norm": 2.1571426391601562, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7145116329193115, + "num_tokens": 194341922.0, + "step": 7777 + }, + { + "epoch": 0.8541620909290578, + "grad_norm": 2.348310708999634, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7417348623275757, + "num_tokens": 194361554.0, + "step": 7778 + }, + { + "epoch": 0.8542719086316715, + "grad_norm": 2.2815024852752686, + "learning_rate": 1e-06, + "loss": 1.0196, + "mean_token_accuracy": 0.6848927140235901, + "num_tokens": 194385300.0, + "step": 7779 + }, + { + "epoch": 0.8543817263342851, + "grad_norm": 1.9507420063018799, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7039876580238342, + "num_tokens": 194417134.0, + "step": 7780 + }, + { + "epoch": 0.8544915440368988, + "grad_norm": 2.100501775741577, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6983948945999146, + "num_tokens": 194444913.0, + "step": 7781 + }, + { + "epoch": 0.8546013617395124, + "grad_norm": 1.9815455675125122, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.6996146440505981, + "num_tokens": 194472123.0, + "step": 7782 + }, + { + "epoch": 0.854711179442126, + "grad_norm": 2.0013856887817383, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7142004370689392, + "num_tokens": 194499395.0, + "step": 7783 + }, + { + "epoch": 0.8548209971447397, + "grad_norm": 2.05646014213562, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6911028027534485, + "num_tokens": 194527290.0, + "step": 7784 + }, + { + "epoch": 0.8549308148473534, + "grad_norm": 2.3414466381073, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7214164733886719, + "num_tokens": 194547970.0, + "step": 7785 + }, + { + "epoch": 0.8550406325499671, + "grad_norm": 2.5073843002319336, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7008184194564819, + "num_tokens": 194569375.0, + "step": 7786 + }, + { + "epoch": 0.8551504502525807, + "grad_norm": 2.1105666160583496, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.705314040184021, + "num_tokens": 194595530.0, + "step": 7787 + }, + { + "epoch": 0.8552602679551944, + "grad_norm": 2.065682888031006, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7064494490623474, + "num_tokens": 194624150.0, + "step": 7788 + }, + { + "epoch": 0.855370085657808, + "grad_norm": 2.3639848232269287, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7261247634887695, + "num_tokens": 194644968.0, + "step": 7789 + }, + { + "epoch": 0.8554799033604217, + "grad_norm": 2.114677906036377, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7084485292434692, + "num_tokens": 194670556.0, + "step": 7790 + }, + { + "epoch": 0.8555897210630353, + "grad_norm": 2.1489834785461426, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7166265249252319, + "num_tokens": 194694245.0, + "step": 7791 + }, + { + "epoch": 0.855699538765649, + "grad_norm": 2.366539478302002, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7206835746765137, + "num_tokens": 194716272.0, + "step": 7792 + }, + { + "epoch": 0.8558093564682627, + "grad_norm": 2.1121671199798584, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7081884145736694, + "num_tokens": 194742933.0, + "step": 7793 + }, + { + "epoch": 0.8559191741708764, + "grad_norm": 2.197709321975708, + "learning_rate": 1e-06, + "loss": 1.0343, + "mean_token_accuracy": 0.6907504796981812, + "num_tokens": 194766607.0, + "step": 7794 + }, + { + "epoch": 0.85602899187349, + "grad_norm": 1.9467966556549072, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7058659791946411, + "num_tokens": 194796455.0, + "step": 7795 + }, + { + "epoch": 0.8561388095761037, + "grad_norm": 2.3917088508605957, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7001852989196777, + "num_tokens": 194819790.0, + "step": 7796 + }, + { + "epoch": 0.8562486272787173, + "grad_norm": 2.8334712982177734, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.6993404030799866, + "num_tokens": 194836864.0, + "step": 7797 + }, + { + "epoch": 0.856358444981331, + "grad_norm": 2.5099527835845947, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7214587926864624, + "num_tokens": 194856241.0, + "step": 7798 + }, + { + "epoch": 0.8564682626839446, + "grad_norm": 2.610755205154419, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7211331129074097, + "num_tokens": 194874577.0, + "step": 7799 + }, + { + "epoch": 0.8565780803865584, + "grad_norm": 2.552781343460083, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7295964956283569, + "num_tokens": 194892014.0, + "step": 7800 + }, + { + "epoch": 0.856687898089172, + "grad_norm": 2.3360443115234375, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7236145734786987, + "num_tokens": 194914396.0, + "step": 7801 + }, + { + "epoch": 0.8567977157917857, + "grad_norm": 2.0626935958862305, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6815947890281677, + "num_tokens": 194942842.0, + "step": 7802 + }, + { + "epoch": 0.8569075334943993, + "grad_norm": 2.3844122886657715, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7095613479614258, + "num_tokens": 194965143.0, + "step": 7803 + }, + { + "epoch": 0.8570173511970129, + "grad_norm": 2.1616716384887695, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7160882949829102, + "num_tokens": 194991809.0, + "step": 7804 + }, + { + "epoch": 0.8571271688996266, + "grad_norm": 2.2138280868530273, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6910374164581299, + "num_tokens": 195016144.0, + "step": 7805 + }, + { + "epoch": 0.8572369866022402, + "grad_norm": 1.9926499128341675, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.703407883644104, + "num_tokens": 195044767.0, + "step": 7806 + }, + { + "epoch": 0.857346804304854, + "grad_norm": 2.1346633434295654, + "learning_rate": 1e-06, + "loss": 0.9745, + "mean_token_accuracy": 0.7014570832252502, + "num_tokens": 195070683.0, + "step": 7807 + }, + { + "epoch": 0.8574566220074676, + "grad_norm": 2.1791443824768066, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7034786939620972, + "num_tokens": 195098120.0, + "step": 7808 + }, + { + "epoch": 0.8575664397100813, + "grad_norm": 1.9841690063476562, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6938426494598389, + "num_tokens": 195127929.0, + "step": 7809 + }, + { + "epoch": 0.8576762574126949, + "grad_norm": 2.0877630710601807, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7242581844329834, + "num_tokens": 195154384.0, + "step": 7810 + }, + { + "epoch": 0.8577860751153086, + "grad_norm": 2.3476343154907227, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7153464555740356, + "num_tokens": 195175963.0, + "step": 7811 + }, + { + "epoch": 0.8578958928179222, + "grad_norm": 2.2736387252807617, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7112187743186951, + "num_tokens": 195199858.0, + "step": 7812 + }, + { + "epoch": 0.8580057105205359, + "grad_norm": 2.2644190788269043, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7223962545394897, + "num_tokens": 195222689.0, + "step": 7813 + }, + { + "epoch": 0.8581155282231496, + "grad_norm": 2.269320011138916, + "learning_rate": 1e-06, + "loss": 1.0452, + "mean_token_accuracy": 0.6866145133972168, + "num_tokens": 195250972.0, + "step": 7814 + }, + { + "epoch": 0.8582253459257633, + "grad_norm": 2.248037815093994, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6968792080879211, + "num_tokens": 195274243.0, + "step": 7815 + }, + { + "epoch": 0.8583351636283769, + "grad_norm": 2.0334959030151367, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.6974529027938843, + "num_tokens": 195305165.0, + "step": 7816 + }, + { + "epoch": 0.8584449813309906, + "grad_norm": 2.1326630115509033, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.6996709108352661, + "num_tokens": 195330309.0, + "step": 7817 + }, + { + "epoch": 0.8585547990336042, + "grad_norm": 2.1105191707611084, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.746174693107605, + "num_tokens": 195355178.0, + "step": 7818 + }, + { + "epoch": 0.8586646167362179, + "grad_norm": 2.2344202995300293, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7140035033226013, + "num_tokens": 195381275.0, + "step": 7819 + }, + { + "epoch": 0.8587744344388315, + "grad_norm": 2.1834945678710938, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7151849865913391, + "num_tokens": 195407044.0, + "step": 7820 + }, + { + "epoch": 0.8588842521414451, + "grad_norm": 2.329779863357544, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7015128135681152, + "num_tokens": 195429462.0, + "step": 7821 + }, + { + "epoch": 0.8589940698440589, + "grad_norm": 2.129377603530884, + "learning_rate": 1e-06, + "loss": 1.0177, + "mean_token_accuracy": 0.685693085193634, + "num_tokens": 195456329.0, + "step": 7822 + }, + { + "epoch": 0.8591038875466726, + "grad_norm": 2.1862895488739014, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7181411981582642, + "num_tokens": 195480386.0, + "step": 7823 + }, + { + "epoch": 0.8592137052492862, + "grad_norm": 2.266273260116577, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7008931636810303, + "num_tokens": 195503220.0, + "step": 7824 + }, + { + "epoch": 0.8593235229518998, + "grad_norm": 2.129225254058838, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.705356240272522, + "num_tokens": 195528588.0, + "step": 7825 + }, + { + "epoch": 0.8594333406545135, + "grad_norm": 2.335270404815674, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.7006635665893555, + "num_tokens": 195551857.0, + "step": 7826 + }, + { + "epoch": 0.8595431583571271, + "grad_norm": 2.1179635524749756, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7155447006225586, + "num_tokens": 195575307.0, + "step": 7827 + }, + { + "epoch": 0.8596529760597408, + "grad_norm": 2.056056499481201, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7099632024765015, + "num_tokens": 195601375.0, + "step": 7828 + }, + { + "epoch": 0.8597627937623545, + "grad_norm": 2.354935884475708, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7469730973243713, + "num_tokens": 195621743.0, + "step": 7829 + }, + { + "epoch": 0.8598726114649682, + "grad_norm": 2.255380153656006, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7034231424331665, + "num_tokens": 195645860.0, + "step": 7830 + }, + { + "epoch": 0.8599824291675818, + "grad_norm": 2.161468029022217, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7236955165863037, + "num_tokens": 195671243.0, + "step": 7831 + }, + { + "epoch": 0.8600922468701955, + "grad_norm": 2.1115081310272217, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.6924408674240112, + "num_tokens": 195700292.0, + "step": 7832 + }, + { + "epoch": 0.8602020645728091, + "grad_norm": 2.1113204956054688, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.691407322883606, + "num_tokens": 195727340.0, + "step": 7833 + }, + { + "epoch": 0.8603118822754228, + "grad_norm": 2.285445213317871, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7078602910041809, + "num_tokens": 195750792.0, + "step": 7834 + }, + { + "epoch": 0.8604216999780364, + "grad_norm": 2.216144323348999, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.700926661491394, + "num_tokens": 195774422.0, + "step": 7835 + }, + { + "epoch": 0.8605315176806502, + "grad_norm": 2.128558874130249, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7075482606887817, + "num_tokens": 195800498.0, + "step": 7836 + }, + { + "epoch": 0.8606413353832638, + "grad_norm": 2.234233856201172, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7026866674423218, + "num_tokens": 195824776.0, + "step": 7837 + }, + { + "epoch": 0.8607511530858775, + "grad_norm": 2.222673177719116, + "learning_rate": 1e-06, + "loss": 1.0736, + "mean_token_accuracy": 0.6748802065849304, + "num_tokens": 195853086.0, + "step": 7838 + }, + { + "epoch": 0.8608609707884911, + "grad_norm": 1.9501030445098877, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7031474113464355, + "num_tokens": 195883583.0, + "step": 7839 + }, + { + "epoch": 0.8609707884911048, + "grad_norm": 1.9846782684326172, + "learning_rate": 1e-06, + "loss": 1.0662, + "mean_token_accuracy": 0.6776683330535889, + "num_tokens": 195914557.0, + "step": 7840 + }, + { + "epoch": 0.8610806061937184, + "grad_norm": 2.0627217292785645, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6949142813682556, + "num_tokens": 195943180.0, + "step": 7841 + }, + { + "epoch": 0.861190423896332, + "grad_norm": 2.1575565338134766, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7105677127838135, + "num_tokens": 195970677.0, + "step": 7842 + }, + { + "epoch": 0.8613002415989458, + "grad_norm": 2.351390838623047, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7398667335510254, + "num_tokens": 195991498.0, + "step": 7843 + }, + { + "epoch": 0.8614100593015594, + "grad_norm": 2.006016731262207, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7014665603637695, + "num_tokens": 196019712.0, + "step": 7844 + }, + { + "epoch": 0.8615198770041731, + "grad_norm": 2.2979736328125, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7098997235298157, + "num_tokens": 196041505.0, + "step": 7845 + }, + { + "epoch": 0.8616296947067867, + "grad_norm": 2.405895948410034, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7198034524917603, + "num_tokens": 196062832.0, + "step": 7846 + }, + { + "epoch": 0.8617395124094004, + "grad_norm": 2.6440093517303467, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.715503454208374, + "num_tokens": 196081440.0, + "step": 7847 + }, + { + "epoch": 0.861849330112014, + "grad_norm": 2.1851329803466797, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6966345310211182, + "num_tokens": 196108652.0, + "step": 7848 + }, + { + "epoch": 0.8619591478146277, + "grad_norm": 2.4157419204711914, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7032420635223389, + "num_tokens": 196129989.0, + "step": 7849 + }, + { + "epoch": 0.8620689655172413, + "grad_norm": 1.9592429399490356, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7001116871833801, + "num_tokens": 196161262.0, + "step": 7850 + }, + { + "epoch": 0.8621787832198551, + "grad_norm": 1.9156142473220825, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6976516246795654, + "num_tokens": 196192175.0, + "step": 7851 + }, + { + "epoch": 0.8622886009224687, + "grad_norm": 2.3737096786499023, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7181811928749084, + "num_tokens": 196214048.0, + "step": 7852 + }, + { + "epoch": 0.8623984186250824, + "grad_norm": 2.374058485031128, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7151980996131897, + "num_tokens": 196235897.0, + "step": 7853 + }, + { + "epoch": 0.862508236327696, + "grad_norm": 2.2183613777160645, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7062559127807617, + "num_tokens": 196260975.0, + "step": 7854 + }, + { + "epoch": 0.8626180540303097, + "grad_norm": 2.6746675968170166, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7196341753005981, + "num_tokens": 196278565.0, + "step": 7855 + }, + { + "epoch": 0.8627278717329233, + "grad_norm": 2.302311897277832, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7140759229660034, + "num_tokens": 196301067.0, + "step": 7856 + }, + { + "epoch": 0.862837689435537, + "grad_norm": 2.18880295753479, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7097814679145813, + "num_tokens": 196327130.0, + "step": 7857 + }, + { + "epoch": 0.8629475071381507, + "grad_norm": 2.1806390285491943, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7100523710250854, + "num_tokens": 196350886.0, + "step": 7858 + }, + { + "epoch": 0.8630573248407644, + "grad_norm": 2.5354762077331543, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7103446125984192, + "num_tokens": 196372457.0, + "step": 7859 + }, + { + "epoch": 0.863167142543378, + "grad_norm": 1.9631761312484741, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7151892781257629, + "num_tokens": 196399930.0, + "step": 7860 + }, + { + "epoch": 0.8632769602459917, + "grad_norm": 2.4565389156341553, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7288686037063599, + "num_tokens": 196420428.0, + "step": 7861 + }, + { + "epoch": 0.8633867779486053, + "grad_norm": 2.5689280033111572, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6947753429412842, + "num_tokens": 196440236.0, + "step": 7862 + }, + { + "epoch": 0.863496595651219, + "grad_norm": 2.3915414810180664, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7259663939476013, + "num_tokens": 196462519.0, + "step": 7863 + }, + { + "epoch": 0.8636064133538326, + "grad_norm": 2.307400703430176, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.727837085723877, + "num_tokens": 196484352.0, + "step": 7864 + }, + { + "epoch": 0.8637162310564463, + "grad_norm": 2.2496440410614014, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6952599287033081, + "num_tokens": 196508962.0, + "step": 7865 + }, + { + "epoch": 0.86382604875906, + "grad_norm": 2.2576851844787598, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7043495178222656, + "num_tokens": 196531907.0, + "step": 7866 + }, + { + "epoch": 0.8639358664616736, + "grad_norm": 2.2481300830841064, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7153283357620239, + "num_tokens": 196555739.0, + "step": 7867 + }, + { + "epoch": 0.8640456841642873, + "grad_norm": 2.777777671813965, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6876261234283447, + "num_tokens": 196572406.0, + "step": 7868 + }, + { + "epoch": 0.8641555018669009, + "grad_norm": 2.2447731494903564, + "learning_rate": 1e-06, + "loss": 1.0069, + "mean_token_accuracy": 0.6914116144180298, + "num_tokens": 196598937.0, + "step": 7869 + }, + { + "epoch": 0.8642653195695146, + "grad_norm": 2.186372995376587, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6903179883956909, + "num_tokens": 196625547.0, + "step": 7870 + }, + { + "epoch": 0.8643751372721282, + "grad_norm": 2.19368314743042, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7464137077331543, + "num_tokens": 196648307.0, + "step": 7871 + }, + { + "epoch": 0.864484954974742, + "grad_norm": 2.1854088306427, + "learning_rate": 1e-06, + "loss": 0.7707, + "mean_token_accuracy": 0.7479130029678345, + "num_tokens": 196671060.0, + "step": 7872 + }, + { + "epoch": 0.8645947726773556, + "grad_norm": 2.0978405475616455, + "learning_rate": 1e-06, + "loss": 1.0372, + "mean_token_accuracy": 0.6910778880119324, + "num_tokens": 196697938.0, + "step": 7873 + }, + { + "epoch": 0.8647045903799693, + "grad_norm": 2.0324020385742188, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7206015586853027, + "num_tokens": 196723662.0, + "step": 7874 + }, + { + "epoch": 0.8648144080825829, + "grad_norm": 2.484738826751709, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6952825784683228, + "num_tokens": 196742429.0, + "step": 7875 + }, + { + "epoch": 0.8649242257851966, + "grad_norm": 2.263765573501587, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7123364210128784, + "num_tokens": 196765663.0, + "step": 7876 + }, + { + "epoch": 0.8650340434878102, + "grad_norm": 2.1494171619415283, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7249254584312439, + "num_tokens": 196790573.0, + "step": 7877 + }, + { + "epoch": 0.8651438611904239, + "grad_norm": 2.4366865158081055, + "learning_rate": 1e-06, + "loss": 0.8531, + "mean_token_accuracy": 0.7311195135116577, + "num_tokens": 196810469.0, + "step": 7878 + }, + { + "epoch": 0.8652536788930375, + "grad_norm": 2.239567756652832, + "learning_rate": 1e-06, + "loss": 1.0396, + "mean_token_accuracy": 0.6824600696563721, + "num_tokens": 196836482.0, + "step": 7879 + }, + { + "epoch": 0.8653634965956513, + "grad_norm": 2.267465353012085, + "learning_rate": 1e-06, + "loss": 1.0445, + "mean_token_accuracy": 0.6773815155029297, + "num_tokens": 196862873.0, + "step": 7880 + }, + { + "epoch": 0.8654733142982649, + "grad_norm": 2.2981154918670654, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7037258744239807, + "num_tokens": 196886460.0, + "step": 7881 + }, + { + "epoch": 0.8655831320008786, + "grad_norm": 2.110877752304077, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.701898455619812, + "num_tokens": 196914116.0, + "step": 7882 + }, + { + "epoch": 0.8656929497034922, + "grad_norm": 2.149160146713257, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6992429494857788, + "num_tokens": 196940471.0, + "step": 7883 + }, + { + "epoch": 0.8658027674061058, + "grad_norm": 2.5163817405700684, + "learning_rate": 1e-06, + "loss": 1.0532, + "mean_token_accuracy": 0.6794257164001465, + "num_tokens": 196961003.0, + "step": 7884 + }, + { + "epoch": 0.8659125851087195, + "grad_norm": 1.8721462488174438, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7063314914703369, + "num_tokens": 196992632.0, + "step": 7885 + }, + { + "epoch": 0.8660224028113331, + "grad_norm": 2.045747995376587, + "learning_rate": 1e-06, + "loss": 0.9916, + "mean_token_accuracy": 0.6965170502662659, + "num_tokens": 197021459.0, + "step": 7886 + }, + { + "epoch": 0.8661322205139469, + "grad_norm": 2.1063318252563477, + "learning_rate": 1e-06, + "loss": 1.0594, + "mean_token_accuracy": 0.6789557933807373, + "num_tokens": 197050944.0, + "step": 7887 + }, + { + "epoch": 0.8662420382165605, + "grad_norm": 1.9851993322372437, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6983741521835327, + "num_tokens": 197083158.0, + "step": 7888 + }, + { + "epoch": 0.8663518559191742, + "grad_norm": 1.9641163349151611, + "learning_rate": 1e-06, + "loss": 1.0872, + "mean_token_accuracy": 0.6776391267776489, + "num_tokens": 197113645.0, + "step": 7889 + }, + { + "epoch": 0.8664616736217878, + "grad_norm": 2.269256830215454, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7275769710540771, + "num_tokens": 197136484.0, + "step": 7890 + }, + { + "epoch": 0.8665714913244015, + "grad_norm": 2.280634880065918, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6949353218078613, + "num_tokens": 197161472.0, + "step": 7891 + }, + { + "epoch": 0.8666813090270151, + "grad_norm": 1.9722349643707275, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7289987802505493, + "num_tokens": 197189427.0, + "step": 7892 + }, + { + "epoch": 0.8667911267296288, + "grad_norm": 2.201007127761841, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7114719152450562, + "num_tokens": 197212457.0, + "step": 7893 + }, + { + "epoch": 0.8669009444322425, + "grad_norm": 2.049985647201538, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.7023776769638062, + "num_tokens": 197242527.0, + "step": 7894 + }, + { + "epoch": 0.8670107621348562, + "grad_norm": 2.257427453994751, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7239634394645691, + "num_tokens": 197266737.0, + "step": 7895 + }, + { + "epoch": 0.8671205798374698, + "grad_norm": 2.2609593868255615, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6929299831390381, + "num_tokens": 197291366.0, + "step": 7896 + }, + { + "epoch": 0.8672303975400835, + "grad_norm": 1.9545108079910278, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6931780576705933, + "num_tokens": 197321821.0, + "step": 7897 + }, + { + "epoch": 0.8673402152426971, + "grad_norm": 1.9692590236663818, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.695556640625, + "num_tokens": 197355999.0, + "step": 7898 + }, + { + "epoch": 0.8674500329453108, + "grad_norm": 2.273662805557251, + "learning_rate": 1e-06, + "loss": 1.0367, + "mean_token_accuracy": 0.6815828084945679, + "num_tokens": 197380847.0, + "step": 7899 + }, + { + "epoch": 0.8675598506479244, + "grad_norm": 2.2301268577575684, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7201677560806274, + "num_tokens": 197404150.0, + "step": 7900 + }, + { + "epoch": 0.8676696683505382, + "grad_norm": 2.272735834121704, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7217626571655273, + "num_tokens": 197428474.0, + "step": 7901 + }, + { + "epoch": 0.8677794860531518, + "grad_norm": 2.376469135284424, + "learning_rate": 1e-06, + "loss": 1.0354, + "mean_token_accuracy": 0.6992919445037842, + "num_tokens": 197451332.0, + "step": 7902 + }, + { + "epoch": 0.8678893037557655, + "grad_norm": 2.2849764823913574, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.722805380821228, + "num_tokens": 197474766.0, + "step": 7903 + }, + { + "epoch": 0.8679991214583791, + "grad_norm": 1.932782530784607, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6951402425765991, + "num_tokens": 197506302.0, + "step": 7904 + }, + { + "epoch": 0.8681089391609927, + "grad_norm": 2.343496799468994, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6943231821060181, + "num_tokens": 197530134.0, + "step": 7905 + }, + { + "epoch": 0.8682187568636064, + "grad_norm": 2.3771016597747803, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6990734338760376, + "num_tokens": 197552582.0, + "step": 7906 + }, + { + "epoch": 0.86832857456622, + "grad_norm": 2.1120481491088867, + "learning_rate": 1e-06, + "loss": 1.0009, + "mean_token_accuracy": 0.7033147811889648, + "num_tokens": 197579703.0, + "step": 7907 + }, + { + "epoch": 0.8684383922688337, + "grad_norm": 1.8608455657958984, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7112634181976318, + "num_tokens": 197611796.0, + "step": 7908 + }, + { + "epoch": 0.8685482099714474, + "grad_norm": 2.1626880168914795, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6880096793174744, + "num_tokens": 197637254.0, + "step": 7909 + }, + { + "epoch": 0.8686580276740611, + "grad_norm": 2.675189733505249, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7052615880966187, + "num_tokens": 197657801.0, + "step": 7910 + }, + { + "epoch": 0.8687678453766747, + "grad_norm": 2.222647190093994, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.714893639087677, + "num_tokens": 197681781.0, + "step": 7911 + }, + { + "epoch": 0.8688776630792884, + "grad_norm": 2.4714419841766357, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7162889838218689, + "num_tokens": 197701916.0, + "step": 7912 + }, + { + "epoch": 0.868987480781902, + "grad_norm": 2.064950942993164, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7046294212341309, + "num_tokens": 197729646.0, + "step": 7913 + }, + { + "epoch": 0.8690972984845157, + "grad_norm": 2.280442953109741, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7067857384681702, + "num_tokens": 197754432.0, + "step": 7914 + }, + { + "epoch": 0.8692071161871293, + "grad_norm": 2.3031160831451416, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7221008539199829, + "num_tokens": 197778926.0, + "step": 7915 + }, + { + "epoch": 0.8693169338897431, + "grad_norm": 1.9822931289672852, + "learning_rate": 1e-06, + "loss": 1.0756, + "mean_token_accuracy": 0.6745519638061523, + "num_tokens": 197809802.0, + "step": 7916 + }, + { + "epoch": 0.8694267515923567, + "grad_norm": 2.392819404602051, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.7014858722686768, + "num_tokens": 197834772.0, + "step": 7917 + }, + { + "epoch": 0.8695365692949704, + "grad_norm": 2.0141854286193848, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.6971328258514404, + "num_tokens": 197864869.0, + "step": 7918 + }, + { + "epoch": 0.869646386997584, + "grad_norm": 2.1118695735931396, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7042672634124756, + "num_tokens": 197892699.0, + "step": 7919 + }, + { + "epoch": 0.8697562047001977, + "grad_norm": 2.159118175506592, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.6939799189567566, + "num_tokens": 197919265.0, + "step": 7920 + }, + { + "epoch": 0.8698660224028113, + "grad_norm": 2.066444158554077, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.727384090423584, + "num_tokens": 197945739.0, + "step": 7921 + }, + { + "epoch": 0.869975840105425, + "grad_norm": 2.162151336669922, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.7068650126457214, + "num_tokens": 197970813.0, + "step": 7922 + }, + { + "epoch": 0.8700856578080387, + "grad_norm": 2.2464938163757324, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7109646201133728, + "num_tokens": 197994038.0, + "step": 7923 + }, + { + "epoch": 0.8701954755106523, + "grad_norm": 2.5473103523254395, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7412823438644409, + "num_tokens": 198012874.0, + "step": 7924 + }, + { + "epoch": 0.870305293213266, + "grad_norm": 2.1016180515289307, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6855834722518921, + "num_tokens": 198039622.0, + "step": 7925 + }, + { + "epoch": 0.8704151109158796, + "grad_norm": 2.3493540287017822, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7264926433563232, + "num_tokens": 198063209.0, + "step": 7926 + }, + { + "epoch": 0.8705249286184933, + "grad_norm": 2.3759024143218994, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.7027801275253296, + "num_tokens": 198087329.0, + "step": 7927 + }, + { + "epoch": 0.8706347463211069, + "grad_norm": 2.0165116786956787, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6967247128486633, + "num_tokens": 198116795.0, + "step": 7928 + }, + { + "epoch": 0.8707445640237206, + "grad_norm": 2.323312759399414, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7063003778457642, + "num_tokens": 198138238.0, + "step": 7929 + }, + { + "epoch": 0.8708543817263343, + "grad_norm": 2.342597484588623, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7195771932601929, + "num_tokens": 198161129.0, + "step": 7930 + }, + { + "epoch": 0.870964199428948, + "grad_norm": 2.123063087463379, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.708760678768158, + "num_tokens": 198187457.0, + "step": 7931 + }, + { + "epoch": 0.8710740171315616, + "grad_norm": 2.729128122329712, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.72618567943573, + "num_tokens": 198203888.0, + "step": 7932 + }, + { + "epoch": 0.8711838348341753, + "grad_norm": 1.9817395210266113, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7101734280586243, + "num_tokens": 198236708.0, + "step": 7933 + }, + { + "epoch": 0.8712936525367889, + "grad_norm": 2.4044349193573, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7266443371772766, + "num_tokens": 198257310.0, + "step": 7934 + }, + { + "epoch": 0.8714034702394026, + "grad_norm": 2.1423046588897705, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7201480865478516, + "num_tokens": 198283833.0, + "step": 7935 + }, + { + "epoch": 0.8715132879420162, + "grad_norm": 2.0535197257995605, + "learning_rate": 1e-06, + "loss": 1.0106, + "mean_token_accuracy": 0.6969103813171387, + "num_tokens": 198313944.0, + "step": 7936 + }, + { + "epoch": 0.87162310564463, + "grad_norm": 2.3524770736694336, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7148852348327637, + "num_tokens": 198336064.0, + "step": 7937 + }, + { + "epoch": 0.8717329233472436, + "grad_norm": 2.618241310119629, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7118457555770874, + "num_tokens": 198354543.0, + "step": 7938 + }, + { + "epoch": 0.8718427410498573, + "grad_norm": 2.0671870708465576, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7147904634475708, + "num_tokens": 198382675.0, + "step": 7939 + }, + { + "epoch": 0.8719525587524709, + "grad_norm": 2.1240503787994385, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7231242656707764, + "num_tokens": 198408054.0, + "step": 7940 + }, + { + "epoch": 0.8720623764550846, + "grad_norm": 2.2242166996002197, + "learning_rate": 1e-06, + "loss": 1.0261, + "mean_token_accuracy": 0.6944600343704224, + "num_tokens": 198434037.0, + "step": 7941 + }, + { + "epoch": 0.8721721941576982, + "grad_norm": 2.287715435028076, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6952166557312012, + "num_tokens": 198459316.0, + "step": 7942 + }, + { + "epoch": 0.8722820118603118, + "grad_norm": 2.339186668395996, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7087258100509644, + "num_tokens": 198480346.0, + "step": 7943 + }, + { + "epoch": 0.8723918295629255, + "grad_norm": 2.11881685256958, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7189762592315674, + "num_tokens": 198504643.0, + "step": 7944 + }, + { + "epoch": 0.8725016472655392, + "grad_norm": 2.08252215385437, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6972046494483948, + "num_tokens": 198533493.0, + "step": 7945 + }, + { + "epoch": 0.8726114649681529, + "grad_norm": 2.0319762229919434, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.6973802447319031, + "num_tokens": 198563189.0, + "step": 7946 + }, + { + "epoch": 0.8727212826707665, + "grad_norm": 2.1057307720184326, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.6861908435821533, + "num_tokens": 198591334.0, + "step": 7947 + }, + { + "epoch": 0.8728311003733802, + "grad_norm": 2.307638168334961, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7156679630279541, + "num_tokens": 198613343.0, + "step": 7948 + }, + { + "epoch": 0.8729409180759938, + "grad_norm": 2.1526408195495605, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.713824987411499, + "num_tokens": 198639700.0, + "step": 7949 + }, + { + "epoch": 0.8730507357786075, + "grad_norm": 2.185542345046997, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7008888125419617, + "num_tokens": 198667370.0, + "step": 7950 + }, + { + "epoch": 0.8731605534812211, + "grad_norm": 2.425168752670288, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7137888669967651, + "num_tokens": 198688505.0, + "step": 7951 + }, + { + "epoch": 0.8732703711838349, + "grad_norm": 2.0613057613372803, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7013940811157227, + "num_tokens": 198715421.0, + "step": 7952 + }, + { + "epoch": 0.8733801888864485, + "grad_norm": 2.1296353340148926, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.702052891254425, + "num_tokens": 198742208.0, + "step": 7953 + }, + { + "epoch": 0.8734900065890622, + "grad_norm": 2.319779634475708, + "learning_rate": 1e-06, + "loss": 1.0909, + "mean_token_accuracy": 0.6921063661575317, + "num_tokens": 198765658.0, + "step": 7954 + }, + { + "epoch": 0.8735998242916758, + "grad_norm": 2.241109848022461, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7011420130729675, + "num_tokens": 198790553.0, + "step": 7955 + }, + { + "epoch": 0.8737096419942895, + "grad_norm": 2.0327816009521484, + "learning_rate": 1e-06, + "loss": 1.0511, + "mean_token_accuracy": 0.6838374733924866, + "num_tokens": 198821283.0, + "step": 7956 + }, + { + "epoch": 0.8738194596969031, + "grad_norm": 2.3534514904022217, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6906647682189941, + "num_tokens": 198845469.0, + "step": 7957 + }, + { + "epoch": 0.8739292773995168, + "grad_norm": 1.9955946207046509, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7139307260513306, + "num_tokens": 198873397.0, + "step": 7958 + }, + { + "epoch": 0.8740390951021305, + "grad_norm": 1.9298250675201416, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7073017954826355, + "num_tokens": 198903422.0, + "step": 7959 + }, + { + "epoch": 0.8741489128047442, + "grad_norm": 2.118905544281006, + "learning_rate": 1e-06, + "loss": 1.1011, + "mean_token_accuracy": 0.6730422973632812, + "num_tokens": 198932438.0, + "step": 7960 + }, + { + "epoch": 0.8742587305073578, + "grad_norm": 2.428323745727539, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7157900929450989, + "num_tokens": 198953947.0, + "step": 7961 + }, + { + "epoch": 0.8743685482099715, + "grad_norm": 2.0711138248443604, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7090091109275818, + "num_tokens": 198982072.0, + "step": 7962 + }, + { + "epoch": 0.8744783659125851, + "grad_norm": 2.0381524562835693, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7027968168258667, + "num_tokens": 199008936.0, + "step": 7963 + }, + { + "epoch": 0.8745881836151987, + "grad_norm": 2.6723177433013916, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7209425568580627, + "num_tokens": 199029475.0, + "step": 7964 + }, + { + "epoch": 0.8746980013178124, + "grad_norm": 2.1610825061798096, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7243375182151794, + "num_tokens": 199053609.0, + "step": 7965 + }, + { + "epoch": 0.8748078190204261, + "grad_norm": 2.0827624797821045, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.711784839630127, + "num_tokens": 199080965.0, + "step": 7966 + }, + { + "epoch": 0.8749176367230398, + "grad_norm": 2.1514875888824463, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7021543979644775, + "num_tokens": 199105906.0, + "step": 7967 + }, + { + "epoch": 0.8750274544256534, + "grad_norm": 2.0412862300872803, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7141920328140259, + "num_tokens": 199136425.0, + "step": 7968 + }, + { + "epoch": 0.8751372721282671, + "grad_norm": 2.3372559547424316, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7209910154342651, + "num_tokens": 199156991.0, + "step": 7969 + }, + { + "epoch": 0.8752470898308807, + "grad_norm": 2.4657235145568848, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7227374315261841, + "num_tokens": 199176946.0, + "step": 7970 + }, + { + "epoch": 0.8753569075334944, + "grad_norm": 2.167804718017578, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.717167854309082, + "num_tokens": 199203942.0, + "step": 7971 + }, + { + "epoch": 0.875466725236108, + "grad_norm": 2.2104458808898926, + "learning_rate": 1e-06, + "loss": 1.0524, + "mean_token_accuracy": 0.696948230266571, + "num_tokens": 199231440.0, + "step": 7972 + }, + { + "epoch": 0.8755765429387217, + "grad_norm": 1.981291651725769, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7063592076301575, + "num_tokens": 199261838.0, + "step": 7973 + }, + { + "epoch": 0.8756863606413354, + "grad_norm": 2.169933557510376, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.7036410570144653, + "num_tokens": 199289213.0, + "step": 7974 + }, + { + "epoch": 0.8757961783439491, + "grad_norm": 2.3048555850982666, + "learning_rate": 1e-06, + "loss": 1.0345, + "mean_token_accuracy": 0.6858713626861572, + "num_tokens": 199312836.0, + "step": 7975 + }, + { + "epoch": 0.8759059960465627, + "grad_norm": 2.029034376144409, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7216734886169434, + "num_tokens": 199340845.0, + "step": 7976 + }, + { + "epoch": 0.8760158137491764, + "grad_norm": 2.3159451484680176, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7094439268112183, + "num_tokens": 199364963.0, + "step": 7977 + }, + { + "epoch": 0.87612563145179, + "grad_norm": 2.6148831844329834, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7090569138526917, + "num_tokens": 199382613.0, + "step": 7978 + }, + { + "epoch": 0.8762354491544037, + "grad_norm": 2.401949882507324, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6970018148422241, + "num_tokens": 199403616.0, + "step": 7979 + }, + { + "epoch": 0.8763452668570173, + "grad_norm": 2.3432226181030273, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7169086337089539, + "num_tokens": 199429263.0, + "step": 7980 + }, + { + "epoch": 0.8764550845596311, + "grad_norm": 2.0605289936065674, + "learning_rate": 1e-06, + "loss": 1.0493, + "mean_token_accuracy": 0.6869194507598877, + "num_tokens": 199457264.0, + "step": 7981 + }, + { + "epoch": 0.8765649022622447, + "grad_norm": 2.1822426319122314, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7237162590026855, + "num_tokens": 199480987.0, + "step": 7982 + }, + { + "epoch": 0.8766747199648584, + "grad_norm": 2.0655081272125244, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.711020290851593, + "num_tokens": 199509963.0, + "step": 7983 + }, + { + "epoch": 0.876784537667472, + "grad_norm": 2.3026134967803955, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7001299262046814, + "num_tokens": 199532674.0, + "step": 7984 + }, + { + "epoch": 0.8768943553700856, + "grad_norm": 1.9876375198364258, + "learning_rate": 1e-06, + "loss": 1.0291, + "mean_token_accuracy": 0.6936173439025879, + "num_tokens": 199563159.0, + "step": 7985 + }, + { + "epoch": 0.8770041730726993, + "grad_norm": 2.073847532272339, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7065067887306213, + "num_tokens": 199588681.0, + "step": 7986 + }, + { + "epoch": 0.8771139907753129, + "grad_norm": 2.3724188804626465, + "learning_rate": 1e-06, + "loss": 1.0685, + "mean_token_accuracy": 0.69130539894104, + "num_tokens": 199612711.0, + "step": 7987 + }, + { + "epoch": 0.8772238084779267, + "grad_norm": 2.001429796218872, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6944836378097534, + "num_tokens": 199642758.0, + "step": 7988 + }, + { + "epoch": 0.8773336261805403, + "grad_norm": 2.056609630584717, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6877099275588989, + "num_tokens": 199670959.0, + "step": 7989 + }, + { + "epoch": 0.877443443883154, + "grad_norm": 2.3195831775665283, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7163450121879578, + "num_tokens": 199692988.0, + "step": 7990 + }, + { + "epoch": 0.8775532615857676, + "grad_norm": 2.2777843475341797, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.6984274387359619, + "num_tokens": 199715678.0, + "step": 7991 + }, + { + "epoch": 0.8776630792883813, + "grad_norm": 2.2459681034088135, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.711765468120575, + "num_tokens": 199741076.0, + "step": 7992 + }, + { + "epoch": 0.8777728969909949, + "grad_norm": 2.1105291843414307, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7261916995048523, + "num_tokens": 199765286.0, + "step": 7993 + }, + { + "epoch": 0.8778827146936086, + "grad_norm": 1.945061445236206, + "learning_rate": 1e-06, + "loss": 1.0436, + "mean_token_accuracy": 0.6802400350570679, + "num_tokens": 199800208.0, + "step": 7994 + }, + { + "epoch": 0.8779925323962223, + "grad_norm": 2.250548839569092, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.695406436920166, + "num_tokens": 199825271.0, + "step": 7995 + }, + { + "epoch": 0.878102350098836, + "grad_norm": 1.9354056119918823, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6897470951080322, + "num_tokens": 199859090.0, + "step": 7996 + }, + { + "epoch": 0.8782121678014496, + "grad_norm": 2.2377102375030518, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7167484760284424, + "num_tokens": 199883291.0, + "step": 7997 + }, + { + "epoch": 0.8783219855040633, + "grad_norm": 2.423825263977051, + "learning_rate": 1e-06, + "loss": 1.0414, + "mean_token_accuracy": 0.6828354001045227, + "num_tokens": 199904580.0, + "step": 7998 + }, + { + "epoch": 0.8784318032066769, + "grad_norm": 2.089193105697632, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6899316310882568, + "num_tokens": 199933760.0, + "step": 7999 + }, + { + "epoch": 0.8785416209092906, + "grad_norm": 2.6345713138580322, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7321118116378784, + "num_tokens": 199950838.0, + "step": 8000 + }, + { + "epoch": 0.8786514386119042, + "grad_norm": 2.0124874114990234, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.6980844140052795, + "num_tokens": 199980615.0, + "step": 8001 + }, + { + "epoch": 0.8787612563145178, + "grad_norm": 2.342600107192993, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7154794931411743, + "num_tokens": 200002489.0, + "step": 8002 + }, + { + "epoch": 0.8788710740171316, + "grad_norm": 2.650470018386841, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7108609676361084, + "num_tokens": 200020739.0, + "step": 8003 + }, + { + "epoch": 0.8789808917197452, + "grad_norm": 2.0674920082092285, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6845114827156067, + "num_tokens": 200050090.0, + "step": 8004 + }, + { + "epoch": 0.8790907094223589, + "grad_norm": 2.150009870529175, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7217920422554016, + "num_tokens": 200074899.0, + "step": 8005 + }, + { + "epoch": 0.8792005271249725, + "grad_norm": 2.611966848373413, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7370350360870361, + "num_tokens": 200092711.0, + "step": 8006 + }, + { + "epoch": 0.8793103448275862, + "grad_norm": 2.3215019702911377, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.6969321966171265, + "num_tokens": 200116240.0, + "step": 8007 + }, + { + "epoch": 0.8794201625301998, + "grad_norm": 2.1295626163482666, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7121515274047852, + "num_tokens": 200142413.0, + "step": 8008 + }, + { + "epoch": 0.8795299802328135, + "grad_norm": 2.2091124057769775, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7409870624542236, + "num_tokens": 200165215.0, + "step": 8009 + }, + { + "epoch": 0.8796397979354272, + "grad_norm": 2.074662923812866, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7109291553497314, + "num_tokens": 200191055.0, + "step": 8010 + }, + { + "epoch": 0.8797496156380409, + "grad_norm": 2.297090768814087, + "learning_rate": 1e-06, + "loss": 0.858, + "mean_token_accuracy": 0.7336541414260864, + "num_tokens": 200214014.0, + "step": 8011 + }, + { + "epoch": 0.8798594333406545, + "grad_norm": 2.171619415283203, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7237806916236877, + "num_tokens": 200240832.0, + "step": 8012 + }, + { + "epoch": 0.8799692510432682, + "grad_norm": 2.2313477993011475, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7099418044090271, + "num_tokens": 200264943.0, + "step": 8013 + }, + { + "epoch": 0.8800790687458818, + "grad_norm": 2.1393253803253174, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7098854780197144, + "num_tokens": 200293190.0, + "step": 8014 + }, + { + "epoch": 0.8801888864484955, + "grad_norm": 2.205601692199707, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7019002437591553, + "num_tokens": 200317855.0, + "step": 8015 + }, + { + "epoch": 0.8802987041511091, + "grad_norm": 2.0154500007629395, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7034956812858582, + "num_tokens": 200346514.0, + "step": 8016 + }, + { + "epoch": 0.8804085218537229, + "grad_norm": 2.5664491653442383, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.715752363204956, + "num_tokens": 200367524.0, + "step": 8017 + }, + { + "epoch": 0.8805183395563365, + "grad_norm": 2.1708452701568604, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.6929791569709778, + "num_tokens": 200392425.0, + "step": 8018 + }, + { + "epoch": 0.8806281572589502, + "grad_norm": 2.2079641819000244, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7056919932365417, + "num_tokens": 200417807.0, + "step": 8019 + }, + { + "epoch": 0.8807379749615638, + "grad_norm": 2.0656206607818604, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7087677717208862, + "num_tokens": 200446384.0, + "step": 8020 + }, + { + "epoch": 0.8808477926641775, + "grad_norm": 2.189260482788086, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7097259163856506, + "num_tokens": 200469700.0, + "step": 8021 + }, + { + "epoch": 0.8809576103667911, + "grad_norm": 2.4617767333984375, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7291015386581421, + "num_tokens": 200489767.0, + "step": 8022 + }, + { + "epoch": 0.8810674280694047, + "grad_norm": 2.130897283554077, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7076296806335449, + "num_tokens": 200517289.0, + "step": 8023 + }, + { + "epoch": 0.8811772457720185, + "grad_norm": 2.0170984268188477, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7011878490447998, + "num_tokens": 200546915.0, + "step": 8024 + }, + { + "epoch": 0.8812870634746321, + "grad_norm": 2.1771156787872314, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7244508862495422, + "num_tokens": 200571148.0, + "step": 8025 + }, + { + "epoch": 0.8813968811772458, + "grad_norm": 2.257094383239746, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7126875519752502, + "num_tokens": 200594849.0, + "step": 8026 + }, + { + "epoch": 0.8815066988798594, + "grad_norm": 1.9015251398086548, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6979702115058899, + "num_tokens": 200624869.0, + "step": 8027 + }, + { + "epoch": 0.8816165165824731, + "grad_norm": 2.090456008911133, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7166028022766113, + "num_tokens": 200650328.0, + "step": 8028 + }, + { + "epoch": 0.8817263342850867, + "grad_norm": 2.2351019382476807, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7109436988830566, + "num_tokens": 200674666.0, + "step": 8029 + }, + { + "epoch": 0.8818361519877004, + "grad_norm": 2.074894666671753, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7195602655410767, + "num_tokens": 200700398.0, + "step": 8030 + }, + { + "epoch": 0.881945969690314, + "grad_norm": 2.287311315536499, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7200970649719238, + "num_tokens": 200722358.0, + "step": 8031 + }, + { + "epoch": 0.8820557873929278, + "grad_norm": 2.6931629180908203, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7191494107246399, + "num_tokens": 200740821.0, + "step": 8032 + }, + { + "epoch": 0.8821656050955414, + "grad_norm": 2.137571096420288, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.718245804309845, + "num_tokens": 200767292.0, + "step": 8033 + }, + { + "epoch": 0.8822754227981551, + "grad_norm": 2.2888736724853516, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7211260795593262, + "num_tokens": 200790474.0, + "step": 8034 + }, + { + "epoch": 0.8823852405007687, + "grad_norm": 2.2065012454986572, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7189068794250488, + "num_tokens": 200814026.0, + "step": 8035 + }, + { + "epoch": 0.8824950582033824, + "grad_norm": 2.602570056915283, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.7007184028625488, + "num_tokens": 200832713.0, + "step": 8036 + }, + { + "epoch": 0.882604875905996, + "grad_norm": 1.8990790843963623, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.706357479095459, + "num_tokens": 200864118.0, + "step": 8037 + }, + { + "epoch": 0.8827146936086097, + "grad_norm": 2.282900810241699, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7053226828575134, + "num_tokens": 200886594.0, + "step": 8038 + }, + { + "epoch": 0.8828245113112234, + "grad_norm": 2.051769733428955, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7024507522583008, + "num_tokens": 200916282.0, + "step": 8039 + }, + { + "epoch": 0.8829343290138371, + "grad_norm": 2.0960073471069336, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7120600938796997, + "num_tokens": 200943708.0, + "step": 8040 + }, + { + "epoch": 0.8830441467164507, + "grad_norm": 2.2150650024414062, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7129947543144226, + "num_tokens": 200966422.0, + "step": 8041 + }, + { + "epoch": 0.8831539644190644, + "grad_norm": 2.3062236309051514, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6913113594055176, + "num_tokens": 200991325.0, + "step": 8042 + }, + { + "epoch": 0.883263782121678, + "grad_norm": 2.4394500255584717, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7276372909545898, + "num_tokens": 201011531.0, + "step": 8043 + }, + { + "epoch": 0.8833735998242916, + "grad_norm": 2.103261947631836, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7225743532180786, + "num_tokens": 201037020.0, + "step": 8044 + }, + { + "epoch": 0.8834834175269053, + "grad_norm": 2.3428432941436768, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.6939640045166016, + "num_tokens": 201060025.0, + "step": 8045 + }, + { + "epoch": 0.883593235229519, + "grad_norm": 2.42020320892334, + "learning_rate": 1e-06, + "loss": 1.0021, + "mean_token_accuracy": 0.7082850933074951, + "num_tokens": 201083827.0, + "step": 8046 + }, + { + "epoch": 0.8837030529321327, + "grad_norm": 2.1398277282714844, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7029469013214111, + "num_tokens": 201108396.0, + "step": 8047 + }, + { + "epoch": 0.8838128706347463, + "grad_norm": 1.9553147554397583, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7241625785827637, + "num_tokens": 201138356.0, + "step": 8048 + }, + { + "epoch": 0.88392268833736, + "grad_norm": 2.3406171798706055, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7014545202255249, + "num_tokens": 201162539.0, + "step": 8049 + }, + { + "epoch": 0.8840325060399736, + "grad_norm": 2.0733509063720703, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.709014356136322, + "num_tokens": 201188857.0, + "step": 8050 + }, + { + "epoch": 0.8841423237425873, + "grad_norm": 2.4221885204315186, + "learning_rate": 1e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.741747260093689, + "num_tokens": 201208277.0, + "step": 8051 + }, + { + "epoch": 0.8842521414452009, + "grad_norm": 2.0680060386657715, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7197672128677368, + "num_tokens": 201234678.0, + "step": 8052 + }, + { + "epoch": 0.8843619591478147, + "grad_norm": 2.306649923324585, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.703191876411438, + "num_tokens": 201257072.0, + "step": 8053 + }, + { + "epoch": 0.8844717768504283, + "grad_norm": 2.3373939990997314, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7108224034309387, + "num_tokens": 201280994.0, + "step": 8054 + }, + { + "epoch": 0.884581594553042, + "grad_norm": 2.3021652698516846, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7041144967079163, + "num_tokens": 201304118.0, + "step": 8055 + }, + { + "epoch": 0.8846914122556556, + "grad_norm": 2.373037099838257, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7230590581893921, + "num_tokens": 201325057.0, + "step": 8056 + }, + { + "epoch": 0.8848012299582693, + "grad_norm": 2.190960645675659, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.7026486396789551, + "num_tokens": 201350778.0, + "step": 8057 + }, + { + "epoch": 0.8849110476608829, + "grad_norm": 1.7615784406661987, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.698426365852356, + "num_tokens": 201386965.0, + "step": 8058 + }, + { + "epoch": 0.8850208653634966, + "grad_norm": 2.417592763900757, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7051057815551758, + "num_tokens": 201408162.0, + "step": 8059 + }, + { + "epoch": 0.8851306830661102, + "grad_norm": 2.0932281017303467, + "learning_rate": 1e-06, + "loss": 1.0185, + "mean_token_accuracy": 0.6902071833610535, + "num_tokens": 201435220.0, + "step": 8060 + }, + { + "epoch": 0.885240500768724, + "grad_norm": 2.3600268363952637, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6910645365715027, + "num_tokens": 201459387.0, + "step": 8061 + }, + { + "epoch": 0.8853503184713376, + "grad_norm": 2.119457721710205, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.7068605422973633, + "num_tokens": 201486186.0, + "step": 8062 + }, + { + "epoch": 0.8854601361739513, + "grad_norm": 1.9381866455078125, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7145407199859619, + "num_tokens": 201515058.0, + "step": 8063 + }, + { + "epoch": 0.8855699538765649, + "grad_norm": 2.2473676204681396, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7163596153259277, + "num_tokens": 201540001.0, + "step": 8064 + }, + { + "epoch": 0.8856797715791785, + "grad_norm": 2.183436393737793, + "learning_rate": 1e-06, + "loss": 1.0272, + "mean_token_accuracy": 0.6832555532455444, + "num_tokens": 201567012.0, + "step": 8065 + }, + { + "epoch": 0.8857895892817922, + "grad_norm": 2.1276602745056152, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7116238474845886, + "num_tokens": 201593557.0, + "step": 8066 + }, + { + "epoch": 0.8858994069844058, + "grad_norm": 2.1645727157592773, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7033256888389587, + "num_tokens": 201619579.0, + "step": 8067 + }, + { + "epoch": 0.8860092246870196, + "grad_norm": 2.249795913696289, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7348897457122803, + "num_tokens": 201643307.0, + "step": 8068 + }, + { + "epoch": 0.8861190423896332, + "grad_norm": 2.923698663711548, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.6912262439727783, + "num_tokens": 201661219.0, + "step": 8069 + }, + { + "epoch": 0.8862288600922469, + "grad_norm": 2.4759109020233154, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7373161315917969, + "num_tokens": 201680315.0, + "step": 8070 + }, + { + "epoch": 0.8863386777948605, + "grad_norm": 2.1703553199768066, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.736555814743042, + "num_tokens": 201706875.0, + "step": 8071 + }, + { + "epoch": 0.8864484954974742, + "grad_norm": 2.485902786254883, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7113727927207947, + "num_tokens": 201727485.0, + "step": 8072 + }, + { + "epoch": 0.8865583132000878, + "grad_norm": 2.090811014175415, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7204126119613647, + "num_tokens": 201754959.0, + "step": 8073 + }, + { + "epoch": 0.8866681309027015, + "grad_norm": 2.158237934112549, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7327627539634705, + "num_tokens": 201781812.0, + "step": 8074 + }, + { + "epoch": 0.8867779486053152, + "grad_norm": 2.1204006671905518, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7030179500579834, + "num_tokens": 201808818.0, + "step": 8075 + }, + { + "epoch": 0.8868877663079289, + "grad_norm": 2.075369119644165, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7284713983535767, + "num_tokens": 201833989.0, + "step": 8076 + }, + { + "epoch": 0.8869975840105425, + "grad_norm": 2.433774709701538, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7039876580238342, + "num_tokens": 201855543.0, + "step": 8077 + }, + { + "epoch": 0.8871074017131562, + "grad_norm": 1.9767227172851562, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.695173442363739, + "num_tokens": 201886318.0, + "step": 8078 + }, + { + "epoch": 0.8872172194157698, + "grad_norm": 2.4377827644348145, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7078860402107239, + "num_tokens": 201907583.0, + "step": 8079 + }, + { + "epoch": 0.8873270371183835, + "grad_norm": 2.381941795349121, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7001062631607056, + "num_tokens": 201928032.0, + "step": 8080 + }, + { + "epoch": 0.8874368548209971, + "grad_norm": 2.3381361961364746, + "learning_rate": 1e-06, + "loss": 1.0656, + "mean_token_accuracy": 0.6781928539276123, + "num_tokens": 201950281.0, + "step": 8081 + }, + { + "epoch": 0.8875466725236109, + "grad_norm": 2.543397903442383, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7108404636383057, + "num_tokens": 201970556.0, + "step": 8082 + }, + { + "epoch": 0.8876564902262245, + "grad_norm": 2.489097833633423, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7160865068435669, + "num_tokens": 201990608.0, + "step": 8083 + }, + { + "epoch": 0.8877663079288381, + "grad_norm": 2.1312687397003174, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7054926156997681, + "num_tokens": 202017367.0, + "step": 8084 + }, + { + "epoch": 0.8878761256314518, + "grad_norm": 2.252018451690674, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7073385119438171, + "num_tokens": 202040770.0, + "step": 8085 + }, + { + "epoch": 0.8879859433340654, + "grad_norm": 2.399303674697876, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7060450315475464, + "num_tokens": 202063372.0, + "step": 8086 + }, + { + "epoch": 0.8880957610366791, + "grad_norm": 2.100738048553467, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.6923925876617432, + "num_tokens": 202091832.0, + "step": 8087 + }, + { + "epoch": 0.8882055787392927, + "grad_norm": 2.5313825607299805, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7142891883850098, + "num_tokens": 202112501.0, + "step": 8088 + }, + { + "epoch": 0.8883153964419065, + "grad_norm": 2.473886013031006, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7161538004875183, + "num_tokens": 202133412.0, + "step": 8089 + }, + { + "epoch": 0.8884252141445201, + "grad_norm": 2.012807846069336, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6980867385864258, + "num_tokens": 202164969.0, + "step": 8090 + }, + { + "epoch": 0.8885350318471338, + "grad_norm": 1.8582552671432495, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6858565211296082, + "num_tokens": 202198496.0, + "step": 8091 + }, + { + "epoch": 0.8886448495497474, + "grad_norm": 2.127455711364746, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7038789987564087, + "num_tokens": 202224895.0, + "step": 8092 + }, + { + "epoch": 0.8887546672523611, + "grad_norm": 2.138505697250366, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7010799646377563, + "num_tokens": 202252821.0, + "step": 8093 + }, + { + "epoch": 0.8888644849549747, + "grad_norm": 1.972182273864746, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.713671863079071, + "num_tokens": 202282123.0, + "step": 8094 + }, + { + "epoch": 0.8889743026575884, + "grad_norm": 2.0379726886749268, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7038560509681702, + "num_tokens": 202311136.0, + "step": 8095 + }, + { + "epoch": 0.889084120360202, + "grad_norm": 2.3732352256774902, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7140207886695862, + "num_tokens": 202334414.0, + "step": 8096 + }, + { + "epoch": 0.8891939380628158, + "grad_norm": 2.143558979034424, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7170758247375488, + "num_tokens": 202360644.0, + "step": 8097 + }, + { + "epoch": 0.8893037557654294, + "grad_norm": 2.1935231685638428, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6922194361686707, + "num_tokens": 202387053.0, + "step": 8098 + }, + { + "epoch": 0.8894135734680431, + "grad_norm": 2.3291783332824707, + "learning_rate": 1e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7510693073272705, + "num_tokens": 202407041.0, + "step": 8099 + }, + { + "epoch": 0.8895233911706567, + "grad_norm": 1.9984012842178345, + "learning_rate": 1e-06, + "loss": 1.0213, + "mean_token_accuracy": 0.6836000680923462, + "num_tokens": 202441392.0, + "step": 8100 + }, + { + "epoch": 0.8896332088732704, + "grad_norm": 2.0668580532073975, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6999657154083252, + "num_tokens": 202468468.0, + "step": 8101 + }, + { + "epoch": 0.889743026575884, + "grad_norm": 2.4252147674560547, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.6963946223258972, + "num_tokens": 202491314.0, + "step": 8102 + }, + { + "epoch": 0.8898528442784976, + "grad_norm": 1.9726347923278809, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6959272623062134, + "num_tokens": 202522215.0, + "step": 8103 + }, + { + "epoch": 0.8899626619811114, + "grad_norm": 2.246332883834839, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.702180802822113, + "num_tokens": 202546691.0, + "step": 8104 + }, + { + "epoch": 0.890072479683725, + "grad_norm": 2.1195578575134277, + "learning_rate": 1e-06, + "loss": 1.0464, + "mean_token_accuracy": 0.6826450228691101, + "num_tokens": 202574509.0, + "step": 8105 + }, + { + "epoch": 0.8901822973863387, + "grad_norm": 1.9459829330444336, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.6980903148651123, + "num_tokens": 202602751.0, + "step": 8106 + }, + { + "epoch": 0.8902921150889523, + "grad_norm": 2.1919949054718018, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7129070162773132, + "num_tokens": 202628063.0, + "step": 8107 + }, + { + "epoch": 0.890401932791566, + "grad_norm": 2.687242031097412, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7254869341850281, + "num_tokens": 202645667.0, + "step": 8108 + }, + { + "epoch": 0.8905117504941796, + "grad_norm": 1.9674317836761475, + "learning_rate": 1e-06, + "loss": 1.0483, + "mean_token_accuracy": 0.6905305981636047, + "num_tokens": 202676224.0, + "step": 8109 + }, + { + "epoch": 0.8906215681967933, + "grad_norm": 2.1863744258880615, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7076596021652222, + "num_tokens": 202701244.0, + "step": 8110 + }, + { + "epoch": 0.890731385899407, + "grad_norm": 2.3555185794830322, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7034828066825867, + "num_tokens": 202726425.0, + "step": 8111 + }, + { + "epoch": 0.8908412036020207, + "grad_norm": 2.074462890625, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7269645929336548, + "num_tokens": 202752816.0, + "step": 8112 + }, + { + "epoch": 0.8909510213046343, + "grad_norm": 2.5597245693206787, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7253704071044922, + "num_tokens": 202771443.0, + "step": 8113 + }, + { + "epoch": 0.891060839007248, + "grad_norm": 2.6281840801239014, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7175204157829285, + "num_tokens": 202790419.0, + "step": 8114 + }, + { + "epoch": 0.8911706567098616, + "grad_norm": 1.8641051054000854, + "learning_rate": 1e-06, + "loss": 0.9858, + "mean_token_accuracy": 0.6967698335647583, + "num_tokens": 202822537.0, + "step": 8115 + }, + { + "epoch": 0.8912804744124753, + "grad_norm": 1.9679396152496338, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7329960465431213, + "num_tokens": 202850213.0, + "step": 8116 + }, + { + "epoch": 0.8913902921150889, + "grad_norm": 2.297703981399536, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7309035062789917, + "num_tokens": 202872000.0, + "step": 8117 + }, + { + "epoch": 0.8915001098177027, + "grad_norm": 2.2334470748901367, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.6960140466690063, + "num_tokens": 202895809.0, + "step": 8118 + }, + { + "epoch": 0.8916099275203163, + "grad_norm": 2.1847012042999268, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7157976627349854, + "num_tokens": 202922124.0, + "step": 8119 + }, + { + "epoch": 0.89171974522293, + "grad_norm": 2.1699345111846924, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7105997204780579, + "num_tokens": 202948227.0, + "step": 8120 + }, + { + "epoch": 0.8918295629255436, + "grad_norm": 2.0333309173583984, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7008607387542725, + "num_tokens": 202977628.0, + "step": 8121 + }, + { + "epoch": 0.8919393806281573, + "grad_norm": 2.0684123039245605, + "learning_rate": 1e-06, + "loss": 1.0137, + "mean_token_accuracy": 0.7041149139404297, + "num_tokens": 203005105.0, + "step": 8122 + }, + { + "epoch": 0.8920491983307709, + "grad_norm": 2.216257095336914, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6998336315155029, + "num_tokens": 203029056.0, + "step": 8123 + }, + { + "epoch": 0.8921590160333845, + "grad_norm": 2.5407497882843018, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7127230167388916, + "num_tokens": 203047205.0, + "step": 8124 + }, + { + "epoch": 0.8922688337359982, + "grad_norm": 2.1160693168640137, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7268902659416199, + "num_tokens": 203073202.0, + "step": 8125 + }, + { + "epoch": 0.892378651438612, + "grad_norm": 2.282625675201416, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7129696011543274, + "num_tokens": 203095244.0, + "step": 8126 + }, + { + "epoch": 0.8924884691412256, + "grad_norm": 2.2209413051605225, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7203460931777954, + "num_tokens": 203117556.0, + "step": 8127 + }, + { + "epoch": 0.8925982868438392, + "grad_norm": 2.4190163612365723, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7164849638938904, + "num_tokens": 203138917.0, + "step": 8128 + }, + { + "epoch": 0.8927081045464529, + "grad_norm": 2.5810084342956543, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7425888180732727, + "num_tokens": 203156902.0, + "step": 8129 + }, + { + "epoch": 0.8928179222490665, + "grad_norm": 2.1239235401153564, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.712185263633728, + "num_tokens": 203182236.0, + "step": 8130 + }, + { + "epoch": 0.8929277399516802, + "grad_norm": 2.723832368850708, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.6996729373931885, + "num_tokens": 203201090.0, + "step": 8131 + }, + { + "epoch": 0.8930375576542938, + "grad_norm": 1.7917472124099731, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7025494575500488, + "num_tokens": 203237675.0, + "step": 8132 + }, + { + "epoch": 0.8931473753569076, + "grad_norm": 2.417823314666748, + "learning_rate": 1e-06, + "loss": 1.0976, + "mean_token_accuracy": 0.6742698550224304, + "num_tokens": 203264043.0, + "step": 8133 + }, + { + "epoch": 0.8932571930595212, + "grad_norm": 2.061249256134033, + "learning_rate": 1e-06, + "loss": 1.0461, + "mean_token_accuracy": 0.6833555698394775, + "num_tokens": 203293273.0, + "step": 8134 + }, + { + "epoch": 0.8933670107621349, + "grad_norm": 2.0734400749206543, + "learning_rate": 1e-06, + "loss": 1.0422, + "mean_token_accuracy": 0.6835085153579712, + "num_tokens": 203321150.0, + "step": 8135 + }, + { + "epoch": 0.8934768284647485, + "grad_norm": 2.248378276824951, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.718397319316864, + "num_tokens": 203346456.0, + "step": 8136 + }, + { + "epoch": 0.8935866461673622, + "grad_norm": 2.0671629905700684, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7101569771766663, + "num_tokens": 203374789.0, + "step": 8137 + }, + { + "epoch": 0.8936964638699758, + "grad_norm": 1.963961124420166, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7194569110870361, + "num_tokens": 203404459.0, + "step": 8138 + }, + { + "epoch": 0.8938062815725895, + "grad_norm": 2.0817341804504395, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.7026792764663696, + "num_tokens": 203432428.0, + "step": 8139 + }, + { + "epoch": 0.8939160992752032, + "grad_norm": 2.400557518005371, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7102162837982178, + "num_tokens": 203455481.0, + "step": 8140 + }, + { + "epoch": 0.8940259169778169, + "grad_norm": 2.2735087871551514, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.6988739967346191, + "num_tokens": 203478999.0, + "step": 8141 + }, + { + "epoch": 0.8941357346804305, + "grad_norm": 2.4414243698120117, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7083029747009277, + "num_tokens": 203499114.0, + "step": 8142 + }, + { + "epoch": 0.8942455523830442, + "grad_norm": 2.402329921722412, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.7081432342529297, + "num_tokens": 203521669.0, + "step": 8143 + }, + { + "epoch": 0.8943553700856578, + "grad_norm": 1.9272716045379639, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.6939722299575806, + "num_tokens": 203554381.0, + "step": 8144 + }, + { + "epoch": 0.8944651877882714, + "grad_norm": 1.9202861785888672, + "learning_rate": 1e-06, + "loss": 1.0046, + "mean_token_accuracy": 0.6958740949630737, + "num_tokens": 203586813.0, + "step": 8145 + }, + { + "epoch": 0.8945750054908851, + "grad_norm": 1.956701636314392, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.715275228023529, + "num_tokens": 203615549.0, + "step": 8146 + }, + { + "epoch": 0.8946848231934988, + "grad_norm": 2.2908849716186523, + "learning_rate": 1e-06, + "loss": 1.0668, + "mean_token_accuracy": 0.68389892578125, + "num_tokens": 203644466.0, + "step": 8147 + }, + { + "epoch": 0.8947946408961125, + "grad_norm": 2.4462814331054688, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.714059054851532, + "num_tokens": 203665465.0, + "step": 8148 + }, + { + "epoch": 0.8949044585987261, + "grad_norm": 2.6576144695281982, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.7007013559341431, + "num_tokens": 203683725.0, + "step": 8149 + }, + { + "epoch": 0.8950142763013398, + "grad_norm": 2.2632675170898438, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.710176944732666, + "num_tokens": 203707822.0, + "step": 8150 + }, + { + "epoch": 0.8951240940039534, + "grad_norm": 2.0741331577301025, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.7058864831924438, + "num_tokens": 203735398.0, + "step": 8151 + }, + { + "epoch": 0.8952339117065671, + "grad_norm": 2.1638870239257812, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7084174156188965, + "num_tokens": 203762638.0, + "step": 8152 + }, + { + "epoch": 0.8953437294091807, + "grad_norm": 2.2088146209716797, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7123682498931885, + "num_tokens": 203787621.0, + "step": 8153 + }, + { + "epoch": 0.8954535471117944, + "grad_norm": 2.074855089187622, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7116725444793701, + "num_tokens": 203814284.0, + "step": 8154 + }, + { + "epoch": 0.8955633648144081, + "grad_norm": 2.100771903991699, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7105529308319092, + "num_tokens": 203839286.0, + "step": 8155 + }, + { + "epoch": 0.8956731825170218, + "grad_norm": 2.109816312789917, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7240090370178223, + "num_tokens": 203863594.0, + "step": 8156 + }, + { + "epoch": 0.8957830002196354, + "grad_norm": 2.184400796890259, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7190017700195312, + "num_tokens": 203887882.0, + "step": 8157 + }, + { + "epoch": 0.8958928179222491, + "grad_norm": 2.177372455596924, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7218009233474731, + "num_tokens": 203910720.0, + "step": 8158 + }, + { + "epoch": 0.8960026356248627, + "grad_norm": 2.4436087608337402, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7163159847259521, + "num_tokens": 203934550.0, + "step": 8159 + }, + { + "epoch": 0.8961124533274764, + "grad_norm": 2.1252951622009277, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6979901790618896, + "num_tokens": 203960999.0, + "step": 8160 + }, + { + "epoch": 0.89622227103009, + "grad_norm": 2.3567895889282227, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7037959098815918, + "num_tokens": 203982145.0, + "step": 8161 + }, + { + "epoch": 0.8963320887327038, + "grad_norm": 2.0913829803466797, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7051133513450623, + "num_tokens": 204009181.0, + "step": 8162 + }, + { + "epoch": 0.8964419064353174, + "grad_norm": 2.5845437049865723, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.721110463142395, + "num_tokens": 204027977.0, + "step": 8163 + }, + { + "epoch": 0.896551724137931, + "grad_norm": 2.2952308654785156, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7102265357971191, + "num_tokens": 204049880.0, + "step": 8164 + }, + { + "epoch": 0.8966615418405447, + "grad_norm": 2.0864579677581787, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.694787859916687, + "num_tokens": 204076902.0, + "step": 8165 + }, + { + "epoch": 0.8967713595431583, + "grad_norm": 2.246335029602051, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7294032573699951, + "num_tokens": 204101252.0, + "step": 8166 + }, + { + "epoch": 0.896881177245772, + "grad_norm": 2.0087034702301025, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7116954326629639, + "num_tokens": 204129509.0, + "step": 8167 + }, + { + "epoch": 0.8969909949483856, + "grad_norm": 1.9334534406661987, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.7023296356201172, + "num_tokens": 204161399.0, + "step": 8168 + }, + { + "epoch": 0.8971008126509994, + "grad_norm": 2.2170777320861816, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7124789953231812, + "num_tokens": 204184828.0, + "step": 8169 + }, + { + "epoch": 0.897210630353613, + "grad_norm": 2.0866141319274902, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7128981351852417, + "num_tokens": 204209679.0, + "step": 8170 + }, + { + "epoch": 0.8973204480562267, + "grad_norm": 2.1427786350250244, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6916632652282715, + "num_tokens": 204238310.0, + "step": 8171 + }, + { + "epoch": 0.8974302657588403, + "grad_norm": 2.2324116230010986, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7311773300170898, + "num_tokens": 204261329.0, + "step": 8172 + }, + { + "epoch": 0.897540083461454, + "grad_norm": 2.4431865215301514, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.6997097134590149, + "num_tokens": 204282175.0, + "step": 8173 + }, + { + "epoch": 0.8976499011640676, + "grad_norm": 2.2555418014526367, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7150503993034363, + "num_tokens": 204306206.0, + "step": 8174 + }, + { + "epoch": 0.8977597188666813, + "grad_norm": 2.1996617317199707, + "learning_rate": 1e-06, + "loss": 1.0289, + "mean_token_accuracy": 0.6881949305534363, + "num_tokens": 204332571.0, + "step": 8175 + }, + { + "epoch": 0.897869536569295, + "grad_norm": 2.3212203979492188, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.6986395120620728, + "num_tokens": 204355649.0, + "step": 8176 + }, + { + "epoch": 0.8979793542719087, + "grad_norm": 2.090763807296753, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7074209451675415, + "num_tokens": 204382043.0, + "step": 8177 + }, + { + "epoch": 0.8980891719745223, + "grad_norm": 2.2528750896453857, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6884346008300781, + "num_tokens": 204407690.0, + "step": 8178 + }, + { + "epoch": 0.898198989677136, + "grad_norm": 2.543891668319702, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7068216800689697, + "num_tokens": 204426502.0, + "step": 8179 + }, + { + "epoch": 0.8983088073797496, + "grad_norm": 2.1516175270080566, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.703118622303009, + "num_tokens": 204452424.0, + "step": 8180 + }, + { + "epoch": 0.8984186250823633, + "grad_norm": 2.350893974304199, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7356038689613342, + "num_tokens": 204474929.0, + "step": 8181 + }, + { + "epoch": 0.8985284427849769, + "grad_norm": 2.211695909500122, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.6987075805664062, + "num_tokens": 204500323.0, + "step": 8182 + }, + { + "epoch": 0.8986382604875905, + "grad_norm": 2.0664522647857666, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6919568777084351, + "num_tokens": 204529396.0, + "step": 8183 + }, + { + "epoch": 0.8987480781902043, + "grad_norm": 1.9960813522338867, + "learning_rate": 1e-06, + "loss": 1.0571, + "mean_token_accuracy": 0.6914745569229126, + "num_tokens": 204560115.0, + "step": 8184 + }, + { + "epoch": 0.898857895892818, + "grad_norm": 2.3956527709960938, + "learning_rate": 1e-06, + "loss": 0.9908, + "mean_token_accuracy": 0.7045782208442688, + "num_tokens": 204583154.0, + "step": 8185 + }, + { + "epoch": 0.8989677135954316, + "grad_norm": 2.045879364013672, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.6851810812950134, + "num_tokens": 204613116.0, + "step": 8186 + }, + { + "epoch": 0.8990775312980452, + "grad_norm": 2.538844108581543, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7036021947860718, + "num_tokens": 204633686.0, + "step": 8187 + }, + { + "epoch": 0.8991873490006589, + "grad_norm": 2.027909517288208, + "learning_rate": 1e-06, + "loss": 1.0132, + "mean_token_accuracy": 0.699146568775177, + "num_tokens": 204663802.0, + "step": 8188 + }, + { + "epoch": 0.8992971667032725, + "grad_norm": 2.2168662548065186, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.7013715505599976, + "num_tokens": 204689830.0, + "step": 8189 + }, + { + "epoch": 0.8994069844058862, + "grad_norm": 2.17293643951416, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6977695226669312, + "num_tokens": 204717639.0, + "step": 8190 + }, + { + "epoch": 0.8995168021084999, + "grad_norm": 2.24143123626709, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7290921807289124, + "num_tokens": 204740988.0, + "step": 8191 + }, + { + "epoch": 0.8996266198111136, + "grad_norm": 2.2454352378845215, + "learning_rate": 1e-06, + "loss": 1.0275, + "mean_token_accuracy": 0.6891527771949768, + "num_tokens": 204765870.0, + "step": 8192 + }, + { + "epoch": 0.8997364375137272, + "grad_norm": 2.261641502380371, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7481746077537537, + "num_tokens": 204788309.0, + "step": 8193 + }, + { + "epoch": 0.8998462552163409, + "grad_norm": 2.1669960021972656, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.6988390684127808, + "num_tokens": 204814571.0, + "step": 8194 + }, + { + "epoch": 0.8999560729189545, + "grad_norm": 2.099750518798828, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7087057828903198, + "num_tokens": 204841443.0, + "step": 8195 + }, + { + "epoch": 0.9000658906215682, + "grad_norm": 2.336068868637085, + "learning_rate": 1e-06, + "loss": 1.0045, + "mean_token_accuracy": 0.6976503133773804, + "num_tokens": 204867190.0, + "step": 8196 + }, + { + "epoch": 0.9001757083241818, + "grad_norm": 2.10530161857605, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.7149767875671387, + "num_tokens": 204894115.0, + "step": 8197 + }, + { + "epoch": 0.9002855260267956, + "grad_norm": 2.3787455558776855, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.6953751444816589, + "num_tokens": 204916588.0, + "step": 8198 + }, + { + "epoch": 0.9003953437294092, + "grad_norm": 1.9381468296051025, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.709047794342041, + "num_tokens": 204946376.0, + "step": 8199 + }, + { + "epoch": 0.9005051614320229, + "grad_norm": 2.121852159500122, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.696694016456604, + "num_tokens": 204973827.0, + "step": 8200 + }, + { + "epoch": 0.9006149791346365, + "grad_norm": 2.34883975982666, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.7008204460144043, + "num_tokens": 204998078.0, + "step": 8201 + }, + { + "epoch": 0.9007247968372502, + "grad_norm": 2.2981061935424805, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7093220353126526, + "num_tokens": 205022221.0, + "step": 8202 + }, + { + "epoch": 0.9008346145398638, + "grad_norm": 2.0994725227355957, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.694124698638916, + "num_tokens": 205050334.0, + "step": 8203 + }, + { + "epoch": 0.9009444322424774, + "grad_norm": 2.0823512077331543, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7149606347084045, + "num_tokens": 205078345.0, + "step": 8204 + }, + { + "epoch": 0.9010542499450912, + "grad_norm": 2.4628958702087402, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7323930859565735, + "num_tokens": 205097804.0, + "step": 8205 + }, + { + "epoch": 0.9011640676477048, + "grad_norm": 2.1440484523773193, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7040480971336365, + "num_tokens": 205122471.0, + "step": 8206 + }, + { + "epoch": 0.9012738853503185, + "grad_norm": 2.067748785018921, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7267249822616577, + "num_tokens": 205149967.0, + "step": 8207 + }, + { + "epoch": 0.9013837030529321, + "grad_norm": 2.4065332412719727, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7191829085350037, + "num_tokens": 205171477.0, + "step": 8208 + }, + { + "epoch": 0.9014935207555458, + "grad_norm": 2.0179662704467773, + "learning_rate": 1e-06, + "loss": 1.0675, + "mean_token_accuracy": 0.695684552192688, + "num_tokens": 205202244.0, + "step": 8209 + }, + { + "epoch": 0.9016033384581594, + "grad_norm": 2.0686254501342773, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.7071040868759155, + "num_tokens": 205230016.0, + "step": 8210 + }, + { + "epoch": 0.9017131561607731, + "grad_norm": 2.0005228519439697, + "learning_rate": 1e-06, + "loss": 1.041, + "mean_token_accuracy": 0.6904451847076416, + "num_tokens": 205259044.0, + "step": 8211 + }, + { + "epoch": 0.9018229738633867, + "grad_norm": 2.3953890800476074, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7217540740966797, + "num_tokens": 205279331.0, + "step": 8212 + }, + { + "epoch": 0.9019327915660005, + "grad_norm": 3.4282374382019043, + "learning_rate": 1e-06, + "loss": 0.7855, + "mean_token_accuracy": 0.7396388649940491, + "num_tokens": 205290615.0, + "step": 8213 + }, + { + "epoch": 0.9020426092686141, + "grad_norm": 2.4795141220092773, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6878116726875305, + "num_tokens": 205312987.0, + "step": 8214 + }, + { + "epoch": 0.9021524269712278, + "grad_norm": 2.4925549030303955, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7245386838912964, + "num_tokens": 205332156.0, + "step": 8215 + }, + { + "epoch": 0.9022622446738414, + "grad_norm": 2.37363338470459, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7124044895172119, + "num_tokens": 205354137.0, + "step": 8216 + }, + { + "epoch": 0.9023720623764551, + "grad_norm": 2.1520190238952637, + "learning_rate": 1e-06, + "loss": 1.0283, + "mean_token_accuracy": 0.6873151659965515, + "num_tokens": 205379667.0, + "step": 8217 + }, + { + "epoch": 0.9024818800790687, + "grad_norm": 2.067885160446167, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6920934915542603, + "num_tokens": 205407241.0, + "step": 8218 + }, + { + "epoch": 0.9025916977816824, + "grad_norm": 2.3989477157592773, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7259412407875061, + "num_tokens": 205427533.0, + "step": 8219 + }, + { + "epoch": 0.9027015154842961, + "grad_norm": 2.251995325088501, + "learning_rate": 1e-06, + "loss": 1.059, + "mean_token_accuracy": 0.682624101638794, + "num_tokens": 205451806.0, + "step": 8220 + }, + { + "epoch": 0.9028113331869098, + "grad_norm": 2.3087332248687744, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7316769957542419, + "num_tokens": 205476780.0, + "step": 8221 + }, + { + "epoch": 0.9029211508895234, + "grad_norm": 2.4546358585357666, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.726888120174408, + "num_tokens": 205496691.0, + "step": 8222 + }, + { + "epoch": 0.903030968592137, + "grad_norm": 1.8970718383789062, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6993957161903381, + "num_tokens": 205525351.0, + "step": 8223 + }, + { + "epoch": 0.9031407862947507, + "grad_norm": 2.201894760131836, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7263000011444092, + "num_tokens": 205549254.0, + "step": 8224 + }, + { + "epoch": 0.9032506039973643, + "grad_norm": 2.3799731731414795, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7162308692932129, + "num_tokens": 205570606.0, + "step": 8225 + }, + { + "epoch": 0.903360421699978, + "grad_norm": 2.52595853805542, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7029165029525757, + "num_tokens": 205590884.0, + "step": 8226 + }, + { + "epoch": 0.9034702394025917, + "grad_norm": 2.0761091709136963, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6926331520080566, + "num_tokens": 205620189.0, + "step": 8227 + }, + { + "epoch": 0.9035800571052054, + "grad_norm": 2.1926429271698, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7057650089263916, + "num_tokens": 205646600.0, + "step": 8228 + }, + { + "epoch": 0.903689874807819, + "grad_norm": 2.379817247390747, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.709749162197113, + "num_tokens": 205668568.0, + "step": 8229 + }, + { + "epoch": 0.9037996925104327, + "grad_norm": 2.195739269256592, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7058367729187012, + "num_tokens": 205695168.0, + "step": 8230 + }, + { + "epoch": 0.9039095102130463, + "grad_norm": 2.5454623699188232, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7037274837493896, + "num_tokens": 205714869.0, + "step": 8231 + }, + { + "epoch": 0.90401932791566, + "grad_norm": 2.039827346801758, + "learning_rate": 1e-06, + "loss": 1.0632, + "mean_token_accuracy": 0.6806201934814453, + "num_tokens": 205743862.0, + "step": 8232 + }, + { + "epoch": 0.9041291456182736, + "grad_norm": 2.085447311401367, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7094361782073975, + "num_tokens": 205770030.0, + "step": 8233 + }, + { + "epoch": 0.9042389633208874, + "grad_norm": 2.302671432495117, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7345027923583984, + "num_tokens": 205791577.0, + "step": 8234 + }, + { + "epoch": 0.904348781023501, + "grad_norm": 2.130556583404541, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6962610483169556, + "num_tokens": 205816366.0, + "step": 8235 + }, + { + "epoch": 0.9044585987261147, + "grad_norm": 2.2650554180145264, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7158297300338745, + "num_tokens": 205839795.0, + "step": 8236 + }, + { + "epoch": 0.9045684164287283, + "grad_norm": 2.382535457611084, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7104764580726624, + "num_tokens": 205863080.0, + "step": 8237 + }, + { + "epoch": 0.904678234131342, + "grad_norm": 2.066969394683838, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6974027156829834, + "num_tokens": 205890942.0, + "step": 8238 + }, + { + "epoch": 0.9047880518339556, + "grad_norm": 2.34787654876709, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7105867862701416, + "num_tokens": 205913459.0, + "step": 8239 + }, + { + "epoch": 0.9048978695365693, + "grad_norm": 1.9424606561660767, + "learning_rate": 1e-06, + "loss": 0.9782, + "mean_token_accuracy": 0.7009707689285278, + "num_tokens": 205947098.0, + "step": 8240 + }, + { + "epoch": 0.9050076872391829, + "grad_norm": 2.135549545288086, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7027803659439087, + "num_tokens": 205975517.0, + "step": 8241 + }, + { + "epoch": 0.9051175049417967, + "grad_norm": 2.2939043045043945, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6859327554702759, + "num_tokens": 206001958.0, + "step": 8242 + }, + { + "epoch": 0.9052273226444103, + "grad_norm": 2.334543466567993, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7183384895324707, + "num_tokens": 206026470.0, + "step": 8243 + }, + { + "epoch": 0.905337140347024, + "grad_norm": 2.5513510704040527, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.717168927192688, + "num_tokens": 206046128.0, + "step": 8244 + }, + { + "epoch": 0.9054469580496376, + "grad_norm": 2.540703535079956, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7000962495803833, + "num_tokens": 206067210.0, + "step": 8245 + }, + { + "epoch": 0.9055567757522512, + "grad_norm": 2.2884159088134766, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6887550950050354, + "num_tokens": 206090494.0, + "step": 8246 + }, + { + "epoch": 0.9056665934548649, + "grad_norm": 2.1342694759368896, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7416670322418213, + "num_tokens": 206116576.0, + "step": 8247 + }, + { + "epoch": 0.9057764111574785, + "grad_norm": 2.2015981674194336, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7151263952255249, + "num_tokens": 206141707.0, + "step": 8248 + }, + { + "epoch": 0.9058862288600923, + "grad_norm": 2.382239818572998, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7216935157775879, + "num_tokens": 206162303.0, + "step": 8249 + }, + { + "epoch": 0.9059960465627059, + "grad_norm": 2.328129291534424, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7020331621170044, + "num_tokens": 206184520.0, + "step": 8250 + }, + { + "epoch": 0.9061058642653196, + "grad_norm": 2.0920162200927734, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.715407133102417, + "num_tokens": 206211796.0, + "step": 8251 + }, + { + "epoch": 0.9062156819679332, + "grad_norm": 2.232004404067993, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7104187607765198, + "num_tokens": 206235333.0, + "step": 8252 + }, + { + "epoch": 0.9063254996705469, + "grad_norm": 2.0540764331817627, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7010097503662109, + "num_tokens": 206264961.0, + "step": 8253 + }, + { + "epoch": 0.9064353173731605, + "grad_norm": 2.224076747894287, + "learning_rate": 1e-06, + "loss": 1.0097, + "mean_token_accuracy": 0.6892949342727661, + "num_tokens": 206290524.0, + "step": 8254 + }, + { + "epoch": 0.9065451350757742, + "grad_norm": 1.9925817251205444, + "learning_rate": 1e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6860349178314209, + "num_tokens": 206320161.0, + "step": 8255 + }, + { + "epoch": 0.9066549527783879, + "grad_norm": 2.057886838912964, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7102625370025635, + "num_tokens": 206345086.0, + "step": 8256 + }, + { + "epoch": 0.9067647704810016, + "grad_norm": 2.462556838989258, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7211453318595886, + "num_tokens": 206365887.0, + "step": 8257 + }, + { + "epoch": 0.9068745881836152, + "grad_norm": 2.0087265968322754, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7021973133087158, + "num_tokens": 206393427.0, + "step": 8258 + }, + { + "epoch": 0.9069844058862289, + "grad_norm": 2.5981547832489014, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7110640406608582, + "num_tokens": 206412733.0, + "step": 8259 + }, + { + "epoch": 0.9070942235888425, + "grad_norm": 2.5483744144439697, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7303173542022705, + "num_tokens": 206432424.0, + "step": 8260 + }, + { + "epoch": 0.9072040412914562, + "grad_norm": 2.283358097076416, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7209954261779785, + "num_tokens": 206453217.0, + "step": 8261 + }, + { + "epoch": 0.9073138589940698, + "grad_norm": 2.3548808097839355, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7326059341430664, + "num_tokens": 206474744.0, + "step": 8262 + }, + { + "epoch": 0.9074236766966836, + "grad_norm": 2.087057590484619, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7097642421722412, + "num_tokens": 206500669.0, + "step": 8263 + }, + { + "epoch": 0.9075334943992972, + "grad_norm": 2.270935535430908, + "learning_rate": 1e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7492079734802246, + "num_tokens": 206522573.0, + "step": 8264 + }, + { + "epoch": 0.9076433121019108, + "grad_norm": 2.007589101791382, + "learning_rate": 1e-06, + "loss": 1.1576, + "mean_token_accuracy": 0.6658787727355957, + "num_tokens": 206552246.0, + "step": 8265 + }, + { + "epoch": 0.9077531298045245, + "grad_norm": 2.387632131576538, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7020767331123352, + "num_tokens": 206574021.0, + "step": 8266 + }, + { + "epoch": 0.9078629475071381, + "grad_norm": 1.8884345293045044, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6941500306129456, + "num_tokens": 206608206.0, + "step": 8267 + }, + { + "epoch": 0.9079727652097518, + "grad_norm": 2.3443801403045654, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7046058177947998, + "num_tokens": 206630375.0, + "step": 8268 + }, + { + "epoch": 0.9080825829123654, + "grad_norm": 2.3271331787109375, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7293793559074402, + "num_tokens": 206653428.0, + "step": 8269 + }, + { + "epoch": 0.9081924006149792, + "grad_norm": 2.4271459579467773, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7123744487762451, + "num_tokens": 206676017.0, + "step": 8270 + }, + { + "epoch": 0.9083022183175928, + "grad_norm": 2.081395149230957, + "learning_rate": 1e-06, + "loss": 1.0169, + "mean_token_accuracy": 0.700613260269165, + "num_tokens": 206704257.0, + "step": 8271 + }, + { + "epoch": 0.9084120360202065, + "grad_norm": 2.1434030532836914, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.6963131427764893, + "num_tokens": 206731066.0, + "step": 8272 + }, + { + "epoch": 0.9085218537228201, + "grad_norm": 2.10469651222229, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7044107913970947, + "num_tokens": 206756886.0, + "step": 8273 + }, + { + "epoch": 0.9086316714254338, + "grad_norm": 2.2466559410095215, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7206532955169678, + "num_tokens": 206779519.0, + "step": 8274 + }, + { + "epoch": 0.9087414891280474, + "grad_norm": 2.0620012283325195, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7120418548583984, + "num_tokens": 206808470.0, + "step": 8275 + }, + { + "epoch": 0.9088513068306611, + "grad_norm": 2.029240846633911, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6966222524642944, + "num_tokens": 206839940.0, + "step": 8276 + }, + { + "epoch": 0.9089611245332747, + "grad_norm": 2.3374106884002686, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7205789685249329, + "num_tokens": 206862950.0, + "step": 8277 + }, + { + "epoch": 0.9090709422358885, + "grad_norm": 2.075287103652954, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7040042877197266, + "num_tokens": 206890590.0, + "step": 8278 + }, + { + "epoch": 0.9091807599385021, + "grad_norm": 2.714618682861328, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.708077073097229, + "num_tokens": 206910088.0, + "step": 8279 + }, + { + "epoch": 0.9092905776411158, + "grad_norm": 2.3368141651153564, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7061293125152588, + "num_tokens": 206933258.0, + "step": 8280 + }, + { + "epoch": 0.9094003953437294, + "grad_norm": 2.1095688343048096, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7045272588729858, + "num_tokens": 206960985.0, + "step": 8281 + }, + { + "epoch": 0.909510213046343, + "grad_norm": 1.9480198621749878, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7039836645126343, + "num_tokens": 206989686.0, + "step": 8282 + }, + { + "epoch": 0.9096200307489567, + "grad_norm": 2.6559865474700928, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7020663022994995, + "num_tokens": 207007551.0, + "step": 8283 + }, + { + "epoch": 0.9097298484515703, + "grad_norm": 2.176985502243042, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6954028606414795, + "num_tokens": 207033750.0, + "step": 8284 + }, + { + "epoch": 0.9098396661541841, + "grad_norm": 2.4722630977630615, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7231689691543579, + "num_tokens": 207054289.0, + "step": 8285 + }, + { + "epoch": 0.9099494838567977, + "grad_norm": 2.5905065536499023, + "learning_rate": 1e-06, + "loss": 0.7691, + "mean_token_accuracy": 0.7524322271347046, + "num_tokens": 207071977.0, + "step": 8286 + }, + { + "epoch": 0.9100593015594114, + "grad_norm": 1.9773114919662476, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.7007445693016052, + "num_tokens": 207101593.0, + "step": 8287 + }, + { + "epoch": 0.910169119262025, + "grad_norm": 2.083007335662842, + "learning_rate": 1e-06, + "loss": 0.9888, + "mean_token_accuracy": 0.6938973665237427, + "num_tokens": 207128715.0, + "step": 8288 + }, + { + "epoch": 0.9102789369646387, + "grad_norm": 2.2832906246185303, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7228153944015503, + "num_tokens": 207151743.0, + "step": 8289 + }, + { + "epoch": 0.9103887546672523, + "grad_norm": 2.075824737548828, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.71907639503479, + "num_tokens": 207178936.0, + "step": 8290 + }, + { + "epoch": 0.910498572369866, + "grad_norm": 2.3633310794830322, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7145323753356934, + "num_tokens": 207201029.0, + "step": 8291 + }, + { + "epoch": 0.9106083900724797, + "grad_norm": 2.0612282752990723, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.715366005897522, + "num_tokens": 207227803.0, + "step": 8292 + }, + { + "epoch": 0.9107182077750934, + "grad_norm": 2.243472099304199, + "learning_rate": 1e-06, + "loss": 1.0716, + "mean_token_accuracy": 0.6732251644134521, + "num_tokens": 207254813.0, + "step": 8293 + }, + { + "epoch": 0.910828025477707, + "grad_norm": 2.313133478164673, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7120916247367859, + "num_tokens": 207279247.0, + "step": 8294 + }, + { + "epoch": 0.9109378431803207, + "grad_norm": 2.0343997478485107, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6961984634399414, + "num_tokens": 207306313.0, + "step": 8295 + }, + { + "epoch": 0.9110476608829343, + "grad_norm": 2.100876569747925, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.7062346935272217, + "num_tokens": 207331936.0, + "step": 8296 + }, + { + "epoch": 0.911157478585548, + "grad_norm": 2.45056414604187, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7207959890365601, + "num_tokens": 207352056.0, + "step": 8297 + }, + { + "epoch": 0.9112672962881616, + "grad_norm": 3.1210973262786865, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.708858847618103, + "num_tokens": 207366551.0, + "step": 8298 + }, + { + "epoch": 0.9113771139907754, + "grad_norm": 2.235306978225708, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7206834554672241, + "num_tokens": 207389110.0, + "step": 8299 + }, + { + "epoch": 0.911486931693389, + "grad_norm": 2.725388526916504, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7091578245162964, + "num_tokens": 207408662.0, + "step": 8300 + }, + { + "epoch": 0.9115967493960027, + "grad_norm": 2.2212953567504883, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7103122472763062, + "num_tokens": 207434853.0, + "step": 8301 + }, + { + "epoch": 0.9117065670986163, + "grad_norm": 2.1985018253326416, + "learning_rate": 1e-06, + "loss": 1.008, + "mean_token_accuracy": 0.6964097619056702, + "num_tokens": 207460068.0, + "step": 8302 + }, + { + "epoch": 0.91181638480123, + "grad_norm": 2.2929797172546387, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7148010730743408, + "num_tokens": 207481859.0, + "step": 8303 + }, + { + "epoch": 0.9119262025038436, + "grad_norm": 2.017667055130005, + "learning_rate": 1e-06, + "loss": 1.0384, + "mean_token_accuracy": 0.6819263696670532, + "num_tokens": 207512784.0, + "step": 8304 + }, + { + "epoch": 0.9120360202064572, + "grad_norm": 2.142249822616577, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7039450407028198, + "num_tokens": 207540948.0, + "step": 8305 + }, + { + "epoch": 0.9121458379090709, + "grad_norm": 2.1703574657440186, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7211030721664429, + "num_tokens": 207565870.0, + "step": 8306 + }, + { + "epoch": 0.9122556556116846, + "grad_norm": 2.1534321308135986, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7238578796386719, + "num_tokens": 207589498.0, + "step": 8307 + }, + { + "epoch": 0.9123654733142983, + "grad_norm": 2.075282573699951, + "learning_rate": 1e-06, + "loss": 1.0179, + "mean_token_accuracy": 0.6956940293312073, + "num_tokens": 207617966.0, + "step": 8308 + }, + { + "epoch": 0.9124752910169119, + "grad_norm": 1.9356075525283813, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7148585319519043, + "num_tokens": 207648810.0, + "step": 8309 + }, + { + "epoch": 0.9125851087195256, + "grad_norm": 2.591343879699707, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7093648910522461, + "num_tokens": 207668588.0, + "step": 8310 + }, + { + "epoch": 0.9126949264221392, + "grad_norm": 2.0761194229125977, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.6968783736228943, + "num_tokens": 207697657.0, + "step": 8311 + }, + { + "epoch": 0.9128047441247529, + "grad_norm": 2.135679006576538, + "learning_rate": 1e-06, + "loss": 1.0344, + "mean_token_accuracy": 0.6890920400619507, + "num_tokens": 207727068.0, + "step": 8312 + }, + { + "epoch": 0.9129145618273665, + "grad_norm": 2.3152976036071777, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7258175611495972, + "num_tokens": 207749160.0, + "step": 8313 + }, + { + "epoch": 0.9130243795299803, + "grad_norm": 2.16251802444458, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7095590829849243, + "num_tokens": 207774464.0, + "step": 8314 + }, + { + "epoch": 0.9131341972325939, + "grad_norm": 2.170210599899292, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7072694301605225, + "num_tokens": 207801209.0, + "step": 8315 + }, + { + "epoch": 0.9132440149352076, + "grad_norm": 2.2401962280273438, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.683219313621521, + "num_tokens": 207828083.0, + "step": 8316 + }, + { + "epoch": 0.9133538326378212, + "grad_norm": 2.2482190132141113, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6853216886520386, + "num_tokens": 207853590.0, + "step": 8317 + }, + { + "epoch": 0.9134636503404349, + "grad_norm": 2.056554079055786, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7115771174430847, + "num_tokens": 207881291.0, + "step": 8318 + }, + { + "epoch": 0.9135734680430485, + "grad_norm": 2.056366443634033, + "learning_rate": 1e-06, + "loss": 1.0463, + "mean_token_accuracy": 0.680604100227356, + "num_tokens": 207909751.0, + "step": 8319 + }, + { + "epoch": 0.9136832857456622, + "grad_norm": 2.3333356380462646, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7091812491416931, + "num_tokens": 207933001.0, + "step": 8320 + }, + { + "epoch": 0.9137931034482759, + "grad_norm": 2.2431628704071045, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7070576548576355, + "num_tokens": 207956809.0, + "step": 8321 + }, + { + "epoch": 0.9139029211508896, + "grad_norm": 2.0132501125335693, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7185595035552979, + "num_tokens": 207985111.0, + "step": 8322 + }, + { + "epoch": 0.9140127388535032, + "grad_norm": 2.2538790702819824, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7294082641601562, + "num_tokens": 208009144.0, + "step": 8323 + }, + { + "epoch": 0.9141225565561168, + "grad_norm": 2.2672383785247803, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7342934608459473, + "num_tokens": 208031611.0, + "step": 8324 + }, + { + "epoch": 0.9142323742587305, + "grad_norm": 2.086167097091675, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7198879718780518, + "num_tokens": 208059782.0, + "step": 8325 + }, + { + "epoch": 0.9143421919613441, + "grad_norm": 2.1599907875061035, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6862605214118958, + "num_tokens": 208086992.0, + "step": 8326 + }, + { + "epoch": 0.9144520096639578, + "grad_norm": 2.090822219848633, + "learning_rate": 1e-06, + "loss": 1.0558, + "mean_token_accuracy": 0.6821601390838623, + "num_tokens": 208114489.0, + "step": 8327 + }, + { + "epoch": 0.9145618273665715, + "grad_norm": 2.727768659591675, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7175434827804565, + "num_tokens": 208132798.0, + "step": 8328 + }, + { + "epoch": 0.9146716450691852, + "grad_norm": 2.028402090072632, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6904119253158569, + "num_tokens": 208162246.0, + "step": 8329 + }, + { + "epoch": 0.9147814627717988, + "grad_norm": 2.0986742973327637, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7043193578720093, + "num_tokens": 208189343.0, + "step": 8330 + }, + { + "epoch": 0.9148912804744125, + "grad_norm": 2.2363083362579346, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7023917436599731, + "num_tokens": 208215429.0, + "step": 8331 + }, + { + "epoch": 0.9150010981770261, + "grad_norm": 1.9824939966201782, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6966831684112549, + "num_tokens": 208245118.0, + "step": 8332 + }, + { + "epoch": 0.9151109158796398, + "grad_norm": 1.929360032081604, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7133198976516724, + "num_tokens": 208274118.0, + "step": 8333 + }, + { + "epoch": 0.9152207335822534, + "grad_norm": 2.265995979309082, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7248425483703613, + "num_tokens": 208296716.0, + "step": 8334 + }, + { + "epoch": 0.9153305512848671, + "grad_norm": 1.909627914428711, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6848862171173096, + "num_tokens": 208329150.0, + "step": 8335 + }, + { + "epoch": 0.9154403689874808, + "grad_norm": 2.279940605163574, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.6978372931480408, + "num_tokens": 208352586.0, + "step": 8336 + }, + { + "epoch": 0.9155501866900945, + "grad_norm": 2.551265001296997, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7049777507781982, + "num_tokens": 208372358.0, + "step": 8337 + }, + { + "epoch": 0.9156600043927081, + "grad_norm": 1.9077492952346802, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7025972604751587, + "num_tokens": 208404016.0, + "step": 8338 + }, + { + "epoch": 0.9157698220953218, + "grad_norm": 2.4193193912506104, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.6980729103088379, + "num_tokens": 208425839.0, + "step": 8339 + }, + { + "epoch": 0.9158796397979354, + "grad_norm": 1.817529320716858, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6982032060623169, + "num_tokens": 208461648.0, + "step": 8340 + }, + { + "epoch": 0.915989457500549, + "grad_norm": 2.019256591796875, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7056020498275757, + "num_tokens": 208491465.0, + "step": 8341 + }, + { + "epoch": 0.9160992752031627, + "grad_norm": 1.999680995941162, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.6949687004089355, + "num_tokens": 208519322.0, + "step": 8342 + }, + { + "epoch": 0.9162090929057765, + "grad_norm": 2.1222805976867676, + "learning_rate": 1e-06, + "loss": 1.0421, + "mean_token_accuracy": 0.6865564584732056, + "num_tokens": 208547456.0, + "step": 8343 + }, + { + "epoch": 0.9163189106083901, + "grad_norm": 2.033475160598755, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7043764591217041, + "num_tokens": 208576642.0, + "step": 8344 + }, + { + "epoch": 0.9164287283110037, + "grad_norm": 2.2088732719421387, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7007910013198853, + "num_tokens": 208600544.0, + "step": 8345 + }, + { + "epoch": 0.9165385460136174, + "grad_norm": 2.3076331615448, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7214940786361694, + "num_tokens": 208622522.0, + "step": 8346 + }, + { + "epoch": 0.916648363716231, + "grad_norm": 2.320565938949585, + "learning_rate": 1e-06, + "loss": 1.0028, + "mean_token_accuracy": 0.6904252171516418, + "num_tokens": 208647952.0, + "step": 8347 + }, + { + "epoch": 0.9167581814188447, + "grad_norm": 2.112576961517334, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7134154438972473, + "num_tokens": 208673416.0, + "step": 8348 + }, + { + "epoch": 0.9168679991214583, + "grad_norm": 2.140223503112793, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7137771844863892, + "num_tokens": 208700784.0, + "step": 8349 + }, + { + "epoch": 0.9169778168240721, + "grad_norm": 2.3587446212768555, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.707119345664978, + "num_tokens": 208722765.0, + "step": 8350 + }, + { + "epoch": 0.9170876345266857, + "grad_norm": 2.032855987548828, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7116607427597046, + "num_tokens": 208754143.0, + "step": 8351 + }, + { + "epoch": 0.9171974522292994, + "grad_norm": 2.4645066261291504, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6934683322906494, + "num_tokens": 208775005.0, + "step": 8352 + }, + { + "epoch": 0.917307269931913, + "grad_norm": 2.525430679321289, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7189610004425049, + "num_tokens": 208795462.0, + "step": 8353 + }, + { + "epoch": 0.9174170876345267, + "grad_norm": 2.405843734741211, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6945827007293701, + "num_tokens": 208817230.0, + "step": 8354 + }, + { + "epoch": 0.9175269053371403, + "grad_norm": 2.4693820476531982, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.74237060546875, + "num_tokens": 208836158.0, + "step": 8355 + }, + { + "epoch": 0.917636723039754, + "grad_norm": 2.1947402954101562, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7187240123748779, + "num_tokens": 208861605.0, + "step": 8356 + }, + { + "epoch": 0.9177465407423677, + "grad_norm": 1.9365962743759155, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.712755560874939, + "num_tokens": 208892054.0, + "step": 8357 + }, + { + "epoch": 0.9178563584449814, + "grad_norm": 2.351121425628662, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7195166349411011, + "num_tokens": 208913905.0, + "step": 8358 + }, + { + "epoch": 0.917966176147595, + "grad_norm": 1.9579275846481323, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6984047293663025, + "num_tokens": 208942868.0, + "step": 8359 + }, + { + "epoch": 0.9180759938502087, + "grad_norm": 2.5719969272613525, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7183554172515869, + "num_tokens": 208963665.0, + "step": 8360 + }, + { + "epoch": 0.9181858115528223, + "grad_norm": 2.2250072956085205, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7070852518081665, + "num_tokens": 208989389.0, + "step": 8361 + }, + { + "epoch": 0.918295629255436, + "grad_norm": 2.3380024433135986, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7154537439346313, + "num_tokens": 209010530.0, + "step": 8362 + }, + { + "epoch": 0.9184054469580496, + "grad_norm": 2.4436004161834717, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7314329743385315, + "num_tokens": 209030035.0, + "step": 8363 + }, + { + "epoch": 0.9185152646606632, + "grad_norm": 2.4537839889526367, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7130974531173706, + "num_tokens": 209050184.0, + "step": 8364 + }, + { + "epoch": 0.918625082363277, + "grad_norm": 2.2673046588897705, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7087761163711548, + "num_tokens": 209071784.0, + "step": 8365 + }, + { + "epoch": 0.9187349000658906, + "grad_norm": 2.2360589504241943, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7118483781814575, + "num_tokens": 209095221.0, + "step": 8366 + }, + { + "epoch": 0.9188447177685043, + "grad_norm": 2.881566286087036, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7123211622238159, + "num_tokens": 209111548.0, + "step": 8367 + }, + { + "epoch": 0.9189545354711179, + "grad_norm": 2.0077638626098633, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.6986165046691895, + "num_tokens": 209139843.0, + "step": 8368 + }, + { + "epoch": 0.9190643531737316, + "grad_norm": 2.1840763092041016, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.732806921005249, + "num_tokens": 209163898.0, + "step": 8369 + }, + { + "epoch": 0.9191741708763452, + "grad_norm": 2.00777006149292, + "learning_rate": 1e-06, + "loss": 1.035, + "mean_token_accuracy": 0.6867341995239258, + "num_tokens": 209192149.0, + "step": 8370 + }, + { + "epoch": 0.9192839885789589, + "grad_norm": 2.3264126777648926, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7462676763534546, + "num_tokens": 209212576.0, + "step": 8371 + }, + { + "epoch": 0.9193938062815726, + "grad_norm": 2.1598470211029053, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7075449824333191, + "num_tokens": 209237245.0, + "step": 8372 + }, + { + "epoch": 0.9195036239841863, + "grad_norm": 2.1357333660125732, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7352085113525391, + "num_tokens": 209263490.0, + "step": 8373 + }, + { + "epoch": 0.9196134416867999, + "grad_norm": 2.482266426086426, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7099184393882751, + "num_tokens": 209283269.0, + "step": 8374 + }, + { + "epoch": 0.9197232593894136, + "grad_norm": 2.360236883163452, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7254325151443481, + "num_tokens": 209304825.0, + "step": 8375 + }, + { + "epoch": 0.9198330770920272, + "grad_norm": 1.8938392400741577, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6923636794090271, + "num_tokens": 209337424.0, + "step": 8376 + }, + { + "epoch": 0.9199428947946409, + "grad_norm": 2.4335525035858154, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6980624198913574, + "num_tokens": 209359653.0, + "step": 8377 + }, + { + "epoch": 0.9200527124972545, + "grad_norm": 1.9609535932540894, + "learning_rate": 1e-06, + "loss": 1.0392, + "mean_token_accuracy": 0.6892289519309998, + "num_tokens": 209392511.0, + "step": 8378 + }, + { + "epoch": 0.9201625301998683, + "grad_norm": 2.523480176925659, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7287585735321045, + "num_tokens": 209411991.0, + "step": 8379 + }, + { + "epoch": 0.9202723479024819, + "grad_norm": 2.104053497314453, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.694689154624939, + "num_tokens": 209437179.0, + "step": 8380 + }, + { + "epoch": 0.9203821656050956, + "grad_norm": 2.090386390686035, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7228279113769531, + "num_tokens": 209462595.0, + "step": 8381 + }, + { + "epoch": 0.9204919833077092, + "grad_norm": 2.3601911067962646, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7230029702186584, + "num_tokens": 209485904.0, + "step": 8382 + }, + { + "epoch": 0.9206018010103229, + "grad_norm": 2.4204065799713135, + "learning_rate": 1e-06, + "loss": 1.0053, + "mean_token_accuracy": 0.7039059996604919, + "num_tokens": 209508295.0, + "step": 8383 + }, + { + "epoch": 0.9207116187129365, + "grad_norm": 2.1525232791900635, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7222793102264404, + "num_tokens": 209532811.0, + "step": 8384 + }, + { + "epoch": 0.9208214364155501, + "grad_norm": 2.0788509845733643, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7309439778327942, + "num_tokens": 209559862.0, + "step": 8385 + }, + { + "epoch": 0.9209312541181639, + "grad_norm": 2.232649803161621, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7067742943763733, + "num_tokens": 209584376.0, + "step": 8386 + }, + { + "epoch": 0.9210410718207775, + "grad_norm": 2.1178417205810547, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7176403999328613, + "num_tokens": 209612071.0, + "step": 8387 + }, + { + "epoch": 0.9211508895233912, + "grad_norm": 2.458056688308716, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.6994388699531555, + "num_tokens": 209633312.0, + "step": 8388 + }, + { + "epoch": 0.9212607072260048, + "grad_norm": 1.9571229219436646, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7371522188186646, + "num_tokens": 209661475.0, + "step": 8389 + }, + { + "epoch": 0.9213705249286185, + "grad_norm": 2.2527921199798584, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.726689338684082, + "num_tokens": 209685590.0, + "step": 8390 + }, + { + "epoch": 0.9214803426312321, + "grad_norm": 2.1951236724853516, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7050225734710693, + "num_tokens": 209710524.0, + "step": 8391 + }, + { + "epoch": 0.9215901603338458, + "grad_norm": 2.315370559692383, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7255086898803711, + "num_tokens": 209732242.0, + "step": 8392 + }, + { + "epoch": 0.9216999780364594, + "grad_norm": 2.1711227893829346, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7280197143554688, + "num_tokens": 209756491.0, + "step": 8393 + }, + { + "epoch": 0.9218097957390732, + "grad_norm": 2.1296801567077637, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7098695039749146, + "num_tokens": 209784742.0, + "step": 8394 + }, + { + "epoch": 0.9219196134416868, + "grad_norm": 2.270909309387207, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7236931324005127, + "num_tokens": 209806825.0, + "step": 8395 + }, + { + "epoch": 0.9220294311443005, + "grad_norm": 2.1607701778411865, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7098261117935181, + "num_tokens": 209834476.0, + "step": 8396 + }, + { + "epoch": 0.9221392488469141, + "grad_norm": 2.070133924484253, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7068845629692078, + "num_tokens": 209862704.0, + "step": 8397 + }, + { + "epoch": 0.9222490665495278, + "grad_norm": 2.3037731647491455, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7265948057174683, + "num_tokens": 209884275.0, + "step": 8398 + }, + { + "epoch": 0.9223588842521414, + "grad_norm": 2.2723124027252197, + "learning_rate": 1e-06, + "loss": 0.9999, + "mean_token_accuracy": 0.6951228380203247, + "num_tokens": 209909366.0, + "step": 8399 + }, + { + "epoch": 0.922468701954755, + "grad_norm": 1.8872613906860352, + "learning_rate": 1e-06, + "loss": 1.0175, + "mean_token_accuracy": 0.6939915418624878, + "num_tokens": 209941900.0, + "step": 8400 + }, + { + "epoch": 0.9225785196573688, + "grad_norm": 2.0378024578094482, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.6877199411392212, + "num_tokens": 209969745.0, + "step": 8401 + }, + { + "epoch": 0.9226883373599825, + "grad_norm": 2.069585084915161, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6997731328010559, + "num_tokens": 209999924.0, + "step": 8402 + }, + { + "epoch": 0.9227981550625961, + "grad_norm": 2.144656181335449, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7167800664901733, + "num_tokens": 210024153.0, + "step": 8403 + }, + { + "epoch": 0.9229079727652097, + "grad_norm": 2.097663640975952, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7026537656784058, + "num_tokens": 210051950.0, + "step": 8404 + }, + { + "epoch": 0.9230177904678234, + "grad_norm": 2.2776031494140625, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7032339572906494, + "num_tokens": 210076975.0, + "step": 8405 + }, + { + "epoch": 0.923127608170437, + "grad_norm": 2.103253126144409, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7193841934204102, + "num_tokens": 210103041.0, + "step": 8406 + }, + { + "epoch": 0.9232374258730507, + "grad_norm": 2.0677030086517334, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7034341096878052, + "num_tokens": 210128678.0, + "step": 8407 + }, + { + "epoch": 0.9233472435756644, + "grad_norm": 2.2745766639709473, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7167828679084778, + "num_tokens": 210152574.0, + "step": 8408 + }, + { + "epoch": 0.9234570612782781, + "grad_norm": 2.3984339237213135, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.7352153062820435, + "num_tokens": 210173428.0, + "step": 8409 + }, + { + "epoch": 0.9235668789808917, + "grad_norm": 2.0429279804229736, + "learning_rate": 1e-06, + "loss": 0.9852, + "mean_token_accuracy": 0.6992712020874023, + "num_tokens": 210201664.0, + "step": 8410 + }, + { + "epoch": 0.9236766966835054, + "grad_norm": 2.098039388656616, + "learning_rate": 1e-06, + "loss": 1.0253, + "mean_token_accuracy": 0.6924817562103271, + "num_tokens": 210229840.0, + "step": 8411 + }, + { + "epoch": 0.923786514386119, + "grad_norm": 2.120570659637451, + "learning_rate": 1e-06, + "loss": 1.0495, + "mean_token_accuracy": 0.6828442811965942, + "num_tokens": 210260510.0, + "step": 8412 + }, + { + "epoch": 0.9238963320887327, + "grad_norm": 2.3092520236968994, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7250213623046875, + "num_tokens": 210283034.0, + "step": 8413 + }, + { + "epoch": 0.9240061497913463, + "grad_norm": 2.022315263748169, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7148732542991638, + "num_tokens": 210311616.0, + "step": 8414 + }, + { + "epoch": 0.9241159674939601, + "grad_norm": 1.9222965240478516, + "learning_rate": 1e-06, + "loss": 1.0597, + "mean_token_accuracy": 0.6789133548736572, + "num_tokens": 210344108.0, + "step": 8415 + }, + { + "epoch": 0.9242257851965737, + "grad_norm": 1.9219417572021484, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7062028646469116, + "num_tokens": 210375780.0, + "step": 8416 + }, + { + "epoch": 0.9243356028991874, + "grad_norm": 2.40297269821167, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.723560094833374, + "num_tokens": 210397112.0, + "step": 8417 + }, + { + "epoch": 0.924445420601801, + "grad_norm": 2.2084457874298096, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6856486201286316, + "num_tokens": 210423018.0, + "step": 8418 + }, + { + "epoch": 0.9245552383044147, + "grad_norm": 2.3637022972106934, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7137746214866638, + "num_tokens": 210444739.0, + "step": 8419 + }, + { + "epoch": 0.9246650560070283, + "grad_norm": 2.370502471923828, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.704484224319458, + "num_tokens": 210466677.0, + "step": 8420 + }, + { + "epoch": 0.924774873709642, + "grad_norm": 2.7513604164123535, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7181878089904785, + "num_tokens": 210484538.0, + "step": 8421 + }, + { + "epoch": 0.9248846914122557, + "grad_norm": 2.208204984664917, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7064231038093567, + "num_tokens": 210509675.0, + "step": 8422 + }, + { + "epoch": 0.9249945091148694, + "grad_norm": 2.128906011581421, + "learning_rate": 1e-06, + "loss": 1.1097, + "mean_token_accuracy": 0.6840339303016663, + "num_tokens": 210538925.0, + "step": 8423 + }, + { + "epoch": 0.925104326817483, + "grad_norm": 2.073322057723999, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6909443140029907, + "num_tokens": 210570239.0, + "step": 8424 + }, + { + "epoch": 0.9252141445200966, + "grad_norm": 2.38179874420166, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7046414017677307, + "num_tokens": 210592233.0, + "step": 8425 + }, + { + "epoch": 0.9253239622227103, + "grad_norm": 2.2491376399993896, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7032374143600464, + "num_tokens": 210615714.0, + "step": 8426 + }, + { + "epoch": 0.9254337799253239, + "grad_norm": 2.1315739154815674, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7032934427261353, + "num_tokens": 210640705.0, + "step": 8427 + }, + { + "epoch": 0.9255435976279376, + "grad_norm": 1.984837293624878, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7095662355422974, + "num_tokens": 210669786.0, + "step": 8428 + }, + { + "epoch": 0.9256534153305512, + "grad_norm": 2.280111312866211, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.7048472762107849, + "num_tokens": 210693203.0, + "step": 8429 + }, + { + "epoch": 0.925763233033165, + "grad_norm": 2.05313777923584, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7070924043655396, + "num_tokens": 210721858.0, + "step": 8430 + }, + { + "epoch": 0.9258730507357786, + "grad_norm": 2.12947416305542, + "learning_rate": 1e-06, + "loss": 0.9963, + "mean_token_accuracy": 0.6995210647583008, + "num_tokens": 210749014.0, + "step": 8431 + }, + { + "epoch": 0.9259828684383923, + "grad_norm": 2.0053672790527344, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6945285797119141, + "num_tokens": 210781998.0, + "step": 8432 + }, + { + "epoch": 0.9260926861410059, + "grad_norm": 2.2687294483184814, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.722038745880127, + "num_tokens": 210804881.0, + "step": 8433 + }, + { + "epoch": 0.9262025038436196, + "grad_norm": 2.0535802841186523, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7238192558288574, + "num_tokens": 210831797.0, + "step": 8434 + }, + { + "epoch": 0.9263123215462332, + "grad_norm": 2.0684847831726074, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7036806344985962, + "num_tokens": 210859822.0, + "step": 8435 + }, + { + "epoch": 0.9264221392488469, + "grad_norm": 2.1429860591888428, + "learning_rate": 1e-06, + "loss": 1.0024, + "mean_token_accuracy": 0.6910321712493896, + "num_tokens": 210885107.0, + "step": 8436 + }, + { + "epoch": 0.9265319569514606, + "grad_norm": 2.1873931884765625, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7103453278541565, + "num_tokens": 210911193.0, + "step": 8437 + }, + { + "epoch": 0.9266417746540743, + "grad_norm": 1.8188459873199463, + "learning_rate": 1e-06, + "loss": 0.9823, + "mean_token_accuracy": 0.6933633089065552, + "num_tokens": 210947248.0, + "step": 8438 + }, + { + "epoch": 0.9267515923566879, + "grad_norm": 2.2143237590789795, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.6960105895996094, + "num_tokens": 210971526.0, + "step": 8439 + }, + { + "epoch": 0.9268614100593016, + "grad_norm": 2.3969759941101074, + "learning_rate": 1e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7424360513687134, + "num_tokens": 210992572.0, + "step": 8440 + }, + { + "epoch": 0.9269712277619152, + "grad_norm": 2.0570528507232666, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.6888048648834229, + "num_tokens": 211021221.0, + "step": 8441 + }, + { + "epoch": 0.9270810454645289, + "grad_norm": 2.411928653717041, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7070356011390686, + "num_tokens": 211042717.0, + "step": 8442 + }, + { + "epoch": 0.9271908631671425, + "grad_norm": 2.0909347534179688, + "learning_rate": 1e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.729049563407898, + "num_tokens": 211065912.0, + "step": 8443 + }, + { + "epoch": 0.9273006808697563, + "grad_norm": 2.572965145111084, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6905364394187927, + "num_tokens": 211094665.0, + "step": 8444 + }, + { + "epoch": 0.9274104985723699, + "grad_norm": 1.8944810628890991, + "learning_rate": 1e-06, + "loss": 0.9961, + "mean_token_accuracy": 0.6887056827545166, + "num_tokens": 211126109.0, + "step": 8445 + }, + { + "epoch": 0.9275203162749835, + "grad_norm": 2.688110113143921, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7130945920944214, + "num_tokens": 211143413.0, + "step": 8446 + }, + { + "epoch": 0.9276301339775972, + "grad_norm": 2.214418411254883, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7006993293762207, + "num_tokens": 211171193.0, + "step": 8447 + }, + { + "epoch": 0.9277399516802108, + "grad_norm": 2.6290781497955322, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7285785675048828, + "num_tokens": 211188325.0, + "step": 8448 + }, + { + "epoch": 0.9278497693828245, + "grad_norm": 2.01212477684021, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6891940832138062, + "num_tokens": 211217377.0, + "step": 8449 + }, + { + "epoch": 0.9279595870854381, + "grad_norm": 2.239137887954712, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7269380688667297, + "num_tokens": 211241280.0, + "step": 8450 + }, + { + "epoch": 0.9280694047880519, + "grad_norm": 2.2527921199798584, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6870148777961731, + "num_tokens": 211265831.0, + "step": 8451 + }, + { + "epoch": 0.9281792224906655, + "grad_norm": 2.057948112487793, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7027592658996582, + "num_tokens": 211294121.0, + "step": 8452 + }, + { + "epoch": 0.9282890401932792, + "grad_norm": 2.1251087188720703, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7076330184936523, + "num_tokens": 211319108.0, + "step": 8453 + }, + { + "epoch": 0.9283988578958928, + "grad_norm": 2.350571870803833, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7066881656646729, + "num_tokens": 211341448.0, + "step": 8454 + }, + { + "epoch": 0.9285086755985065, + "grad_norm": 2.072036027908325, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.7003263235092163, + "num_tokens": 211368152.0, + "step": 8455 + }, + { + "epoch": 0.9286184933011201, + "grad_norm": 1.7408952713012695, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.691609263420105, + "num_tokens": 211406087.0, + "step": 8456 + }, + { + "epoch": 0.9287283110037338, + "grad_norm": 2.0866713523864746, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7099402546882629, + "num_tokens": 211433674.0, + "step": 8457 + }, + { + "epoch": 0.9288381287063474, + "grad_norm": 2.4470596313476562, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7155517935752869, + "num_tokens": 211455353.0, + "step": 8458 + }, + { + "epoch": 0.9289479464089612, + "grad_norm": 1.9558590650558472, + "learning_rate": 1e-06, + "loss": 1.0605, + "mean_token_accuracy": 0.6777836084365845, + "num_tokens": 211486024.0, + "step": 8459 + }, + { + "epoch": 0.9290577641115748, + "grad_norm": 2.2320010662078857, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7194218635559082, + "num_tokens": 211510206.0, + "step": 8460 + }, + { + "epoch": 0.9291675818141885, + "grad_norm": 1.9888626337051392, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7214577794075012, + "num_tokens": 211538568.0, + "step": 8461 + }, + { + "epoch": 0.9292773995168021, + "grad_norm": 2.084171772003174, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7213276624679565, + "num_tokens": 211567321.0, + "step": 8462 + }, + { + "epoch": 0.9293872172194158, + "grad_norm": 2.432547092437744, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7234950065612793, + "num_tokens": 211588686.0, + "step": 8463 + }, + { + "epoch": 0.9294970349220294, + "grad_norm": 2.572664260864258, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7019127011299133, + "num_tokens": 211608438.0, + "step": 8464 + }, + { + "epoch": 0.929606852624643, + "grad_norm": 1.8679990768432617, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7261826992034912, + "num_tokens": 211642540.0, + "step": 8465 + }, + { + "epoch": 0.9297166703272568, + "grad_norm": 2.044800043106079, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7082185745239258, + "num_tokens": 211672496.0, + "step": 8466 + }, + { + "epoch": 0.9298264880298704, + "grad_norm": 2.1598141193389893, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7151979207992554, + "num_tokens": 211696832.0, + "step": 8467 + }, + { + "epoch": 0.9299363057324841, + "grad_norm": 2.2951252460479736, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7011955380439758, + "num_tokens": 211719106.0, + "step": 8468 + }, + { + "epoch": 0.9300461234350977, + "grad_norm": 2.2332146167755127, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7202109098434448, + "num_tokens": 211742435.0, + "step": 8469 + }, + { + "epoch": 0.9301559411377114, + "grad_norm": 2.1927740573883057, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7036603689193726, + "num_tokens": 211768621.0, + "step": 8470 + }, + { + "epoch": 0.930265758840325, + "grad_norm": 2.072279214859009, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7141261100769043, + "num_tokens": 211795528.0, + "step": 8471 + }, + { + "epoch": 0.9303755765429387, + "grad_norm": 2.2886219024658203, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.704472541809082, + "num_tokens": 211820008.0, + "step": 8472 + }, + { + "epoch": 0.9304853942455524, + "grad_norm": 2.111419677734375, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.6968562006950378, + "num_tokens": 211847161.0, + "step": 8473 + }, + { + "epoch": 0.9305952119481661, + "grad_norm": 2.5345139503479004, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7271019816398621, + "num_tokens": 211865297.0, + "step": 8474 + }, + { + "epoch": 0.9307050296507797, + "grad_norm": 2.1346004009246826, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7105329632759094, + "num_tokens": 211890574.0, + "step": 8475 + }, + { + "epoch": 0.9308148473533934, + "grad_norm": 2.579524278640747, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7216969728469849, + "num_tokens": 211908426.0, + "step": 8476 + }, + { + "epoch": 0.930924665056007, + "grad_norm": 2.6052112579345703, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7076618671417236, + "num_tokens": 211927089.0, + "step": 8477 + }, + { + "epoch": 0.9310344827586207, + "grad_norm": 2.6510438919067383, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.717652440071106, + "num_tokens": 211944944.0, + "step": 8478 + }, + { + "epoch": 0.9311443004612343, + "grad_norm": 2.306420087814331, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7033466696739197, + "num_tokens": 211969304.0, + "step": 8479 + }, + { + "epoch": 0.9312541181638481, + "grad_norm": 2.6763875484466553, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7180903553962708, + "num_tokens": 211988254.0, + "step": 8480 + }, + { + "epoch": 0.9313639358664617, + "grad_norm": 2.205078125, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6954056024551392, + "num_tokens": 212014937.0, + "step": 8481 + }, + { + "epoch": 0.9314737535690754, + "grad_norm": 2.329686403274536, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7023403644561768, + "num_tokens": 212041829.0, + "step": 8482 + }, + { + "epoch": 0.931583571271689, + "grad_norm": 2.400012969970703, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6974093914031982, + "num_tokens": 212063509.0, + "step": 8483 + }, + { + "epoch": 0.9316933889743026, + "grad_norm": 2.002856969833374, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7123022079467773, + "num_tokens": 212091259.0, + "step": 8484 + }, + { + "epoch": 0.9318032066769163, + "grad_norm": 2.063488721847534, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7130162715911865, + "num_tokens": 212118269.0, + "step": 8485 + }, + { + "epoch": 0.9319130243795299, + "grad_norm": 2.167971134185791, + "learning_rate": 1e-06, + "loss": 1.0139, + "mean_token_accuracy": 0.6887103319168091, + "num_tokens": 212144073.0, + "step": 8486 + }, + { + "epoch": 0.9320228420821436, + "grad_norm": 2.117704153060913, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6994365453720093, + "num_tokens": 212171195.0, + "step": 8487 + }, + { + "epoch": 0.9321326597847573, + "grad_norm": 2.136674165725708, + "learning_rate": 1e-06, + "loss": 1.0061, + "mean_token_accuracy": 0.6987112760543823, + "num_tokens": 212196835.0, + "step": 8488 + }, + { + "epoch": 0.932242477487371, + "grad_norm": 2.2726619243621826, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7032094597816467, + "num_tokens": 212219739.0, + "step": 8489 + }, + { + "epoch": 0.9323522951899846, + "grad_norm": 2.552377700805664, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.701702892780304, + "num_tokens": 212239272.0, + "step": 8490 + }, + { + "epoch": 0.9324621128925983, + "grad_norm": 2.097031831741333, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7091734409332275, + "num_tokens": 212263563.0, + "step": 8491 + }, + { + "epoch": 0.9325719305952119, + "grad_norm": 2.3834354877471924, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7082603573799133, + "num_tokens": 212285596.0, + "step": 8492 + }, + { + "epoch": 0.9326817482978256, + "grad_norm": 2.0455687046051025, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7119777202606201, + "num_tokens": 212313082.0, + "step": 8493 + }, + { + "epoch": 0.9327915660004392, + "grad_norm": 2.115861654281616, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.6972016096115112, + "num_tokens": 212340153.0, + "step": 8494 + }, + { + "epoch": 0.932901383703053, + "grad_norm": 2.025778293609619, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7317994832992554, + "num_tokens": 212368030.0, + "step": 8495 + }, + { + "epoch": 0.9330112014056666, + "grad_norm": 2.390225887298584, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.6991947889328003, + "num_tokens": 212390736.0, + "step": 8496 + }, + { + "epoch": 0.9331210191082803, + "grad_norm": 2.1574084758758545, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7194184064865112, + "num_tokens": 212415793.0, + "step": 8497 + }, + { + "epoch": 0.9332308368108939, + "grad_norm": 2.3394341468811035, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7283061742782593, + "num_tokens": 212438147.0, + "step": 8498 + }, + { + "epoch": 0.9333406545135076, + "grad_norm": 2.3084638118743896, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7234302759170532, + "num_tokens": 212460692.0, + "step": 8499 + }, + { + "epoch": 0.9334504722161212, + "grad_norm": 2.2495901584625244, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7044781446456909, + "num_tokens": 212486102.0, + "step": 8500 + }, + { + "epoch": 0.9335602899187349, + "grad_norm": 1.933275818824768, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.6969097852706909, + "num_tokens": 212516504.0, + "step": 8501 + }, + { + "epoch": 0.9336701076213486, + "grad_norm": 2.2386908531188965, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.7013221979141235, + "num_tokens": 212539662.0, + "step": 8502 + }, + { + "epoch": 0.9337799253239623, + "grad_norm": 2.086406707763672, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.71857750415802, + "num_tokens": 212566195.0, + "step": 8503 + }, + { + "epoch": 0.9338897430265759, + "grad_norm": 2.220087766647339, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.7046952247619629, + "num_tokens": 212592401.0, + "step": 8504 + }, + { + "epoch": 0.9339995607291895, + "grad_norm": 2.2362475395202637, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7063690423965454, + "num_tokens": 212616077.0, + "step": 8505 + }, + { + "epoch": 0.9341093784318032, + "grad_norm": 2.36187744140625, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.7066726088523865, + "num_tokens": 212638518.0, + "step": 8506 + }, + { + "epoch": 0.9342191961344168, + "grad_norm": 2.122551679611206, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7122403383255005, + "num_tokens": 212664853.0, + "step": 8507 + }, + { + "epoch": 0.9343290138370305, + "grad_norm": 2.5880179405212402, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7083425521850586, + "num_tokens": 212684128.0, + "step": 8508 + }, + { + "epoch": 0.9344388315396442, + "grad_norm": 2.381831407546997, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7251383066177368, + "num_tokens": 212706584.0, + "step": 8509 + }, + { + "epoch": 0.9345486492422579, + "grad_norm": 2.47108793258667, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7136480808258057, + "num_tokens": 212726555.0, + "step": 8510 + }, + { + "epoch": 0.9346584669448715, + "grad_norm": 2.3479156494140625, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7146333456039429, + "num_tokens": 212747450.0, + "step": 8511 + }, + { + "epoch": 0.9347682846474852, + "grad_norm": 2.250115394592285, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7089263200759888, + "num_tokens": 212771109.0, + "step": 8512 + }, + { + "epoch": 0.9348781023500988, + "grad_norm": 2.1925535202026367, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.6969326138496399, + "num_tokens": 212794517.0, + "step": 8513 + }, + { + "epoch": 0.9349879200527125, + "grad_norm": 2.080667734146118, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7037646174430847, + "num_tokens": 212823105.0, + "step": 8514 + }, + { + "epoch": 0.9350977377553261, + "grad_norm": 2.2384393215179443, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7255017161369324, + "num_tokens": 212843600.0, + "step": 8515 + }, + { + "epoch": 0.9352075554579398, + "grad_norm": 1.9209667444229126, + "learning_rate": 1e-06, + "loss": 1.0364, + "mean_token_accuracy": 0.6886464357376099, + "num_tokens": 212875699.0, + "step": 8516 + }, + { + "epoch": 0.9353173731605535, + "grad_norm": 2.1368188858032227, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.704391360282898, + "num_tokens": 212901464.0, + "step": 8517 + }, + { + "epoch": 0.9354271908631672, + "grad_norm": 2.306896209716797, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7114435434341431, + "num_tokens": 212924988.0, + "step": 8518 + }, + { + "epoch": 0.9355370085657808, + "grad_norm": 2.3373372554779053, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7129299640655518, + "num_tokens": 212948745.0, + "step": 8519 + }, + { + "epoch": 0.9356468262683945, + "grad_norm": 2.19494366645813, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7059195637702942, + "num_tokens": 212973601.0, + "step": 8520 + }, + { + "epoch": 0.9357566439710081, + "grad_norm": 2.41465425491333, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7151426076889038, + "num_tokens": 212993924.0, + "step": 8521 + }, + { + "epoch": 0.9358664616736218, + "grad_norm": 2.230593204498291, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7123111486434937, + "num_tokens": 213018518.0, + "step": 8522 + }, + { + "epoch": 0.9359762793762354, + "grad_norm": 2.057727813720703, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7024310827255249, + "num_tokens": 213046477.0, + "step": 8523 + }, + { + "epoch": 0.9360860970788492, + "grad_norm": 2.555561065673828, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7236088514328003, + "num_tokens": 213063607.0, + "step": 8524 + }, + { + "epoch": 0.9361959147814628, + "grad_norm": 1.9050047397613525, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7212294936180115, + "num_tokens": 213093968.0, + "step": 8525 + }, + { + "epoch": 0.9363057324840764, + "grad_norm": 2.0836668014526367, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7084099650382996, + "num_tokens": 213120331.0, + "step": 8526 + }, + { + "epoch": 0.9364155501866901, + "grad_norm": 2.1462366580963135, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7261017560958862, + "num_tokens": 213142602.0, + "step": 8527 + }, + { + "epoch": 0.9365253678893037, + "grad_norm": 2.2380049228668213, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7370308637619019, + "num_tokens": 213165188.0, + "step": 8528 + }, + { + "epoch": 0.9366351855919174, + "grad_norm": 2.339790105819702, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7391327619552612, + "num_tokens": 213185035.0, + "step": 8529 + }, + { + "epoch": 0.936745003294531, + "grad_norm": 2.481199264526367, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7185724377632141, + "num_tokens": 213205431.0, + "step": 8530 + }, + { + "epoch": 0.9368548209971448, + "grad_norm": 2.1632766723632812, + "learning_rate": 1e-06, + "loss": 1.0229, + "mean_token_accuracy": 0.6900948882102966, + "num_tokens": 213234018.0, + "step": 8531 + }, + { + "epoch": 0.9369646386997584, + "grad_norm": 2.079923152923584, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7010687589645386, + "num_tokens": 213262067.0, + "step": 8532 + }, + { + "epoch": 0.9370744564023721, + "grad_norm": 2.5761406421661377, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7101219296455383, + "num_tokens": 213280732.0, + "step": 8533 + }, + { + "epoch": 0.9371842741049857, + "grad_norm": 2.303133249282837, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.7078315019607544, + "num_tokens": 213303575.0, + "step": 8534 + }, + { + "epoch": 0.9372940918075994, + "grad_norm": 2.3432517051696777, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7064220905303955, + "num_tokens": 213326651.0, + "step": 8535 + }, + { + "epoch": 0.937403909510213, + "grad_norm": 2.032884120941162, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7192251682281494, + "num_tokens": 213353823.0, + "step": 8536 + }, + { + "epoch": 0.9375137272128267, + "grad_norm": 1.9944647550582886, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6943718194961548, + "num_tokens": 213382575.0, + "step": 8537 + }, + { + "epoch": 0.9376235449154404, + "grad_norm": 2.0468358993530273, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7032701969146729, + "num_tokens": 213411211.0, + "step": 8538 + }, + { + "epoch": 0.9377333626180541, + "grad_norm": 2.204331398010254, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7035402059555054, + "num_tokens": 213436379.0, + "step": 8539 + }, + { + "epoch": 0.9378431803206677, + "grad_norm": 2.0659711360931396, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7166223526000977, + "num_tokens": 213462468.0, + "step": 8540 + }, + { + "epoch": 0.9379529980232814, + "grad_norm": 2.228386640548706, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7137556076049805, + "num_tokens": 213486398.0, + "step": 8541 + }, + { + "epoch": 0.938062815725895, + "grad_norm": 2.3944616317749023, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7153677940368652, + "num_tokens": 213509337.0, + "step": 8542 + }, + { + "epoch": 0.9381726334285087, + "grad_norm": 2.4882776737213135, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7219939827919006, + "num_tokens": 213529766.0, + "step": 8543 + }, + { + "epoch": 0.9382824511311223, + "grad_norm": 2.0444483757019043, + "learning_rate": 1e-06, + "loss": 1.0138, + "mean_token_accuracy": 0.7013417482376099, + "num_tokens": 213559041.0, + "step": 8544 + }, + { + "epoch": 0.9383922688337359, + "grad_norm": 2.094630718231201, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7108920812606812, + "num_tokens": 213584733.0, + "step": 8545 + }, + { + "epoch": 0.9385020865363497, + "grad_norm": 2.176424980163574, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7146356105804443, + "num_tokens": 213609683.0, + "step": 8546 + }, + { + "epoch": 0.9386119042389633, + "grad_norm": 1.9868981838226318, + "learning_rate": 1e-06, + "loss": 1.001, + "mean_token_accuracy": 0.6953681707382202, + "num_tokens": 213638574.0, + "step": 8547 + }, + { + "epoch": 0.938721721941577, + "grad_norm": 2.2781317234039307, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7037389278411865, + "num_tokens": 213663481.0, + "step": 8548 + }, + { + "epoch": 0.9388315396441906, + "grad_norm": 2.3249881267547607, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7118293046951294, + "num_tokens": 213686438.0, + "step": 8549 + }, + { + "epoch": 0.9389413573468043, + "grad_norm": 2.1863715648651123, + "learning_rate": 1e-06, + "loss": 1.0238, + "mean_token_accuracy": 0.6917338967323303, + "num_tokens": 213712629.0, + "step": 8550 + }, + { + "epoch": 0.9390511750494179, + "grad_norm": 2.113849401473999, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.733701765537262, + "num_tokens": 213738682.0, + "step": 8551 + }, + { + "epoch": 0.9391609927520316, + "grad_norm": 3.1332738399505615, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7319427132606506, + "num_tokens": 213759453.0, + "step": 8552 + }, + { + "epoch": 0.9392708104546453, + "grad_norm": 2.6221234798431396, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.700515627861023, + "num_tokens": 213779910.0, + "step": 8553 + }, + { + "epoch": 0.939380628157259, + "grad_norm": 2.7290592193603516, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6983827352523804, + "num_tokens": 213797903.0, + "step": 8554 + }, + { + "epoch": 0.9394904458598726, + "grad_norm": 2.2023212909698486, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7074591517448425, + "num_tokens": 213823687.0, + "step": 8555 + }, + { + "epoch": 0.9396002635624863, + "grad_norm": 2.294912576675415, + "learning_rate": 1e-06, + "loss": 1.0424, + "mean_token_accuracy": 0.691020667552948, + "num_tokens": 213849256.0, + "step": 8556 + }, + { + "epoch": 0.9397100812650999, + "grad_norm": 2.0950586795806885, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.709263265132904, + "num_tokens": 213877845.0, + "step": 8557 + }, + { + "epoch": 0.9398198989677136, + "grad_norm": 2.1505324840545654, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.690289318561554, + "num_tokens": 213903238.0, + "step": 8558 + }, + { + "epoch": 0.9399297166703272, + "grad_norm": 2.6872611045837402, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7259680032730103, + "num_tokens": 213922378.0, + "step": 8559 + }, + { + "epoch": 0.940039534372941, + "grad_norm": 2.249008893966675, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.7022358775138855, + "num_tokens": 213946364.0, + "step": 8560 + }, + { + "epoch": 0.9401493520755546, + "grad_norm": 2.139813184738159, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7176370620727539, + "num_tokens": 213971997.0, + "step": 8561 + }, + { + "epoch": 0.9402591697781683, + "grad_norm": 2.0461668968200684, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.6969504356384277, + "num_tokens": 214001868.0, + "step": 8562 + }, + { + "epoch": 0.9403689874807819, + "grad_norm": 2.3661513328552246, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7248110771179199, + "num_tokens": 214022985.0, + "step": 8563 + }, + { + "epoch": 0.9404788051833955, + "grad_norm": 2.029001235961914, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7086790204048157, + "num_tokens": 214050564.0, + "step": 8564 + }, + { + "epoch": 0.9405886228860092, + "grad_norm": 2.0136213302612305, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7106190919876099, + "num_tokens": 214080601.0, + "step": 8565 + }, + { + "epoch": 0.9406984405886228, + "grad_norm": 2.3125264644622803, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7249820828437805, + "num_tokens": 214102551.0, + "step": 8566 + }, + { + "epoch": 0.9408082582912366, + "grad_norm": 2.2334351539611816, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.723143458366394, + "num_tokens": 214125789.0, + "step": 8567 + }, + { + "epoch": 0.9409180759938502, + "grad_norm": 1.9972481727600098, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7062743306159973, + "num_tokens": 214154775.0, + "step": 8568 + }, + { + "epoch": 0.9410278936964639, + "grad_norm": 2.136455535888672, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.6988186836242676, + "num_tokens": 214182135.0, + "step": 8569 + }, + { + "epoch": 0.9411377113990775, + "grad_norm": 2.3900768756866455, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7227323055267334, + "num_tokens": 214203288.0, + "step": 8570 + }, + { + "epoch": 0.9412475291016912, + "grad_norm": 2.085585117340088, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7254148721694946, + "num_tokens": 214230072.0, + "step": 8571 + }, + { + "epoch": 0.9413573468043048, + "grad_norm": 2.3276965618133545, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6962308883666992, + "num_tokens": 214254067.0, + "step": 8572 + }, + { + "epoch": 0.9414671645069185, + "grad_norm": 2.239314556121826, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.73167484998703, + "num_tokens": 214275658.0, + "step": 8573 + }, + { + "epoch": 0.9415769822095322, + "grad_norm": 2.294278860092163, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7030305862426758, + "num_tokens": 214300562.0, + "step": 8574 + }, + { + "epoch": 0.9416867999121459, + "grad_norm": 2.068324565887451, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.6993822455406189, + "num_tokens": 214328460.0, + "step": 8575 + }, + { + "epoch": 0.9417966176147595, + "grad_norm": 2.08248233795166, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7082159519195557, + "num_tokens": 214357485.0, + "step": 8576 + }, + { + "epoch": 0.9419064353173732, + "grad_norm": 2.1064226627349854, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7042796015739441, + "num_tokens": 214384426.0, + "step": 8577 + }, + { + "epoch": 0.9420162530199868, + "grad_norm": 2.479484796524048, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.719135046005249, + "num_tokens": 214404037.0, + "step": 8578 + }, + { + "epoch": 0.9421260707226005, + "grad_norm": 2.1667399406433105, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7244440317153931, + "num_tokens": 214430667.0, + "step": 8579 + }, + { + "epoch": 0.9422358884252141, + "grad_norm": 2.3884947299957275, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7131128311157227, + "num_tokens": 214453074.0, + "step": 8580 + }, + { + "epoch": 0.9423457061278278, + "grad_norm": 2.175132989883423, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7121391892433167, + "num_tokens": 214478440.0, + "step": 8581 + }, + { + "epoch": 0.9424555238304415, + "grad_norm": 2.1767711639404297, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6838060021400452, + "num_tokens": 214504195.0, + "step": 8582 + }, + { + "epoch": 0.9425653415330552, + "grad_norm": 2.657764434814453, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7153980731964111, + "num_tokens": 214521657.0, + "step": 8583 + }, + { + "epoch": 0.9426751592356688, + "grad_norm": 2.0370163917541504, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6906406283378601, + "num_tokens": 214550423.0, + "step": 8584 + }, + { + "epoch": 0.9427849769382824, + "grad_norm": 2.3359131813049316, + "learning_rate": 1e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7508699893951416, + "num_tokens": 214571122.0, + "step": 8585 + }, + { + "epoch": 0.9428947946408961, + "grad_norm": 1.930804967880249, + "learning_rate": 1e-06, + "loss": 1.0013, + "mean_token_accuracy": 0.6994220614433289, + "num_tokens": 214603633.0, + "step": 8586 + }, + { + "epoch": 0.9430046123435097, + "grad_norm": 2.2153632640838623, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7047916650772095, + "num_tokens": 214626343.0, + "step": 8587 + }, + { + "epoch": 0.9431144300461234, + "grad_norm": 2.380789279937744, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7200771570205688, + "num_tokens": 214648808.0, + "step": 8588 + }, + { + "epoch": 0.9432242477487371, + "grad_norm": 2.4861743450164795, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7387024760246277, + "num_tokens": 214667552.0, + "step": 8589 + }, + { + "epoch": 0.9433340654513508, + "grad_norm": 2.3937978744506836, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7188941240310669, + "num_tokens": 214689562.0, + "step": 8590 + }, + { + "epoch": 0.9434438831539644, + "grad_norm": 2.219482183456421, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7017011642456055, + "num_tokens": 214713272.0, + "step": 8591 + }, + { + "epoch": 0.9435537008565781, + "grad_norm": 2.224540948867798, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7054059505462646, + "num_tokens": 214736672.0, + "step": 8592 + }, + { + "epoch": 0.9436635185591917, + "grad_norm": 2.1736013889312744, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7304165363311768, + "num_tokens": 214760134.0, + "step": 8593 + }, + { + "epoch": 0.9437733362618054, + "grad_norm": 1.869133710861206, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7027591466903687, + "num_tokens": 214792068.0, + "step": 8594 + }, + { + "epoch": 0.943883153964419, + "grad_norm": 2.0453033447265625, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.6909075379371643, + "num_tokens": 214819356.0, + "step": 8595 + }, + { + "epoch": 0.9439929716670328, + "grad_norm": 2.223172903060913, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7148270606994629, + "num_tokens": 214843904.0, + "step": 8596 + }, + { + "epoch": 0.9441027893696464, + "grad_norm": 1.797802209854126, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7232335805892944, + "num_tokens": 214875227.0, + "step": 8597 + }, + { + "epoch": 0.9442126070722601, + "grad_norm": 2.221459150314331, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7310232520103455, + "num_tokens": 214901068.0, + "step": 8598 + }, + { + "epoch": 0.9443224247748737, + "grad_norm": 2.2540273666381836, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7012054920196533, + "num_tokens": 214924114.0, + "step": 8599 + }, + { + "epoch": 0.9444322424774874, + "grad_norm": 2.199666976928711, + "learning_rate": 1e-06, + "loss": 1.0118, + "mean_token_accuracy": 0.6964791417121887, + "num_tokens": 214951056.0, + "step": 8600 + }, + { + "epoch": 0.944542060180101, + "grad_norm": 2.2957377433776855, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6999596357345581, + "num_tokens": 214973856.0, + "step": 8601 + }, + { + "epoch": 0.9446518778827147, + "grad_norm": 2.122814178466797, + "learning_rate": 1e-06, + "loss": 0.99, + "mean_token_accuracy": 0.7099567651748657, + "num_tokens": 215000807.0, + "step": 8602 + }, + { + "epoch": 0.9447616955853284, + "grad_norm": 2.3047945499420166, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7211560010910034, + "num_tokens": 215024153.0, + "step": 8603 + }, + { + "epoch": 0.944871513287942, + "grad_norm": 2.2101409435272217, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.722541868686676, + "num_tokens": 215048143.0, + "step": 8604 + }, + { + "epoch": 0.9449813309905557, + "grad_norm": 2.0617589950561523, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.698367714881897, + "num_tokens": 215074746.0, + "step": 8605 + }, + { + "epoch": 0.9450911486931693, + "grad_norm": 2.1308958530426025, + "learning_rate": 1e-06, + "loss": 1.0274, + "mean_token_accuracy": 0.6918892860412598, + "num_tokens": 215102969.0, + "step": 8606 + }, + { + "epoch": 0.945200966395783, + "grad_norm": 2.1995153427124023, + "learning_rate": 1e-06, + "loss": 1.0505, + "mean_token_accuracy": 0.6825507283210754, + "num_tokens": 215130793.0, + "step": 8607 + }, + { + "epoch": 0.9453107840983966, + "grad_norm": 1.7818653583526611, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7013217806816101, + "num_tokens": 215164120.0, + "step": 8608 + }, + { + "epoch": 0.9454206018010103, + "grad_norm": 2.2997195720672607, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.69350266456604, + "num_tokens": 215187941.0, + "step": 8609 + }, + { + "epoch": 0.9455304195036239, + "grad_norm": 2.151534080505371, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6841838955879211, + "num_tokens": 215214067.0, + "step": 8610 + }, + { + "epoch": 0.9456402372062377, + "grad_norm": 1.8642008304595947, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7112101912498474, + "num_tokens": 215244623.0, + "step": 8611 + }, + { + "epoch": 0.9457500549088513, + "grad_norm": 2.090977668762207, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7282332181930542, + "num_tokens": 215268295.0, + "step": 8612 + }, + { + "epoch": 0.945859872611465, + "grad_norm": 2.252950429916382, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7038528919219971, + "num_tokens": 215291813.0, + "step": 8613 + }, + { + "epoch": 0.9459696903140786, + "grad_norm": 2.8303518295288086, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7132402658462524, + "num_tokens": 215308158.0, + "step": 8614 + }, + { + "epoch": 0.9460795080166923, + "grad_norm": 2.453561782836914, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.6911600828170776, + "num_tokens": 215327011.0, + "step": 8615 + }, + { + "epoch": 0.9461893257193059, + "grad_norm": 2.236100435256958, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6940551996231079, + "num_tokens": 215351973.0, + "step": 8616 + }, + { + "epoch": 0.9462991434219196, + "grad_norm": 2.3735878467559814, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7206581830978394, + "num_tokens": 215372034.0, + "step": 8617 + }, + { + "epoch": 0.9464089611245333, + "grad_norm": 2.07979679107666, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6978355646133423, + "num_tokens": 215398737.0, + "step": 8618 + }, + { + "epoch": 0.946518778827147, + "grad_norm": 2.1969404220581055, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7227994203567505, + "num_tokens": 215422833.0, + "step": 8619 + }, + { + "epoch": 0.9466285965297606, + "grad_norm": 2.041386365890503, + "learning_rate": 1e-06, + "loss": 1.0322, + "mean_token_accuracy": 0.6812489032745361, + "num_tokens": 215451286.0, + "step": 8620 + }, + { + "epoch": 0.9467384142323743, + "grad_norm": 1.8497542142868042, + "learning_rate": 1e-06, + "loss": 1.0586, + "mean_token_accuracy": 0.6851316690444946, + "num_tokens": 215488776.0, + "step": 8621 + }, + { + "epoch": 0.9468482319349879, + "grad_norm": 2.5533506870269775, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7027232646942139, + "num_tokens": 215509792.0, + "step": 8622 + }, + { + "epoch": 0.9469580496376016, + "grad_norm": 2.169849157333374, + "learning_rate": 1e-06, + "loss": 1.034, + "mean_token_accuracy": 0.6878888607025146, + "num_tokens": 215537826.0, + "step": 8623 + }, + { + "epoch": 0.9470678673402152, + "grad_norm": 2.242079973220825, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7172747850418091, + "num_tokens": 215562501.0, + "step": 8624 + }, + { + "epoch": 0.947177685042829, + "grad_norm": 2.1732828617095947, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7185693979263306, + "num_tokens": 215588210.0, + "step": 8625 + }, + { + "epoch": 0.9472875027454426, + "grad_norm": 2.1801538467407227, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7038683295249939, + "num_tokens": 215614934.0, + "step": 8626 + }, + { + "epoch": 0.9473973204480562, + "grad_norm": 2.3254196643829346, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7298633456230164, + "num_tokens": 215636695.0, + "step": 8627 + }, + { + "epoch": 0.9475071381506699, + "grad_norm": 2.2480900287628174, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7196768522262573, + "num_tokens": 215660776.0, + "step": 8628 + }, + { + "epoch": 0.9476169558532835, + "grad_norm": 2.1974196434020996, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7144339084625244, + "num_tokens": 215687101.0, + "step": 8629 + }, + { + "epoch": 0.9477267735558972, + "grad_norm": 2.184791088104248, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7074406147003174, + "num_tokens": 215713362.0, + "step": 8630 + }, + { + "epoch": 0.9478365912585108, + "grad_norm": 2.2085835933685303, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.6984492540359497, + "num_tokens": 215738697.0, + "step": 8631 + }, + { + "epoch": 0.9479464089611246, + "grad_norm": 2.609945297241211, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7122726440429688, + "num_tokens": 215758172.0, + "step": 8632 + }, + { + "epoch": 0.9480562266637382, + "grad_norm": 2.69213604927063, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7234358787536621, + "num_tokens": 215776999.0, + "step": 8633 + }, + { + "epoch": 0.9481660443663519, + "grad_norm": 2.107235908508301, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7206292152404785, + "num_tokens": 215801628.0, + "step": 8634 + }, + { + "epoch": 0.9482758620689655, + "grad_norm": 2.4469785690307617, + "learning_rate": 1e-06, + "loss": 0.9736, + "mean_token_accuracy": 0.7099639177322388, + "num_tokens": 215822560.0, + "step": 8635 + }, + { + "epoch": 0.9483856797715792, + "grad_norm": 2.324831485748291, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7209455966949463, + "num_tokens": 215845012.0, + "step": 8636 + }, + { + "epoch": 0.9484954974741928, + "grad_norm": 2.313523054122925, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.736100971698761, + "num_tokens": 215866450.0, + "step": 8637 + }, + { + "epoch": 0.9486053151768065, + "grad_norm": 2.247026205062866, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7037162184715271, + "num_tokens": 215891477.0, + "step": 8638 + }, + { + "epoch": 0.9487151328794201, + "grad_norm": 2.12910532951355, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7272553443908691, + "num_tokens": 215915262.0, + "step": 8639 + }, + { + "epoch": 0.9488249505820339, + "grad_norm": 2.6478230953216553, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7433084845542908, + "num_tokens": 215933315.0, + "step": 8640 + }, + { + "epoch": 0.9489347682846475, + "grad_norm": 2.544557809829712, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6977745890617371, + "num_tokens": 215954259.0, + "step": 8641 + }, + { + "epoch": 0.9490445859872612, + "grad_norm": 2.3650763034820557, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7079452276229858, + "num_tokens": 215975148.0, + "step": 8642 + }, + { + "epoch": 0.9491544036898748, + "grad_norm": 2.113523006439209, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.722591757774353, + "num_tokens": 215999863.0, + "step": 8643 + }, + { + "epoch": 0.9492642213924884, + "grad_norm": 2.0850558280944824, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7057305574417114, + "num_tokens": 216029563.0, + "step": 8644 + }, + { + "epoch": 0.9493740390951021, + "grad_norm": 2.1195695400238037, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6977993249893188, + "num_tokens": 216056545.0, + "step": 8645 + }, + { + "epoch": 0.9494838567977157, + "grad_norm": 2.22182559967041, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7184308767318726, + "num_tokens": 216079436.0, + "step": 8646 + }, + { + "epoch": 0.9495936745003295, + "grad_norm": 2.170649766921997, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7080352306365967, + "num_tokens": 216104633.0, + "step": 8647 + }, + { + "epoch": 0.9497034922029431, + "grad_norm": 2.3667588233947754, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7060617804527283, + "num_tokens": 216126380.0, + "step": 8648 + }, + { + "epoch": 0.9498133099055568, + "grad_norm": 2.0003209114074707, + "learning_rate": 1e-06, + "loss": 1.0271, + "mean_token_accuracy": 0.6872621774673462, + "num_tokens": 216154393.0, + "step": 8649 + }, + { + "epoch": 0.9499231276081704, + "grad_norm": 1.9936935901641846, + "learning_rate": 1e-06, + "loss": 1.0141, + "mean_token_accuracy": 0.7028295397758484, + "num_tokens": 216183971.0, + "step": 8650 + }, + { + "epoch": 0.9500329453107841, + "grad_norm": 2.305814504623413, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6977152824401855, + "num_tokens": 216208719.0, + "step": 8651 + }, + { + "epoch": 0.9501427630133977, + "grad_norm": 2.3236117362976074, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.715986967086792, + "num_tokens": 216229095.0, + "step": 8652 + }, + { + "epoch": 0.9502525807160114, + "grad_norm": 2.2485225200653076, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.713384211063385, + "num_tokens": 216252063.0, + "step": 8653 + }, + { + "epoch": 0.9503623984186251, + "grad_norm": 1.8604493141174316, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.6993272304534912, + "num_tokens": 216284624.0, + "step": 8654 + }, + { + "epoch": 0.9504722161212388, + "grad_norm": 2.1427717208862305, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7068196535110474, + "num_tokens": 216311197.0, + "step": 8655 + }, + { + "epoch": 0.9505820338238524, + "grad_norm": 2.213594913482666, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.6977101564407349, + "num_tokens": 216336169.0, + "step": 8656 + }, + { + "epoch": 0.9506918515264661, + "grad_norm": 1.9694572687149048, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.6973679065704346, + "num_tokens": 216367190.0, + "step": 8657 + }, + { + "epoch": 0.9508016692290797, + "grad_norm": 2.120845079421997, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6992921829223633, + "num_tokens": 216393070.0, + "step": 8658 + }, + { + "epoch": 0.9509114869316934, + "grad_norm": 2.296947479248047, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7302918434143066, + "num_tokens": 216415134.0, + "step": 8659 + }, + { + "epoch": 0.951021304634307, + "grad_norm": 2.3571057319641113, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7253310680389404, + "num_tokens": 216435938.0, + "step": 8660 + }, + { + "epoch": 0.9511311223369208, + "grad_norm": 2.101268768310547, + "learning_rate": 1e-06, + "loss": 1.0243, + "mean_token_accuracy": 0.6951658129692078, + "num_tokens": 216461764.0, + "step": 8661 + }, + { + "epoch": 0.9512409400395344, + "grad_norm": 2.1842997074127197, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7097535133361816, + "num_tokens": 216484145.0, + "step": 8662 + }, + { + "epoch": 0.9513507577421481, + "grad_norm": 2.183574914932251, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7052799463272095, + "num_tokens": 216509036.0, + "step": 8663 + }, + { + "epoch": 0.9514605754447617, + "grad_norm": 2.563298225402832, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7308617830276489, + "num_tokens": 216526992.0, + "step": 8664 + }, + { + "epoch": 0.9515703931473753, + "grad_norm": 2.240349054336548, + "learning_rate": 1e-06, + "loss": 1.0349, + "mean_token_accuracy": 0.6879400014877319, + "num_tokens": 216553187.0, + "step": 8665 + }, + { + "epoch": 0.951680210849989, + "grad_norm": 2.1202290058135986, + "learning_rate": 1e-06, + "loss": 1.0785, + "mean_token_accuracy": 0.6762081384658813, + "num_tokens": 216580402.0, + "step": 8666 + }, + { + "epoch": 0.9517900285526026, + "grad_norm": 2.382983684539795, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.706081748008728, + "num_tokens": 216604928.0, + "step": 8667 + }, + { + "epoch": 0.9518998462552163, + "grad_norm": 2.1950948238372803, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7100925445556641, + "num_tokens": 216629961.0, + "step": 8668 + }, + { + "epoch": 0.95200966395783, + "grad_norm": 2.3065876960754395, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7097792625427246, + "num_tokens": 216652933.0, + "step": 8669 + }, + { + "epoch": 0.9521194816604437, + "grad_norm": 2.114022731781006, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.7073200941085815, + "num_tokens": 216680767.0, + "step": 8670 + }, + { + "epoch": 0.9522292993630573, + "grad_norm": 2.0642173290252686, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7144348621368408, + "num_tokens": 216709777.0, + "step": 8671 + }, + { + "epoch": 0.952339117065671, + "grad_norm": 2.274017572402954, + "learning_rate": 1e-06, + "loss": 1.0016, + "mean_token_accuracy": 0.705741286277771, + "num_tokens": 216734481.0, + "step": 8672 + }, + { + "epoch": 0.9524489347682846, + "grad_norm": 2.1785778999328613, + "learning_rate": 1e-06, + "loss": 1.0171, + "mean_token_accuracy": 0.6992812752723694, + "num_tokens": 216761922.0, + "step": 8673 + }, + { + "epoch": 0.9525587524708983, + "grad_norm": 2.2546560764312744, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7059810161590576, + "num_tokens": 216783229.0, + "step": 8674 + }, + { + "epoch": 0.9526685701735119, + "grad_norm": 2.068995475769043, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7287026643753052, + "num_tokens": 216807135.0, + "step": 8675 + }, + { + "epoch": 0.9527783878761257, + "grad_norm": 1.9020226001739502, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.6964704990386963, + "num_tokens": 216839784.0, + "step": 8676 + }, + { + "epoch": 0.9528882055787393, + "grad_norm": 2.413510322570801, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7258427143096924, + "num_tokens": 216861013.0, + "step": 8677 + }, + { + "epoch": 0.952998023281353, + "grad_norm": 2.434680223464966, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7266378998756409, + "num_tokens": 216881691.0, + "step": 8678 + }, + { + "epoch": 0.9531078409839666, + "grad_norm": 1.8491907119750977, + "learning_rate": 1e-06, + "loss": 1.0319, + "mean_token_accuracy": 0.6908806562423706, + "num_tokens": 216915001.0, + "step": 8679 + }, + { + "epoch": 0.9532176586865803, + "grad_norm": 2.161777973175049, + "learning_rate": 1e-06, + "loss": 1.0837, + "mean_token_accuracy": 0.6698993444442749, + "num_tokens": 216944067.0, + "step": 8680 + }, + { + "epoch": 0.9533274763891939, + "grad_norm": 1.910689115524292, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7059392929077148, + "num_tokens": 216975759.0, + "step": 8681 + }, + { + "epoch": 0.9534372940918076, + "grad_norm": 2.2368762493133545, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7389812469482422, + "num_tokens": 217001058.0, + "step": 8682 + }, + { + "epoch": 0.9535471117944213, + "grad_norm": 2.0294289588928223, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6900657415390015, + "num_tokens": 217029031.0, + "step": 8683 + }, + { + "epoch": 0.953656929497035, + "grad_norm": 2.222358226776123, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.702765703201294, + "num_tokens": 217054133.0, + "step": 8684 + }, + { + "epoch": 0.9537667471996486, + "grad_norm": 2.04144024848938, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.7041933536529541, + "num_tokens": 217081421.0, + "step": 8685 + }, + { + "epoch": 0.9538765649022622, + "grad_norm": 2.0975067615509033, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6964056491851807, + "num_tokens": 217109944.0, + "step": 8686 + }, + { + "epoch": 0.9539863826048759, + "grad_norm": 2.146033763885498, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.710944652557373, + "num_tokens": 217136204.0, + "step": 8687 + }, + { + "epoch": 0.9540962003074895, + "grad_norm": 2.17305064201355, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7010165452957153, + "num_tokens": 217161444.0, + "step": 8688 + }, + { + "epoch": 0.9542060180101032, + "grad_norm": 2.0624771118164062, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6942184567451477, + "num_tokens": 217189447.0, + "step": 8689 + }, + { + "epoch": 0.9543158357127169, + "grad_norm": 2.6660776138305664, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7101826667785645, + "num_tokens": 217207910.0, + "step": 8690 + }, + { + "epoch": 0.9544256534153306, + "grad_norm": 2.231900691986084, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.697535514831543, + "num_tokens": 217231639.0, + "step": 8691 + }, + { + "epoch": 0.9545354711179442, + "grad_norm": 2.6760504245758057, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7210249900817871, + "num_tokens": 217249618.0, + "step": 8692 + }, + { + "epoch": 0.9546452888205579, + "grad_norm": 2.500619649887085, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7386447191238403, + "num_tokens": 217268070.0, + "step": 8693 + }, + { + "epoch": 0.9547551065231715, + "grad_norm": 2.5504555702209473, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7104302644729614, + "num_tokens": 217288633.0, + "step": 8694 + }, + { + "epoch": 0.9548649242257852, + "grad_norm": 2.2704524993896484, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7117772698402405, + "num_tokens": 217313346.0, + "step": 8695 + }, + { + "epoch": 0.9549747419283988, + "grad_norm": 2.2939202785491943, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.717781662940979, + "num_tokens": 217335058.0, + "step": 8696 + }, + { + "epoch": 0.9550845596310125, + "grad_norm": 2.6644647121429443, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.71608567237854, + "num_tokens": 217353988.0, + "step": 8697 + }, + { + "epoch": 0.9551943773336262, + "grad_norm": 2.3688743114471436, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7267635464668274, + "num_tokens": 217376418.0, + "step": 8698 + }, + { + "epoch": 0.9553041950362399, + "grad_norm": 2.1627535820007324, + "learning_rate": 1e-06, + "loss": 1.0081, + "mean_token_accuracy": 0.6910881400108337, + "num_tokens": 217403786.0, + "step": 8699 + }, + { + "epoch": 0.9554140127388535, + "grad_norm": 2.330970048904419, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7309728860855103, + "num_tokens": 217426891.0, + "step": 8700 + }, + { + "epoch": 0.9555238304414672, + "grad_norm": 2.2523791790008545, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7097971439361572, + "num_tokens": 217448190.0, + "step": 8701 + }, + { + "epoch": 0.9556336481440808, + "grad_norm": 2.383007764816284, + "learning_rate": 1e-06, + "loss": 1.0706, + "mean_token_accuracy": 0.6908921003341675, + "num_tokens": 217471081.0, + "step": 8702 + }, + { + "epoch": 0.9557434658466945, + "grad_norm": 2.0724306106567383, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.684410035610199, + "num_tokens": 217498260.0, + "step": 8703 + }, + { + "epoch": 0.9558532835493081, + "grad_norm": 2.155806541442871, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.688368558883667, + "num_tokens": 217525993.0, + "step": 8704 + }, + { + "epoch": 0.9559631012519219, + "grad_norm": 2.1054017543792725, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7183322310447693, + "num_tokens": 217553641.0, + "step": 8705 + }, + { + "epoch": 0.9560729189545355, + "grad_norm": 2.0504841804504395, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7175806760787964, + "num_tokens": 217580192.0, + "step": 8706 + }, + { + "epoch": 0.9561827366571491, + "grad_norm": 1.9932851791381836, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7236058712005615, + "num_tokens": 217608551.0, + "step": 8707 + }, + { + "epoch": 0.9562925543597628, + "grad_norm": 2.2198684215545654, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7188721895217896, + "num_tokens": 217632120.0, + "step": 8708 + }, + { + "epoch": 0.9564023720623764, + "grad_norm": 1.9552857875823975, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7142330408096313, + "num_tokens": 217662127.0, + "step": 8709 + }, + { + "epoch": 0.9565121897649901, + "grad_norm": 2.16581392288208, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7170789241790771, + "num_tokens": 217686444.0, + "step": 8710 + }, + { + "epoch": 0.9566220074676037, + "grad_norm": 1.9007662534713745, + "learning_rate": 1e-06, + "loss": 1.0932, + "mean_token_accuracy": 0.6717923283576965, + "num_tokens": 217720993.0, + "step": 8711 + }, + { + "epoch": 0.9567318251702175, + "grad_norm": 2.192018747329712, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7466081380844116, + "num_tokens": 217743024.0, + "step": 8712 + }, + { + "epoch": 0.9568416428728311, + "grad_norm": 2.40468168258667, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7139524221420288, + "num_tokens": 217764991.0, + "step": 8713 + }, + { + "epoch": 0.9569514605754448, + "grad_norm": 2.169696092605591, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6970086693763733, + "num_tokens": 217791196.0, + "step": 8714 + }, + { + "epoch": 0.9570612782780584, + "grad_norm": 2.2842869758605957, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7255781888961792, + "num_tokens": 217814255.0, + "step": 8715 + }, + { + "epoch": 0.9571710959806721, + "grad_norm": 2.009769916534424, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7417033910751343, + "num_tokens": 217841579.0, + "step": 8716 + }, + { + "epoch": 0.9572809136832857, + "grad_norm": 2.175619125366211, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7208703756332397, + "num_tokens": 217866635.0, + "step": 8717 + }, + { + "epoch": 0.9573907313858994, + "grad_norm": 2.4695417881011963, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7099124193191528, + "num_tokens": 217889135.0, + "step": 8718 + }, + { + "epoch": 0.9575005490885131, + "grad_norm": 2.2488365173339844, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6933421492576599, + "num_tokens": 217913556.0, + "step": 8719 + }, + { + "epoch": 0.9576103667911268, + "grad_norm": 1.9864575862884521, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7094895839691162, + "num_tokens": 217940989.0, + "step": 8720 + }, + { + "epoch": 0.9577201844937404, + "grad_norm": 1.970551609992981, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7106479406356812, + "num_tokens": 217968273.0, + "step": 8721 + }, + { + "epoch": 0.9578300021963541, + "grad_norm": 2.1408979892730713, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7247236371040344, + "num_tokens": 217991231.0, + "step": 8722 + }, + { + "epoch": 0.9579398198989677, + "grad_norm": 2.2159345149993896, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.7019811868667603, + "num_tokens": 218017723.0, + "step": 8723 + }, + { + "epoch": 0.9580496376015813, + "grad_norm": 1.914003849029541, + "learning_rate": 1e-06, + "loss": 1.049, + "mean_token_accuracy": 0.6820868253707886, + "num_tokens": 218051427.0, + "step": 8724 + }, + { + "epoch": 0.958159455304195, + "grad_norm": 2.399632692337036, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7247273921966553, + "num_tokens": 218072003.0, + "step": 8725 + }, + { + "epoch": 0.9582692730068088, + "grad_norm": 2.223717451095581, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.6900771260261536, + "num_tokens": 218099578.0, + "step": 8726 + }, + { + "epoch": 0.9583790907094224, + "grad_norm": 2.0807621479034424, + "learning_rate": 1e-06, + "loss": 1.0315, + "mean_token_accuracy": 0.6849679946899414, + "num_tokens": 218127727.0, + "step": 8727 + }, + { + "epoch": 0.958488908412036, + "grad_norm": 2.1227774620056152, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.723706066608429, + "num_tokens": 218151864.0, + "step": 8728 + }, + { + "epoch": 0.9585987261146497, + "grad_norm": 2.0926403999328613, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7209184169769287, + "num_tokens": 218178825.0, + "step": 8729 + }, + { + "epoch": 0.9587085438172633, + "grad_norm": 2.1574645042419434, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7053554058074951, + "num_tokens": 218206834.0, + "step": 8730 + }, + { + "epoch": 0.958818361519877, + "grad_norm": 2.12032151222229, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7033409476280212, + "num_tokens": 218232957.0, + "step": 8731 + }, + { + "epoch": 0.9589281792224906, + "grad_norm": 2.3066632747650146, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7063058614730835, + "num_tokens": 218257772.0, + "step": 8732 + }, + { + "epoch": 0.9590379969251043, + "grad_norm": 2.106908082962036, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7129865884780884, + "num_tokens": 218285194.0, + "step": 8733 + }, + { + "epoch": 0.959147814627718, + "grad_norm": 2.300405979156494, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7004836201667786, + "num_tokens": 218307122.0, + "step": 8734 + }, + { + "epoch": 0.9592576323303317, + "grad_norm": 2.3494787216186523, + "learning_rate": 1e-06, + "loss": 1.0297, + "mean_token_accuracy": 0.6992820501327515, + "num_tokens": 218329350.0, + "step": 8735 + }, + { + "epoch": 0.9593674500329453, + "grad_norm": 1.873362421989441, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6946265697479248, + "num_tokens": 218364791.0, + "step": 8736 + }, + { + "epoch": 0.959477267735559, + "grad_norm": 1.9313429594039917, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.6985933780670166, + "num_tokens": 218396678.0, + "step": 8737 + }, + { + "epoch": 0.9595870854381726, + "grad_norm": 1.9234439134597778, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7285841107368469, + "num_tokens": 218426447.0, + "step": 8738 + }, + { + "epoch": 0.9596969031407863, + "grad_norm": 1.882628321647644, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6848757266998291, + "num_tokens": 218458094.0, + "step": 8739 + }, + { + "epoch": 0.9598067208433999, + "grad_norm": 2.3527824878692627, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7178134918212891, + "num_tokens": 218481829.0, + "step": 8740 + }, + { + "epoch": 0.9599165385460137, + "grad_norm": 2.4542973041534424, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7204447984695435, + "num_tokens": 218502645.0, + "step": 8741 + }, + { + "epoch": 0.9600263562486273, + "grad_norm": 2.261449098587036, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.7036346197128296, + "num_tokens": 218526003.0, + "step": 8742 + }, + { + "epoch": 0.960136173951241, + "grad_norm": 2.2602763175964355, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.699254035949707, + "num_tokens": 218552576.0, + "step": 8743 + }, + { + "epoch": 0.9602459916538546, + "grad_norm": 2.2671313285827637, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7271143198013306, + "num_tokens": 218574445.0, + "step": 8744 + }, + { + "epoch": 0.9603558093564682, + "grad_norm": 2.200899124145508, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7311198711395264, + "num_tokens": 218597834.0, + "step": 8745 + }, + { + "epoch": 0.9604656270590819, + "grad_norm": 2.4648399353027344, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7270022630691528, + "num_tokens": 218616600.0, + "step": 8746 + }, + { + "epoch": 0.9605754447616955, + "grad_norm": 2.2862329483032227, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7165776491165161, + "num_tokens": 218639783.0, + "step": 8747 + }, + { + "epoch": 0.9606852624643093, + "grad_norm": 2.361409902572632, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7087373733520508, + "num_tokens": 218662896.0, + "step": 8748 + }, + { + "epoch": 0.9607950801669229, + "grad_norm": 2.193578004837036, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.70320725440979, + "num_tokens": 218689114.0, + "step": 8749 + }, + { + "epoch": 0.9609048978695366, + "grad_norm": 2.3467252254486084, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.707025408744812, + "num_tokens": 218710613.0, + "step": 8750 + }, + { + "epoch": 0.9610147155721502, + "grad_norm": 2.602628707885742, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7204095125198364, + "num_tokens": 218727858.0, + "step": 8751 + }, + { + "epoch": 0.9611245332747639, + "grad_norm": 2.322681427001953, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6973645091056824, + "num_tokens": 218751135.0, + "step": 8752 + }, + { + "epoch": 0.9612343509773775, + "grad_norm": 2.35491943359375, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7253227233886719, + "num_tokens": 218771469.0, + "step": 8753 + }, + { + "epoch": 0.9613441686799912, + "grad_norm": 2.3936638832092285, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7143908143043518, + "num_tokens": 218792991.0, + "step": 8754 + }, + { + "epoch": 0.9614539863826049, + "grad_norm": 2.370251417160034, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7129436731338501, + "num_tokens": 218813857.0, + "step": 8755 + }, + { + "epoch": 0.9615638040852186, + "grad_norm": 1.9814388751983643, + "learning_rate": 1e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7496234178543091, + "num_tokens": 218839736.0, + "step": 8756 + }, + { + "epoch": 0.9616736217878322, + "grad_norm": 2.1209754943847656, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7038155198097229, + "num_tokens": 218866747.0, + "step": 8757 + }, + { + "epoch": 0.9617834394904459, + "grad_norm": 2.1025054454803467, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6965726613998413, + "num_tokens": 218891184.0, + "step": 8758 + }, + { + "epoch": 0.9618932571930595, + "grad_norm": 2.1850979328155518, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6893409490585327, + "num_tokens": 218917124.0, + "step": 8759 + }, + { + "epoch": 0.9620030748956732, + "grad_norm": 1.9064414501190186, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7236268520355225, + "num_tokens": 218945113.0, + "step": 8760 + }, + { + "epoch": 0.9621128925982868, + "grad_norm": 2.3979244232177734, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.716750979423523, + "num_tokens": 218965516.0, + "step": 8761 + }, + { + "epoch": 0.9622227103009005, + "grad_norm": 1.9115064144134521, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7051345109939575, + "num_tokens": 218998215.0, + "step": 8762 + }, + { + "epoch": 0.9623325280035142, + "grad_norm": 2.142850637435913, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6903601884841919, + "num_tokens": 219023971.0, + "step": 8763 + }, + { + "epoch": 0.9624423457061279, + "grad_norm": 2.23996639251709, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7056787014007568, + "num_tokens": 219048191.0, + "step": 8764 + }, + { + "epoch": 0.9625521634087415, + "grad_norm": 2.1569974422454834, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7305001020431519, + "num_tokens": 219072309.0, + "step": 8765 + }, + { + "epoch": 0.9626619811113551, + "grad_norm": 2.3688411712646484, + "learning_rate": 1e-06, + "loss": 1.01, + "mean_token_accuracy": 0.6965121030807495, + "num_tokens": 219095310.0, + "step": 8766 + }, + { + "epoch": 0.9627717988139688, + "grad_norm": 2.3712570667266846, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7228871583938599, + "num_tokens": 219116524.0, + "step": 8767 + }, + { + "epoch": 0.9628816165165824, + "grad_norm": 2.130657434463501, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.703287661075592, + "num_tokens": 219143102.0, + "step": 8768 + }, + { + "epoch": 0.9629914342191961, + "grad_norm": 1.918497085571289, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7087644338607788, + "num_tokens": 219175367.0, + "step": 8769 + }, + { + "epoch": 0.9631012519218098, + "grad_norm": 2.087289810180664, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7163000106811523, + "num_tokens": 219202316.0, + "step": 8770 + }, + { + "epoch": 0.9632110696244235, + "grad_norm": 2.251887083053589, + "learning_rate": 1e-06, + "loss": 1.0642, + "mean_token_accuracy": 0.6844042539596558, + "num_tokens": 219228139.0, + "step": 8771 + }, + { + "epoch": 0.9633208873270371, + "grad_norm": 2.0649256706237793, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7094998359680176, + "num_tokens": 219256623.0, + "step": 8772 + }, + { + "epoch": 0.9634307050296508, + "grad_norm": 2.290848731994629, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7118740081787109, + "num_tokens": 219279261.0, + "step": 8773 + }, + { + "epoch": 0.9635405227322644, + "grad_norm": 2.05660343170166, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7049113512039185, + "num_tokens": 219306545.0, + "step": 8774 + }, + { + "epoch": 0.9636503404348781, + "grad_norm": 2.064639091491699, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7023277282714844, + "num_tokens": 219334011.0, + "step": 8775 + }, + { + "epoch": 0.9637601581374917, + "grad_norm": 2.1984379291534424, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7158940434455872, + "num_tokens": 219356814.0, + "step": 8776 + }, + { + "epoch": 0.9638699758401055, + "grad_norm": 2.477506637573242, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7065731287002563, + "num_tokens": 219377298.0, + "step": 8777 + }, + { + "epoch": 0.9639797935427191, + "grad_norm": 2.4012985229492188, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7052744030952454, + "num_tokens": 219401115.0, + "step": 8778 + }, + { + "epoch": 0.9640896112453328, + "grad_norm": 2.4380075931549072, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7283324599266052, + "num_tokens": 219421285.0, + "step": 8779 + }, + { + "epoch": 0.9641994289479464, + "grad_norm": 1.923805594444275, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6877386569976807, + "num_tokens": 219453560.0, + "step": 8780 + }, + { + "epoch": 0.9643092466505601, + "grad_norm": 1.8839412927627563, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7109302878379822, + "num_tokens": 219484552.0, + "step": 8781 + }, + { + "epoch": 0.9644190643531737, + "grad_norm": 2.5470268726348877, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7125396728515625, + "num_tokens": 219505129.0, + "step": 8782 + }, + { + "epoch": 0.9645288820557874, + "grad_norm": 2.5930142402648926, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.6996315717697144, + "num_tokens": 219526048.0, + "step": 8783 + }, + { + "epoch": 0.9646386997584011, + "grad_norm": 2.4663405418395996, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.715718150138855, + "num_tokens": 219547049.0, + "step": 8784 + }, + { + "epoch": 0.9647485174610148, + "grad_norm": 2.227470874786377, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.701055645942688, + "num_tokens": 219571597.0, + "step": 8785 + }, + { + "epoch": 0.9648583351636284, + "grad_norm": 2.315385341644287, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7148646116256714, + "num_tokens": 219594634.0, + "step": 8786 + }, + { + "epoch": 0.964968152866242, + "grad_norm": 2.406416893005371, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.707400918006897, + "num_tokens": 219617030.0, + "step": 8787 + }, + { + "epoch": 0.9650779705688557, + "grad_norm": 2.286424160003662, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.714008092880249, + "num_tokens": 219638944.0, + "step": 8788 + }, + { + "epoch": 0.9651877882714693, + "grad_norm": 2.168581962585449, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7144863605499268, + "num_tokens": 219663653.0, + "step": 8789 + }, + { + "epoch": 0.965297605974083, + "grad_norm": 2.2659730911254883, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7123105525970459, + "num_tokens": 219687602.0, + "step": 8790 + }, + { + "epoch": 0.9654074236766966, + "grad_norm": 2.4228219985961914, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7086790800094604, + "num_tokens": 219709248.0, + "step": 8791 + }, + { + "epoch": 0.9655172413793104, + "grad_norm": 2.516777753829956, + "learning_rate": 1e-06, + "loss": 0.8455, + "mean_token_accuracy": 0.7493300437927246, + "num_tokens": 219728393.0, + "step": 8792 + }, + { + "epoch": 0.965627059081924, + "grad_norm": 2.381267786026001, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7087946534156799, + "num_tokens": 219751208.0, + "step": 8793 + }, + { + "epoch": 0.9657368767845377, + "grad_norm": 2.119743585586548, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7113555669784546, + "num_tokens": 219779016.0, + "step": 8794 + }, + { + "epoch": 0.9658466944871513, + "grad_norm": 2.0677218437194824, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7105552554130554, + "num_tokens": 219806818.0, + "step": 8795 + }, + { + "epoch": 0.965956512189765, + "grad_norm": 2.196296453475952, + "learning_rate": 1e-06, + "loss": 0.979, + "mean_token_accuracy": 0.7020062804222107, + "num_tokens": 219833654.0, + "step": 8796 + }, + { + "epoch": 0.9660663298923786, + "grad_norm": 2.327347993850708, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7338226437568665, + "num_tokens": 219854792.0, + "step": 8797 + }, + { + "epoch": 0.9661761475949923, + "grad_norm": 2.586696147918701, + "learning_rate": 1e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7253686189651489, + "num_tokens": 219872491.0, + "step": 8798 + }, + { + "epoch": 0.966285965297606, + "grad_norm": 2.1457972526550293, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7053326368331909, + "num_tokens": 219898824.0, + "step": 8799 + }, + { + "epoch": 0.9663957830002197, + "grad_norm": 2.071864128112793, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7319923043251038, + "num_tokens": 219923184.0, + "step": 8800 + }, + { + "epoch": 0.9665056007028333, + "grad_norm": 2.451709032058716, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7017673254013062, + "num_tokens": 219944905.0, + "step": 8801 + }, + { + "epoch": 0.966615418405447, + "grad_norm": 2.1829867362976074, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7099312543869019, + "num_tokens": 219969216.0, + "step": 8802 + }, + { + "epoch": 0.9667252361080606, + "grad_norm": 2.3482120037078857, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7252835035324097, + "num_tokens": 219990695.0, + "step": 8803 + }, + { + "epoch": 0.9668350538106742, + "grad_norm": 2.515960931777954, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7209106683731079, + "num_tokens": 220009934.0, + "step": 8804 + }, + { + "epoch": 0.9669448715132879, + "grad_norm": 2.2798025608062744, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7288118600845337, + "num_tokens": 220031827.0, + "step": 8805 + }, + { + "epoch": 0.9670546892159017, + "grad_norm": 2.1735386848449707, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7335950136184692, + "num_tokens": 220054914.0, + "step": 8806 + }, + { + "epoch": 0.9671645069185153, + "grad_norm": 2.0863037109375, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7067294716835022, + "num_tokens": 220081853.0, + "step": 8807 + }, + { + "epoch": 0.967274324621129, + "grad_norm": 2.0615131855010986, + "learning_rate": 1e-06, + "loss": 1.0191, + "mean_token_accuracy": 0.6895394325256348, + "num_tokens": 220112149.0, + "step": 8808 + }, + { + "epoch": 0.9673841423237426, + "grad_norm": 2.277331829071045, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7106595635414124, + "num_tokens": 220135612.0, + "step": 8809 + }, + { + "epoch": 0.9674939600263562, + "grad_norm": 2.2031469345092773, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7143085598945618, + "num_tokens": 220158203.0, + "step": 8810 + }, + { + "epoch": 0.9676037777289699, + "grad_norm": 2.0304830074310303, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7302172780036926, + "num_tokens": 220185481.0, + "step": 8811 + }, + { + "epoch": 0.9677135954315835, + "grad_norm": 2.5246026515960693, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7142020463943481, + "num_tokens": 220205063.0, + "step": 8812 + }, + { + "epoch": 0.9678234131341973, + "grad_norm": 2.144777297973633, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.713799238204956, + "num_tokens": 220232358.0, + "step": 8813 + }, + { + "epoch": 0.9679332308368109, + "grad_norm": 2.1533141136169434, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7287176251411438, + "num_tokens": 220258884.0, + "step": 8814 + }, + { + "epoch": 0.9680430485394246, + "grad_norm": 2.306162118911743, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7104523777961731, + "num_tokens": 220283784.0, + "step": 8815 + }, + { + "epoch": 0.9681528662420382, + "grad_norm": 2.2849555015563965, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7217549681663513, + "num_tokens": 220305294.0, + "step": 8816 + }, + { + "epoch": 0.9682626839446519, + "grad_norm": 2.337778091430664, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7025324106216431, + "num_tokens": 220327387.0, + "step": 8817 + }, + { + "epoch": 0.9683725016472655, + "grad_norm": 2.1431193351745605, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7226270437240601, + "num_tokens": 220353842.0, + "step": 8818 + }, + { + "epoch": 0.9684823193498792, + "grad_norm": 2.3671505451202393, + "learning_rate": 1e-06, + "loss": 1.0664, + "mean_token_accuracy": 0.6865556240081787, + "num_tokens": 220377757.0, + "step": 8819 + }, + { + "epoch": 0.9685921370524928, + "grad_norm": 1.9984713792800903, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7172541618347168, + "num_tokens": 220405184.0, + "step": 8820 + }, + { + "epoch": 0.9687019547551066, + "grad_norm": 2.2965502738952637, + "learning_rate": 1e-06, + "loss": 1.0423, + "mean_token_accuracy": 0.6856668591499329, + "num_tokens": 220428012.0, + "step": 8821 + }, + { + "epoch": 0.9688117724577202, + "grad_norm": 2.1496970653533936, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7170344591140747, + "num_tokens": 220451905.0, + "step": 8822 + }, + { + "epoch": 0.9689215901603339, + "grad_norm": 2.276418924331665, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.73079913854599, + "num_tokens": 220474671.0, + "step": 8823 + }, + { + "epoch": 0.9690314078629475, + "grad_norm": 2.6576168537139893, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7201855182647705, + "num_tokens": 220492451.0, + "step": 8824 + }, + { + "epoch": 0.9691412255655611, + "grad_norm": 2.222325086593628, + "learning_rate": 1e-06, + "loss": 1.0042, + "mean_token_accuracy": 0.6939500570297241, + "num_tokens": 220517813.0, + "step": 8825 + }, + { + "epoch": 0.9692510432681748, + "grad_norm": 2.1117024421691895, + "learning_rate": 1e-06, + "loss": 1.0292, + "mean_token_accuracy": 0.683818519115448, + "num_tokens": 220545492.0, + "step": 8826 + }, + { + "epoch": 0.9693608609707884, + "grad_norm": 2.2568466663360596, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7111366987228394, + "num_tokens": 220569250.0, + "step": 8827 + }, + { + "epoch": 0.9694706786734022, + "grad_norm": 2.1238605976104736, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7424759268760681, + "num_tokens": 220593795.0, + "step": 8828 + }, + { + "epoch": 0.9695804963760158, + "grad_norm": 2.0206596851348877, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7261625528335571, + "num_tokens": 220622405.0, + "step": 8829 + }, + { + "epoch": 0.9696903140786295, + "grad_norm": 2.1202352046966553, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7107076644897461, + "num_tokens": 220649187.0, + "step": 8830 + }, + { + "epoch": 0.9698001317812431, + "grad_norm": 2.260457754135132, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6938064098358154, + "num_tokens": 220675043.0, + "step": 8831 + }, + { + "epoch": 0.9699099494838568, + "grad_norm": 2.1030707359313965, + "learning_rate": 1e-06, + "loss": 1.0273, + "mean_token_accuracy": 0.692391574382782, + "num_tokens": 220705873.0, + "step": 8832 + }, + { + "epoch": 0.9700197671864704, + "grad_norm": 2.970223903656006, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7203813791275024, + "num_tokens": 220726315.0, + "step": 8833 + }, + { + "epoch": 0.9701295848890841, + "grad_norm": 2.146150827407837, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.71816086769104, + "num_tokens": 220750710.0, + "step": 8834 + }, + { + "epoch": 0.9702394025916978, + "grad_norm": 2.3072898387908936, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7343196868896484, + "num_tokens": 220773088.0, + "step": 8835 + }, + { + "epoch": 0.9703492202943115, + "grad_norm": 2.0660853385925293, + "learning_rate": 1e-06, + "loss": 1.0027, + "mean_token_accuracy": 0.7031800150871277, + "num_tokens": 220801817.0, + "step": 8836 + }, + { + "epoch": 0.9704590379969251, + "grad_norm": 1.9354499578475952, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.7038809061050415, + "num_tokens": 220832888.0, + "step": 8837 + }, + { + "epoch": 0.9705688556995388, + "grad_norm": 2.0645835399627686, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7254154682159424, + "num_tokens": 220858857.0, + "step": 8838 + }, + { + "epoch": 0.9706786734021524, + "grad_norm": 2.3423590660095215, + "learning_rate": 1e-06, + "loss": 1.0235, + "mean_token_accuracy": 0.691184401512146, + "num_tokens": 220882468.0, + "step": 8839 + }, + { + "epoch": 0.9707884911047661, + "grad_norm": 2.0612831115722656, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7086299061775208, + "num_tokens": 220909776.0, + "step": 8840 + }, + { + "epoch": 0.9708983088073797, + "grad_norm": 2.2370824813842773, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7063130140304565, + "num_tokens": 220934290.0, + "step": 8841 + }, + { + "epoch": 0.9710081265099935, + "grad_norm": 2.1656620502471924, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7034052014350891, + "num_tokens": 220959263.0, + "step": 8842 + }, + { + "epoch": 0.9711179442126071, + "grad_norm": 1.8843905925750732, + "learning_rate": 1e-06, + "loss": 1.0873, + "mean_token_accuracy": 0.6858714818954468, + "num_tokens": 220993010.0, + "step": 8843 + }, + { + "epoch": 0.9712277619152208, + "grad_norm": 1.8776508569717407, + "learning_rate": 1e-06, + "loss": 1.0406, + "mean_token_accuracy": 0.6919070482254028, + "num_tokens": 221026330.0, + "step": 8844 + }, + { + "epoch": 0.9713375796178344, + "grad_norm": 2.279040575027466, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7111347913742065, + "num_tokens": 221048609.0, + "step": 8845 + }, + { + "epoch": 0.971447397320448, + "grad_norm": 2.4052164554595947, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.709014356136322, + "num_tokens": 221069819.0, + "step": 8846 + }, + { + "epoch": 0.9715572150230617, + "grad_norm": 2.1186366081237793, + "learning_rate": 1e-06, + "loss": 1.0088, + "mean_token_accuracy": 0.7004929780960083, + "num_tokens": 221095484.0, + "step": 8847 + }, + { + "epoch": 0.9716670327256753, + "grad_norm": 2.6190619468688965, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7213712334632874, + "num_tokens": 221113577.0, + "step": 8848 + }, + { + "epoch": 0.971776850428289, + "grad_norm": 1.9986125230789185, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.723251223564148, + "num_tokens": 221140292.0, + "step": 8849 + }, + { + "epoch": 0.9718866681309027, + "grad_norm": 2.50545597076416, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7245820760726929, + "num_tokens": 221159007.0, + "step": 8850 + }, + { + "epoch": 0.9719964858335164, + "grad_norm": 2.0017826557159424, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7042194604873657, + "num_tokens": 221191347.0, + "step": 8851 + }, + { + "epoch": 0.97210630353613, + "grad_norm": 2.043006181716919, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.6951525211334229, + "num_tokens": 221216903.0, + "step": 8852 + }, + { + "epoch": 0.9722161212387437, + "grad_norm": 1.9276440143585205, + "learning_rate": 1e-06, + "loss": 1.1048, + "mean_token_accuracy": 0.6806776523590088, + "num_tokens": 221248358.0, + "step": 8853 + }, + { + "epoch": 0.9723259389413573, + "grad_norm": 2.3805408477783203, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7161961197853088, + "num_tokens": 221269205.0, + "step": 8854 + }, + { + "epoch": 0.972435756643971, + "grad_norm": 2.312774419784546, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7398858666419983, + "num_tokens": 221292617.0, + "step": 8855 + }, + { + "epoch": 0.9725455743465846, + "grad_norm": 2.154419422149658, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7352761030197144, + "num_tokens": 221316556.0, + "step": 8856 + }, + { + "epoch": 0.9726553920491984, + "grad_norm": 1.8254035711288452, + "learning_rate": 1e-06, + "loss": 1.0546, + "mean_token_accuracy": 0.6889714598655701, + "num_tokens": 221351051.0, + "step": 8857 + }, + { + "epoch": 0.972765209751812, + "grad_norm": 2.1959900856018066, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7063801288604736, + "num_tokens": 221377219.0, + "step": 8858 + }, + { + "epoch": 0.9728750274544257, + "grad_norm": 2.266187906265259, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7009764313697815, + "num_tokens": 221401467.0, + "step": 8859 + }, + { + "epoch": 0.9729848451570393, + "grad_norm": 2.343219757080078, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7151338458061218, + "num_tokens": 221423440.0, + "step": 8860 + }, + { + "epoch": 0.973094662859653, + "grad_norm": 2.228832721710205, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7199482917785645, + "num_tokens": 221446616.0, + "step": 8861 + }, + { + "epoch": 0.9732044805622666, + "grad_norm": 2.162557363510132, + "learning_rate": 1e-06, + "loss": 1.0645, + "mean_token_accuracy": 0.6780077815055847, + "num_tokens": 221474774.0, + "step": 8862 + }, + { + "epoch": 0.9733142982648803, + "grad_norm": 2.4446113109588623, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.730111300945282, + "num_tokens": 221494775.0, + "step": 8863 + }, + { + "epoch": 0.973424115967494, + "grad_norm": 2.679255247116089, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7061026692390442, + "num_tokens": 221514298.0, + "step": 8864 + }, + { + "epoch": 0.9735339336701077, + "grad_norm": 2.2328765392303467, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6925428509712219, + "num_tokens": 221538438.0, + "step": 8865 + }, + { + "epoch": 0.9736437513727213, + "grad_norm": 2.1030402183532715, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7043086290359497, + "num_tokens": 221564253.0, + "step": 8866 + }, + { + "epoch": 0.973753569075335, + "grad_norm": 2.1292150020599365, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6958236694335938, + "num_tokens": 221592196.0, + "step": 8867 + }, + { + "epoch": 0.9738633867779486, + "grad_norm": 2.185818910598755, + "learning_rate": 1e-06, + "loss": 1.0176, + "mean_token_accuracy": 0.6844390630722046, + "num_tokens": 221618335.0, + "step": 8868 + }, + { + "epoch": 0.9739732044805622, + "grad_norm": 2.284334659576416, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7132721543312073, + "num_tokens": 221642674.0, + "step": 8869 + }, + { + "epoch": 0.9740830221831759, + "grad_norm": 2.4383513927459717, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.709536075592041, + "num_tokens": 221662035.0, + "step": 8870 + }, + { + "epoch": 0.9741928398857896, + "grad_norm": 2.296929121017456, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7067558169364929, + "num_tokens": 221685896.0, + "step": 8871 + }, + { + "epoch": 0.9743026575884033, + "grad_norm": 2.3722920417785645, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7334866523742676, + "num_tokens": 221708358.0, + "step": 8872 + }, + { + "epoch": 0.9744124752910169, + "grad_norm": 2.1539430618286133, + "learning_rate": 1e-06, + "loss": 1.0143, + "mean_token_accuracy": 0.6919484734535217, + "num_tokens": 221734676.0, + "step": 8873 + }, + { + "epoch": 0.9745222929936306, + "grad_norm": 2.2730205059051514, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7163563966751099, + "num_tokens": 221757074.0, + "step": 8874 + }, + { + "epoch": 0.9746321106962442, + "grad_norm": 2.439178466796875, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7333140969276428, + "num_tokens": 221777110.0, + "step": 8875 + }, + { + "epoch": 0.9747419283988579, + "grad_norm": 2.2629029750823975, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.6986758708953857, + "num_tokens": 221801219.0, + "step": 8876 + }, + { + "epoch": 0.9748517461014715, + "grad_norm": 2.054098129272461, + "learning_rate": 1e-06, + "loss": 1.0083, + "mean_token_accuracy": 0.6881874799728394, + "num_tokens": 221829167.0, + "step": 8877 + }, + { + "epoch": 0.9749615638040853, + "grad_norm": 2.426239490509033, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6955101490020752, + "num_tokens": 221850787.0, + "step": 8878 + }, + { + "epoch": 0.9750713815066989, + "grad_norm": 2.487835168838501, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7122279405593872, + "num_tokens": 221870978.0, + "step": 8879 + }, + { + "epoch": 0.9751811992093126, + "grad_norm": 2.3373358249664307, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7144235372543335, + "num_tokens": 221891615.0, + "step": 8880 + }, + { + "epoch": 0.9752910169119262, + "grad_norm": 2.2372822761535645, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7097887992858887, + "num_tokens": 221914892.0, + "step": 8881 + }, + { + "epoch": 0.9754008346145399, + "grad_norm": 2.425203800201416, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7011226415634155, + "num_tokens": 221937512.0, + "step": 8882 + }, + { + "epoch": 0.9755106523171535, + "grad_norm": 2.1251866817474365, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7105164527893066, + "num_tokens": 221964498.0, + "step": 8883 + }, + { + "epoch": 0.9756204700197671, + "grad_norm": 2.540123701095581, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7079401016235352, + "num_tokens": 221983471.0, + "step": 8884 + }, + { + "epoch": 0.9757302877223808, + "grad_norm": 2.313793420791626, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7166019678115845, + "num_tokens": 222007650.0, + "step": 8885 + }, + { + "epoch": 0.9758401054249946, + "grad_norm": 2.2298266887664795, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6912631988525391, + "num_tokens": 222031102.0, + "step": 8886 + }, + { + "epoch": 0.9759499231276082, + "grad_norm": 1.9154058694839478, + "learning_rate": 1e-06, + "loss": 1.0325, + "mean_token_accuracy": 0.6860648393630981, + "num_tokens": 222065029.0, + "step": 8887 + }, + { + "epoch": 0.9760597408302218, + "grad_norm": 2.2192628383636475, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7153416275978088, + "num_tokens": 222090400.0, + "step": 8888 + }, + { + "epoch": 0.9761695585328355, + "grad_norm": 1.9624894857406616, + "learning_rate": 1e-06, + "loss": 1.0158, + "mean_token_accuracy": 0.6969775557518005, + "num_tokens": 222121269.0, + "step": 8889 + }, + { + "epoch": 0.9762793762354491, + "grad_norm": 2.0516746044158936, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6957353353500366, + "num_tokens": 222151297.0, + "step": 8890 + }, + { + "epoch": 0.9763891939380628, + "grad_norm": 2.2642600536346436, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7275864481925964, + "num_tokens": 222174658.0, + "step": 8891 + }, + { + "epoch": 0.9764990116406764, + "grad_norm": 2.0858490467071533, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7137800455093384, + "num_tokens": 222201636.0, + "step": 8892 + }, + { + "epoch": 0.9766088293432902, + "grad_norm": 2.46869158744812, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.725174069404602, + "num_tokens": 222223699.0, + "step": 8893 + }, + { + "epoch": 0.9767186470459038, + "grad_norm": 2.088024377822876, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7153580188751221, + "num_tokens": 222250044.0, + "step": 8894 + }, + { + "epoch": 0.9768284647485175, + "grad_norm": 2.2705078125, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7298763990402222, + "num_tokens": 222272407.0, + "step": 8895 + }, + { + "epoch": 0.9769382824511311, + "grad_norm": 2.108689546585083, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7179443836212158, + "num_tokens": 222298511.0, + "step": 8896 + }, + { + "epoch": 0.9770481001537448, + "grad_norm": 2.035012722015381, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7251879572868347, + "num_tokens": 222326945.0, + "step": 8897 + }, + { + "epoch": 0.9771579178563584, + "grad_norm": 1.982266902923584, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7015819549560547, + "num_tokens": 222355030.0, + "step": 8898 + }, + { + "epoch": 0.9772677355589721, + "grad_norm": 2.1383519172668457, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7007198333740234, + "num_tokens": 222381635.0, + "step": 8899 + }, + { + "epoch": 0.9773775532615858, + "grad_norm": 2.237088441848755, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7117336988449097, + "num_tokens": 222405648.0, + "step": 8900 + }, + { + "epoch": 0.9774873709641995, + "grad_norm": 2.297665596008301, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7087088823318481, + "num_tokens": 222427954.0, + "step": 8901 + }, + { + "epoch": 0.9775971886668131, + "grad_norm": 2.2652440071105957, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7246271371841431, + "num_tokens": 222451537.0, + "step": 8902 + }, + { + "epoch": 0.9777070063694268, + "grad_norm": 2.2307240962982178, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7037715911865234, + "num_tokens": 222475826.0, + "step": 8903 + }, + { + "epoch": 0.9778168240720404, + "grad_norm": 1.9115434885025024, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7196775078773499, + "num_tokens": 222506227.0, + "step": 8904 + }, + { + "epoch": 0.977926641774654, + "grad_norm": 2.395817518234253, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7146324515342712, + "num_tokens": 222528138.0, + "step": 8905 + }, + { + "epoch": 0.9780364594772677, + "grad_norm": 2.3128631114959717, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7060071229934692, + "num_tokens": 222551125.0, + "step": 8906 + }, + { + "epoch": 0.9781462771798815, + "grad_norm": 2.003325939178467, + "learning_rate": 1e-06, + "loss": 1.0579, + "mean_token_accuracy": 0.6769463419914246, + "num_tokens": 222580996.0, + "step": 8907 + }, + { + "epoch": 0.9782560948824951, + "grad_norm": 2.9347734451293945, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7083995342254639, + "num_tokens": 222596154.0, + "step": 8908 + }, + { + "epoch": 0.9783659125851087, + "grad_norm": 2.016310930252075, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7246387004852295, + "num_tokens": 222625144.0, + "step": 8909 + }, + { + "epoch": 0.9784757302877224, + "grad_norm": 2.215137481689453, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7211356163024902, + "num_tokens": 222648749.0, + "step": 8910 + }, + { + "epoch": 0.978585547990336, + "grad_norm": 1.9112153053283691, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6959606409072876, + "num_tokens": 222680527.0, + "step": 8911 + }, + { + "epoch": 0.9786953656929497, + "grad_norm": 2.11195969581604, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6843131184577942, + "num_tokens": 222708463.0, + "step": 8912 + }, + { + "epoch": 0.9788051833955633, + "grad_norm": 2.1501593589782715, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7141141891479492, + "num_tokens": 222734003.0, + "step": 8913 + }, + { + "epoch": 0.978915001098177, + "grad_norm": 2.1794910430908203, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7333189249038696, + "num_tokens": 222757425.0, + "step": 8914 + }, + { + "epoch": 0.9790248188007907, + "grad_norm": 1.9968794584274292, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7045220136642456, + "num_tokens": 222786531.0, + "step": 8915 + }, + { + "epoch": 0.9791346365034044, + "grad_norm": 2.2400848865509033, + "learning_rate": 1e-06, + "loss": 1.0135, + "mean_token_accuracy": 0.6881630420684814, + "num_tokens": 222811090.0, + "step": 8916 + }, + { + "epoch": 0.979244454206018, + "grad_norm": 2.2315595149993896, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7000030279159546, + "num_tokens": 222836663.0, + "step": 8917 + }, + { + "epoch": 0.9793542719086317, + "grad_norm": 1.8424510955810547, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7327081561088562, + "num_tokens": 222869183.0, + "step": 8918 + }, + { + "epoch": 0.9794640896112453, + "grad_norm": 1.9266046285629272, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7004565000534058, + "num_tokens": 222901575.0, + "step": 8919 + }, + { + "epoch": 0.979573907313859, + "grad_norm": 2.0734002590179443, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7197917699813843, + "num_tokens": 222927788.0, + "step": 8920 + }, + { + "epoch": 0.9796837250164726, + "grad_norm": 2.2177987098693848, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7018692493438721, + "num_tokens": 222953633.0, + "step": 8921 + }, + { + "epoch": 0.9797935427190864, + "grad_norm": 2.172821283340454, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.7013826966285706, + "num_tokens": 222979467.0, + "step": 8922 + }, + { + "epoch": 0.9799033604217, + "grad_norm": 2.0764334201812744, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7004156112670898, + "num_tokens": 223008265.0, + "step": 8923 + }, + { + "epoch": 0.9800131781243137, + "grad_norm": 2.015450954437256, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.721076488494873, + "num_tokens": 223034975.0, + "step": 8924 + }, + { + "epoch": 0.9801229958269273, + "grad_norm": 2.305777072906494, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7262786626815796, + "num_tokens": 223057106.0, + "step": 8925 + }, + { + "epoch": 0.980232813529541, + "grad_norm": 1.9583520889282227, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7116400599479675, + "num_tokens": 223083558.0, + "step": 8926 + }, + { + "epoch": 0.9803426312321546, + "grad_norm": 2.117048501968384, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7112124562263489, + "num_tokens": 223107810.0, + "step": 8927 + }, + { + "epoch": 0.9804524489347682, + "grad_norm": 1.9666990041732788, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.721657931804657, + "num_tokens": 223137228.0, + "step": 8928 + }, + { + "epoch": 0.980562266637382, + "grad_norm": 2.1291563510894775, + "learning_rate": 1e-06, + "loss": 1.0389, + "mean_token_accuracy": 0.6883848905563354, + "num_tokens": 223163784.0, + "step": 8929 + }, + { + "epoch": 0.9806720843399956, + "grad_norm": 2.316509962081909, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7005940079689026, + "num_tokens": 223186042.0, + "step": 8930 + }, + { + "epoch": 0.9807819020426093, + "grad_norm": 2.5502703189849854, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7047823667526245, + "num_tokens": 223205673.0, + "step": 8931 + }, + { + "epoch": 0.9808917197452229, + "grad_norm": 2.2157201766967773, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7051228880882263, + "num_tokens": 223228275.0, + "step": 8932 + }, + { + "epoch": 0.9810015374478366, + "grad_norm": 2.232017755508423, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7005033493041992, + "num_tokens": 223253102.0, + "step": 8933 + }, + { + "epoch": 0.9811113551504502, + "grad_norm": 2.3107919692993164, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.7029186487197876, + "num_tokens": 223277077.0, + "step": 8934 + }, + { + "epoch": 0.9812211728530639, + "grad_norm": 2.204796075820923, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7086732387542725, + "num_tokens": 223302582.0, + "step": 8935 + }, + { + "epoch": 0.9813309905556776, + "grad_norm": 2.2656071186065674, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6966382265090942, + "num_tokens": 223327233.0, + "step": 8936 + }, + { + "epoch": 0.9814408082582913, + "grad_norm": 2.5186548233032227, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7204665541648865, + "num_tokens": 223346597.0, + "step": 8937 + }, + { + "epoch": 0.9815506259609049, + "grad_norm": 2.541297197341919, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7283375859260559, + "num_tokens": 223365988.0, + "step": 8938 + }, + { + "epoch": 0.9816604436635186, + "grad_norm": 2.2455387115478516, + "learning_rate": 1e-06, + "loss": 1.0107, + "mean_token_accuracy": 0.6941417455673218, + "num_tokens": 223390894.0, + "step": 8939 + }, + { + "epoch": 0.9817702613661322, + "grad_norm": 2.541043281555176, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7263351678848267, + "num_tokens": 223409432.0, + "step": 8940 + }, + { + "epoch": 0.9818800790687459, + "grad_norm": 2.12599515914917, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7275489568710327, + "num_tokens": 223433721.0, + "step": 8941 + }, + { + "epoch": 0.9819898967713595, + "grad_norm": 2.269188404083252, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7324112057685852, + "num_tokens": 223456760.0, + "step": 8942 + }, + { + "epoch": 0.9820997144739732, + "grad_norm": 2.025137186050415, + "learning_rate": 1e-06, + "loss": 1.0434, + "mean_token_accuracy": 0.6892165541648865, + "num_tokens": 223485470.0, + "step": 8943 + }, + { + "epoch": 0.9822095321765869, + "grad_norm": 2.7833473682403564, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7088057398796082, + "num_tokens": 223503508.0, + "step": 8944 + }, + { + "epoch": 0.9823193498792006, + "grad_norm": 2.063202142715454, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6993021965026855, + "num_tokens": 223533134.0, + "step": 8945 + }, + { + "epoch": 0.9824291675818142, + "grad_norm": 2.353743314743042, + "learning_rate": 1e-06, + "loss": 1.026, + "mean_token_accuracy": 0.6930193901062012, + "num_tokens": 223556921.0, + "step": 8946 + }, + { + "epoch": 0.9825389852844278, + "grad_norm": 2.4610114097595215, + "learning_rate": 1e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7406288385391235, + "num_tokens": 223575513.0, + "step": 8947 + }, + { + "epoch": 0.9826488029870415, + "grad_norm": 2.066451072692871, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7243387699127197, + "num_tokens": 223602162.0, + "step": 8948 + }, + { + "epoch": 0.9827586206896551, + "grad_norm": 2.2956948280334473, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6915170550346375, + "num_tokens": 223626003.0, + "step": 8949 + }, + { + "epoch": 0.9828684383922688, + "grad_norm": 2.390812873840332, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6984437704086304, + "num_tokens": 223647434.0, + "step": 8950 + }, + { + "epoch": 0.9829782560948825, + "grad_norm": 2.0153744220733643, + "learning_rate": 1e-06, + "loss": 0.989, + "mean_token_accuracy": 0.6985652446746826, + "num_tokens": 223675780.0, + "step": 8951 + }, + { + "epoch": 0.9830880737974962, + "grad_norm": 2.2366204261779785, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7206186056137085, + "num_tokens": 223698446.0, + "step": 8952 + }, + { + "epoch": 0.9831978915001098, + "grad_norm": 2.413579225540161, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7082931995391846, + "num_tokens": 223717365.0, + "step": 8953 + }, + { + "epoch": 0.9833077092027235, + "grad_norm": 2.017831325531006, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7165502905845642, + "num_tokens": 223744643.0, + "step": 8954 + }, + { + "epoch": 0.9834175269053371, + "grad_norm": 2.570115089416504, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7024123072624207, + "num_tokens": 223764753.0, + "step": 8955 + }, + { + "epoch": 0.9835273446079508, + "grad_norm": 2.1929914951324463, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7143793106079102, + "num_tokens": 223788248.0, + "step": 8956 + }, + { + "epoch": 0.9836371623105644, + "grad_norm": 2.316082239151001, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7044258117675781, + "num_tokens": 223810308.0, + "step": 8957 + }, + { + "epoch": 0.9837469800131782, + "grad_norm": 2.197998285293579, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7135448455810547, + "num_tokens": 223835430.0, + "step": 8958 + }, + { + "epoch": 0.9838567977157918, + "grad_norm": 2.4072952270507812, + "learning_rate": 1e-06, + "loss": 0.994, + "mean_token_accuracy": 0.6983237862586975, + "num_tokens": 223859684.0, + "step": 8959 + }, + { + "epoch": 0.9839666154184055, + "grad_norm": 2.134246349334717, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7338542342185974, + "num_tokens": 223884001.0, + "step": 8960 + }, + { + "epoch": 0.9840764331210191, + "grad_norm": 2.116271495819092, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7146669626235962, + "num_tokens": 223910214.0, + "step": 8961 + }, + { + "epoch": 0.9841862508236328, + "grad_norm": 2.158456802368164, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7340658903121948, + "num_tokens": 223935069.0, + "step": 8962 + }, + { + "epoch": 0.9842960685262464, + "grad_norm": 2.744227170944214, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7109179496765137, + "num_tokens": 223954759.0, + "step": 8963 + }, + { + "epoch": 0.98440588622886, + "grad_norm": 2.01003098487854, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.6964944005012512, + "num_tokens": 223985074.0, + "step": 8964 + }, + { + "epoch": 0.9845157039314738, + "grad_norm": 2.4390156269073486, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7052611708641052, + "num_tokens": 224006433.0, + "step": 8965 + }, + { + "epoch": 0.9846255216340875, + "grad_norm": 2.2352800369262695, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6957563757896423, + "num_tokens": 224030867.0, + "step": 8966 + }, + { + "epoch": 0.9847353393367011, + "grad_norm": 1.8684951066970825, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7083424925804138, + "num_tokens": 224062658.0, + "step": 8967 + }, + { + "epoch": 0.9848451570393147, + "grad_norm": 2.00087833404541, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7008599042892456, + "num_tokens": 224092399.0, + "step": 8968 + }, + { + "epoch": 0.9849549747419284, + "grad_norm": 1.9580249786376953, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.7099077701568604, + "num_tokens": 224124709.0, + "step": 8969 + }, + { + "epoch": 0.985064792444542, + "grad_norm": 2.3424594402313232, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7110460996627808, + "num_tokens": 224147252.0, + "step": 8970 + }, + { + "epoch": 0.9851746101471557, + "grad_norm": 2.163153648376465, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7283227443695068, + "num_tokens": 224173639.0, + "step": 8971 + }, + { + "epoch": 0.9852844278497693, + "grad_norm": 2.311772346496582, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6944372653961182, + "num_tokens": 224197006.0, + "step": 8972 + }, + { + "epoch": 0.9853942455523831, + "grad_norm": 2.054124593734741, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7171462774276733, + "num_tokens": 224226184.0, + "step": 8973 + }, + { + "epoch": 0.9855040632549967, + "grad_norm": 2.057365894317627, + "learning_rate": 1e-06, + "loss": 1.0598, + "mean_token_accuracy": 0.6782362461090088, + "num_tokens": 224254058.0, + "step": 8974 + }, + { + "epoch": 0.9856138809576104, + "grad_norm": 2.189537763595581, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6934694051742554, + "num_tokens": 224280985.0, + "step": 8975 + }, + { + "epoch": 0.985723698660224, + "grad_norm": 2.237863540649414, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7289657592773438, + "num_tokens": 224304950.0, + "step": 8976 + }, + { + "epoch": 0.9858335163628377, + "grad_norm": 2.52897047996521, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7143180966377258, + "num_tokens": 224324326.0, + "step": 8977 + }, + { + "epoch": 0.9859433340654513, + "grad_norm": 2.5930562019348145, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6917093992233276, + "num_tokens": 224343193.0, + "step": 8978 + }, + { + "epoch": 0.986053151768065, + "grad_norm": 2.011850118637085, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6922091245651245, + "num_tokens": 224374313.0, + "step": 8979 + }, + { + "epoch": 0.9861629694706787, + "grad_norm": 2.6046457290649414, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.715437650680542, + "num_tokens": 224392701.0, + "step": 8980 + }, + { + "epoch": 0.9862727871732924, + "grad_norm": 2.2699944972991943, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.719200611114502, + "num_tokens": 224414661.0, + "step": 8981 + }, + { + "epoch": 0.986382604875906, + "grad_norm": 2.5589585304260254, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7162865400314331, + "num_tokens": 224434531.0, + "step": 8982 + }, + { + "epoch": 0.9864924225785197, + "grad_norm": 1.9766485691070557, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7222186326980591, + "num_tokens": 224464814.0, + "step": 8983 + }, + { + "epoch": 0.9866022402811333, + "grad_norm": 2.2228481769561768, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7081822156906128, + "num_tokens": 224489662.0, + "step": 8984 + }, + { + "epoch": 0.986712057983747, + "grad_norm": 2.4204487800598145, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7148301601409912, + "num_tokens": 224511923.0, + "step": 8985 + }, + { + "epoch": 0.9868218756863606, + "grad_norm": 2.3998842239379883, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7084140181541443, + "num_tokens": 224534430.0, + "step": 8986 + }, + { + "epoch": 0.9869316933889744, + "grad_norm": 2.42704701423645, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7086253762245178, + "num_tokens": 224556456.0, + "step": 8987 + }, + { + "epoch": 0.987041511091588, + "grad_norm": 2.2132976055145264, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7268258333206177, + "num_tokens": 224581404.0, + "step": 8988 + }, + { + "epoch": 0.9871513287942016, + "grad_norm": 2.4676873683929443, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6940554976463318, + "num_tokens": 224603954.0, + "step": 8989 + }, + { + "epoch": 0.9872611464968153, + "grad_norm": 2.758279800415039, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7147791385650635, + "num_tokens": 224622411.0, + "step": 8990 + }, + { + "epoch": 0.9873709641994289, + "grad_norm": 2.5670006275177, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7225828766822815, + "num_tokens": 224641660.0, + "step": 8991 + }, + { + "epoch": 0.9874807819020426, + "grad_norm": 2.342568874359131, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7301974296569824, + "num_tokens": 224663217.0, + "step": 8992 + }, + { + "epoch": 0.9875905996046562, + "grad_norm": 2.1182730197906494, + "learning_rate": 1e-06, + "loss": 1.053, + "mean_token_accuracy": 0.6819418668746948, + "num_tokens": 224691753.0, + "step": 8993 + }, + { + "epoch": 0.98770041730727, + "grad_norm": 1.935193419456482, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7184356451034546, + "num_tokens": 224721260.0, + "step": 8994 + }, + { + "epoch": 0.9878102350098836, + "grad_norm": 2.3439810276031494, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7479052543640137, + "num_tokens": 224741113.0, + "step": 8995 + }, + { + "epoch": 0.9879200527124973, + "grad_norm": 2.251114845275879, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7092947363853455, + "num_tokens": 224766355.0, + "step": 8996 + }, + { + "epoch": 0.9880298704151109, + "grad_norm": 2.220635414123535, + "learning_rate": 1e-06, + "loss": 1.0641, + "mean_token_accuracy": 0.6798995733261108, + "num_tokens": 224794893.0, + "step": 8997 + }, + { + "epoch": 0.9881396881177246, + "grad_norm": 2.24646258354187, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7045276165008545, + "num_tokens": 224818624.0, + "step": 8998 + }, + { + "epoch": 0.9882495058203382, + "grad_norm": 2.5803637504577637, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.74204421043396, + "num_tokens": 224838287.0, + "step": 8999 + }, + { + "epoch": 0.9883593235229519, + "grad_norm": 2.1629018783569336, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7018775939941406, + "num_tokens": 224865304.0, + "step": 9000 + }, + { + "epoch": 0.9884691412255655, + "grad_norm": 2.0782577991485596, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7128945589065552, + "num_tokens": 224891241.0, + "step": 9001 + }, + { + "epoch": 0.9885789589281793, + "grad_norm": 2.4701974391937256, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.6986193656921387, + "num_tokens": 224912687.0, + "step": 9002 + }, + { + "epoch": 0.9886887766307929, + "grad_norm": 1.931430697441101, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7028464078903198, + "num_tokens": 224943629.0, + "step": 9003 + }, + { + "epoch": 0.9887985943334066, + "grad_norm": 2.2750282287597656, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7359784245491028, + "num_tokens": 224965344.0, + "step": 9004 + }, + { + "epoch": 0.9889084120360202, + "grad_norm": 2.006133556365967, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7380199432373047, + "num_tokens": 224993211.0, + "step": 9005 + }, + { + "epoch": 0.9890182297386338, + "grad_norm": 2.335230827331543, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6953580379486084, + "num_tokens": 225014179.0, + "step": 9006 + }, + { + "epoch": 0.9891280474412475, + "grad_norm": 2.3534209728240967, + "learning_rate": 1e-06, + "loss": 0.9788, + "mean_token_accuracy": 0.7005447149276733, + "num_tokens": 225037954.0, + "step": 9007 + }, + { + "epoch": 0.9892378651438611, + "grad_norm": 2.3976361751556396, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6937230825424194, + "num_tokens": 225062237.0, + "step": 9008 + }, + { + "epoch": 0.9893476828464749, + "grad_norm": 2.160418748855591, + "learning_rate": 1e-06, + "loss": 1.0248, + "mean_token_accuracy": 0.6998982429504395, + "num_tokens": 225087551.0, + "step": 9009 + }, + { + "epoch": 0.9894575005490885, + "grad_norm": 2.5554230213165283, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7323815822601318, + "num_tokens": 225105603.0, + "step": 9010 + }, + { + "epoch": 0.9895673182517022, + "grad_norm": 2.1589887142181396, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.7015429139137268, + "num_tokens": 225131964.0, + "step": 9011 + }, + { + "epoch": 0.9896771359543158, + "grad_norm": 2.234426736831665, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7151129245758057, + "num_tokens": 225156812.0, + "step": 9012 + }, + { + "epoch": 0.9897869536569295, + "grad_norm": 2.187588691711426, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7229576110839844, + "num_tokens": 225183338.0, + "step": 9013 + }, + { + "epoch": 0.9898967713595431, + "grad_norm": 2.219205617904663, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7050293684005737, + "num_tokens": 225206763.0, + "step": 9014 + }, + { + "epoch": 0.9900065890621568, + "grad_norm": 2.151522159576416, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7503262162208557, + "num_tokens": 225230078.0, + "step": 9015 + }, + { + "epoch": 0.9901164067647705, + "grad_norm": 2.2434210777282715, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.704387366771698, + "num_tokens": 225254394.0, + "step": 9016 + }, + { + "epoch": 0.9902262244673842, + "grad_norm": 1.894771933555603, + "learning_rate": 1e-06, + "loss": 1.0792, + "mean_token_accuracy": 0.6720100045204163, + "num_tokens": 225288621.0, + "step": 9017 + }, + { + "epoch": 0.9903360421699978, + "grad_norm": 2.0995588302612305, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7210878133773804, + "num_tokens": 225315386.0, + "step": 9018 + }, + { + "epoch": 0.9904458598726115, + "grad_norm": 2.557316780090332, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7138296365737915, + "num_tokens": 225334330.0, + "step": 9019 + }, + { + "epoch": 0.9905556775752251, + "grad_norm": 2.1885247230529785, + "learning_rate": 1e-06, + "loss": 1.0394, + "mean_token_accuracy": 0.6827552914619446, + "num_tokens": 225361422.0, + "step": 9020 + }, + { + "epoch": 0.9906654952778388, + "grad_norm": 2.241070508956909, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7134596109390259, + "num_tokens": 225386293.0, + "step": 9021 + }, + { + "epoch": 0.9907753129804524, + "grad_norm": 2.163588047027588, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7241453528404236, + "num_tokens": 225412021.0, + "step": 9022 + }, + { + "epoch": 0.9908851306830662, + "grad_norm": 2.0837152004241943, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7176938056945801, + "num_tokens": 225437295.0, + "step": 9023 + }, + { + "epoch": 0.9909949483856798, + "grad_norm": 2.3704757690429688, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7078640460968018, + "num_tokens": 225460295.0, + "step": 9024 + }, + { + "epoch": 0.9911047660882935, + "grad_norm": 1.9730150699615479, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7110133767127991, + "num_tokens": 225490490.0, + "step": 9025 + }, + { + "epoch": 0.9912145837909071, + "grad_norm": 2.3460896015167236, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7118551135063171, + "num_tokens": 225511766.0, + "step": 9026 + }, + { + "epoch": 0.9913244014935207, + "grad_norm": 2.5964231491088867, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7179242372512817, + "num_tokens": 225530171.0, + "step": 9027 + }, + { + "epoch": 0.9914342191961344, + "grad_norm": 2.247999429702759, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7054663300514221, + "num_tokens": 225555822.0, + "step": 9028 + }, + { + "epoch": 0.991544036898748, + "grad_norm": 2.110436201095581, + "learning_rate": 1e-06, + "loss": 1.0055, + "mean_token_accuracy": 0.6996238827705383, + "num_tokens": 225582953.0, + "step": 9029 + }, + { + "epoch": 0.9916538546013618, + "grad_norm": 2.003923177719116, + "learning_rate": 1e-06, + "loss": 1.0352, + "mean_token_accuracy": 0.6911423206329346, + "num_tokens": 225614662.0, + "step": 9030 + }, + { + "epoch": 0.9917636723039754, + "grad_norm": 2.570000410079956, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7192502021789551, + "num_tokens": 225634385.0, + "step": 9031 + }, + { + "epoch": 0.9918734900065891, + "grad_norm": 2.2653942108154297, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7323182225227356, + "num_tokens": 225655854.0, + "step": 9032 + }, + { + "epoch": 0.9919833077092027, + "grad_norm": 2.049994468688965, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7342033386230469, + "num_tokens": 225680495.0, + "step": 9033 + }, + { + "epoch": 0.9920931254118164, + "grad_norm": 2.0602056980133057, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7298566699028015, + "num_tokens": 225709772.0, + "step": 9034 + }, + { + "epoch": 0.99220294311443, + "grad_norm": 2.234344244003296, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7144661545753479, + "num_tokens": 225733820.0, + "step": 9035 + }, + { + "epoch": 0.9923127608170437, + "grad_norm": 2.3348352909088135, + "learning_rate": 1e-06, + "loss": 0.9967, + "mean_token_accuracy": 0.7086254954338074, + "num_tokens": 225756447.0, + "step": 9036 + }, + { + "epoch": 0.9924225785196573, + "grad_norm": 2.284411668777466, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7232035398483276, + "num_tokens": 225780171.0, + "step": 9037 + }, + { + "epoch": 0.9925323962222711, + "grad_norm": 2.185713529586792, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7261716723442078, + "num_tokens": 225804142.0, + "step": 9038 + }, + { + "epoch": 0.9926422139248847, + "grad_norm": 2.2378857135772705, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7250874042510986, + "num_tokens": 225827089.0, + "step": 9039 + }, + { + "epoch": 0.9927520316274984, + "grad_norm": 2.040384292602539, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7163533568382263, + "num_tokens": 225854623.0, + "step": 9040 + }, + { + "epoch": 0.992861849330112, + "grad_norm": 2.1321613788604736, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.7060031294822693, + "num_tokens": 225882334.0, + "step": 9041 + }, + { + "epoch": 0.9929716670327257, + "grad_norm": 2.4191172122955322, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7047232389450073, + "num_tokens": 225903607.0, + "step": 9042 + }, + { + "epoch": 0.9930814847353393, + "grad_norm": 1.7863638401031494, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6915187239646912, + "num_tokens": 225940054.0, + "step": 9043 + }, + { + "epoch": 0.993191302437953, + "grad_norm": 2.5974161624908447, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7188330888748169, + "num_tokens": 225958963.0, + "step": 9044 + }, + { + "epoch": 0.9933011201405667, + "grad_norm": 2.031721591949463, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.6872211694717407, + "num_tokens": 225986576.0, + "step": 9045 + }, + { + "epoch": 0.9934109378431804, + "grad_norm": 1.999184489250183, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7264649271965027, + "num_tokens": 226013210.0, + "step": 9046 + }, + { + "epoch": 0.993520755545794, + "grad_norm": 2.612236261367798, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.7323110103607178, + "num_tokens": 226032684.0, + "step": 9047 + }, + { + "epoch": 0.9936305732484076, + "grad_norm": 2.251784563064575, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7148009538650513, + "num_tokens": 226058521.0, + "step": 9048 + }, + { + "epoch": 0.9937403909510213, + "grad_norm": 2.2329013347625732, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7220627069473267, + "num_tokens": 226082946.0, + "step": 9049 + }, + { + "epoch": 0.9938502086536349, + "grad_norm": 2.0537943840026855, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.730315089225769, + "num_tokens": 226111796.0, + "step": 9050 + }, + { + "epoch": 0.9939600263562486, + "grad_norm": 2.1232500076293945, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7045273780822754, + "num_tokens": 226136194.0, + "step": 9051 + }, + { + "epoch": 0.9940698440588623, + "grad_norm": 2.2135748863220215, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7268381118774414, + "num_tokens": 226160035.0, + "step": 9052 + }, + { + "epoch": 0.994179661761476, + "grad_norm": 2.1893434524536133, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7052811980247498, + "num_tokens": 226188834.0, + "step": 9053 + }, + { + "epoch": 0.9942894794640896, + "grad_norm": 2.3972606658935547, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7028918266296387, + "num_tokens": 226210891.0, + "step": 9054 + }, + { + "epoch": 0.9943992971667033, + "grad_norm": 2.3611650466918945, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7123674750328064, + "num_tokens": 226233193.0, + "step": 9055 + }, + { + "epoch": 0.9945091148693169, + "grad_norm": 2.2898998260498047, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7075847387313843, + "num_tokens": 226257038.0, + "step": 9056 + }, + { + "epoch": 0.9946189325719306, + "grad_norm": 2.4659335613250732, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.6967533826828003, + "num_tokens": 226276930.0, + "step": 9057 + }, + { + "epoch": 0.9947287502745442, + "grad_norm": 2.3348748683929443, + "learning_rate": 1e-06, + "loss": 0.9779, + "mean_token_accuracy": 0.6989201307296753, + "num_tokens": 226302774.0, + "step": 9058 + }, + { + "epoch": 0.994838567977158, + "grad_norm": 2.368762493133545, + "learning_rate": 1e-06, + "loss": 0.9802, + "mean_token_accuracy": 0.702711820602417, + "num_tokens": 226324209.0, + "step": 9059 + }, + { + "epoch": 0.9949483856797716, + "grad_norm": 2.246255397796631, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7079145908355713, + "num_tokens": 226349184.0, + "step": 9060 + }, + { + "epoch": 0.9950582033823853, + "grad_norm": 2.1689293384552, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7291750907897949, + "num_tokens": 226372106.0, + "step": 9061 + }, + { + "epoch": 0.9951680210849989, + "grad_norm": 1.8556829690933228, + "learning_rate": 1e-06, + "loss": 0.909, + "mean_token_accuracy": 0.7204163074493408, + "num_tokens": 226404144.0, + "step": 9062 + }, + { + "epoch": 0.9952778387876126, + "grad_norm": 2.5929369926452637, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.7001882791519165, + "num_tokens": 226423113.0, + "step": 9063 + }, + { + "epoch": 0.9953876564902262, + "grad_norm": 2.130173683166504, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7099159955978394, + "num_tokens": 226449084.0, + "step": 9064 + }, + { + "epoch": 0.9954974741928398, + "grad_norm": 2.2690351009368896, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7133506536483765, + "num_tokens": 226471713.0, + "step": 9065 + }, + { + "epoch": 0.9956072918954535, + "grad_norm": 2.1833512783050537, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.693474292755127, + "num_tokens": 226496446.0, + "step": 9066 + }, + { + "epoch": 0.9957171095980673, + "grad_norm": 2.4555506706237793, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7320326566696167, + "num_tokens": 226515531.0, + "step": 9067 + }, + { + "epoch": 0.9958269273006809, + "grad_norm": 2.013664722442627, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.6933826208114624, + "num_tokens": 226544413.0, + "step": 9068 + }, + { + "epoch": 0.9959367450032945, + "grad_norm": 2.246833562850952, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7257072925567627, + "num_tokens": 226567063.0, + "step": 9069 + }, + { + "epoch": 0.9960465627059082, + "grad_norm": 2.003824472427368, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7161926031112671, + "num_tokens": 226595391.0, + "step": 9070 + }, + { + "epoch": 0.9961563804085218, + "grad_norm": 2.2471210956573486, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7055426836013794, + "num_tokens": 226619771.0, + "step": 9071 + }, + { + "epoch": 0.9962661981111355, + "grad_norm": 2.260498046875, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7170106172561646, + "num_tokens": 226642476.0, + "step": 9072 + }, + { + "epoch": 0.9963760158137491, + "grad_norm": 2.3704142570495605, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7211676239967346, + "num_tokens": 226664242.0, + "step": 9073 + }, + { + "epoch": 0.9964858335163629, + "grad_norm": 2.3322689533233643, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.699193000793457, + "num_tokens": 226688385.0, + "step": 9074 + }, + { + "epoch": 0.9965956512189765, + "grad_norm": 2.3223419189453125, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.71173095703125, + "num_tokens": 226713335.0, + "step": 9075 + }, + { + "epoch": 0.9967054689215902, + "grad_norm": 2.7539126873016357, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7119015455245972, + "num_tokens": 226731306.0, + "step": 9076 + }, + { + "epoch": 0.9968152866242038, + "grad_norm": 2.3561527729034424, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.715773344039917, + "num_tokens": 226752886.0, + "step": 9077 + }, + { + "epoch": 0.9969251043268175, + "grad_norm": 2.1353816986083984, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.710832417011261, + "num_tokens": 226777168.0, + "step": 9078 + }, + { + "epoch": 0.9970349220294311, + "grad_norm": 2.146141290664673, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6994249820709229, + "num_tokens": 226804891.0, + "step": 9079 + }, + { + "epoch": 0.9971447397320448, + "grad_norm": 2.2966575622558594, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7005470991134644, + "num_tokens": 226828675.0, + "step": 9080 + }, + { + "epoch": 0.9972545574346585, + "grad_norm": 2.2351739406585693, + "learning_rate": 1e-06, + "loss": 0.9645, + "mean_token_accuracy": 0.7186983823776245, + "num_tokens": 226855451.0, + "step": 9081 + }, + { + "epoch": 0.9973643751372722, + "grad_norm": 2.1011803150177, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7230947017669678, + "num_tokens": 226881767.0, + "step": 9082 + }, + { + "epoch": 0.9974741928398858, + "grad_norm": 2.071268081665039, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7126882672309875, + "num_tokens": 226908082.0, + "step": 9083 + }, + { + "epoch": 0.9975840105424995, + "grad_norm": 2.1902570724487305, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7282952070236206, + "num_tokens": 226931668.0, + "step": 9084 + }, + { + "epoch": 0.9976938282451131, + "grad_norm": 2.049410820007324, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7008982300758362, + "num_tokens": 226959074.0, + "step": 9085 + }, + { + "epoch": 0.9978036459477267, + "grad_norm": 2.142580509185791, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.697475790977478, + "num_tokens": 226983989.0, + "step": 9086 + }, + { + "epoch": 0.9979134636503404, + "grad_norm": 2.3533291816711426, + "learning_rate": 1e-06, + "loss": 0.7813, + "mean_token_accuracy": 0.7500049471855164, + "num_tokens": 227003313.0, + "step": 9087 + }, + { + "epoch": 0.9980232813529542, + "grad_norm": 2.5382280349731445, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7307679653167725, + "num_tokens": 227022193.0, + "step": 9088 + }, + { + "epoch": 0.9981330990555678, + "grad_norm": 2.1942014694213867, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7032891511917114, + "num_tokens": 227046024.0, + "step": 9089 + }, + { + "epoch": 0.9982429167581814, + "grad_norm": 2.279688596725464, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7236195802688599, + "num_tokens": 227067849.0, + "step": 9090 + }, + { + "epoch": 0.9983527344607951, + "grad_norm": 2.1980295181274414, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7051586508750916, + "num_tokens": 227092067.0, + "step": 9091 + }, + { + "epoch": 0.9984625521634087, + "grad_norm": 2.4651436805725098, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7097599506378174, + "num_tokens": 227112774.0, + "step": 9092 + }, + { + "epoch": 0.9985723698660224, + "grad_norm": 2.4312777519226074, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7333817481994629, + "num_tokens": 227134708.0, + "step": 9093 + }, + { + "epoch": 0.998682187568636, + "grad_norm": 2.1870646476745605, + "learning_rate": 1e-06, + "loss": 1.0447, + "mean_token_accuracy": 0.6844313740730286, + "num_tokens": 227161249.0, + "step": 9094 + }, + { + "epoch": 0.9987920052712497, + "grad_norm": 2.4317970275878906, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.705588161945343, + "num_tokens": 227182549.0, + "step": 9095 + }, + { + "epoch": 0.9989018229738634, + "grad_norm": 2.1848113536834717, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.707294225692749, + "num_tokens": 227204937.0, + "step": 9096 + }, + { + "epoch": 0.9990116406764771, + "grad_norm": 1.8039462566375732, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.6950277090072632, + "num_tokens": 227239173.0, + "step": 9097 + }, + { + "epoch": 0.9991214583790907, + "grad_norm": 2.217223644256592, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6986913681030273, + "num_tokens": 227265025.0, + "step": 9098 + }, + { + "epoch": 0.9992312760817044, + "grad_norm": 2.1417396068573, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6887744665145874, + "num_tokens": 227291465.0, + "step": 9099 + }, + { + "epoch": 0.999341093784318, + "grad_norm": 2.2375972270965576, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7185179591178894, + "num_tokens": 227317196.0, + "step": 9100 + }, + { + "epoch": 0.9994509114869317, + "grad_norm": 2.0665934085845947, + "learning_rate": 1e-06, + "loss": 1.0202, + "mean_token_accuracy": 0.6828880906105042, + "num_tokens": 227346770.0, + "step": 9101 + }, + { + "epoch": 0.9995607291895453, + "grad_norm": 2.391967296600342, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.704392671585083, + "num_tokens": 227368495.0, + "step": 9102 + }, + { + "epoch": 0.9996705468921591, + "grad_norm": 2.2891385555267334, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7113019227981567, + "num_tokens": 227391951.0, + "step": 9103 + }, + { + "epoch": 0.9997803645947727, + "grad_norm": 1.930294394493103, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7076441049575806, + "num_tokens": 227420238.0, + "step": 9104 + }, + { + "epoch": 0.9998901822973864, + "grad_norm": 2.4697682857513428, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7247247695922852, + "num_tokens": 227439295.0, + "step": 9105 + }, + { + "epoch": 1.0, + "grad_norm": 2.1114115715026855, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7068490386009216, + "num_tokens": 227465573.0, + "step": 9106 + }, + { + "epoch": 1.0001098177026138, + "grad_norm": 2.1854734420776367, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.706066370010376, + "num_tokens": 227489366.0, + "step": 9107 + }, + { + "epoch": 1.0002196354052273, + "grad_norm": 2.25569224357605, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7029613256454468, + "num_tokens": 227512223.0, + "step": 9108 + }, + { + "epoch": 1.000329453107841, + "grad_norm": 2.2519989013671875, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7365105152130127, + "num_tokens": 227536147.0, + "step": 9109 + }, + { + "epoch": 1.0004392708104546, + "grad_norm": 2.1965742111206055, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7229003310203552, + "num_tokens": 227559609.0, + "step": 9110 + }, + { + "epoch": 1.0005490885130683, + "grad_norm": 2.4316320419311523, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7198702096939087, + "num_tokens": 227579257.0, + "step": 9111 + }, + { + "epoch": 1.0006589062156819, + "grad_norm": 2.1255276203155518, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7132164239883423, + "num_tokens": 227604906.0, + "step": 9112 + }, + { + "epoch": 1.0007687239182956, + "grad_norm": 2.2082414627075195, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7214857339859009, + "num_tokens": 227629347.0, + "step": 9113 + }, + { + "epoch": 1.0008785416209094, + "grad_norm": 2.014946460723877, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7138620615005493, + "num_tokens": 227657790.0, + "step": 9114 + }, + { + "epoch": 1.000988359323523, + "grad_norm": 2.09346079826355, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7260233163833618, + "num_tokens": 227684082.0, + "step": 9115 + }, + { + "epoch": 1.0010981770261367, + "grad_norm": 1.7139010429382324, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7085763216018677, + "num_tokens": 227724903.0, + "step": 9116 + }, + { + "epoch": 1.0012079947287502, + "grad_norm": 2.1886374950408936, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.6995638608932495, + "num_tokens": 227752001.0, + "step": 9117 + }, + { + "epoch": 1.001317812431364, + "grad_norm": 2.1360743045806885, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7311719655990601, + "num_tokens": 227777466.0, + "step": 9118 + }, + { + "epoch": 1.0014276301339775, + "grad_norm": 2.0444860458374023, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7213108539581299, + "num_tokens": 227807614.0, + "step": 9119 + }, + { + "epoch": 1.0015374478365913, + "grad_norm": 2.2413506507873535, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7288249731063843, + "num_tokens": 227831452.0, + "step": 9120 + }, + { + "epoch": 1.001647265539205, + "grad_norm": 2.046705722808838, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.734725832939148, + "num_tokens": 227860261.0, + "step": 9121 + }, + { + "epoch": 1.0017570832418186, + "grad_norm": 2.283586025238037, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.732724666595459, + "num_tokens": 227883374.0, + "step": 9122 + }, + { + "epoch": 1.0018669009444323, + "grad_norm": 2.404452323913574, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.7013717293739319, + "num_tokens": 227907228.0, + "step": 9123 + }, + { + "epoch": 1.0019767186470458, + "grad_norm": 2.0976011753082275, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.719150960445404, + "num_tokens": 227933843.0, + "step": 9124 + }, + { + "epoch": 1.0020865363496596, + "grad_norm": 1.9652966260910034, + "learning_rate": 1e-06, + "loss": 1.0908, + "mean_token_accuracy": 0.6714528203010559, + "num_tokens": 227964216.0, + "step": 9125 + }, + { + "epoch": 1.0021963540522731, + "grad_norm": 2.5092592239379883, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7308007478713989, + "num_tokens": 227984960.0, + "step": 9126 + }, + { + "epoch": 1.002306171754887, + "grad_norm": 2.2446985244750977, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7184659242630005, + "num_tokens": 228008747.0, + "step": 9127 + }, + { + "epoch": 1.0024159894575007, + "grad_norm": 2.3188087940216064, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6922996044158936, + "num_tokens": 228032590.0, + "step": 9128 + }, + { + "epoch": 1.0025258071601142, + "grad_norm": 2.4593944549560547, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7302283048629761, + "num_tokens": 228053785.0, + "step": 9129 + }, + { + "epoch": 1.002635624862728, + "grad_norm": 2.2577595710754395, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7394805550575256, + "num_tokens": 228076323.0, + "step": 9130 + }, + { + "epoch": 1.0027454425653415, + "grad_norm": 2.1044979095458984, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7123091816902161, + "num_tokens": 228102974.0, + "step": 9131 + }, + { + "epoch": 1.0028552602679552, + "grad_norm": 2.0901997089385986, + "learning_rate": 1e-06, + "loss": 1.0502, + "mean_token_accuracy": 0.6772881150245667, + "num_tokens": 228133436.0, + "step": 9132 + }, + { + "epoch": 1.0029650779705688, + "grad_norm": 2.199946165084839, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7307144403457642, + "num_tokens": 228158395.0, + "step": 9133 + }, + { + "epoch": 1.0030748956731825, + "grad_norm": 2.115176200866699, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7303850650787354, + "num_tokens": 228183246.0, + "step": 9134 + }, + { + "epoch": 1.0031847133757963, + "grad_norm": 2.25427508354187, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7170155644416809, + "num_tokens": 228209546.0, + "step": 9135 + }, + { + "epoch": 1.0032945310784098, + "grad_norm": 2.207623243331909, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7038935422897339, + "num_tokens": 228235703.0, + "step": 9136 + }, + { + "epoch": 1.0034043487810236, + "grad_norm": 2.2590901851654053, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7383565306663513, + "num_tokens": 228258890.0, + "step": 9137 + }, + { + "epoch": 1.0035141664836371, + "grad_norm": 2.384591817855835, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7255439162254333, + "num_tokens": 228279897.0, + "step": 9138 + }, + { + "epoch": 1.0036239841862509, + "grad_norm": 2.214554786682129, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7110685110092163, + "num_tokens": 228305981.0, + "step": 9139 + }, + { + "epoch": 1.0037338018888644, + "grad_norm": 2.4164435863494873, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7202223539352417, + "num_tokens": 228328579.0, + "step": 9140 + }, + { + "epoch": 1.0038436195914782, + "grad_norm": 2.216552257537842, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7077173590660095, + "num_tokens": 228353802.0, + "step": 9141 + }, + { + "epoch": 1.0039534372940917, + "grad_norm": 2.018557548522949, + "learning_rate": 1e-06, + "loss": 1.0068, + "mean_token_accuracy": 0.6895749568939209, + "num_tokens": 228386289.0, + "step": 9142 + }, + { + "epoch": 1.0040632549967055, + "grad_norm": 1.987014889717102, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7097694277763367, + "num_tokens": 228416272.0, + "step": 9143 + }, + { + "epoch": 1.0041730726993192, + "grad_norm": 2.8348538875579834, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.730305552482605, + "num_tokens": 228434248.0, + "step": 9144 + }, + { + "epoch": 1.0042828904019327, + "grad_norm": 2.0210471153259277, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.712266743183136, + "num_tokens": 228463001.0, + "step": 9145 + }, + { + "epoch": 1.0043927081045465, + "grad_norm": 2.2303617000579834, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7137831449508667, + "num_tokens": 228488717.0, + "step": 9146 + }, + { + "epoch": 1.00450252580716, + "grad_norm": 1.8450629711151123, + "learning_rate": 1e-06, + "loss": 1.0157, + "mean_token_accuracy": 0.6894821524620056, + "num_tokens": 228523695.0, + "step": 9147 + }, + { + "epoch": 1.0046123435097738, + "grad_norm": 2.250988245010376, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7287642955780029, + "num_tokens": 228549211.0, + "step": 9148 + }, + { + "epoch": 1.0047221612123873, + "grad_norm": 2.201632499694824, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7156717777252197, + "num_tokens": 228572999.0, + "step": 9149 + }, + { + "epoch": 1.004831978915001, + "grad_norm": 2.798626661300659, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7353302240371704, + "num_tokens": 228589753.0, + "step": 9150 + }, + { + "epoch": 1.0049417966176148, + "grad_norm": 2.337573528289795, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7029242515563965, + "num_tokens": 228615452.0, + "step": 9151 + }, + { + "epoch": 1.0050516143202284, + "grad_norm": 2.096127986907959, + "learning_rate": 1e-06, + "loss": 1.0031, + "mean_token_accuracy": 0.6982896327972412, + "num_tokens": 228644456.0, + "step": 9152 + }, + { + "epoch": 1.0051614320228421, + "grad_norm": 2.443718194961548, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7298654913902283, + "num_tokens": 228665812.0, + "step": 9153 + }, + { + "epoch": 1.0052712497254557, + "grad_norm": 2.0447463989257812, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7092941999435425, + "num_tokens": 228692874.0, + "step": 9154 + }, + { + "epoch": 1.0053810674280694, + "grad_norm": 2.294590950012207, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7279976606369019, + "num_tokens": 228717893.0, + "step": 9155 + }, + { + "epoch": 1.005490885130683, + "grad_norm": 2.021155595779419, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7258397340774536, + "num_tokens": 228747571.0, + "step": 9156 + }, + { + "epoch": 1.0056007028332967, + "grad_norm": 2.235398292541504, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7316443920135498, + "num_tokens": 228772551.0, + "step": 9157 + }, + { + "epoch": 1.0057105205359105, + "grad_norm": 2.5057129859924316, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.700796365737915, + "num_tokens": 228793593.0, + "step": 9158 + }, + { + "epoch": 1.005820338238524, + "grad_norm": 2.086031198501587, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.7000027894973755, + "num_tokens": 228823715.0, + "step": 9159 + }, + { + "epoch": 1.0059301559411378, + "grad_norm": 2.470064878463745, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7277096509933472, + "num_tokens": 228843544.0, + "step": 9160 + }, + { + "epoch": 1.0060399736437513, + "grad_norm": 2.115117311477661, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7205075025558472, + "num_tokens": 228872232.0, + "step": 9161 + }, + { + "epoch": 1.006149791346365, + "grad_norm": 2.7232630252838135, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7093075513839722, + "num_tokens": 228889576.0, + "step": 9162 + }, + { + "epoch": 1.0062596090489786, + "grad_norm": 2.6692466735839844, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7166553735733032, + "num_tokens": 228909180.0, + "step": 9163 + }, + { + "epoch": 1.0063694267515924, + "grad_norm": 2.392838478088379, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7112725973129272, + "num_tokens": 228930696.0, + "step": 9164 + }, + { + "epoch": 1.0064792444542061, + "grad_norm": 2.364124298095703, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.708675742149353, + "num_tokens": 228954901.0, + "step": 9165 + }, + { + "epoch": 1.0065890621568196, + "grad_norm": 2.2338881492614746, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7180089950561523, + "num_tokens": 228980852.0, + "step": 9166 + }, + { + "epoch": 1.0066988798594334, + "grad_norm": 2.5256688594818115, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7282003164291382, + "num_tokens": 229000664.0, + "step": 9167 + }, + { + "epoch": 1.006808697562047, + "grad_norm": 2.346736431121826, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7324769496917725, + "num_tokens": 229023063.0, + "step": 9168 + }, + { + "epoch": 1.0069185152646607, + "grad_norm": 2.0670571327209473, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7187994718551636, + "num_tokens": 229051066.0, + "step": 9169 + }, + { + "epoch": 1.0070283329672742, + "grad_norm": 2.348405361175537, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7347207069396973, + "num_tokens": 229071678.0, + "step": 9170 + }, + { + "epoch": 1.007138150669888, + "grad_norm": 2.2479453086853027, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7235983610153198, + "num_tokens": 229097297.0, + "step": 9171 + }, + { + "epoch": 1.0072479683725017, + "grad_norm": 2.3164377212524414, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7139211893081665, + "num_tokens": 229120346.0, + "step": 9172 + }, + { + "epoch": 1.0073577860751153, + "grad_norm": 1.9710184335708618, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7059228420257568, + "num_tokens": 229153915.0, + "step": 9173 + }, + { + "epoch": 1.007467603777729, + "grad_norm": 2.166773557662964, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7337371110916138, + "num_tokens": 229180760.0, + "step": 9174 + }, + { + "epoch": 1.0075774214803426, + "grad_norm": 2.3308796882629395, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7162289619445801, + "num_tokens": 229203613.0, + "step": 9175 + }, + { + "epoch": 1.0076872391829563, + "grad_norm": 2.223527193069458, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7210630774497986, + "num_tokens": 229226723.0, + "step": 9176 + }, + { + "epoch": 1.0077970568855699, + "grad_norm": 2.488936424255371, + "learning_rate": 1e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.741355299949646, + "num_tokens": 229248240.0, + "step": 9177 + }, + { + "epoch": 1.0079068745881836, + "grad_norm": 2.262510061264038, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7236053347587585, + "num_tokens": 229271273.0, + "step": 9178 + }, + { + "epoch": 1.0080166922907974, + "grad_norm": 1.915977954864502, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.7041239738464355, + "num_tokens": 229302574.0, + "step": 9179 + }, + { + "epoch": 1.008126509993411, + "grad_norm": 2.2032876014709473, + "learning_rate": 1e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7352613210678101, + "num_tokens": 229327946.0, + "step": 9180 + }, + { + "epoch": 1.0082363276960247, + "grad_norm": 2.1532864570617676, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.738783597946167, + "num_tokens": 229353254.0, + "step": 9181 + }, + { + "epoch": 1.0083461453986382, + "grad_norm": 2.101306200027466, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7352092862129211, + "num_tokens": 229378042.0, + "step": 9182 + }, + { + "epoch": 1.008455963101252, + "grad_norm": 2.4661641120910645, + "learning_rate": 1e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.7409193515777588, + "num_tokens": 229398945.0, + "step": 9183 + }, + { + "epoch": 1.0085657808038655, + "grad_norm": 2.544196605682373, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7089591026306152, + "num_tokens": 229419295.0, + "step": 9184 + }, + { + "epoch": 1.0086755985064793, + "grad_norm": 2.2919745445251465, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7056294679641724, + "num_tokens": 229445584.0, + "step": 9185 + }, + { + "epoch": 1.008785416209093, + "grad_norm": 2.117708683013916, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7102268934249878, + "num_tokens": 229474044.0, + "step": 9186 + }, + { + "epoch": 1.0088952339117065, + "grad_norm": 2.2806334495544434, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6989041566848755, + "num_tokens": 229501572.0, + "step": 9187 + }, + { + "epoch": 1.0090050516143203, + "grad_norm": 2.3324601650238037, + "learning_rate": 1e-06, + "loss": 1.0004, + "mean_token_accuracy": 0.6923854351043701, + "num_tokens": 229527384.0, + "step": 9188 + }, + { + "epoch": 1.0091148693169338, + "grad_norm": 2.1918258666992188, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.719917893409729, + "num_tokens": 229552310.0, + "step": 9189 + }, + { + "epoch": 1.0092246870195476, + "grad_norm": 2.7552380561828613, + "learning_rate": 1e-06, + "loss": 0.7532, + "mean_token_accuracy": 0.7548471689224243, + "num_tokens": 229568503.0, + "step": 9190 + }, + { + "epoch": 1.0093345047221611, + "grad_norm": 2.186668872833252, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7221269011497498, + "num_tokens": 229594718.0, + "step": 9191 + }, + { + "epoch": 1.0094443224247749, + "grad_norm": 2.298363208770752, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7088609337806702, + "num_tokens": 229618398.0, + "step": 9192 + }, + { + "epoch": 1.0095541401273886, + "grad_norm": 2.5464589595794678, + "learning_rate": 1e-06, + "loss": 0.7935, + "mean_token_accuracy": 0.7412206530570984, + "num_tokens": 229636265.0, + "step": 9193 + }, + { + "epoch": 1.0096639578300022, + "grad_norm": 2.4369850158691406, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7117118835449219, + "num_tokens": 229656815.0, + "step": 9194 + }, + { + "epoch": 1.009773775532616, + "grad_norm": 1.9400609731674194, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.713667094707489, + "num_tokens": 229689360.0, + "step": 9195 + }, + { + "epoch": 1.0098835932352295, + "grad_norm": 2.4917945861816406, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.728759229183197, + "num_tokens": 229710865.0, + "step": 9196 + }, + { + "epoch": 1.0099934109378432, + "grad_norm": 2.3459651470184326, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7110153436660767, + "num_tokens": 229733475.0, + "step": 9197 + }, + { + "epoch": 1.0101032286404568, + "grad_norm": 2.2011280059814453, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7185763120651245, + "num_tokens": 229760768.0, + "step": 9198 + }, + { + "epoch": 1.0102130463430705, + "grad_norm": 2.0224173069000244, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.698849081993103, + "num_tokens": 229790003.0, + "step": 9199 + }, + { + "epoch": 1.010322864045684, + "grad_norm": 2.192734718322754, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7193543314933777, + "num_tokens": 229815706.0, + "step": 9200 + }, + { + "epoch": 1.0104326817482978, + "grad_norm": 2.0988080501556396, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6908288598060608, + "num_tokens": 229843091.0, + "step": 9201 + }, + { + "epoch": 1.0105424994509116, + "grad_norm": 2.238361358642578, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7197877764701843, + "num_tokens": 229867131.0, + "step": 9202 + }, + { + "epoch": 1.010652317153525, + "grad_norm": 1.9651740789413452, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7160316705703735, + "num_tokens": 229900006.0, + "step": 9203 + }, + { + "epoch": 1.0107621348561389, + "grad_norm": 2.213426113128662, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7116422653198242, + "num_tokens": 229926704.0, + "step": 9204 + }, + { + "epoch": 1.0108719525587524, + "grad_norm": 2.077601432800293, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7421382665634155, + "num_tokens": 229953690.0, + "step": 9205 + }, + { + "epoch": 1.0109817702613662, + "grad_norm": 2.2689027786254883, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7244502902030945, + "num_tokens": 229979007.0, + "step": 9206 + }, + { + "epoch": 1.0110915879639797, + "grad_norm": 2.2314488887786865, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7122562527656555, + "num_tokens": 230006448.0, + "step": 9207 + }, + { + "epoch": 1.0112014056665934, + "grad_norm": 2.3496196269989014, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6978208422660828, + "num_tokens": 230030372.0, + "step": 9208 + }, + { + "epoch": 1.0113112233692072, + "grad_norm": 2.005478620529175, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7275550365447998, + "num_tokens": 230061286.0, + "step": 9209 + }, + { + "epoch": 1.0114210410718207, + "grad_norm": 2.039743423461914, + "learning_rate": 1e-06, + "loss": 1.0167, + "mean_token_accuracy": 0.6898267269134521, + "num_tokens": 230097522.0, + "step": 9210 + }, + { + "epoch": 1.0115308587744345, + "grad_norm": 2.321249485015869, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7332966327667236, + "num_tokens": 230121808.0, + "step": 9211 + }, + { + "epoch": 1.011640676477048, + "grad_norm": 2.3213706016540527, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.7292580604553223, + "num_tokens": 230143799.0, + "step": 9212 + }, + { + "epoch": 1.0117504941796618, + "grad_norm": 2.42510986328125, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7348149418830872, + "num_tokens": 230166048.0, + "step": 9213 + }, + { + "epoch": 1.0118603118822753, + "grad_norm": 2.010986328125, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7180848717689514, + "num_tokens": 230194819.0, + "step": 9214 + }, + { + "epoch": 1.011970129584889, + "grad_norm": 2.322338819503784, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7161442637443542, + "num_tokens": 230217509.0, + "step": 9215 + }, + { + "epoch": 1.0120799472875028, + "grad_norm": 2.2542848587036133, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7213816046714783, + "num_tokens": 230242831.0, + "step": 9216 + }, + { + "epoch": 1.0121897649901164, + "grad_norm": 2.396549701690674, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7118575572967529, + "num_tokens": 230265540.0, + "step": 9217 + }, + { + "epoch": 1.0122995826927301, + "grad_norm": 2.5421433448791504, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.7284636497497559, + "num_tokens": 230284694.0, + "step": 9218 + }, + { + "epoch": 1.0124094003953437, + "grad_norm": 2.0751793384552, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7174705862998962, + "num_tokens": 230314941.0, + "step": 9219 + }, + { + "epoch": 1.0125192180979574, + "grad_norm": 2.2943739891052246, + "learning_rate": 1e-06, + "loss": 0.7266, + "mean_token_accuracy": 0.7709928154945374, + "num_tokens": 230337326.0, + "step": 9220 + }, + { + "epoch": 1.012629035800571, + "grad_norm": 2.7553088665008545, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7147282361984253, + "num_tokens": 230356483.0, + "step": 9221 + }, + { + "epoch": 1.0127388535031847, + "grad_norm": 2.683056116104126, + "learning_rate": 1e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7450078725814819, + "num_tokens": 230375103.0, + "step": 9222 + }, + { + "epoch": 1.0128486712057985, + "grad_norm": 2.485949993133545, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.7098760008811951, + "num_tokens": 230397997.0, + "step": 9223 + }, + { + "epoch": 1.012958488908412, + "grad_norm": 2.0523409843444824, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7010841369628906, + "num_tokens": 230430112.0, + "step": 9224 + }, + { + "epoch": 1.0130683066110258, + "grad_norm": 2.493147850036621, + "learning_rate": 1e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7419443130493164, + "num_tokens": 230451500.0, + "step": 9225 + }, + { + "epoch": 1.0131781243136393, + "grad_norm": 2.2315733432769775, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7210149765014648, + "num_tokens": 230478459.0, + "step": 9226 + }, + { + "epoch": 1.013287942016253, + "grad_norm": 2.450160264968872, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7302574515342712, + "num_tokens": 230499258.0, + "step": 9227 + }, + { + "epoch": 1.0133977597188666, + "grad_norm": 2.6651673316955566, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7158624529838562, + "num_tokens": 230518540.0, + "step": 9228 + }, + { + "epoch": 1.0135075774214803, + "grad_norm": 2.3302228450775146, + "learning_rate": 1e-06, + "loss": 0.8112, + "mean_token_accuracy": 0.7477709054946899, + "num_tokens": 230540417.0, + "step": 9229 + }, + { + "epoch": 1.013617395124094, + "grad_norm": 2.102011203765869, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7216963171958923, + "num_tokens": 230568278.0, + "step": 9230 + }, + { + "epoch": 1.0137272128267076, + "grad_norm": 2.3458774089813232, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7320852279663086, + "num_tokens": 230589999.0, + "step": 9231 + }, + { + "epoch": 1.0138370305293214, + "grad_norm": 2.595412492752075, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7291707992553711, + "num_tokens": 230610215.0, + "step": 9232 + }, + { + "epoch": 1.013946848231935, + "grad_norm": 2.0523793697357178, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.718246340751648, + "num_tokens": 230639310.0, + "step": 9233 + }, + { + "epoch": 1.0140566659345487, + "grad_norm": 2.163912296295166, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7043930292129517, + "num_tokens": 230667875.0, + "step": 9234 + }, + { + "epoch": 1.0141664836371622, + "grad_norm": 2.5700457096099854, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7303566932678223, + "num_tokens": 230688356.0, + "step": 9235 + }, + { + "epoch": 1.014276301339776, + "grad_norm": 2.407644033432007, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.703721284866333, + "num_tokens": 230709461.0, + "step": 9236 + }, + { + "epoch": 1.0143861190423897, + "grad_norm": 2.3245413303375244, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7011241316795349, + "num_tokens": 230736747.0, + "step": 9237 + }, + { + "epoch": 1.0144959367450033, + "grad_norm": 2.6835927963256836, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7146439552307129, + "num_tokens": 230755567.0, + "step": 9238 + }, + { + "epoch": 1.014605754447617, + "grad_norm": 2.3516271114349365, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7395479679107666, + "num_tokens": 230778591.0, + "step": 9239 + }, + { + "epoch": 1.0147155721502306, + "grad_norm": 2.053516149520874, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7193581461906433, + "num_tokens": 230808508.0, + "step": 9240 + }, + { + "epoch": 1.0148253898528443, + "grad_norm": 2.18973970413208, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7149230241775513, + "num_tokens": 230834694.0, + "step": 9241 + }, + { + "epoch": 1.0149352075554579, + "grad_norm": 2.502626657485962, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7255151271820068, + "num_tokens": 230855817.0, + "step": 9242 + }, + { + "epoch": 1.0150450252580716, + "grad_norm": 2.3019487857818604, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.716425895690918, + "num_tokens": 230879855.0, + "step": 9243 + }, + { + "epoch": 1.0151548429606854, + "grad_norm": 2.4291844367980957, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7347406148910522, + "num_tokens": 230901311.0, + "step": 9244 + }, + { + "epoch": 1.015264660663299, + "grad_norm": 2.0504558086395264, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.700391411781311, + "num_tokens": 230928427.0, + "step": 9245 + }, + { + "epoch": 1.0153744783659127, + "grad_norm": 2.0568575859069824, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7175222635269165, + "num_tokens": 230959815.0, + "step": 9246 + }, + { + "epoch": 1.0154842960685262, + "grad_norm": 2.2564451694488525, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7229285836219788, + "num_tokens": 230986124.0, + "step": 9247 + }, + { + "epoch": 1.01559411377114, + "grad_norm": 2.0624024868011475, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7102915048599243, + "num_tokens": 231013644.0, + "step": 9248 + }, + { + "epoch": 1.0157039314737535, + "grad_norm": 2.113464593887329, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7366611957550049, + "num_tokens": 231039379.0, + "step": 9249 + }, + { + "epoch": 1.0158137491763672, + "grad_norm": 2.3530631065368652, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7407397031784058, + "num_tokens": 231060016.0, + "step": 9250 + }, + { + "epoch": 1.015923566878981, + "grad_norm": 2.506035804748535, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.709671139717102, + "num_tokens": 231081515.0, + "step": 9251 + }, + { + "epoch": 1.0160333845815945, + "grad_norm": 2.3214077949523926, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7099902629852295, + "num_tokens": 231105346.0, + "step": 9252 + }, + { + "epoch": 1.0161432022842083, + "grad_norm": 2.2899115085601807, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7461862564086914, + "num_tokens": 231128079.0, + "step": 9253 + }, + { + "epoch": 1.0162530199868218, + "grad_norm": 2.240844964981079, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7330138683319092, + "num_tokens": 231152780.0, + "step": 9254 + }, + { + "epoch": 1.0163628376894356, + "grad_norm": 2.4406402111053467, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.711583137512207, + "num_tokens": 231174300.0, + "step": 9255 + }, + { + "epoch": 1.0164726553920491, + "grad_norm": 2.3346071243286133, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7107332944869995, + "num_tokens": 231197477.0, + "step": 9256 + }, + { + "epoch": 1.0165824730946629, + "grad_norm": 2.4275052547454834, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7378117442131042, + "num_tokens": 231217710.0, + "step": 9257 + }, + { + "epoch": 1.0166922907972764, + "grad_norm": 2.3098490238189697, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7013891935348511, + "num_tokens": 231240227.0, + "step": 9258 + }, + { + "epoch": 1.0168021084998902, + "grad_norm": 2.420196056365967, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7305699586868286, + "num_tokens": 231263186.0, + "step": 9259 + }, + { + "epoch": 1.016911926202504, + "grad_norm": 1.9414806365966797, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6947304010391235, + "num_tokens": 231293179.0, + "step": 9260 + }, + { + "epoch": 1.0170217439051175, + "grad_norm": 2.312248468399048, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.708504319190979, + "num_tokens": 231317299.0, + "step": 9261 + }, + { + "epoch": 1.0171315616077312, + "grad_norm": 2.1587281227111816, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7040116786956787, + "num_tokens": 231345308.0, + "step": 9262 + }, + { + "epoch": 1.0172413793103448, + "grad_norm": 2.398030996322632, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.732806384563446, + "num_tokens": 231365704.0, + "step": 9263 + }, + { + "epoch": 1.0173511970129585, + "grad_norm": 2.479858875274658, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7479923963546753, + "num_tokens": 231385433.0, + "step": 9264 + }, + { + "epoch": 1.017461014715572, + "grad_norm": 2.441862106323242, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7146627902984619, + "num_tokens": 231406546.0, + "step": 9265 + }, + { + "epoch": 1.0175708324181858, + "grad_norm": 2.2507989406585693, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7178776264190674, + "num_tokens": 231432255.0, + "step": 9266 + }, + { + "epoch": 1.0176806501207996, + "grad_norm": 2.1341660022735596, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7295169830322266, + "num_tokens": 231460445.0, + "step": 9267 + }, + { + "epoch": 1.017790467823413, + "grad_norm": 2.3393073081970215, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6981698870658875, + "num_tokens": 231485780.0, + "step": 9268 + }, + { + "epoch": 1.0179002855260268, + "grad_norm": 2.646350622177124, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7233612537384033, + "num_tokens": 231503417.0, + "step": 9269 + }, + { + "epoch": 1.0180101032286404, + "grad_norm": 2.2878785133361816, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7099125981330872, + "num_tokens": 231527645.0, + "step": 9270 + }, + { + "epoch": 1.0181199209312541, + "grad_norm": 2.166172504425049, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.713396430015564, + "num_tokens": 231554486.0, + "step": 9271 + }, + { + "epoch": 1.0182297386338677, + "grad_norm": 1.959802508354187, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7167199850082397, + "num_tokens": 231585102.0, + "step": 9272 + }, + { + "epoch": 1.0183395563364814, + "grad_norm": 2.045224189758301, + "learning_rate": 1e-06, + "loss": 0.7826, + "mean_token_accuracy": 0.7502643465995789, + "num_tokens": 231610594.0, + "step": 9273 + }, + { + "epoch": 1.0184493740390952, + "grad_norm": 2.4716508388519287, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7255645990371704, + "num_tokens": 231631461.0, + "step": 9274 + }, + { + "epoch": 1.0185591917417087, + "grad_norm": 2.1539363861083984, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6927212476730347, + "num_tokens": 231659996.0, + "step": 9275 + }, + { + "epoch": 1.0186690094443225, + "grad_norm": 2.3295955657958984, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7099193334579468, + "num_tokens": 231684706.0, + "step": 9276 + }, + { + "epoch": 1.018778827146936, + "grad_norm": 2.785226583480835, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7318863272666931, + "num_tokens": 231701442.0, + "step": 9277 + }, + { + "epoch": 1.0188886448495498, + "grad_norm": 2.214153289794922, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7066264152526855, + "num_tokens": 231727351.0, + "step": 9278 + }, + { + "epoch": 1.0189984625521633, + "grad_norm": 2.112943172454834, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7110534310340881, + "num_tokens": 231754776.0, + "step": 9279 + }, + { + "epoch": 1.019108280254777, + "grad_norm": 2.3121321201324463, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.719114363193512, + "num_tokens": 231779662.0, + "step": 9280 + }, + { + "epoch": 1.0192180979573908, + "grad_norm": 2.5307023525238037, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.733328104019165, + "num_tokens": 231799284.0, + "step": 9281 + }, + { + "epoch": 1.0193279156600044, + "grad_norm": 2.4820504188537598, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7238785028457642, + "num_tokens": 231820737.0, + "step": 9282 + }, + { + "epoch": 1.0194377333626181, + "grad_norm": 2.5527288913726807, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.723656415939331, + "num_tokens": 231841441.0, + "step": 9283 + }, + { + "epoch": 1.0195475510652316, + "grad_norm": 2.236398220062256, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7313048839569092, + "num_tokens": 231866413.0, + "step": 9284 + }, + { + "epoch": 1.0196573687678454, + "grad_norm": 2.3182411193847656, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7108711004257202, + "num_tokens": 231891146.0, + "step": 9285 + }, + { + "epoch": 1.019767186470459, + "grad_norm": 2.0143351554870605, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7180156111717224, + "num_tokens": 231920471.0, + "step": 9286 + }, + { + "epoch": 1.0198770041730727, + "grad_norm": 2.2249956130981445, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.719340443611145, + "num_tokens": 231945067.0, + "step": 9287 + }, + { + "epoch": 1.0199868218756865, + "grad_norm": 1.8312171697616577, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7010425329208374, + "num_tokens": 231982049.0, + "step": 9288 + }, + { + "epoch": 1.0200966395783, + "grad_norm": 2.462827682495117, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7244264483451843, + "num_tokens": 232003346.0, + "step": 9289 + }, + { + "epoch": 1.0202064572809137, + "grad_norm": 2.2324483394622803, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.732016921043396, + "num_tokens": 232027833.0, + "step": 9290 + }, + { + "epoch": 1.0203162749835273, + "grad_norm": 2.4804537296295166, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.705683171749115, + "num_tokens": 232048295.0, + "step": 9291 + }, + { + "epoch": 1.020426092686141, + "grad_norm": 2.517604351043701, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7375833988189697, + "num_tokens": 232067572.0, + "step": 9292 + }, + { + "epoch": 1.0205359103887546, + "grad_norm": 1.9919370412826538, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7513325214385986, + "num_tokens": 232097445.0, + "step": 9293 + }, + { + "epoch": 1.0206457280913683, + "grad_norm": 2.3002829551696777, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7067632675170898, + "num_tokens": 232123924.0, + "step": 9294 + }, + { + "epoch": 1.020755545793982, + "grad_norm": 2.5058162212371826, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.719045877456665, + "num_tokens": 232144671.0, + "step": 9295 + }, + { + "epoch": 1.0208653634965956, + "grad_norm": 2.4507737159729004, + "learning_rate": 1e-06, + "loss": 0.7863, + "mean_token_accuracy": 0.7469885945320129, + "num_tokens": 232164632.0, + "step": 9296 + }, + { + "epoch": 1.0209751811992094, + "grad_norm": 2.303352117538452, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7106457948684692, + "num_tokens": 232190156.0, + "step": 9297 + }, + { + "epoch": 1.021084998901823, + "grad_norm": 2.159740447998047, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.6996732354164124, + "num_tokens": 232219069.0, + "step": 9298 + }, + { + "epoch": 1.0211948166044367, + "grad_norm": 2.1823012828826904, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7218267917633057, + "num_tokens": 232245247.0, + "step": 9299 + }, + { + "epoch": 1.0213046343070502, + "grad_norm": 2.571394205093384, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7333747744560242, + "num_tokens": 232267685.0, + "step": 9300 + }, + { + "epoch": 1.021414452009664, + "grad_norm": 2.582925796508789, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.701793372631073, + "num_tokens": 232288674.0, + "step": 9301 + }, + { + "epoch": 1.0215242697122777, + "grad_norm": 2.087730884552002, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7257657051086426, + "num_tokens": 232315041.0, + "step": 9302 + }, + { + "epoch": 1.0216340874148913, + "grad_norm": 2.3424911499023438, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7303372621536255, + "num_tokens": 232337494.0, + "step": 9303 + }, + { + "epoch": 1.021743905117505, + "grad_norm": 2.558152198791504, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7110324501991272, + "num_tokens": 232358813.0, + "step": 9304 + }, + { + "epoch": 1.0218537228201185, + "grad_norm": 1.9902571439743042, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7103937864303589, + "num_tokens": 232392038.0, + "step": 9305 + }, + { + "epoch": 1.0219635405227323, + "grad_norm": 2.2727227210998535, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7284442186355591, + "num_tokens": 232419017.0, + "step": 9306 + }, + { + "epoch": 1.0220733582253458, + "grad_norm": 2.142275094985962, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7021690607070923, + "num_tokens": 232446388.0, + "step": 9307 + }, + { + "epoch": 1.0221831759279596, + "grad_norm": 2.172921895980835, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7180538177490234, + "num_tokens": 232473784.0, + "step": 9308 + }, + { + "epoch": 1.0222929936305734, + "grad_norm": 1.9930357933044434, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7235849499702454, + "num_tokens": 232502014.0, + "step": 9309 + }, + { + "epoch": 1.022402811333187, + "grad_norm": 2.5444202423095703, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7036070823669434, + "num_tokens": 232525058.0, + "step": 9310 + }, + { + "epoch": 1.0225126290358006, + "grad_norm": 2.3621346950531006, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7163814306259155, + "num_tokens": 232550974.0, + "step": 9311 + }, + { + "epoch": 1.0226224467384142, + "grad_norm": 2.1140737533569336, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7056480646133423, + "num_tokens": 232580010.0, + "step": 9312 + }, + { + "epoch": 1.022732264441028, + "grad_norm": 2.1028451919555664, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7189695835113525, + "num_tokens": 232605789.0, + "step": 9313 + }, + { + "epoch": 1.0228420821436415, + "grad_norm": 2.1473946571350098, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7296615839004517, + "num_tokens": 232631629.0, + "step": 9314 + }, + { + "epoch": 1.0229518998462552, + "grad_norm": 2.377648115158081, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7215477824211121, + "num_tokens": 232655098.0, + "step": 9315 + }, + { + "epoch": 1.023061717548869, + "grad_norm": 2.4877119064331055, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7342634797096252, + "num_tokens": 232675179.0, + "step": 9316 + }, + { + "epoch": 1.0231715352514825, + "grad_norm": 2.270494222640991, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7130860090255737, + "num_tokens": 232701392.0, + "step": 9317 + }, + { + "epoch": 1.0232813529540963, + "grad_norm": 2.20967435836792, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7193830013275146, + "num_tokens": 232727234.0, + "step": 9318 + }, + { + "epoch": 1.0233911706567098, + "grad_norm": 2.248077630996704, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7167955636978149, + "num_tokens": 232752994.0, + "step": 9319 + }, + { + "epoch": 1.0235009883593236, + "grad_norm": 2.132000207901001, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7084207534790039, + "num_tokens": 232784986.0, + "step": 9320 + }, + { + "epoch": 1.023610806061937, + "grad_norm": 1.9989595413208008, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7128568887710571, + "num_tokens": 232815013.0, + "step": 9321 + }, + { + "epoch": 1.0237206237645509, + "grad_norm": 2.6136765480041504, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7317494750022888, + "num_tokens": 232834750.0, + "step": 9322 + }, + { + "epoch": 1.0238304414671644, + "grad_norm": 2.2877840995788574, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7190848588943481, + "num_tokens": 232859234.0, + "step": 9323 + }, + { + "epoch": 1.0239402591697782, + "grad_norm": 2.052922487258911, + "learning_rate": 1e-06, + "loss": 0.9744, + "mean_token_accuracy": 0.7006411552429199, + "num_tokens": 232891159.0, + "step": 9324 + }, + { + "epoch": 1.024050076872392, + "grad_norm": 2.1657004356384277, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7198017835617065, + "num_tokens": 232919504.0, + "step": 9325 + }, + { + "epoch": 1.0241598945750054, + "grad_norm": 2.2452375888824463, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7242424488067627, + "num_tokens": 232943932.0, + "step": 9326 + }, + { + "epoch": 1.0242697122776192, + "grad_norm": 2.276028871536255, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7061538100242615, + "num_tokens": 232967315.0, + "step": 9327 + }, + { + "epoch": 1.0243795299802327, + "grad_norm": 2.39347767829895, + "learning_rate": 1e-06, + "loss": 0.8067, + "mean_token_accuracy": 0.7460092306137085, + "num_tokens": 232987542.0, + "step": 9328 + }, + { + "epoch": 1.0244893476828465, + "grad_norm": 2.287752389907837, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7121692895889282, + "num_tokens": 233010579.0, + "step": 9329 + }, + { + "epoch": 1.02459916538546, + "grad_norm": 2.4029812812805176, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.725386381149292, + "num_tokens": 233031480.0, + "step": 9330 + }, + { + "epoch": 1.0247089830880738, + "grad_norm": 2.6755292415618896, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7595727443695068, + "num_tokens": 233048976.0, + "step": 9331 + }, + { + "epoch": 1.0248188007906875, + "grad_norm": 2.0386364459991455, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7128517031669617, + "num_tokens": 233077113.0, + "step": 9332 + }, + { + "epoch": 1.024928618493301, + "grad_norm": 2.2465312480926514, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7472147941589355, + "num_tokens": 233100298.0, + "step": 9333 + }, + { + "epoch": 1.0250384361959148, + "grad_norm": 2.3370158672332764, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7083885669708252, + "num_tokens": 233123428.0, + "step": 9334 + }, + { + "epoch": 1.0251482538985284, + "grad_norm": 1.9829928874969482, + "learning_rate": 1e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7358542680740356, + "num_tokens": 233153070.0, + "step": 9335 + }, + { + "epoch": 1.0252580716011421, + "grad_norm": 2.4711365699768066, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7209305763244629, + "num_tokens": 233173499.0, + "step": 9336 + }, + { + "epoch": 1.0253678893037557, + "grad_norm": 2.0497663021087646, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7113593816757202, + "num_tokens": 233204706.0, + "step": 9337 + }, + { + "epoch": 1.0254777070063694, + "grad_norm": 2.344224214553833, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7351518869400024, + "num_tokens": 233227793.0, + "step": 9338 + }, + { + "epoch": 1.0255875247089832, + "grad_norm": 2.4600698947906494, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7361708879470825, + "num_tokens": 233248752.0, + "step": 9339 + }, + { + "epoch": 1.0256973424115967, + "grad_norm": 1.9841723442077637, + "learning_rate": 1e-06, + "loss": 0.9863, + "mean_token_accuracy": 0.69785475730896, + "num_tokens": 233282012.0, + "step": 9340 + }, + { + "epoch": 1.0258071601142105, + "grad_norm": 2.2444283962249756, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.724562406539917, + "num_tokens": 233308019.0, + "step": 9341 + }, + { + "epoch": 1.025916977816824, + "grad_norm": 2.1401784420013428, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7108722925186157, + "num_tokens": 233337648.0, + "step": 9342 + }, + { + "epoch": 1.0260267955194378, + "grad_norm": 2.2094926834106445, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.738418459892273, + "num_tokens": 233361950.0, + "step": 9343 + }, + { + "epoch": 1.0261366132220513, + "grad_norm": 2.106039047241211, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7244559526443481, + "num_tokens": 233388900.0, + "step": 9344 + }, + { + "epoch": 1.026246430924665, + "grad_norm": 1.9745163917541504, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7018402218818665, + "num_tokens": 233421542.0, + "step": 9345 + }, + { + "epoch": 1.0263562486272788, + "grad_norm": 2.353123903274536, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7245751619338989, + "num_tokens": 233446296.0, + "step": 9346 + }, + { + "epoch": 1.0264660663298923, + "grad_norm": 1.9839229583740234, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7223387360572815, + "num_tokens": 233474703.0, + "step": 9347 + }, + { + "epoch": 1.026575884032506, + "grad_norm": 2.540271282196045, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7237024307250977, + "num_tokens": 233495134.0, + "step": 9348 + }, + { + "epoch": 1.0266857017351196, + "grad_norm": 2.114821434020996, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.702365517616272, + "num_tokens": 233523586.0, + "step": 9349 + }, + { + "epoch": 1.0267955194377334, + "grad_norm": 2.3564016819000244, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7052462697029114, + "num_tokens": 233548887.0, + "step": 9350 + }, + { + "epoch": 1.026905337140347, + "grad_norm": 2.294699192047119, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.739443838596344, + "num_tokens": 233572437.0, + "step": 9351 + }, + { + "epoch": 1.0270151548429607, + "grad_norm": 2.1913185119628906, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7123638391494751, + "num_tokens": 233600150.0, + "step": 9352 + }, + { + "epoch": 1.0271249725455744, + "grad_norm": 2.2711522579193115, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7405900955200195, + "num_tokens": 233623876.0, + "step": 9353 + }, + { + "epoch": 1.027234790248188, + "grad_norm": 2.013129949569702, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7243474721908569, + "num_tokens": 233651492.0, + "step": 9354 + }, + { + "epoch": 1.0273446079508017, + "grad_norm": 2.2566120624542236, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7225408554077148, + "num_tokens": 233677451.0, + "step": 9355 + }, + { + "epoch": 1.0274544256534153, + "grad_norm": 2.4726197719573975, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7087344527244568, + "num_tokens": 233699513.0, + "step": 9356 + }, + { + "epoch": 1.027564243356029, + "grad_norm": 2.031432867050171, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7084987759590149, + "num_tokens": 233728805.0, + "step": 9357 + }, + { + "epoch": 1.0276740610586426, + "grad_norm": 2.640212059020996, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.700067400932312, + "num_tokens": 233749409.0, + "step": 9358 + }, + { + "epoch": 1.0277838787612563, + "grad_norm": 2.2047643661499023, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7073745131492615, + "num_tokens": 233774496.0, + "step": 9359 + }, + { + "epoch": 1.02789369646387, + "grad_norm": 2.262831211090088, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7163262367248535, + "num_tokens": 233802177.0, + "step": 9360 + }, + { + "epoch": 1.0280035141664836, + "grad_norm": 2.0505683422088623, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.702020525932312, + "num_tokens": 233836471.0, + "step": 9361 + }, + { + "epoch": 1.0281133318690974, + "grad_norm": 2.1282520294189453, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7282243967056274, + "num_tokens": 233864122.0, + "step": 9362 + }, + { + "epoch": 1.028223149571711, + "grad_norm": 2.1387085914611816, + "learning_rate": 1e-06, + "loss": 1.0304, + "mean_token_accuracy": 0.6889309883117676, + "num_tokens": 233892420.0, + "step": 9363 + }, + { + "epoch": 1.0283329672743247, + "grad_norm": 2.317659616470337, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7351688742637634, + "num_tokens": 233914944.0, + "step": 9364 + }, + { + "epoch": 1.0284427849769382, + "grad_norm": 2.1641392707824707, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7050092816352844, + "num_tokens": 233945101.0, + "step": 9365 + }, + { + "epoch": 1.028552602679552, + "grad_norm": 2.818084716796875, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7199794054031372, + "num_tokens": 233963078.0, + "step": 9366 + }, + { + "epoch": 1.0286624203821657, + "grad_norm": 2.509274959564209, + "learning_rate": 1e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7380039691925049, + "num_tokens": 233983385.0, + "step": 9367 + }, + { + "epoch": 1.0287722380847792, + "grad_norm": 2.6558423042297363, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7380126714706421, + "num_tokens": 234008292.0, + "step": 9368 + }, + { + "epoch": 1.028882055787393, + "grad_norm": 2.341935396194458, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7441275119781494, + "num_tokens": 234030118.0, + "step": 9369 + }, + { + "epoch": 1.0289918734900065, + "grad_norm": 2.3439879417419434, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7302523255348206, + "num_tokens": 234057032.0, + "step": 9370 + }, + { + "epoch": 1.0291016911926203, + "grad_norm": 2.2032713890075684, + "learning_rate": 1e-06, + "loss": 1.0371, + "mean_token_accuracy": 0.682701826095581, + "num_tokens": 234087002.0, + "step": 9371 + }, + { + "epoch": 1.0292115088952338, + "grad_norm": 2.1267507076263428, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7194070816040039, + "num_tokens": 234116475.0, + "step": 9372 + }, + { + "epoch": 1.0293213265978476, + "grad_norm": 2.574136257171631, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7339857816696167, + "num_tokens": 234137752.0, + "step": 9373 + }, + { + "epoch": 1.0294311443004613, + "grad_norm": 2.2262864112854004, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7196531891822815, + "num_tokens": 234163760.0, + "step": 9374 + }, + { + "epoch": 1.0295409620030749, + "grad_norm": 2.087411642074585, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.706164538860321, + "num_tokens": 234193359.0, + "step": 9375 + }, + { + "epoch": 1.0296507797056886, + "grad_norm": 2.0755584239959717, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.708073079586029, + "num_tokens": 234221650.0, + "step": 9376 + }, + { + "epoch": 1.0297605974083022, + "grad_norm": 2.394810676574707, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.714220404624939, + "num_tokens": 234245134.0, + "step": 9377 + }, + { + "epoch": 1.029870415110916, + "grad_norm": 2.321127414703369, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7096115350723267, + "num_tokens": 234271344.0, + "step": 9378 + }, + { + "epoch": 1.0299802328135295, + "grad_norm": 2.5398294925689697, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7297877073287964, + "num_tokens": 234291577.0, + "step": 9379 + }, + { + "epoch": 1.0300900505161432, + "grad_norm": 2.0810863971710205, + "learning_rate": 1e-06, + "loss": 1.0314, + "mean_token_accuracy": 0.6967266798019409, + "num_tokens": 234322643.0, + "step": 9380 + }, + { + "epoch": 1.030199868218757, + "grad_norm": 2.035245180130005, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7039738893508911, + "num_tokens": 234350026.0, + "step": 9381 + }, + { + "epoch": 1.0303096859213705, + "grad_norm": 2.3530657291412354, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7159645557403564, + "num_tokens": 234373207.0, + "step": 9382 + }, + { + "epoch": 1.0304195036239843, + "grad_norm": 2.094942808151245, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6934919357299805, + "num_tokens": 234403145.0, + "step": 9383 + }, + { + "epoch": 1.0305293213265978, + "grad_norm": 2.2000200748443604, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7152772545814514, + "num_tokens": 234430256.0, + "step": 9384 + }, + { + "epoch": 1.0306391390292116, + "grad_norm": 2.2928051948547363, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7095109224319458, + "num_tokens": 234455052.0, + "step": 9385 + }, + { + "epoch": 1.030748956731825, + "grad_norm": 2.337517023086548, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7020583152770996, + "num_tokens": 234479125.0, + "step": 9386 + }, + { + "epoch": 1.0308587744344389, + "grad_norm": 2.342512369155884, + "learning_rate": 1e-06, + "loss": 1.0782, + "mean_token_accuracy": 0.6804571151733398, + "num_tokens": 234504294.0, + "step": 9387 + }, + { + "epoch": 1.0309685921370524, + "grad_norm": 2.332939624786377, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7118987441062927, + "num_tokens": 234527947.0, + "step": 9388 + }, + { + "epoch": 1.0310784098396661, + "grad_norm": 2.52793550491333, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7293637990951538, + "num_tokens": 234549516.0, + "step": 9389 + }, + { + "epoch": 1.03118822754228, + "grad_norm": 2.1667404174804688, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7104932069778442, + "num_tokens": 234578593.0, + "step": 9390 + }, + { + "epoch": 1.0312980452448934, + "grad_norm": 2.1881003379821777, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7082680463790894, + "num_tokens": 234604861.0, + "step": 9391 + }, + { + "epoch": 1.0314078629475072, + "grad_norm": 1.908113956451416, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7205485701560974, + "num_tokens": 234638524.0, + "step": 9392 + }, + { + "epoch": 1.0315176806501207, + "grad_norm": 2.2846288681030273, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7320647835731506, + "num_tokens": 234660050.0, + "step": 9393 + }, + { + "epoch": 1.0316274983527345, + "grad_norm": 2.417445659637451, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7114959955215454, + "num_tokens": 234682273.0, + "step": 9394 + }, + { + "epoch": 1.031737316055348, + "grad_norm": 2.488659381866455, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7247277498245239, + "num_tokens": 234704471.0, + "step": 9395 + }, + { + "epoch": 1.0318471337579618, + "grad_norm": 2.6808159351348877, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7433837056159973, + "num_tokens": 234723863.0, + "step": 9396 + }, + { + "epoch": 1.0319569514605755, + "grad_norm": 2.188023567199707, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7058982849121094, + "num_tokens": 234751991.0, + "step": 9397 + }, + { + "epoch": 1.032066769163189, + "grad_norm": 2.396512746810913, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7198089957237244, + "num_tokens": 234774742.0, + "step": 9398 + }, + { + "epoch": 1.0321765868658028, + "grad_norm": 2.4286246299743652, + "learning_rate": 1e-06, + "loss": 1.0154, + "mean_token_accuracy": 0.6909433603286743, + "num_tokens": 234799541.0, + "step": 9399 + }, + { + "epoch": 1.0322864045684164, + "grad_norm": 2.4441916942596436, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7261384725570679, + "num_tokens": 234822127.0, + "step": 9400 + }, + { + "epoch": 1.0323962222710301, + "grad_norm": 2.5803775787353516, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7110702395439148, + "num_tokens": 234844922.0, + "step": 9401 + }, + { + "epoch": 1.0325060399736437, + "grad_norm": 2.2862393856048584, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7198700904846191, + "num_tokens": 234869180.0, + "step": 9402 + }, + { + "epoch": 1.0326158576762574, + "grad_norm": 2.320056200027466, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7238420248031616, + "num_tokens": 234894222.0, + "step": 9403 + }, + { + "epoch": 1.0327256753788712, + "grad_norm": 2.0284974575042725, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7073404788970947, + "num_tokens": 234923813.0, + "step": 9404 + }, + { + "epoch": 1.0328354930814847, + "grad_norm": 2.216618537902832, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7318975329399109, + "num_tokens": 234949369.0, + "step": 9405 + }, + { + "epoch": 1.0329453107840985, + "grad_norm": 2.2951409816741943, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7125455141067505, + "num_tokens": 234973151.0, + "step": 9406 + }, + { + "epoch": 1.033055128486712, + "grad_norm": 2.198117971420288, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7104659676551819, + "num_tokens": 234998203.0, + "step": 9407 + }, + { + "epoch": 1.0331649461893258, + "grad_norm": 2.0251033306121826, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7174787521362305, + "num_tokens": 235031146.0, + "step": 9408 + }, + { + "epoch": 1.0332747638919393, + "grad_norm": 2.305060863494873, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7116563320159912, + "num_tokens": 235055854.0, + "step": 9409 + }, + { + "epoch": 1.033384581594553, + "grad_norm": 2.2319579124450684, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7069895267486572, + "num_tokens": 235082236.0, + "step": 9410 + }, + { + "epoch": 1.0334943992971668, + "grad_norm": 2.3327043056488037, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7221663594245911, + "num_tokens": 235106096.0, + "step": 9411 + }, + { + "epoch": 1.0336042169997803, + "grad_norm": 2.027867555618286, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7135289311408997, + "num_tokens": 235134292.0, + "step": 9412 + }, + { + "epoch": 1.033714034702394, + "grad_norm": 2.3125133514404297, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.702207088470459, + "num_tokens": 235159399.0, + "step": 9413 + }, + { + "epoch": 1.0338238524050076, + "grad_norm": 2.322767972946167, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7164509296417236, + "num_tokens": 235183355.0, + "step": 9414 + }, + { + "epoch": 1.0339336701076214, + "grad_norm": 2.6532161235809326, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7170023918151855, + "num_tokens": 235201439.0, + "step": 9415 + }, + { + "epoch": 1.034043487810235, + "grad_norm": 2.359060049057007, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.705736517906189, + "num_tokens": 235225080.0, + "step": 9416 + }, + { + "epoch": 1.0341533055128487, + "grad_norm": 2.5309767723083496, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7382147312164307, + "num_tokens": 235243895.0, + "step": 9417 + }, + { + "epoch": 1.0342631232154624, + "grad_norm": 2.391099691390991, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7168720960617065, + "num_tokens": 235266401.0, + "step": 9418 + }, + { + "epoch": 1.034372940918076, + "grad_norm": 2.4782803058624268, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7207093238830566, + "num_tokens": 235289733.0, + "step": 9419 + }, + { + "epoch": 1.0344827586206897, + "grad_norm": 2.0747857093811035, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7302735447883606, + "num_tokens": 235317351.0, + "step": 9420 + }, + { + "epoch": 1.0345925763233033, + "grad_norm": 2.008143186569214, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.6989480257034302, + "num_tokens": 235346951.0, + "step": 9421 + }, + { + "epoch": 1.034702394025917, + "grad_norm": 2.5575623512268066, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7230374813079834, + "num_tokens": 235367253.0, + "step": 9422 + }, + { + "epoch": 1.0348122117285306, + "grad_norm": 2.468468189239502, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7115417718887329, + "num_tokens": 235389566.0, + "step": 9423 + }, + { + "epoch": 1.0349220294311443, + "grad_norm": 2.6557087898254395, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7353172898292542, + "num_tokens": 235408262.0, + "step": 9424 + }, + { + "epoch": 1.035031847133758, + "grad_norm": 2.1484038829803467, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6983849406242371, + "num_tokens": 235437074.0, + "step": 9425 + }, + { + "epoch": 1.0351416648363716, + "grad_norm": 2.1203696727752686, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7257763147354126, + "num_tokens": 235462913.0, + "step": 9426 + }, + { + "epoch": 1.0352514825389854, + "grad_norm": 2.035917043685913, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6956954598426819, + "num_tokens": 235491496.0, + "step": 9427 + }, + { + "epoch": 1.035361300241599, + "grad_norm": 2.4667844772338867, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7196157574653625, + "num_tokens": 235513590.0, + "step": 9428 + }, + { + "epoch": 1.0354711179442126, + "grad_norm": 1.9680778980255127, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.701583743095398, + "num_tokens": 235543998.0, + "step": 9429 + }, + { + "epoch": 1.0355809356468262, + "grad_norm": 2.3067142963409424, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7258141040802002, + "num_tokens": 235567987.0, + "step": 9430 + }, + { + "epoch": 1.03569075334944, + "grad_norm": 2.4566473960876465, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7071704864501953, + "num_tokens": 235590470.0, + "step": 9431 + }, + { + "epoch": 1.0358005710520537, + "grad_norm": 2.0881760120391846, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.6992682218551636, + "num_tokens": 235618201.0, + "step": 9432 + }, + { + "epoch": 1.0359103887546672, + "grad_norm": 2.1406662464141846, + "learning_rate": 1e-06, + "loss": 0.9942, + "mean_token_accuracy": 0.6955782771110535, + "num_tokens": 235646210.0, + "step": 9433 + }, + { + "epoch": 1.036020206457281, + "grad_norm": 2.229356527328491, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7061048150062561, + "num_tokens": 235672662.0, + "step": 9434 + }, + { + "epoch": 1.0361300241598945, + "grad_norm": 2.0521581172943115, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7083101272583008, + "num_tokens": 235706261.0, + "step": 9435 + }, + { + "epoch": 1.0362398418625083, + "grad_norm": 2.479996919631958, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7186861038208008, + "num_tokens": 235728237.0, + "step": 9436 + }, + { + "epoch": 1.0363496595651218, + "grad_norm": 2.211637020111084, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7254418134689331, + "num_tokens": 235755227.0, + "step": 9437 + }, + { + "epoch": 1.0364594772677356, + "grad_norm": 2.33046817779541, + "learning_rate": 1e-06, + "loss": 0.8, + "mean_token_accuracy": 0.7536882162094116, + "num_tokens": 235776383.0, + "step": 9438 + }, + { + "epoch": 1.036569294970349, + "grad_norm": 2.17744779586792, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.719349205493927, + "num_tokens": 235803341.0, + "step": 9439 + }, + { + "epoch": 1.0366791126729629, + "grad_norm": 2.4284451007843018, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7176027297973633, + "num_tokens": 235824321.0, + "step": 9440 + }, + { + "epoch": 1.0367889303755766, + "grad_norm": 2.1473515033721924, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.712461531162262, + "num_tokens": 235852404.0, + "step": 9441 + }, + { + "epoch": 1.0368987480781902, + "grad_norm": 2.160010576248169, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7399365901947021, + "num_tokens": 235876594.0, + "step": 9442 + }, + { + "epoch": 1.037008565780804, + "grad_norm": 2.1172263622283936, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7026731371879578, + "num_tokens": 235908010.0, + "step": 9443 + }, + { + "epoch": 1.0371183834834174, + "grad_norm": 1.8945072889328003, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7033853530883789, + "num_tokens": 235943158.0, + "step": 9444 + }, + { + "epoch": 1.0372282011860312, + "grad_norm": 2.207051992416382, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.722321093082428, + "num_tokens": 235970886.0, + "step": 9445 + }, + { + "epoch": 1.037338018888645, + "grad_norm": 2.4471733570098877, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7135406732559204, + "num_tokens": 235993283.0, + "step": 9446 + }, + { + "epoch": 1.0374478365912585, + "grad_norm": 2.513868808746338, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7309464812278748, + "num_tokens": 236014503.0, + "step": 9447 + }, + { + "epoch": 1.0375576542938723, + "grad_norm": 2.5324349403381348, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7203414440155029, + "num_tokens": 236035072.0, + "step": 9448 + }, + { + "epoch": 1.0376674719964858, + "grad_norm": 2.3195669651031494, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.703478991985321, + "num_tokens": 236059326.0, + "step": 9449 + }, + { + "epoch": 1.0377772896990995, + "grad_norm": 2.243596315383911, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7264320254325867, + "num_tokens": 236083320.0, + "step": 9450 + }, + { + "epoch": 1.037887107401713, + "grad_norm": 2.3232979774475098, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7124482989311218, + "num_tokens": 236106484.0, + "step": 9451 + }, + { + "epoch": 1.0379969251043268, + "grad_norm": 2.315514326095581, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7169718742370605, + "num_tokens": 236130194.0, + "step": 9452 + }, + { + "epoch": 1.0381067428069404, + "grad_norm": 2.1985926628112793, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7313969135284424, + "num_tokens": 236155337.0, + "step": 9453 + }, + { + "epoch": 1.0382165605095541, + "grad_norm": 2.2906761169433594, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7157726287841797, + "num_tokens": 236180753.0, + "step": 9454 + }, + { + "epoch": 1.0383263782121679, + "grad_norm": 2.5668892860412598, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7262237071990967, + "num_tokens": 236199927.0, + "step": 9455 + }, + { + "epoch": 1.0384361959147814, + "grad_norm": 2.4000208377838135, + "learning_rate": 1e-06, + "loss": 0.8098, + "mean_token_accuracy": 0.736479640007019, + "num_tokens": 236220872.0, + "step": 9456 + }, + { + "epoch": 1.0385460136173952, + "grad_norm": 2.54876708984375, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.720166802406311, + "num_tokens": 236239478.0, + "step": 9457 + }, + { + "epoch": 1.0386558313200087, + "grad_norm": 2.5483744144439697, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7256460785865784, + "num_tokens": 236260167.0, + "step": 9458 + }, + { + "epoch": 1.0387656490226225, + "grad_norm": 2.0269434452056885, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.718411386013031, + "num_tokens": 236290707.0, + "step": 9459 + }, + { + "epoch": 1.038875466725236, + "grad_norm": 2.5682716369628906, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7079159021377563, + "num_tokens": 236310612.0, + "step": 9460 + }, + { + "epoch": 1.0389852844278498, + "grad_norm": 2.5248780250549316, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7372646331787109, + "num_tokens": 236330845.0, + "step": 9461 + }, + { + "epoch": 1.0390951021304635, + "grad_norm": 2.2332475185394287, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7106784582138062, + "num_tokens": 236357443.0, + "step": 9462 + }, + { + "epoch": 1.039204919833077, + "grad_norm": 2.294259786605835, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.718589186668396, + "num_tokens": 236380412.0, + "step": 9463 + }, + { + "epoch": 1.0393147375356908, + "grad_norm": 2.687062978744507, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7306151390075684, + "num_tokens": 236401076.0, + "step": 9464 + }, + { + "epoch": 1.0394245552383043, + "grad_norm": 2.3594181537628174, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7252891659736633, + "num_tokens": 236426797.0, + "step": 9465 + }, + { + "epoch": 1.039534372940918, + "grad_norm": 2.1401114463806152, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.708774983882904, + "num_tokens": 236455889.0, + "step": 9466 + }, + { + "epoch": 1.0396441906435316, + "grad_norm": 2.324714183807373, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.726012110710144, + "num_tokens": 236480518.0, + "step": 9467 + }, + { + "epoch": 1.0397540083461454, + "grad_norm": 2.229433536529541, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7297600507736206, + "num_tokens": 236507146.0, + "step": 9468 + }, + { + "epoch": 1.0398638260487592, + "grad_norm": 2.141643524169922, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6887848377227783, + "num_tokens": 236534547.0, + "step": 9469 + }, + { + "epoch": 1.0399736437513727, + "grad_norm": 2.0440542697906494, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7099138498306274, + "num_tokens": 236565317.0, + "step": 9470 + }, + { + "epoch": 1.0400834614539864, + "grad_norm": 2.7053771018981934, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7270694971084595, + "num_tokens": 236584611.0, + "step": 9471 + }, + { + "epoch": 1.0401932791566, + "grad_norm": 2.102229356765747, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.703704833984375, + "num_tokens": 236613917.0, + "step": 9472 + }, + { + "epoch": 1.0403030968592137, + "grad_norm": 2.2494637966156006, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.702197790145874, + "num_tokens": 236639551.0, + "step": 9473 + }, + { + "epoch": 1.0404129145618273, + "grad_norm": 2.340520143508911, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.734125018119812, + "num_tokens": 236662248.0, + "step": 9474 + }, + { + "epoch": 1.040522732264441, + "grad_norm": 2.424872875213623, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7248321175575256, + "num_tokens": 236684122.0, + "step": 9475 + }, + { + "epoch": 1.0406325499670548, + "grad_norm": 2.8264219760894775, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7109854817390442, + "num_tokens": 236701355.0, + "step": 9476 + }, + { + "epoch": 1.0407423676696683, + "grad_norm": 2.123227119445801, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7113369703292847, + "num_tokens": 236728370.0, + "step": 9477 + }, + { + "epoch": 1.040852185372282, + "grad_norm": 1.9271653890609741, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7169874906539917, + "num_tokens": 236761439.0, + "step": 9478 + }, + { + "epoch": 1.0409620030748956, + "grad_norm": 2.4051666259765625, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.709521472454071, + "num_tokens": 236784623.0, + "step": 9479 + }, + { + "epoch": 1.0410718207775094, + "grad_norm": 2.4914088249206543, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7036721706390381, + "num_tokens": 236810851.0, + "step": 9480 + }, + { + "epoch": 1.041181638480123, + "grad_norm": 2.0533697605133057, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7240850925445557, + "num_tokens": 236839914.0, + "step": 9481 + }, + { + "epoch": 1.0412914561827367, + "grad_norm": 2.1082370281219482, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7204011678695679, + "num_tokens": 236869032.0, + "step": 9482 + }, + { + "epoch": 1.0414012738853504, + "grad_norm": 2.2084124088287354, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7136335372924805, + "num_tokens": 236894525.0, + "step": 9483 + }, + { + "epoch": 1.041511091587964, + "grad_norm": 2.2953531742095947, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.6998635530471802, + "num_tokens": 236918929.0, + "step": 9484 + }, + { + "epoch": 1.0416209092905777, + "grad_norm": 1.9860711097717285, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7151048183441162, + "num_tokens": 236950461.0, + "step": 9485 + }, + { + "epoch": 1.0417307269931912, + "grad_norm": 2.4353818893432617, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7155320644378662, + "num_tokens": 236973419.0, + "step": 9486 + }, + { + "epoch": 1.041840544695805, + "grad_norm": 2.3280036449432373, + "learning_rate": 1e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7414258718490601, + "num_tokens": 236995274.0, + "step": 9487 + }, + { + "epoch": 1.0419503623984185, + "grad_norm": 1.9991979598999023, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7036067247390747, + "num_tokens": 237025569.0, + "step": 9488 + }, + { + "epoch": 1.0420601801010323, + "grad_norm": 2.2384679317474365, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7271840572357178, + "num_tokens": 237050267.0, + "step": 9489 + }, + { + "epoch": 1.042169997803646, + "grad_norm": 2.375086784362793, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7103376984596252, + "num_tokens": 237072713.0, + "step": 9490 + }, + { + "epoch": 1.0422798155062596, + "grad_norm": 2.436478614807129, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7155342102050781, + "num_tokens": 237096715.0, + "step": 9491 + }, + { + "epoch": 1.0423896332088733, + "grad_norm": 2.3446848392486572, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7197321653366089, + "num_tokens": 237119234.0, + "step": 9492 + }, + { + "epoch": 1.0424994509114869, + "grad_norm": 2.655644416809082, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7049785852432251, + "num_tokens": 237139463.0, + "step": 9493 + }, + { + "epoch": 1.0426092686141006, + "grad_norm": 2.477163314819336, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.6962552070617676, + "num_tokens": 237159461.0, + "step": 9494 + }, + { + "epoch": 1.0427190863167142, + "grad_norm": 1.9025498628616333, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7202738523483276, + "num_tokens": 237191246.0, + "step": 9495 + }, + { + "epoch": 1.042828904019328, + "grad_norm": 2.124405860900879, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7319350242614746, + "num_tokens": 237217661.0, + "step": 9496 + }, + { + "epoch": 1.0429387217219417, + "grad_norm": 2.42246413230896, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7346900701522827, + "num_tokens": 237240387.0, + "step": 9497 + }, + { + "epoch": 1.0430485394245552, + "grad_norm": 2.5081727504730225, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7379977703094482, + "num_tokens": 237260958.0, + "step": 9498 + }, + { + "epoch": 1.043158357127169, + "grad_norm": 2.1036159992218018, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7138442993164062, + "num_tokens": 237290615.0, + "step": 9499 + }, + { + "epoch": 1.0432681748297825, + "grad_norm": 2.407883882522583, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7236628532409668, + "num_tokens": 237313404.0, + "step": 9500 + }, + { + "epoch": 1.0433779925323963, + "grad_norm": 2.2562952041625977, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.7048555612564087, + "num_tokens": 237337613.0, + "step": 9501 + }, + { + "epoch": 1.0434878102350098, + "grad_norm": 2.3239176273345947, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7302903532981873, + "num_tokens": 237359298.0, + "step": 9502 + }, + { + "epoch": 1.0435976279376236, + "grad_norm": 2.4783620834350586, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7136608362197876, + "num_tokens": 237381136.0, + "step": 9503 + }, + { + "epoch": 1.043707445640237, + "grad_norm": 2.226249933242798, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7323805093765259, + "num_tokens": 237405220.0, + "step": 9504 + }, + { + "epoch": 1.0438172633428509, + "grad_norm": 2.4488272666931152, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.721970796585083, + "num_tokens": 237426944.0, + "step": 9505 + }, + { + "epoch": 1.0439270810454646, + "grad_norm": 2.4832847118377686, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7230485081672668, + "num_tokens": 237449737.0, + "step": 9506 + }, + { + "epoch": 1.0440368987480781, + "grad_norm": 2.2575268745422363, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7203724980354309, + "num_tokens": 237474708.0, + "step": 9507 + }, + { + "epoch": 1.044146716450692, + "grad_norm": 2.5081188678741455, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7215027809143066, + "num_tokens": 237496968.0, + "step": 9508 + }, + { + "epoch": 1.0442565341533054, + "grad_norm": 2.007786512374878, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7182393074035645, + "num_tokens": 237527088.0, + "step": 9509 + }, + { + "epoch": 1.0443663518559192, + "grad_norm": 2.361900568008423, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7366064786911011, + "num_tokens": 237549876.0, + "step": 9510 + }, + { + "epoch": 1.0444761695585327, + "grad_norm": 2.2214739322662354, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7319466471672058, + "num_tokens": 237573523.0, + "step": 9511 + }, + { + "epoch": 1.0445859872611465, + "grad_norm": 2.6713736057281494, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7369044423103333, + "num_tokens": 237591793.0, + "step": 9512 + }, + { + "epoch": 1.0446958049637602, + "grad_norm": 2.23195743560791, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7271525263786316, + "num_tokens": 237616976.0, + "step": 9513 + }, + { + "epoch": 1.0448056226663738, + "grad_norm": 2.2489657402038574, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7278202176094055, + "num_tokens": 237640486.0, + "step": 9514 + }, + { + "epoch": 1.0449154403689875, + "grad_norm": 2.1447598934173584, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6900737285614014, + "num_tokens": 237667382.0, + "step": 9515 + }, + { + "epoch": 1.045025258071601, + "grad_norm": 2.196784019470215, + "learning_rate": 1e-06, + "loss": 0.8323, + "mean_token_accuracy": 0.7356109619140625, + "num_tokens": 237692241.0, + "step": 9516 + }, + { + "epoch": 1.0451350757742148, + "grad_norm": 2.481811046600342, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7246729135513306, + "num_tokens": 237715490.0, + "step": 9517 + }, + { + "epoch": 1.0452448934768284, + "grad_norm": 2.31413197517395, + "learning_rate": 1e-06, + "loss": 1.0152, + "mean_token_accuracy": 0.6879925727844238, + "num_tokens": 237742760.0, + "step": 9518 + }, + { + "epoch": 1.0453547111794421, + "grad_norm": 2.4084231853485107, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7014509439468384, + "num_tokens": 237766827.0, + "step": 9519 + }, + { + "epoch": 1.0454645288820559, + "grad_norm": 2.0628573894500732, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7039576768875122, + "num_tokens": 237796625.0, + "step": 9520 + }, + { + "epoch": 1.0455743465846694, + "grad_norm": 2.1935505867004395, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.703073263168335, + "num_tokens": 237826307.0, + "step": 9521 + }, + { + "epoch": 1.0456841642872832, + "grad_norm": 2.187648057937622, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.712409496307373, + "num_tokens": 237852868.0, + "step": 9522 + }, + { + "epoch": 1.0457939819898967, + "grad_norm": 2.157155752182007, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.6927921772003174, + "num_tokens": 237881462.0, + "step": 9523 + }, + { + "epoch": 1.0459037996925105, + "grad_norm": 1.934906005859375, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7372241020202637, + "num_tokens": 237912135.0, + "step": 9524 + }, + { + "epoch": 1.046013617395124, + "grad_norm": 2.455500602722168, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7199660539627075, + "num_tokens": 237932716.0, + "step": 9525 + }, + { + "epoch": 1.0461234350977378, + "grad_norm": 2.1476516723632812, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7275177240371704, + "num_tokens": 237959457.0, + "step": 9526 + }, + { + "epoch": 1.0462332528003515, + "grad_norm": 2.3800618648529053, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7179690599441528, + "num_tokens": 237983650.0, + "step": 9527 + }, + { + "epoch": 1.046343070502965, + "grad_norm": 2.218693971633911, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7153576612472534, + "num_tokens": 238008506.0, + "step": 9528 + }, + { + "epoch": 1.0464528882055788, + "grad_norm": 2.0624098777770996, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7289617657661438, + "num_tokens": 238037032.0, + "step": 9529 + }, + { + "epoch": 1.0465627059081923, + "grad_norm": 2.2180697917938232, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7210604548454285, + "num_tokens": 238062647.0, + "step": 9530 + }, + { + "epoch": 1.046672523610806, + "grad_norm": 2.0430314540863037, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.715787947177887, + "num_tokens": 238092389.0, + "step": 9531 + }, + { + "epoch": 1.0467823413134196, + "grad_norm": 2.594595432281494, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7382268309593201, + "num_tokens": 238111327.0, + "step": 9532 + }, + { + "epoch": 1.0468921590160334, + "grad_norm": 2.450453281402588, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7368260025978088, + "num_tokens": 238133888.0, + "step": 9533 + }, + { + "epoch": 1.0470019767186471, + "grad_norm": 2.1892075538635254, + "learning_rate": 1e-06, + "loss": 0.8096, + "mean_token_accuracy": 0.745265007019043, + "num_tokens": 238158669.0, + "step": 9534 + }, + { + "epoch": 1.0471117944212607, + "grad_norm": 2.4564361572265625, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7370225191116333, + "num_tokens": 238180382.0, + "step": 9535 + }, + { + "epoch": 1.0472216121238744, + "grad_norm": 2.3194563388824463, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7159234881401062, + "num_tokens": 238203951.0, + "step": 9536 + }, + { + "epoch": 1.047331429826488, + "grad_norm": 2.1732962131500244, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7202224731445312, + "num_tokens": 238229564.0, + "step": 9537 + }, + { + "epoch": 1.0474412475291017, + "grad_norm": 2.2359297275543213, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7219620943069458, + "num_tokens": 238257309.0, + "step": 9538 + }, + { + "epoch": 1.0475510652317153, + "grad_norm": 2.2054522037506104, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.710593581199646, + "num_tokens": 238284787.0, + "step": 9539 + }, + { + "epoch": 1.047660882934329, + "grad_norm": 2.235170364379883, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7277516722679138, + "num_tokens": 238311491.0, + "step": 9540 + }, + { + "epoch": 1.0477707006369428, + "grad_norm": 2.4827985763549805, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7251905798912048, + "num_tokens": 238331423.0, + "step": 9541 + }, + { + "epoch": 1.0478805183395563, + "grad_norm": 2.8290116786956787, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7147642970085144, + "num_tokens": 238348727.0, + "step": 9542 + }, + { + "epoch": 1.04799033604217, + "grad_norm": 2.1634387969970703, + "learning_rate": 1e-06, + "loss": 1.0456, + "mean_token_accuracy": 0.6791145205497742, + "num_tokens": 238379771.0, + "step": 9543 + }, + { + "epoch": 1.0481001537447836, + "grad_norm": 2.2069356441497803, + "learning_rate": 1e-06, + "loss": 1.0232, + "mean_token_accuracy": 0.687094509601593, + "num_tokens": 238407114.0, + "step": 9544 + }, + { + "epoch": 1.0482099714473974, + "grad_norm": 2.420496940612793, + "learning_rate": 1e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.7458050847053528, + "num_tokens": 238428231.0, + "step": 9545 + }, + { + "epoch": 1.048319789150011, + "grad_norm": 2.273308277130127, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.697820782661438, + "num_tokens": 238453571.0, + "step": 9546 + }, + { + "epoch": 1.0484296068526247, + "grad_norm": 2.1561942100524902, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7045400142669678, + "num_tokens": 238482249.0, + "step": 9547 + }, + { + "epoch": 1.0485394245552384, + "grad_norm": 2.177264451980591, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7034719586372375, + "num_tokens": 238508359.0, + "step": 9548 + }, + { + "epoch": 1.048649242257852, + "grad_norm": 2.1233835220336914, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7371673583984375, + "num_tokens": 238534805.0, + "step": 9549 + }, + { + "epoch": 1.0487590599604657, + "grad_norm": 2.487215757369995, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7124063372612, + "num_tokens": 238554864.0, + "step": 9550 + }, + { + "epoch": 1.0488688776630792, + "grad_norm": 2.3549060821533203, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7146052122116089, + "num_tokens": 238577375.0, + "step": 9551 + }, + { + "epoch": 1.048978695365693, + "grad_norm": 2.372640609741211, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.712363600730896, + "num_tokens": 238599326.0, + "step": 9552 + }, + { + "epoch": 1.0490885130683065, + "grad_norm": 2.5225636959075928, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7387920618057251, + "num_tokens": 238619675.0, + "step": 9553 + }, + { + "epoch": 1.0491983307709203, + "grad_norm": 2.4295551776885986, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.741516649723053, + "num_tokens": 238641064.0, + "step": 9554 + }, + { + "epoch": 1.0493081484735338, + "grad_norm": 2.1942877769470215, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7018136978149414, + "num_tokens": 238668827.0, + "step": 9555 + }, + { + "epoch": 1.0494179661761476, + "grad_norm": 2.3714118003845215, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.6989086866378784, + "num_tokens": 238692360.0, + "step": 9556 + }, + { + "epoch": 1.0495277838787613, + "grad_norm": 1.9631983041763306, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.71513831615448, + "num_tokens": 238724505.0, + "step": 9557 + }, + { + "epoch": 1.0496376015813749, + "grad_norm": 2.233703136444092, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7028522491455078, + "num_tokens": 238751043.0, + "step": 9558 + }, + { + "epoch": 1.0497474192839886, + "grad_norm": 2.2360565662384033, + "learning_rate": 1e-06, + "loss": 0.965, + "mean_token_accuracy": 0.7067940831184387, + "num_tokens": 238777354.0, + "step": 9559 + }, + { + "epoch": 1.0498572369866022, + "grad_norm": 2.4835574626922607, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7171813249588013, + "num_tokens": 238797859.0, + "step": 9560 + }, + { + "epoch": 1.049967054689216, + "grad_norm": 2.4274468421936035, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7211955785751343, + "num_tokens": 238821027.0, + "step": 9561 + }, + { + "epoch": 1.0500768723918297, + "grad_norm": 1.848842978477478, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7110474705696106, + "num_tokens": 238855414.0, + "step": 9562 + }, + { + "epoch": 1.0501866900944432, + "grad_norm": 2.4160354137420654, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.728670597076416, + "num_tokens": 238876698.0, + "step": 9563 + }, + { + "epoch": 1.050296507797057, + "grad_norm": 2.5604026317596436, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7240578532218933, + "num_tokens": 238897319.0, + "step": 9564 + }, + { + "epoch": 1.0504063254996705, + "grad_norm": 2.040330171585083, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7187556028366089, + "num_tokens": 238929744.0, + "step": 9565 + }, + { + "epoch": 1.0505161432022843, + "grad_norm": 2.0027828216552734, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7012181282043457, + "num_tokens": 238962279.0, + "step": 9566 + }, + { + "epoch": 1.0506259609048978, + "grad_norm": 2.3037643432617188, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7160362601280212, + "num_tokens": 238986145.0, + "step": 9567 + }, + { + "epoch": 1.0507357786075116, + "grad_norm": 2.5195846557617188, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7319499254226685, + "num_tokens": 239007836.0, + "step": 9568 + }, + { + "epoch": 1.050845596310125, + "grad_norm": 2.388435125350952, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7100766897201538, + "num_tokens": 239030965.0, + "step": 9569 + }, + { + "epoch": 1.0509554140127388, + "grad_norm": 2.570622682571411, + "learning_rate": 1e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7445017099380493, + "num_tokens": 239049871.0, + "step": 9570 + }, + { + "epoch": 1.0510652317153526, + "grad_norm": 2.3951494693756104, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6946668028831482, + "num_tokens": 239075333.0, + "step": 9571 + }, + { + "epoch": 1.0511750494179661, + "grad_norm": 2.3521010875701904, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7159014344215393, + "num_tokens": 239098504.0, + "step": 9572 + }, + { + "epoch": 1.05128486712058, + "grad_norm": 2.131563425064087, + "learning_rate": 1e-06, + "loss": 0.7564, + "mean_token_accuracy": 0.7566618919372559, + "num_tokens": 239125184.0, + "step": 9573 + }, + { + "epoch": 1.0513946848231934, + "grad_norm": 2.1391823291778564, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7139952778816223, + "num_tokens": 239151271.0, + "step": 9574 + }, + { + "epoch": 1.0515045025258072, + "grad_norm": 2.4617197513580322, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7254171371459961, + "num_tokens": 239174016.0, + "step": 9575 + }, + { + "epoch": 1.0516143202284207, + "grad_norm": 2.190537691116333, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7335816621780396, + "num_tokens": 239199685.0, + "step": 9576 + }, + { + "epoch": 1.0517241379310345, + "grad_norm": 2.212496519088745, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7131671905517578, + "num_tokens": 239225523.0, + "step": 9577 + }, + { + "epoch": 1.0518339556336482, + "grad_norm": 2.2581331729888916, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7176815271377563, + "num_tokens": 239250583.0, + "step": 9578 + }, + { + "epoch": 1.0519437733362618, + "grad_norm": 2.534381866455078, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7388581037521362, + "num_tokens": 239269445.0, + "step": 9579 + }, + { + "epoch": 1.0520535910388755, + "grad_norm": 2.2874836921691895, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7164795398712158, + "num_tokens": 239293575.0, + "step": 9580 + }, + { + "epoch": 1.052163408741489, + "grad_norm": 2.403038501739502, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7310292720794678, + "num_tokens": 239315522.0, + "step": 9581 + }, + { + "epoch": 1.0522732264441028, + "grad_norm": 2.2256462574005127, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7210978269577026, + "num_tokens": 239342222.0, + "step": 9582 + }, + { + "epoch": 1.0523830441467164, + "grad_norm": 2.1653881072998047, + "learning_rate": 1e-06, + "loss": 1.0212, + "mean_token_accuracy": 0.6924902200698853, + "num_tokens": 239369541.0, + "step": 9583 + }, + { + "epoch": 1.05249286184933, + "grad_norm": 2.09246563911438, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6963210105895996, + "num_tokens": 239399640.0, + "step": 9584 + }, + { + "epoch": 1.0526026795519439, + "grad_norm": 2.1421408653259277, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.7234163284301758, + "num_tokens": 239425944.0, + "step": 9585 + }, + { + "epoch": 1.0527124972545574, + "grad_norm": 2.56508731842041, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7096986174583435, + "num_tokens": 239447117.0, + "step": 9586 + }, + { + "epoch": 1.0528223149571712, + "grad_norm": 2.457627058029175, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7180565595626831, + "num_tokens": 239469531.0, + "step": 9587 + }, + { + "epoch": 1.0529321326597847, + "grad_norm": 2.081958532333374, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7196080684661865, + "num_tokens": 239497955.0, + "step": 9588 + }, + { + "epoch": 1.0530419503623984, + "grad_norm": 2.7887659072875977, + "learning_rate": 1e-06, + "loss": 0.7786, + "mean_token_accuracy": 0.7472093105316162, + "num_tokens": 239515948.0, + "step": 9589 + }, + { + "epoch": 1.053151768065012, + "grad_norm": 1.968583583831787, + "learning_rate": 1e-06, + "loss": 1.0201, + "mean_token_accuracy": 0.6897432804107666, + "num_tokens": 239548699.0, + "step": 9590 + }, + { + "epoch": 1.0532615857676257, + "grad_norm": 2.4620168209075928, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7244011163711548, + "num_tokens": 239571949.0, + "step": 9591 + }, + { + "epoch": 1.0533714034702395, + "grad_norm": 2.385956048965454, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.707557737827301, + "num_tokens": 239595908.0, + "step": 9592 + }, + { + "epoch": 1.053481221172853, + "grad_norm": 2.3053386211395264, + "learning_rate": 1e-06, + "loss": 0.7573, + "mean_token_accuracy": 0.7562496066093445, + "num_tokens": 239619112.0, + "step": 9593 + }, + { + "epoch": 1.0535910388754668, + "grad_norm": 2.143021821975708, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7015453577041626, + "num_tokens": 239647523.0, + "step": 9594 + }, + { + "epoch": 1.0537008565780803, + "grad_norm": 1.9795901775360107, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6831099987030029, + "num_tokens": 239681218.0, + "step": 9595 + }, + { + "epoch": 1.053810674280694, + "grad_norm": 2.069775342941284, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7321816682815552, + "num_tokens": 239709370.0, + "step": 9596 + }, + { + "epoch": 1.0539204919833076, + "grad_norm": 2.249476432800293, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7298721671104431, + "num_tokens": 239734057.0, + "step": 9597 + }, + { + "epoch": 1.0540303096859214, + "grad_norm": 2.2491707801818848, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.6966601610183716, + "num_tokens": 239759811.0, + "step": 9598 + }, + { + "epoch": 1.0541401273885351, + "grad_norm": 2.0755646228790283, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7135626077651978, + "num_tokens": 239788197.0, + "step": 9599 + }, + { + "epoch": 1.0542499450911487, + "grad_norm": 1.7716894149780273, + "learning_rate": 1e-06, + "loss": 1.0124, + "mean_token_accuracy": 0.6874634623527527, + "num_tokens": 239825760.0, + "step": 9600 + }, + { + "epoch": 1.0543597627937624, + "grad_norm": 2.5009238719940186, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7173468470573425, + "num_tokens": 239847301.0, + "step": 9601 + }, + { + "epoch": 1.054469580496376, + "grad_norm": 2.411098003387451, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7245990037918091, + "num_tokens": 239870808.0, + "step": 9602 + }, + { + "epoch": 1.0545793981989897, + "grad_norm": 2.445457696914673, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7178813219070435, + "num_tokens": 239893734.0, + "step": 9603 + }, + { + "epoch": 1.0546892159016032, + "grad_norm": 2.3097705841064453, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7236524820327759, + "num_tokens": 239917377.0, + "step": 9604 + }, + { + "epoch": 1.054799033604217, + "grad_norm": 2.586756467819214, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7102233171463013, + "num_tokens": 239939196.0, + "step": 9605 + }, + { + "epoch": 1.0549088513068308, + "grad_norm": 2.4508256912231445, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.727933406829834, + "num_tokens": 239960973.0, + "step": 9606 + }, + { + "epoch": 1.0550186690094443, + "grad_norm": 2.44948673248291, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7425030469894409, + "num_tokens": 239983690.0, + "step": 9607 + }, + { + "epoch": 1.055128486712058, + "grad_norm": 2.2921669483184814, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7039994597434998, + "num_tokens": 240010122.0, + "step": 9608 + }, + { + "epoch": 1.0552383044146716, + "grad_norm": 2.0859601497650146, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.739740252494812, + "num_tokens": 240039079.0, + "step": 9609 + }, + { + "epoch": 1.0553481221172853, + "grad_norm": 2.4069015979766846, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7349361777305603, + "num_tokens": 240060082.0, + "step": 9610 + }, + { + "epoch": 1.0554579398198989, + "grad_norm": 2.257904291152954, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7188993692398071, + "num_tokens": 240086065.0, + "step": 9611 + }, + { + "epoch": 1.0555677575225126, + "grad_norm": 2.3555080890655518, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.713779091835022, + "num_tokens": 240109045.0, + "step": 9612 + }, + { + "epoch": 1.0556775752251264, + "grad_norm": 2.234466552734375, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.73679518699646, + "num_tokens": 240132953.0, + "step": 9613 + }, + { + "epoch": 1.05578739292774, + "grad_norm": 2.1641855239868164, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.6950650215148926, + "num_tokens": 240160745.0, + "step": 9614 + }, + { + "epoch": 1.0558972106303537, + "grad_norm": 2.628648519515991, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.711093008518219, + "num_tokens": 240178953.0, + "step": 9615 + }, + { + "epoch": 1.0560070283329672, + "grad_norm": 2.113917112350464, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7305581569671631, + "num_tokens": 240208128.0, + "step": 9616 + }, + { + "epoch": 1.056116846035581, + "grad_norm": 2.179710626602173, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.712705135345459, + "num_tokens": 240236299.0, + "step": 9617 + }, + { + "epoch": 1.0562266637381945, + "grad_norm": 2.1169283390045166, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.7075475454330444, + "num_tokens": 240265737.0, + "step": 9618 + }, + { + "epoch": 1.0563364814408083, + "grad_norm": 2.276256561279297, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7198518514633179, + "num_tokens": 240288708.0, + "step": 9619 + }, + { + "epoch": 1.0564462991434218, + "grad_norm": 1.9804883003234863, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6915332674980164, + "num_tokens": 240319883.0, + "step": 9620 + }, + { + "epoch": 1.0565561168460356, + "grad_norm": 2.280311107635498, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7123684287071228, + "num_tokens": 240345083.0, + "step": 9621 + }, + { + "epoch": 1.0566659345486493, + "grad_norm": 2.1562132835388184, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7107900381088257, + "num_tokens": 240372535.0, + "step": 9622 + }, + { + "epoch": 1.0567757522512629, + "grad_norm": 2.2784719467163086, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7400714755058289, + "num_tokens": 240395133.0, + "step": 9623 + }, + { + "epoch": 1.0568855699538766, + "grad_norm": 2.5516319274902344, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7189643383026123, + "num_tokens": 240416253.0, + "step": 9624 + }, + { + "epoch": 1.0569953876564901, + "grad_norm": 2.0350749492645264, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7042574286460876, + "num_tokens": 240446493.0, + "step": 9625 + }, + { + "epoch": 1.057105205359104, + "grad_norm": 2.3394997119903564, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7197154760360718, + "num_tokens": 240469964.0, + "step": 9626 + }, + { + "epoch": 1.0572150230617177, + "grad_norm": 2.2490906715393066, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7130246162414551, + "num_tokens": 240496228.0, + "step": 9627 + }, + { + "epoch": 1.0573248407643312, + "grad_norm": 2.4629569053649902, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7390786409378052, + "num_tokens": 240518793.0, + "step": 9628 + }, + { + "epoch": 1.057434658466945, + "grad_norm": 2.489283561706543, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7358887195587158, + "num_tokens": 240539944.0, + "step": 9629 + }, + { + "epoch": 1.0575444761695585, + "grad_norm": 2.111741065979004, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7393880486488342, + "num_tokens": 240566442.0, + "step": 9630 + }, + { + "epoch": 1.0576542938721722, + "grad_norm": 2.5794436931610107, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7382121682167053, + "num_tokens": 240584835.0, + "step": 9631 + }, + { + "epoch": 1.0577641115747858, + "grad_norm": 2.275084972381592, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7104164361953735, + "num_tokens": 240609288.0, + "step": 9632 + }, + { + "epoch": 1.0578739292773995, + "grad_norm": 2.6355748176574707, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7074754238128662, + "num_tokens": 240628965.0, + "step": 9633 + }, + { + "epoch": 1.057983746980013, + "grad_norm": 2.270131826400757, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.697270929813385, + "num_tokens": 240655657.0, + "step": 9634 + }, + { + "epoch": 1.0580935646826268, + "grad_norm": 2.408822774887085, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7501213550567627, + "num_tokens": 240677553.0, + "step": 9635 + }, + { + "epoch": 1.0582033823852406, + "grad_norm": 2.3332080841064453, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.6981261968612671, + "num_tokens": 240701994.0, + "step": 9636 + }, + { + "epoch": 1.0583132000878541, + "grad_norm": 2.184283971786499, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7113620042800903, + "num_tokens": 240728568.0, + "step": 9637 + }, + { + "epoch": 1.0584230177904679, + "grad_norm": 2.289686441421509, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7239456176757812, + "num_tokens": 240751841.0, + "step": 9638 + }, + { + "epoch": 1.0585328354930814, + "grad_norm": 2.3390774726867676, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.709642767906189, + "num_tokens": 240776964.0, + "step": 9639 + }, + { + "epoch": 1.0586426531956952, + "grad_norm": 2.0136377811431885, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7062344551086426, + "num_tokens": 240808184.0, + "step": 9640 + }, + { + "epoch": 1.0587524708983087, + "grad_norm": 2.4303627014160156, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7386639714241028, + "num_tokens": 240830150.0, + "step": 9641 + }, + { + "epoch": 1.0588622886009225, + "grad_norm": 2.270270586013794, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7223272323608398, + "num_tokens": 240855320.0, + "step": 9642 + }, + { + "epoch": 1.0589721063035362, + "grad_norm": 2.494135618209839, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7369430065155029, + "num_tokens": 240877589.0, + "step": 9643 + }, + { + "epoch": 1.0590819240061498, + "grad_norm": 2.585575580596924, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7354521751403809, + "num_tokens": 240898004.0, + "step": 9644 + }, + { + "epoch": 1.0591917417087635, + "grad_norm": 2.2046353816986084, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7310007810592651, + "num_tokens": 240924287.0, + "step": 9645 + }, + { + "epoch": 1.059301559411377, + "grad_norm": 2.297182321548462, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6990386247634888, + "num_tokens": 240950575.0, + "step": 9646 + }, + { + "epoch": 1.0594113771139908, + "grad_norm": 2.3206818103790283, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.721307098865509, + "num_tokens": 240975054.0, + "step": 9647 + }, + { + "epoch": 1.0595211948166043, + "grad_norm": 2.21319580078125, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.72206711769104, + "num_tokens": 241000164.0, + "step": 9648 + }, + { + "epoch": 1.059631012519218, + "grad_norm": 2.156550884246826, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.6931090354919434, + "num_tokens": 241028370.0, + "step": 9649 + }, + { + "epoch": 1.0597408302218319, + "grad_norm": 2.5730807781219482, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7074450850486755, + "num_tokens": 241048511.0, + "step": 9650 + }, + { + "epoch": 1.0598506479244454, + "grad_norm": 2.0569088459014893, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6987255215644836, + "num_tokens": 241080623.0, + "step": 9651 + }, + { + "epoch": 1.0599604656270591, + "grad_norm": 2.2703843116760254, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7209936380386353, + "num_tokens": 241106534.0, + "step": 9652 + }, + { + "epoch": 1.0600702833296727, + "grad_norm": 1.932643175125122, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.6979260444641113, + "num_tokens": 241139092.0, + "step": 9653 + }, + { + "epoch": 1.0601801010322864, + "grad_norm": 2.3670783042907715, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7150797843933105, + "num_tokens": 241162507.0, + "step": 9654 + }, + { + "epoch": 1.0602899187349, + "grad_norm": 2.5037336349487305, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7137218713760376, + "num_tokens": 241184261.0, + "step": 9655 + }, + { + "epoch": 1.0603997364375137, + "grad_norm": 2.4686453342437744, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7113842964172363, + "num_tokens": 241207862.0, + "step": 9656 + }, + { + "epoch": 1.0605095541401275, + "grad_norm": 2.140637159347534, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.719519853591919, + "num_tokens": 241235314.0, + "step": 9657 + }, + { + "epoch": 1.060619371842741, + "grad_norm": 2.094616174697876, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7101878523826599, + "num_tokens": 241262317.0, + "step": 9658 + }, + { + "epoch": 1.0607291895453548, + "grad_norm": 2.8034889698028564, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7231432199478149, + "num_tokens": 241281367.0, + "step": 9659 + }, + { + "epoch": 1.0608390072479683, + "grad_norm": 2.1866352558135986, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7044083476066589, + "num_tokens": 241309939.0, + "step": 9660 + }, + { + "epoch": 1.060948824950582, + "grad_norm": 2.4696600437164307, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7284694314002991, + "num_tokens": 241330796.0, + "step": 9661 + }, + { + "epoch": 1.0610586426531956, + "grad_norm": 2.032040596008301, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7009124755859375, + "num_tokens": 241360703.0, + "step": 9662 + }, + { + "epoch": 1.0611684603558094, + "grad_norm": 2.22501277923584, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7319775223731995, + "num_tokens": 241383585.0, + "step": 9663 + }, + { + "epoch": 1.0612782780584231, + "grad_norm": 2.274981737136841, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7283979654312134, + "num_tokens": 241407978.0, + "step": 9664 + }, + { + "epoch": 1.0613880957610367, + "grad_norm": 1.9915790557861328, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.710204541683197, + "num_tokens": 241439850.0, + "step": 9665 + }, + { + "epoch": 1.0614979134636504, + "grad_norm": 2.4792017936706543, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7112530469894409, + "num_tokens": 241462931.0, + "step": 9666 + }, + { + "epoch": 1.061607731166264, + "grad_norm": 2.124378204345703, + "learning_rate": 1e-06, + "loss": 0.834, + "mean_token_accuracy": 0.73382568359375, + "num_tokens": 241490209.0, + "step": 9667 + }, + { + "epoch": 1.0617175488688777, + "grad_norm": 2.211848497390747, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7105298638343811, + "num_tokens": 241517093.0, + "step": 9668 + }, + { + "epoch": 1.0618273665714912, + "grad_norm": 2.3083810806274414, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7239469885826111, + "num_tokens": 241539837.0, + "step": 9669 + }, + { + "epoch": 1.061937184274105, + "grad_norm": 2.3502070903778076, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7213236093521118, + "num_tokens": 241562820.0, + "step": 9670 + }, + { + "epoch": 1.0620470019767188, + "grad_norm": 2.2884621620178223, + "learning_rate": 1e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.73810875415802, + "num_tokens": 241585515.0, + "step": 9671 + }, + { + "epoch": 1.0621568196793323, + "grad_norm": 2.208315372467041, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.730254054069519, + "num_tokens": 241611915.0, + "step": 9672 + }, + { + "epoch": 1.062266637381946, + "grad_norm": 2.1216800212860107, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7166597843170166, + "num_tokens": 241640091.0, + "step": 9673 + }, + { + "epoch": 1.0623764550845596, + "grad_norm": 2.1943721771240234, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6935217976570129, + "num_tokens": 241670890.0, + "step": 9674 + }, + { + "epoch": 1.0624862727871733, + "grad_norm": 2.6191749572753906, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7194317579269409, + "num_tokens": 241692188.0, + "step": 9675 + }, + { + "epoch": 1.0625960904897869, + "grad_norm": 2.520582437515259, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7326076030731201, + "num_tokens": 241713687.0, + "step": 9676 + }, + { + "epoch": 1.0627059081924006, + "grad_norm": 2.3653202056884766, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7135673761367798, + "num_tokens": 241737553.0, + "step": 9677 + }, + { + "epoch": 1.0628157258950144, + "grad_norm": 2.221374034881592, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.709221601486206, + "num_tokens": 241762748.0, + "step": 9678 + }, + { + "epoch": 1.062925543597628, + "grad_norm": 1.8840922117233276, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7142733335494995, + "num_tokens": 241797624.0, + "step": 9679 + }, + { + "epoch": 1.0630353613002417, + "grad_norm": 2.5471744537353516, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7369452118873596, + "num_tokens": 241819060.0, + "step": 9680 + }, + { + "epoch": 1.0631451790028552, + "grad_norm": 2.4874541759490967, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7186466455459595, + "num_tokens": 241841295.0, + "step": 9681 + }, + { + "epoch": 1.063254996705469, + "grad_norm": 2.3047938346862793, + "learning_rate": 1e-06, + "loss": 1.045, + "mean_token_accuracy": 0.6865147948265076, + "num_tokens": 241868672.0, + "step": 9682 + }, + { + "epoch": 1.0633648144080825, + "grad_norm": 2.188446283340454, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7088650465011597, + "num_tokens": 241898181.0, + "step": 9683 + }, + { + "epoch": 1.0634746321106963, + "grad_norm": 2.091229200363159, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.6977843046188354, + "num_tokens": 241926252.0, + "step": 9684 + }, + { + "epoch": 1.0635844498133098, + "grad_norm": 2.3590087890625, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7186264991760254, + "num_tokens": 241950440.0, + "step": 9685 + }, + { + "epoch": 1.0636942675159236, + "grad_norm": 2.69686222076416, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7275484800338745, + "num_tokens": 241970532.0, + "step": 9686 + }, + { + "epoch": 1.0638040852185373, + "grad_norm": 2.160367727279663, + "learning_rate": 1e-06, + "loss": 1.0335, + "mean_token_accuracy": 0.6834352016448975, + "num_tokens": 241998789.0, + "step": 9687 + }, + { + "epoch": 1.0639139029211508, + "grad_norm": 2.3777341842651367, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7207131385803223, + "num_tokens": 242023425.0, + "step": 9688 + }, + { + "epoch": 1.0640237206237646, + "grad_norm": 2.070042133331299, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.734377920627594, + "num_tokens": 242052059.0, + "step": 9689 + }, + { + "epoch": 1.0641335383263781, + "grad_norm": 2.6789379119873047, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7435903549194336, + "num_tokens": 242069711.0, + "step": 9690 + }, + { + "epoch": 1.064243356028992, + "grad_norm": 2.3562252521514893, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7321857213973999, + "num_tokens": 242091308.0, + "step": 9691 + }, + { + "epoch": 1.0643531737316057, + "grad_norm": 2.723447322845459, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.72732013463974, + "num_tokens": 242108789.0, + "step": 9692 + }, + { + "epoch": 1.0644629914342192, + "grad_norm": 2.4668169021606445, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7044248580932617, + "num_tokens": 242131851.0, + "step": 9693 + }, + { + "epoch": 1.064572809136833, + "grad_norm": 2.138117551803589, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7210429906845093, + "num_tokens": 242161006.0, + "step": 9694 + }, + { + "epoch": 1.0646826268394465, + "grad_norm": 2.6614835262298584, + "learning_rate": 1e-06, + "loss": 0.8064, + "mean_token_accuracy": 0.7444427609443665, + "num_tokens": 242179121.0, + "step": 9695 + }, + { + "epoch": 1.0647924445420602, + "grad_norm": 2.498603343963623, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.727908194065094, + "num_tokens": 242198962.0, + "step": 9696 + }, + { + "epoch": 1.0649022622446738, + "grad_norm": 2.227853775024414, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7201824188232422, + "num_tokens": 242223229.0, + "step": 9697 + }, + { + "epoch": 1.0650120799472875, + "grad_norm": 2.6381452083587646, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.716278612613678, + "num_tokens": 242242485.0, + "step": 9698 + }, + { + "epoch": 1.065121897649901, + "grad_norm": 2.6010942459106445, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7145177721977234, + "num_tokens": 242262151.0, + "step": 9699 + }, + { + "epoch": 1.0652317153525148, + "grad_norm": 2.2142250537872314, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7310653924942017, + "num_tokens": 242288588.0, + "step": 9700 + }, + { + "epoch": 1.0653415330551286, + "grad_norm": 2.1167638301849365, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.701526403427124, + "num_tokens": 242317605.0, + "step": 9701 + }, + { + "epoch": 1.065451350757742, + "grad_norm": 1.9944733381271362, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7349109649658203, + "num_tokens": 242347373.0, + "step": 9702 + }, + { + "epoch": 1.0655611684603559, + "grad_norm": 2.253544807434082, + "learning_rate": 1e-06, + "loss": 0.7425, + "mean_token_accuracy": 0.756912887096405, + "num_tokens": 242369405.0, + "step": 9703 + }, + { + "epoch": 1.0656709861629694, + "grad_norm": 1.9642972946166992, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7341814041137695, + "num_tokens": 242399939.0, + "step": 9704 + }, + { + "epoch": 1.0657808038655832, + "grad_norm": 2.517578125, + "learning_rate": 1e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.73384690284729, + "num_tokens": 242419908.0, + "step": 9705 + }, + { + "epoch": 1.0658906215681967, + "grad_norm": 2.16062331199646, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7406302094459534, + "num_tokens": 242446858.0, + "step": 9706 + }, + { + "epoch": 1.0660004392708105, + "grad_norm": 2.783222198486328, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7364494800567627, + "num_tokens": 242464863.0, + "step": 9707 + }, + { + "epoch": 1.0661102569734242, + "grad_norm": 1.916353464126587, + "learning_rate": 1e-06, + "loss": 1.0161, + "mean_token_accuracy": 0.6891822218894958, + "num_tokens": 242502551.0, + "step": 9708 + }, + { + "epoch": 1.0662200746760377, + "grad_norm": 1.9983336925506592, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7170456647872925, + "num_tokens": 242533276.0, + "step": 9709 + }, + { + "epoch": 1.0663298923786515, + "grad_norm": 2.392474889755249, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7153072357177734, + "num_tokens": 242555275.0, + "step": 9710 + }, + { + "epoch": 1.066439710081265, + "grad_norm": 2.581080913543701, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7150496244430542, + "num_tokens": 242577111.0, + "step": 9711 + }, + { + "epoch": 1.0665495277838788, + "grad_norm": 2.0556483268737793, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7296702265739441, + "num_tokens": 242603837.0, + "step": 9712 + }, + { + "epoch": 1.0666593454864923, + "grad_norm": 2.254488468170166, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7271074056625366, + "num_tokens": 242628419.0, + "step": 9713 + }, + { + "epoch": 1.066769163189106, + "grad_norm": 2.5029609203338623, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7197818756103516, + "num_tokens": 242649668.0, + "step": 9714 + }, + { + "epoch": 1.0668789808917198, + "grad_norm": 2.499302864074707, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.701988697052002, + "num_tokens": 242673327.0, + "step": 9715 + }, + { + "epoch": 1.0669887985943334, + "grad_norm": 2.06611704826355, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7102739810943604, + "num_tokens": 242702584.0, + "step": 9716 + }, + { + "epoch": 1.0670986162969471, + "grad_norm": 2.331137180328369, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7149368524551392, + "num_tokens": 242727469.0, + "step": 9717 + }, + { + "epoch": 1.0672084339995607, + "grad_norm": 1.976340889930725, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7076714038848877, + "num_tokens": 242757498.0, + "step": 9718 + }, + { + "epoch": 1.0673182517021744, + "grad_norm": 2.387667417526245, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7258539795875549, + "num_tokens": 242779050.0, + "step": 9719 + }, + { + "epoch": 1.067428069404788, + "grad_norm": 2.2321231365203857, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7034839391708374, + "num_tokens": 242803859.0, + "step": 9720 + }, + { + "epoch": 1.0675378871074017, + "grad_norm": 2.341414451599121, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7179322242736816, + "num_tokens": 242827587.0, + "step": 9721 + }, + { + "epoch": 1.0676477048100155, + "grad_norm": 2.546574354171753, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7152881622314453, + "num_tokens": 242850218.0, + "step": 9722 + }, + { + "epoch": 1.067757522512629, + "grad_norm": 2.518357753753662, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7068202495574951, + "num_tokens": 242873062.0, + "step": 9723 + }, + { + "epoch": 1.0678673402152428, + "grad_norm": 2.3434460163116455, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.71219801902771, + "num_tokens": 242897034.0, + "step": 9724 + }, + { + "epoch": 1.0679771579178563, + "grad_norm": 1.995129942893982, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7034099102020264, + "num_tokens": 242925954.0, + "step": 9725 + }, + { + "epoch": 1.06808697562047, + "grad_norm": 2.330960988998413, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7294766306877136, + "num_tokens": 242949311.0, + "step": 9726 + }, + { + "epoch": 1.0681967933230836, + "grad_norm": 2.2920925617218018, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7105076313018799, + "num_tokens": 242975394.0, + "step": 9727 + }, + { + "epoch": 1.0683066110256974, + "grad_norm": 2.216212749481201, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7199466228485107, + "num_tokens": 243001728.0, + "step": 9728 + }, + { + "epoch": 1.068416428728311, + "grad_norm": 2.1109907627105713, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7040795087814331, + "num_tokens": 243029533.0, + "step": 9729 + }, + { + "epoch": 1.0685262464309246, + "grad_norm": 2.399268865585327, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7279731631278992, + "num_tokens": 243051481.0, + "step": 9730 + }, + { + "epoch": 1.0686360641335384, + "grad_norm": 2.2319467067718506, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7299094796180725, + "num_tokens": 243075373.0, + "step": 9731 + }, + { + "epoch": 1.068745881836152, + "grad_norm": 2.453251600265503, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7205864191055298, + "num_tokens": 243098077.0, + "step": 9732 + }, + { + "epoch": 1.0688556995387657, + "grad_norm": 2.345705509185791, + "learning_rate": 1e-06, + "loss": 0.9796, + "mean_token_accuracy": 0.7016507387161255, + "num_tokens": 243123978.0, + "step": 9733 + }, + { + "epoch": 1.0689655172413792, + "grad_norm": 2.334232807159424, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7269871830940247, + "num_tokens": 243147619.0, + "step": 9734 + }, + { + "epoch": 1.069075334943993, + "grad_norm": 2.3584237098693848, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7245094776153564, + "num_tokens": 243172102.0, + "step": 9735 + }, + { + "epoch": 1.0691851526466065, + "grad_norm": 2.3583552837371826, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.7033025622367859, + "num_tokens": 243195963.0, + "step": 9736 + }, + { + "epoch": 1.0692949703492203, + "grad_norm": 2.2500457763671875, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7097426056861877, + "num_tokens": 243220889.0, + "step": 9737 + }, + { + "epoch": 1.069404788051834, + "grad_norm": 2.6584062576293945, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7238954901695251, + "num_tokens": 243239746.0, + "step": 9738 + }, + { + "epoch": 1.0695146057544476, + "grad_norm": 2.5342347621917725, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7367566823959351, + "num_tokens": 243260052.0, + "step": 9739 + }, + { + "epoch": 1.0696244234570613, + "grad_norm": 2.467550277709961, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7279996275901794, + "num_tokens": 243284475.0, + "step": 9740 + }, + { + "epoch": 1.0697342411596749, + "grad_norm": 2.3208553791046143, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7192709445953369, + "num_tokens": 243310767.0, + "step": 9741 + }, + { + "epoch": 1.0698440588622886, + "grad_norm": 2.1003196239471436, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.717449426651001, + "num_tokens": 243338704.0, + "step": 9742 + }, + { + "epoch": 1.0699538765649024, + "grad_norm": 2.0802361965179443, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7232555747032166, + "num_tokens": 243367012.0, + "step": 9743 + }, + { + "epoch": 1.070063694267516, + "grad_norm": 1.9850817918777466, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7085145711898804, + "num_tokens": 243396631.0, + "step": 9744 + }, + { + "epoch": 1.0701735119701297, + "grad_norm": 2.353224754333496, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7128713726997375, + "num_tokens": 243418276.0, + "step": 9745 + }, + { + "epoch": 1.0702833296727432, + "grad_norm": 2.3202483654022217, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.69638991355896, + "num_tokens": 243443501.0, + "step": 9746 + }, + { + "epoch": 1.070393147375357, + "grad_norm": 2.2840933799743652, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7203634977340698, + "num_tokens": 243467061.0, + "step": 9747 + }, + { + "epoch": 1.0705029650779705, + "grad_norm": 2.1678266525268555, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7341632843017578, + "num_tokens": 243492009.0, + "step": 9748 + }, + { + "epoch": 1.0706127827805842, + "grad_norm": 2.3814022541046143, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7351059913635254, + "num_tokens": 243514858.0, + "step": 9749 + }, + { + "epoch": 1.0707226004831978, + "grad_norm": 2.6745693683624268, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7523387670516968, + "num_tokens": 243531137.0, + "step": 9750 + }, + { + "epoch": 1.0708324181858115, + "grad_norm": 2.37050199508667, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7227620482444763, + "num_tokens": 243558351.0, + "step": 9751 + }, + { + "epoch": 1.0709422358884253, + "grad_norm": 2.7133545875549316, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7076067924499512, + "num_tokens": 243576614.0, + "step": 9752 + }, + { + "epoch": 1.0710520535910388, + "grad_norm": 1.9169002771377563, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7068675756454468, + "num_tokens": 243613079.0, + "step": 9753 + }, + { + "epoch": 1.0711618712936526, + "grad_norm": 2.3608295917510986, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.7074158787727356, + "num_tokens": 243635652.0, + "step": 9754 + }, + { + "epoch": 1.0712716889962661, + "grad_norm": 2.2768523693084717, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7050715684890747, + "num_tokens": 243661272.0, + "step": 9755 + }, + { + "epoch": 1.0713815066988799, + "grad_norm": 2.5048458576202393, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7397201061248779, + "num_tokens": 243681124.0, + "step": 9756 + }, + { + "epoch": 1.0714913244014934, + "grad_norm": 2.0855159759521484, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7055571675300598, + "num_tokens": 243711187.0, + "step": 9757 + }, + { + "epoch": 1.0716011421041072, + "grad_norm": 2.5197856426239014, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7248777151107788, + "num_tokens": 243730702.0, + "step": 9758 + }, + { + "epoch": 1.071710959806721, + "grad_norm": 2.3035590648651123, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7243672609329224, + "num_tokens": 243753541.0, + "step": 9759 + }, + { + "epoch": 1.0718207775093345, + "grad_norm": 2.0084896087646484, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7104701399803162, + "num_tokens": 243782862.0, + "step": 9760 + }, + { + "epoch": 1.0719305952119482, + "grad_norm": 2.2838871479034424, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7391088008880615, + "num_tokens": 243805360.0, + "step": 9761 + }, + { + "epoch": 1.0720404129145618, + "grad_norm": 2.470911741256714, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7185719013214111, + "num_tokens": 243828391.0, + "step": 9762 + }, + { + "epoch": 1.0721502306171755, + "grad_norm": 2.2119762897491455, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7007964253425598, + "num_tokens": 243853562.0, + "step": 9763 + }, + { + "epoch": 1.072260048319789, + "grad_norm": 2.4706759452819824, + "learning_rate": 1e-06, + "loss": 0.8258, + "mean_token_accuracy": 0.7373933792114258, + "num_tokens": 243873316.0, + "step": 9764 + }, + { + "epoch": 1.0723698660224028, + "grad_norm": 2.6037673950195312, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7186778783798218, + "num_tokens": 243892516.0, + "step": 9765 + }, + { + "epoch": 1.0724796837250166, + "grad_norm": 2.226611852645874, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7129652500152588, + "num_tokens": 243917716.0, + "step": 9766 + }, + { + "epoch": 1.07258950142763, + "grad_norm": 2.2219204902648926, + "learning_rate": 1e-06, + "loss": 0.9691, + "mean_token_accuracy": 0.7020535469055176, + "num_tokens": 243943480.0, + "step": 9767 + }, + { + "epoch": 1.0726993191302439, + "grad_norm": 2.121760129928589, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7021913528442383, + "num_tokens": 243971523.0, + "step": 9768 + }, + { + "epoch": 1.0728091368328574, + "grad_norm": 2.2865548133850098, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6939166784286499, + "num_tokens": 243998319.0, + "step": 9769 + }, + { + "epoch": 1.0729189545354711, + "grad_norm": 2.4228098392486572, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7411192059516907, + "num_tokens": 244018211.0, + "step": 9770 + }, + { + "epoch": 1.0730287722380847, + "grad_norm": 2.0774199962615967, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7132505178451538, + "num_tokens": 244047092.0, + "step": 9771 + }, + { + "epoch": 1.0731385899406984, + "grad_norm": 2.5391976833343506, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.695396363735199, + "num_tokens": 244068459.0, + "step": 9772 + }, + { + "epoch": 1.0732484076433122, + "grad_norm": 2.19164776802063, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7353668808937073, + "num_tokens": 244093406.0, + "step": 9773 + }, + { + "epoch": 1.0733582253459257, + "grad_norm": 2.2961950302124023, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7195339202880859, + "num_tokens": 244117619.0, + "step": 9774 + }, + { + "epoch": 1.0734680430485395, + "grad_norm": 2.2290260791778564, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7288628816604614, + "num_tokens": 244141783.0, + "step": 9775 + }, + { + "epoch": 1.073577860751153, + "grad_norm": 2.297309637069702, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7171545028686523, + "num_tokens": 244166337.0, + "step": 9776 + }, + { + "epoch": 1.0736876784537668, + "grad_norm": 2.357276201248169, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7351740598678589, + "num_tokens": 244189769.0, + "step": 9777 + }, + { + "epoch": 1.0737974961563803, + "grad_norm": 2.22773814201355, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7435399293899536, + "num_tokens": 244214490.0, + "step": 9778 + }, + { + "epoch": 1.073907313858994, + "grad_norm": 2.7075135707855225, + "learning_rate": 1e-06, + "loss": 0.7807, + "mean_token_accuracy": 0.7468736171722412, + "num_tokens": 244232685.0, + "step": 9779 + }, + { + "epoch": 1.0740171315616078, + "grad_norm": 1.9704527854919434, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.717511773109436, + "num_tokens": 244264289.0, + "step": 9780 + }, + { + "epoch": 1.0741269492642214, + "grad_norm": 2.0385708808898926, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7188597917556763, + "num_tokens": 244293197.0, + "step": 9781 + }, + { + "epoch": 1.0742367669668351, + "grad_norm": 2.0998029708862305, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.728080153465271, + "num_tokens": 244320630.0, + "step": 9782 + }, + { + "epoch": 1.0743465846694487, + "grad_norm": 2.5777711868286133, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7230819463729858, + "num_tokens": 244340220.0, + "step": 9783 + }, + { + "epoch": 1.0744564023720624, + "grad_norm": 2.2067394256591797, + "learning_rate": 1e-06, + "loss": 1.0065, + "mean_token_accuracy": 0.6871749758720398, + "num_tokens": 244369708.0, + "step": 9784 + }, + { + "epoch": 1.074566220074676, + "grad_norm": 2.0999393463134766, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7193472385406494, + "num_tokens": 244396492.0, + "step": 9785 + }, + { + "epoch": 1.0746760377772897, + "grad_norm": 2.591165542602539, + "learning_rate": 1e-06, + "loss": 0.9682, + "mean_token_accuracy": 0.708489179611206, + "num_tokens": 244417365.0, + "step": 9786 + }, + { + "epoch": 1.0747858554799035, + "grad_norm": 2.274146556854248, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7161673903465271, + "num_tokens": 244443577.0, + "step": 9787 + }, + { + "epoch": 1.074895673182517, + "grad_norm": 2.361262321472168, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7033723592758179, + "num_tokens": 244469701.0, + "step": 9788 + }, + { + "epoch": 1.0750054908851308, + "grad_norm": 2.1673505306243896, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7083616852760315, + "num_tokens": 244497561.0, + "step": 9789 + }, + { + "epoch": 1.0751153085877443, + "grad_norm": 2.441338062286377, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.714431643486023, + "num_tokens": 244519169.0, + "step": 9790 + }, + { + "epoch": 1.075225126290358, + "grad_norm": 2.036651372909546, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.709825873374939, + "num_tokens": 244550680.0, + "step": 9791 + }, + { + "epoch": 1.0753349439929716, + "grad_norm": 2.147787570953369, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7075166702270508, + "num_tokens": 244577702.0, + "step": 9792 + }, + { + "epoch": 1.0754447616955853, + "grad_norm": 2.0377607345581055, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.712775707244873, + "num_tokens": 244607209.0, + "step": 9793 + }, + { + "epoch": 1.075554579398199, + "grad_norm": 2.1136419773101807, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7091093063354492, + "num_tokens": 244633251.0, + "step": 9794 + }, + { + "epoch": 1.0756643971008126, + "grad_norm": 2.3644566535949707, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7093052864074707, + "num_tokens": 244656988.0, + "step": 9795 + }, + { + "epoch": 1.0757742148034264, + "grad_norm": 2.3223845958709717, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7055299282073975, + "num_tokens": 244682957.0, + "step": 9796 + }, + { + "epoch": 1.07588403250604, + "grad_norm": 2.4596893787384033, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7184550762176514, + "num_tokens": 244704640.0, + "step": 9797 + }, + { + "epoch": 1.0759938502086537, + "grad_norm": 2.120335102081299, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7307885885238647, + "num_tokens": 244732731.0, + "step": 9798 + }, + { + "epoch": 1.0761036679112672, + "grad_norm": 2.4102680683135986, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.6950908303260803, + "num_tokens": 244758406.0, + "step": 9799 + }, + { + "epoch": 1.076213485613881, + "grad_norm": 2.601731538772583, + "learning_rate": 1e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7369248270988464, + "num_tokens": 244776900.0, + "step": 9800 + }, + { + "epoch": 1.0763233033164945, + "grad_norm": 2.3612239360809326, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7088531255722046, + "num_tokens": 244800388.0, + "step": 9801 + }, + { + "epoch": 1.0764331210191083, + "grad_norm": 2.259674310684204, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7191950678825378, + "num_tokens": 244825770.0, + "step": 9802 + }, + { + "epoch": 1.076542938721722, + "grad_norm": 2.518132448196411, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7384136915206909, + "num_tokens": 244846345.0, + "step": 9803 + }, + { + "epoch": 1.0766527564243356, + "grad_norm": 2.1324117183685303, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7078026533126831, + "num_tokens": 244875194.0, + "step": 9804 + }, + { + "epoch": 1.0767625741269493, + "grad_norm": 2.5691051483154297, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7380405068397522, + "num_tokens": 244894290.0, + "step": 9805 + }, + { + "epoch": 1.0768723918295628, + "grad_norm": 2.4326555728912354, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7209790945053101, + "num_tokens": 244917849.0, + "step": 9806 + }, + { + "epoch": 1.0769822095321766, + "grad_norm": 2.708366870880127, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7156858444213867, + "num_tokens": 244937468.0, + "step": 9807 + }, + { + "epoch": 1.0770920272347904, + "grad_norm": 2.0934579372406006, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7413791418075562, + "num_tokens": 244964370.0, + "step": 9808 + }, + { + "epoch": 1.077201844937404, + "grad_norm": 2.0015318393707275, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.72386634349823, + "num_tokens": 244995307.0, + "step": 9809 + }, + { + "epoch": 1.0773116626400177, + "grad_norm": 2.020725965499878, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7388193607330322, + "num_tokens": 245023853.0, + "step": 9810 + }, + { + "epoch": 1.0774214803426312, + "grad_norm": 1.8404960632324219, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.721039891242981, + "num_tokens": 245057923.0, + "step": 9811 + }, + { + "epoch": 1.077531298045245, + "grad_norm": 2.176384687423706, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7246527671813965, + "num_tokens": 245083371.0, + "step": 9812 + }, + { + "epoch": 1.0776411157478585, + "grad_norm": 2.1646006107330322, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7321656942367554, + "num_tokens": 245109959.0, + "step": 9813 + }, + { + "epoch": 1.0777509334504722, + "grad_norm": 2.1808977127075195, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7126206159591675, + "num_tokens": 245135756.0, + "step": 9814 + }, + { + "epoch": 1.0778607511530858, + "grad_norm": 2.347759485244751, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7425916194915771, + "num_tokens": 245156446.0, + "step": 9815 + }, + { + "epoch": 1.0779705688556995, + "grad_norm": 2.251616954803467, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7168472409248352, + "num_tokens": 245182847.0, + "step": 9816 + }, + { + "epoch": 1.0780803865583133, + "grad_norm": 2.430379867553711, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.714492917060852, + "num_tokens": 245204955.0, + "step": 9817 + }, + { + "epoch": 1.0781902042609268, + "grad_norm": 2.2033095359802246, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7094785571098328, + "num_tokens": 245230795.0, + "step": 9818 + }, + { + "epoch": 1.0783000219635406, + "grad_norm": 2.3632397651672363, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7330309152603149, + "num_tokens": 245253158.0, + "step": 9819 + }, + { + "epoch": 1.0784098396661541, + "grad_norm": 2.13861346244812, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7152615189552307, + "num_tokens": 245279549.0, + "step": 9820 + }, + { + "epoch": 1.0785196573687679, + "grad_norm": 2.3657634258270264, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7212303280830383, + "num_tokens": 245303736.0, + "step": 9821 + }, + { + "epoch": 1.0786294750713814, + "grad_norm": 2.1797099113464355, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.6988974809646606, + "num_tokens": 245331396.0, + "step": 9822 + }, + { + "epoch": 1.0787392927739952, + "grad_norm": 2.2265048027038574, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6907643675804138, + "num_tokens": 245359744.0, + "step": 9823 + }, + { + "epoch": 1.078849110476609, + "grad_norm": 1.865384817123413, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6887170076370239, + "num_tokens": 245394257.0, + "step": 9824 + }, + { + "epoch": 1.0789589281792225, + "grad_norm": 2.339597463607788, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7138939499855042, + "num_tokens": 245417186.0, + "step": 9825 + }, + { + "epoch": 1.0790687458818362, + "grad_norm": 2.439359426498413, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7295066714286804, + "num_tokens": 245439264.0, + "step": 9826 + }, + { + "epoch": 1.0791785635844497, + "grad_norm": 2.6896326541900635, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7326310276985168, + "num_tokens": 245458976.0, + "step": 9827 + }, + { + "epoch": 1.0792883812870635, + "grad_norm": 1.999338984489441, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7110609412193298, + "num_tokens": 245491136.0, + "step": 9828 + }, + { + "epoch": 1.079398198989677, + "grad_norm": 2.3260884284973145, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7382903695106506, + "num_tokens": 245516882.0, + "step": 9829 + }, + { + "epoch": 1.0795080166922908, + "grad_norm": 2.0904879570007324, + "learning_rate": 1e-06, + "loss": 0.9878, + "mean_token_accuracy": 0.6999523639678955, + "num_tokens": 245546313.0, + "step": 9830 + }, + { + "epoch": 1.0796178343949046, + "grad_norm": 2.77142596244812, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7159007787704468, + "num_tokens": 245565905.0, + "step": 9831 + }, + { + "epoch": 1.079727652097518, + "grad_norm": 2.5063555240631104, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7321636080741882, + "num_tokens": 245586920.0, + "step": 9832 + }, + { + "epoch": 1.0798374698001318, + "grad_norm": 2.43179988861084, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7256079912185669, + "num_tokens": 245611935.0, + "step": 9833 + }, + { + "epoch": 1.0799472875027454, + "grad_norm": 2.145712375640869, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7187304496765137, + "num_tokens": 245640049.0, + "step": 9834 + }, + { + "epoch": 1.0800571052053591, + "grad_norm": 2.2015881538391113, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7251222133636475, + "num_tokens": 245667184.0, + "step": 9835 + }, + { + "epoch": 1.0801669229079727, + "grad_norm": 2.152451753616333, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7115883231163025, + "num_tokens": 245696056.0, + "step": 9836 + }, + { + "epoch": 1.0802767406105864, + "grad_norm": 2.46685528755188, + "learning_rate": 1e-06, + "loss": 0.7994, + "mean_token_accuracy": 0.7351676225662231, + "num_tokens": 245716283.0, + "step": 9837 + }, + { + "epoch": 1.0803865583132002, + "grad_norm": 2.485969305038452, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7354555130004883, + "num_tokens": 245737054.0, + "step": 9838 + }, + { + "epoch": 1.0804963760158137, + "grad_norm": 2.198666572570801, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6979326605796814, + "num_tokens": 245767239.0, + "step": 9839 + }, + { + "epoch": 1.0806061937184275, + "grad_norm": 2.1435413360595703, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7122803926467896, + "num_tokens": 245796221.0, + "step": 9840 + }, + { + "epoch": 1.080716011421041, + "grad_norm": 2.176577091217041, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7435499429702759, + "num_tokens": 245820688.0, + "step": 9841 + }, + { + "epoch": 1.0808258291236548, + "grad_norm": 2.2494797706604004, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7318295240402222, + "num_tokens": 245846882.0, + "step": 9842 + }, + { + "epoch": 1.0809356468262683, + "grad_norm": 2.0867109298706055, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7292177677154541, + "num_tokens": 245875505.0, + "step": 9843 + }, + { + "epoch": 1.081045464528882, + "grad_norm": 2.368975877761841, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.71331787109375, + "num_tokens": 245898951.0, + "step": 9844 + }, + { + "epoch": 1.0811552822314958, + "grad_norm": 2.257779121398926, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7243427038192749, + "num_tokens": 245924964.0, + "step": 9845 + }, + { + "epoch": 1.0812650999341094, + "grad_norm": 2.562046527862549, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7382848858833313, + "num_tokens": 245945215.0, + "step": 9846 + }, + { + "epoch": 1.081374917636723, + "grad_norm": 2.327545404434204, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7384689450263977, + "num_tokens": 245968452.0, + "step": 9847 + }, + { + "epoch": 1.0814847353393366, + "grad_norm": 2.3098766803741455, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7174191474914551, + "num_tokens": 245994057.0, + "step": 9848 + }, + { + "epoch": 1.0815945530419504, + "grad_norm": 2.37534761428833, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.712649941444397, + "num_tokens": 246017555.0, + "step": 9849 + }, + { + "epoch": 1.081704370744564, + "grad_norm": 2.0979464054107666, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7049579620361328, + "num_tokens": 246046078.0, + "step": 9850 + }, + { + "epoch": 1.0818141884471777, + "grad_norm": 2.454050302505493, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.734165608882904, + "num_tokens": 246067061.0, + "step": 9851 + }, + { + "epoch": 1.0819240061497912, + "grad_norm": 2.0368640422821045, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.705662190914154, + "num_tokens": 246097241.0, + "step": 9852 + }, + { + "epoch": 1.082033823852405, + "grad_norm": 2.3268048763275146, + "learning_rate": 1e-06, + "loss": 0.824, + "mean_token_accuracy": 0.7395058274269104, + "num_tokens": 246123538.0, + "step": 9853 + }, + { + "epoch": 1.0821436415550187, + "grad_norm": 2.841198444366455, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7255795001983643, + "num_tokens": 246141527.0, + "step": 9854 + }, + { + "epoch": 1.0822534592576323, + "grad_norm": 2.1864516735076904, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7168616056442261, + "num_tokens": 246168706.0, + "step": 9855 + }, + { + "epoch": 1.082363276960246, + "grad_norm": 1.9555197954177856, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7117469906806946, + "num_tokens": 246200904.0, + "step": 9856 + }, + { + "epoch": 1.0824730946628596, + "grad_norm": 2.2280097007751465, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7192124724388123, + "num_tokens": 246227277.0, + "step": 9857 + }, + { + "epoch": 1.0825829123654733, + "grad_norm": 2.056292772293091, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.690477728843689, + "num_tokens": 246259999.0, + "step": 9858 + }, + { + "epoch": 1.082692730068087, + "grad_norm": 2.4566545486450195, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7142559885978699, + "num_tokens": 246281472.0, + "step": 9859 + }, + { + "epoch": 1.0828025477707006, + "grad_norm": 1.9944088459014893, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7353549003601074, + "num_tokens": 246311254.0, + "step": 9860 + }, + { + "epoch": 1.0829123654733144, + "grad_norm": 2.3252923488616943, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7044099569320679, + "num_tokens": 246336009.0, + "step": 9861 + }, + { + "epoch": 1.083022183175928, + "grad_norm": 2.1193699836730957, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.718458890914917, + "num_tokens": 246363127.0, + "step": 9862 + }, + { + "epoch": 1.0831320008785417, + "grad_norm": 2.4277865886688232, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7337668538093567, + "num_tokens": 246386791.0, + "step": 9863 + }, + { + "epoch": 1.0832418185811552, + "grad_norm": 2.3269450664520264, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7353633642196655, + "num_tokens": 246410713.0, + "step": 9864 + }, + { + "epoch": 1.083351636283769, + "grad_norm": 2.5297536849975586, + "learning_rate": 1e-06, + "loss": 0.7917, + "mean_token_accuracy": 0.7493014335632324, + "num_tokens": 246429797.0, + "step": 9865 + }, + { + "epoch": 1.0834614539863825, + "grad_norm": 2.605635643005371, + "learning_rate": 1e-06, + "loss": 0.7392, + "mean_token_accuracy": 0.7605077028274536, + "num_tokens": 246447846.0, + "step": 9866 + }, + { + "epoch": 1.0835712716889963, + "grad_norm": 2.240701675415039, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7347369194030762, + "num_tokens": 246471720.0, + "step": 9867 + }, + { + "epoch": 1.08368108939161, + "grad_norm": 2.5309979915618896, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7275604009628296, + "num_tokens": 246491958.0, + "step": 9868 + }, + { + "epoch": 1.0837909070942235, + "grad_norm": 2.122990846633911, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7285212278366089, + "num_tokens": 246520227.0, + "step": 9869 + }, + { + "epoch": 1.0839007247968373, + "grad_norm": 2.233470916748047, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7065994739532471, + "num_tokens": 246545520.0, + "step": 9870 + }, + { + "epoch": 1.0840105424994508, + "grad_norm": 1.9949148893356323, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7421306371688843, + "num_tokens": 246572771.0, + "step": 9871 + }, + { + "epoch": 1.0841203602020646, + "grad_norm": 2.783062696456909, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7145848870277405, + "num_tokens": 246590262.0, + "step": 9872 + }, + { + "epoch": 1.0842301779046783, + "grad_norm": 2.4923477172851562, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.694994330406189, + "num_tokens": 246613239.0, + "step": 9873 + }, + { + "epoch": 1.0843399956072919, + "grad_norm": 2.269272565841675, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7138472199440002, + "num_tokens": 246640103.0, + "step": 9874 + }, + { + "epoch": 1.0844498133099056, + "grad_norm": 2.2335782051086426, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.699042558670044, + "num_tokens": 246665832.0, + "step": 9875 + }, + { + "epoch": 1.0845596310125192, + "grad_norm": 2.2553958892822266, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7037359476089478, + "num_tokens": 246692248.0, + "step": 9876 + }, + { + "epoch": 1.084669448715133, + "grad_norm": 2.051586627960205, + "learning_rate": 1e-06, + "loss": 1.0058, + "mean_token_accuracy": 0.6997453570365906, + "num_tokens": 246724059.0, + "step": 9877 + }, + { + "epoch": 1.0847792664177465, + "grad_norm": 2.274372100830078, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.689976692199707, + "num_tokens": 246750956.0, + "step": 9878 + }, + { + "epoch": 1.0848890841203602, + "grad_norm": 2.334045886993408, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7355710864067078, + "num_tokens": 246774325.0, + "step": 9879 + }, + { + "epoch": 1.0849989018229738, + "grad_norm": 2.146667003631592, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7361329793930054, + "num_tokens": 246800574.0, + "step": 9880 + }, + { + "epoch": 1.0851087195255875, + "grad_norm": 2.501577377319336, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7154223322868347, + "num_tokens": 246822475.0, + "step": 9881 + }, + { + "epoch": 1.0852185372282013, + "grad_norm": 1.8801196813583374, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7044922113418579, + "num_tokens": 246860148.0, + "step": 9882 + }, + { + "epoch": 1.0853283549308148, + "grad_norm": 2.5422163009643555, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7204736471176147, + "num_tokens": 246882199.0, + "step": 9883 + }, + { + "epoch": 1.0854381726334286, + "grad_norm": 2.4096527099609375, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7190920114517212, + "num_tokens": 246906215.0, + "step": 9884 + }, + { + "epoch": 1.085547990336042, + "grad_norm": 2.1734306812286377, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.6991610527038574, + "num_tokens": 246934453.0, + "step": 9885 + }, + { + "epoch": 1.0856578080386559, + "grad_norm": 2.295231580734253, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7199803590774536, + "num_tokens": 246959171.0, + "step": 9886 + }, + { + "epoch": 1.0857676257412694, + "grad_norm": 2.440500497817993, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7423809766769409, + "num_tokens": 246981966.0, + "step": 9887 + }, + { + "epoch": 1.0858774434438832, + "grad_norm": 2.267364263534546, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7109891176223755, + "num_tokens": 247006806.0, + "step": 9888 + }, + { + "epoch": 1.085987261146497, + "grad_norm": 2.4422755241394043, + "learning_rate": 1e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7521827816963196, + "num_tokens": 247027719.0, + "step": 9889 + }, + { + "epoch": 1.0860970788491104, + "grad_norm": 2.2239253520965576, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.727556586265564, + "num_tokens": 247052818.0, + "step": 9890 + }, + { + "epoch": 1.0862068965517242, + "grad_norm": 2.338108777999878, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7028542757034302, + "num_tokens": 247076252.0, + "step": 9891 + }, + { + "epoch": 1.0863167142543377, + "grad_norm": 2.0688841342926025, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6986109614372253, + "num_tokens": 247106683.0, + "step": 9892 + }, + { + "epoch": 1.0864265319569515, + "grad_norm": 2.1594467163085938, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7067055702209473, + "num_tokens": 247134051.0, + "step": 9893 + }, + { + "epoch": 1.086536349659565, + "grad_norm": 2.4001834392547607, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7266245484352112, + "num_tokens": 247156187.0, + "step": 9894 + }, + { + "epoch": 1.0866461673621788, + "grad_norm": 3.0459842681884766, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7394919395446777, + "num_tokens": 247172483.0, + "step": 9895 + }, + { + "epoch": 1.0867559850647925, + "grad_norm": 2.186657428741455, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7277922630310059, + "num_tokens": 247199113.0, + "step": 9896 + }, + { + "epoch": 1.086865802767406, + "grad_norm": 2.3559720516204834, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7088733911514282, + "num_tokens": 247220958.0, + "step": 9897 + }, + { + "epoch": 1.0869756204700198, + "grad_norm": 2.1003775596618652, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.723699152469635, + "num_tokens": 247246568.0, + "step": 9898 + }, + { + "epoch": 1.0870854381726334, + "grad_norm": 2.060610294342041, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7065150141716003, + "num_tokens": 247278156.0, + "step": 9899 + }, + { + "epoch": 1.0871952558752471, + "grad_norm": 2.575059175491333, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7344958186149597, + "num_tokens": 247300003.0, + "step": 9900 + }, + { + "epoch": 1.0873050735778607, + "grad_norm": 2.171372175216675, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7123425006866455, + "num_tokens": 247329168.0, + "step": 9901 + }, + { + "epoch": 1.0874148912804744, + "grad_norm": 2.2646799087524414, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.720649242401123, + "num_tokens": 247354638.0, + "step": 9902 + }, + { + "epoch": 1.0875247089830882, + "grad_norm": 2.2588326930999756, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7276942729949951, + "num_tokens": 247380468.0, + "step": 9903 + }, + { + "epoch": 1.0876345266857017, + "grad_norm": 2.1234803199768066, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7172325253486633, + "num_tokens": 247409521.0, + "step": 9904 + }, + { + "epoch": 1.0877443443883155, + "grad_norm": 2.1500658988952637, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7183182239532471, + "num_tokens": 247436747.0, + "step": 9905 + }, + { + "epoch": 1.087854162090929, + "grad_norm": 2.2427480220794678, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7261897921562195, + "num_tokens": 247462588.0, + "step": 9906 + }, + { + "epoch": 1.0879639797935428, + "grad_norm": 2.3656017780303955, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7275623083114624, + "num_tokens": 247485135.0, + "step": 9907 + }, + { + "epoch": 1.0880737974961563, + "grad_norm": 1.9695674180984497, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7273664474487305, + "num_tokens": 247517065.0, + "step": 9908 + }, + { + "epoch": 1.08818361519877, + "grad_norm": 2.0730011463165283, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7372402548789978, + "num_tokens": 247545724.0, + "step": 9909 + }, + { + "epoch": 1.0882934329013838, + "grad_norm": 2.2289934158325195, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.704861044883728, + "num_tokens": 247572774.0, + "step": 9910 + }, + { + "epoch": 1.0884032506039973, + "grad_norm": 2.177314519882202, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.724149763584137, + "num_tokens": 247599177.0, + "step": 9911 + }, + { + "epoch": 1.088513068306611, + "grad_norm": 2.688507080078125, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7408283948898315, + "num_tokens": 247617417.0, + "step": 9912 + }, + { + "epoch": 1.0886228860092246, + "grad_norm": 2.450279474258423, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.726172685623169, + "num_tokens": 247638121.0, + "step": 9913 + }, + { + "epoch": 1.0887327037118384, + "grad_norm": 2.101606845855713, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7169222831726074, + "num_tokens": 247667388.0, + "step": 9914 + }, + { + "epoch": 1.088842521414452, + "grad_norm": 2.197626829147339, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7150291204452515, + "num_tokens": 247693996.0, + "step": 9915 + }, + { + "epoch": 1.0889523391170657, + "grad_norm": 2.1309261322021484, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7125102877616882, + "num_tokens": 247725358.0, + "step": 9916 + }, + { + "epoch": 1.0890621568196792, + "grad_norm": 2.190664291381836, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7213635444641113, + "num_tokens": 247751127.0, + "step": 9917 + }, + { + "epoch": 1.089171974522293, + "grad_norm": 2.6473331451416016, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7204688787460327, + "num_tokens": 247770294.0, + "step": 9918 + }, + { + "epoch": 1.0892817922249067, + "grad_norm": 2.3210642337799072, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.6988586783409119, + "num_tokens": 247796775.0, + "step": 9919 + }, + { + "epoch": 1.0893916099275203, + "grad_norm": 2.3388113975524902, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7058178186416626, + "num_tokens": 247821577.0, + "step": 9920 + }, + { + "epoch": 1.089501427630134, + "grad_norm": 2.2585673332214355, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7091801762580872, + "num_tokens": 247846870.0, + "step": 9921 + }, + { + "epoch": 1.0896112453327476, + "grad_norm": 2.08170485496521, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.7079854011535645, + "num_tokens": 247876314.0, + "step": 9922 + }, + { + "epoch": 1.0897210630353613, + "grad_norm": 2.2831826210021973, + "learning_rate": 1e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7417440414428711, + "num_tokens": 247899796.0, + "step": 9923 + }, + { + "epoch": 1.089830880737975, + "grad_norm": 2.301069736480713, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7245110869407654, + "num_tokens": 247923762.0, + "step": 9924 + }, + { + "epoch": 1.0899406984405886, + "grad_norm": 2.4442856311798096, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7116807699203491, + "num_tokens": 247947527.0, + "step": 9925 + }, + { + "epoch": 1.0900505161432024, + "grad_norm": 2.0927631855010986, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7105770111083984, + "num_tokens": 247975704.0, + "step": 9926 + }, + { + "epoch": 1.090160333845816, + "grad_norm": 2.122248411178589, + "learning_rate": 1e-06, + "loss": 1.0407, + "mean_token_accuracy": 0.6782827973365784, + "num_tokens": 248007163.0, + "step": 9927 + }, + { + "epoch": 1.0902701515484297, + "grad_norm": 2.2012228965759277, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6977361440658569, + "num_tokens": 248034234.0, + "step": 9928 + }, + { + "epoch": 1.0903799692510432, + "grad_norm": 2.3787741661071777, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7358055114746094, + "num_tokens": 248055783.0, + "step": 9929 + }, + { + "epoch": 1.090489786953657, + "grad_norm": 2.2163610458374023, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7170825004577637, + "num_tokens": 248083941.0, + "step": 9930 + }, + { + "epoch": 1.0905996046562705, + "grad_norm": 2.211980104446411, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.723741352558136, + "num_tokens": 248107390.0, + "step": 9931 + }, + { + "epoch": 1.0907094223588842, + "grad_norm": 2.289245128631592, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7044510841369629, + "num_tokens": 248134354.0, + "step": 9932 + }, + { + "epoch": 1.090819240061498, + "grad_norm": 2.1795902252197266, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7113332748413086, + "num_tokens": 248162558.0, + "step": 9933 + }, + { + "epoch": 1.0909290577641115, + "grad_norm": 2.1482930183410645, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7381052374839783, + "num_tokens": 248187378.0, + "step": 9934 + }, + { + "epoch": 1.0910388754667253, + "grad_norm": 2.0903193950653076, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6937394142150879, + "num_tokens": 248216526.0, + "step": 9935 + }, + { + "epoch": 1.0911486931693388, + "grad_norm": 2.3342502117156982, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7050307989120483, + "num_tokens": 248240059.0, + "step": 9936 + }, + { + "epoch": 1.0912585108719526, + "grad_norm": 2.2550926208496094, + "learning_rate": 1e-06, + "loss": 0.8829, + "mean_token_accuracy": 0.7290974259376526, + "num_tokens": 248266395.0, + "step": 9937 + }, + { + "epoch": 1.0913683285745663, + "grad_norm": 2.28631329536438, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7193308472633362, + "num_tokens": 248291808.0, + "step": 9938 + }, + { + "epoch": 1.0914781462771799, + "grad_norm": 2.3489022254943848, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7084985375404358, + "num_tokens": 248316417.0, + "step": 9939 + }, + { + "epoch": 1.0915879639797936, + "grad_norm": 2.3403513431549072, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.6920729279518127, + "num_tokens": 248341563.0, + "step": 9940 + }, + { + "epoch": 1.0916977816824072, + "grad_norm": 2.5371596813201904, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7152284383773804, + "num_tokens": 248363302.0, + "step": 9941 + }, + { + "epoch": 1.091807599385021, + "grad_norm": 2.108071804046631, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7044172286987305, + "num_tokens": 248390255.0, + "step": 9942 + }, + { + "epoch": 1.0919174170876345, + "grad_norm": 2.287158489227295, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7215547561645508, + "num_tokens": 248415264.0, + "step": 9943 + }, + { + "epoch": 1.0920272347902482, + "grad_norm": 2.2091379165649414, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6957324743270874, + "num_tokens": 248443060.0, + "step": 9944 + }, + { + "epoch": 1.0921370524928617, + "grad_norm": 2.2628912925720215, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7144230604171753, + "num_tokens": 248466936.0, + "step": 9945 + }, + { + "epoch": 1.0922468701954755, + "grad_norm": 2.2907767295837402, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7231173515319824, + "num_tokens": 248493485.0, + "step": 9946 + }, + { + "epoch": 1.0923566878980893, + "grad_norm": 2.0739970207214355, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7133861780166626, + "num_tokens": 248522518.0, + "step": 9947 + }, + { + "epoch": 1.0924665056007028, + "grad_norm": 2.2032811641693115, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7151427268981934, + "num_tokens": 248549903.0, + "step": 9948 + }, + { + "epoch": 1.0925763233033166, + "grad_norm": 2.2140145301818848, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7324331402778625, + "num_tokens": 248575492.0, + "step": 9949 + }, + { + "epoch": 1.09268614100593, + "grad_norm": 2.468611001968384, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7223442792892456, + "num_tokens": 248596369.0, + "step": 9950 + }, + { + "epoch": 1.0927959587085438, + "grad_norm": 2.4726192951202393, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7257914543151855, + "num_tokens": 248617448.0, + "step": 9951 + }, + { + "epoch": 1.0929057764111574, + "grad_norm": 2.2774057388305664, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.724563717842102, + "num_tokens": 248641213.0, + "step": 9952 + }, + { + "epoch": 1.0930155941137711, + "grad_norm": 2.4721717834472656, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7202280759811401, + "num_tokens": 248662902.0, + "step": 9953 + }, + { + "epoch": 1.093125411816385, + "grad_norm": 2.4979164600372314, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7237474322319031, + "num_tokens": 248684315.0, + "step": 9954 + }, + { + "epoch": 1.0932352295189984, + "grad_norm": 2.722644329071045, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7418761849403381, + "num_tokens": 248701758.0, + "step": 9955 + }, + { + "epoch": 1.0933450472216122, + "grad_norm": 2.4869673252105713, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7102057337760925, + "num_tokens": 248723162.0, + "step": 9956 + }, + { + "epoch": 1.0934548649242257, + "grad_norm": 2.163580894470215, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7080657482147217, + "num_tokens": 248751797.0, + "step": 9957 + }, + { + "epoch": 1.0935646826268395, + "grad_norm": 2.1875803470611572, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7169730067253113, + "num_tokens": 248778228.0, + "step": 9958 + }, + { + "epoch": 1.093674500329453, + "grad_norm": 2.0571727752685547, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7210138440132141, + "num_tokens": 248809724.0, + "step": 9959 + }, + { + "epoch": 1.0937843180320668, + "grad_norm": 2.4341068267822266, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7299289703369141, + "num_tokens": 248832168.0, + "step": 9960 + }, + { + "epoch": 1.0938941357346805, + "grad_norm": 2.1594228744506836, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7142937779426575, + "num_tokens": 248860626.0, + "step": 9961 + }, + { + "epoch": 1.094003953437294, + "grad_norm": 2.441512107849121, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7274935245513916, + "num_tokens": 248883544.0, + "step": 9962 + }, + { + "epoch": 1.0941137711399078, + "grad_norm": 2.377699851989746, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7296774387359619, + "num_tokens": 248906526.0, + "step": 9963 + }, + { + "epoch": 1.0942235888425214, + "grad_norm": 2.454305648803711, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7221893072128296, + "num_tokens": 248929304.0, + "step": 9964 + }, + { + "epoch": 1.0943334065451351, + "grad_norm": 2.2443196773529053, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7123238444328308, + "num_tokens": 248954607.0, + "step": 9965 + }, + { + "epoch": 1.0944432242477486, + "grad_norm": 2.530200719833374, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.717702329158783, + "num_tokens": 248976268.0, + "step": 9966 + }, + { + "epoch": 1.0945530419503624, + "grad_norm": 2.3024075031280518, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7085961699485779, + "num_tokens": 249000646.0, + "step": 9967 + }, + { + "epoch": 1.0946628596529762, + "grad_norm": 2.1091480255126953, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7193285226821899, + "num_tokens": 249028552.0, + "step": 9968 + }, + { + "epoch": 1.0947726773555897, + "grad_norm": 2.327186107635498, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7258805632591248, + "num_tokens": 249053014.0, + "step": 9969 + }, + { + "epoch": 1.0948824950582035, + "grad_norm": 2.4125189781188965, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7290352582931519, + "num_tokens": 249073324.0, + "step": 9970 + }, + { + "epoch": 1.094992312760817, + "grad_norm": 2.345839738845825, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7243466377258301, + "num_tokens": 249099218.0, + "step": 9971 + }, + { + "epoch": 1.0951021304634307, + "grad_norm": 2.5946059226989746, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7317119836807251, + "num_tokens": 249119238.0, + "step": 9972 + }, + { + "epoch": 1.0952119481660443, + "grad_norm": 2.336472988128662, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7235316038131714, + "num_tokens": 249143196.0, + "step": 9973 + }, + { + "epoch": 1.095321765868658, + "grad_norm": 2.2750232219696045, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7148298025131226, + "num_tokens": 249168414.0, + "step": 9974 + }, + { + "epoch": 1.0954315835712718, + "grad_norm": 2.426664352416992, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.6964004039764404, + "num_tokens": 249193963.0, + "step": 9975 + }, + { + "epoch": 1.0955414012738853, + "grad_norm": 2.2155959606170654, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7166858315467834, + "num_tokens": 249222756.0, + "step": 9976 + }, + { + "epoch": 1.095651218976499, + "grad_norm": 2.2689671516418457, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7143352031707764, + "num_tokens": 249249934.0, + "step": 9977 + }, + { + "epoch": 1.0957610366791126, + "grad_norm": 2.5176589488983154, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7178362607955933, + "num_tokens": 249271445.0, + "step": 9978 + }, + { + "epoch": 1.0958708543817264, + "grad_norm": 2.337782859802246, + "learning_rate": 1e-06, + "loss": 0.8008, + "mean_token_accuracy": 0.746078610420227, + "num_tokens": 249295182.0, + "step": 9979 + }, + { + "epoch": 1.09598067208434, + "grad_norm": 2.1433136463165283, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7029632925987244, + "num_tokens": 249324206.0, + "step": 9980 + }, + { + "epoch": 1.0960904897869537, + "grad_norm": 2.091550827026367, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7055894136428833, + "num_tokens": 249352814.0, + "step": 9981 + }, + { + "epoch": 1.0962003074895672, + "grad_norm": 2.3146989345550537, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7314153909683228, + "num_tokens": 249376448.0, + "step": 9982 + }, + { + "epoch": 1.096310125192181, + "grad_norm": 2.3242478370666504, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7263240218162537, + "num_tokens": 249399477.0, + "step": 9983 + }, + { + "epoch": 1.0964199428947947, + "grad_norm": 2.021106481552124, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7121971249580383, + "num_tokens": 249431504.0, + "step": 9984 + }, + { + "epoch": 1.0965297605974083, + "grad_norm": 2.094888687133789, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7067543268203735, + "num_tokens": 249463044.0, + "step": 9985 + }, + { + "epoch": 1.096639578300022, + "grad_norm": 2.117111921310425, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7218602895736694, + "num_tokens": 249492292.0, + "step": 9986 + }, + { + "epoch": 1.0967493960026355, + "grad_norm": 2.447659730911255, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7055178880691528, + "num_tokens": 249515790.0, + "step": 9987 + }, + { + "epoch": 1.0968592137052493, + "grad_norm": 1.9183043241500854, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7211783528327942, + "num_tokens": 249546510.0, + "step": 9988 + }, + { + "epoch": 1.096969031407863, + "grad_norm": 2.5083858966827393, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7154255509376526, + "num_tokens": 249568875.0, + "step": 9989 + }, + { + "epoch": 1.0970788491104766, + "grad_norm": 2.463836193084717, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7065175771713257, + "num_tokens": 249592980.0, + "step": 9990 + }, + { + "epoch": 1.0971886668130904, + "grad_norm": 2.41577410697937, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7073036432266235, + "num_tokens": 249618493.0, + "step": 9991 + }, + { + "epoch": 1.0972984845157039, + "grad_norm": 2.212146043777466, + "learning_rate": 1e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.75255286693573, + "num_tokens": 249645050.0, + "step": 9992 + }, + { + "epoch": 1.0974083022183176, + "grad_norm": 2.7114527225494385, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7124663591384888, + "num_tokens": 249664871.0, + "step": 9993 + }, + { + "epoch": 1.0975181199209312, + "grad_norm": 2.0812251567840576, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7208915948867798, + "num_tokens": 249692069.0, + "step": 9994 + }, + { + "epoch": 1.097627937623545, + "grad_norm": 1.9748222827911377, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7283734083175659, + "num_tokens": 249722649.0, + "step": 9995 + }, + { + "epoch": 1.0977377553261585, + "grad_norm": 2.6881301403045654, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7411580085754395, + "num_tokens": 249740365.0, + "step": 9996 + }, + { + "epoch": 1.0978475730287722, + "grad_norm": 2.7660071849823, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7232419848442078, + "num_tokens": 249761437.0, + "step": 9997 + }, + { + "epoch": 1.097957390731386, + "grad_norm": 2.2447385787963867, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7142534852027893, + "num_tokens": 249786637.0, + "step": 9998 + }, + { + "epoch": 1.0980672084339995, + "grad_norm": 2.0998826026916504, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7168787717819214, + "num_tokens": 249816268.0, + "step": 9999 + }, + { + "epoch": 1.0981770261366133, + "grad_norm": 2.13638973236084, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7305505275726318, + "num_tokens": 249843373.0, + "step": 10000 + }, + { + "epoch": 1.0982868438392268, + "grad_norm": 2.485414743423462, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7225223779678345, + "num_tokens": 249865556.0, + "step": 10001 + }, + { + "epoch": 1.0983966615418406, + "grad_norm": 2.390516757965088, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.695320725440979, + "num_tokens": 249891204.0, + "step": 10002 + }, + { + "epoch": 1.098506479244454, + "grad_norm": 1.74717116355896, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7236934900283813, + "num_tokens": 249930497.0, + "step": 10003 + }, + { + "epoch": 1.0986162969470679, + "grad_norm": 2.502730369567871, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7224795818328857, + "num_tokens": 249951180.0, + "step": 10004 + }, + { + "epoch": 1.0987261146496816, + "grad_norm": 2.1496403217315674, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7372497320175171, + "num_tokens": 249976643.0, + "step": 10005 + }, + { + "epoch": 1.0988359323522952, + "grad_norm": 2.5593326091766357, + "learning_rate": 1e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7408868074417114, + "num_tokens": 249995548.0, + "step": 10006 + }, + { + "epoch": 1.098945750054909, + "grad_norm": 2.167649507522583, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7164802551269531, + "num_tokens": 250021455.0, + "step": 10007 + }, + { + "epoch": 1.0990555677575224, + "grad_norm": 2.3408868312835693, + "learning_rate": 1e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7355215549468994, + "num_tokens": 250044386.0, + "step": 10008 + }, + { + "epoch": 1.0991653854601362, + "grad_norm": 2.5266451835632324, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7169133424758911, + "num_tokens": 250064912.0, + "step": 10009 + }, + { + "epoch": 1.0992752031627497, + "grad_norm": 2.398117780685425, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7073554396629333, + "num_tokens": 250090284.0, + "step": 10010 + }, + { + "epoch": 1.0993850208653635, + "grad_norm": 2.163963556289673, + "learning_rate": 1e-06, + "loss": 1.0278, + "mean_token_accuracy": 0.6833648681640625, + "num_tokens": 250121918.0, + "step": 10011 + }, + { + "epoch": 1.0994948385679773, + "grad_norm": 2.461682081222534, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7073830962181091, + "num_tokens": 250144263.0, + "step": 10012 + }, + { + "epoch": 1.0996046562705908, + "grad_norm": 2.1090192794799805, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7210231423377991, + "num_tokens": 250170412.0, + "step": 10013 + }, + { + "epoch": 1.0997144739732045, + "grad_norm": 2.2140204906463623, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.71517014503479, + "num_tokens": 250196097.0, + "step": 10014 + }, + { + "epoch": 1.099824291675818, + "grad_norm": 2.7028112411499023, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7409818768501282, + "num_tokens": 250214389.0, + "step": 10015 + }, + { + "epoch": 1.0999341093784318, + "grad_norm": 2.29746413230896, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7026058435440063, + "num_tokens": 250241819.0, + "step": 10016 + }, + { + "epoch": 1.1000439270810454, + "grad_norm": 2.441920280456543, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7002720236778259, + "num_tokens": 250266180.0, + "step": 10017 + }, + { + "epoch": 1.1001537447836591, + "grad_norm": 2.1852378845214844, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7214507460594177, + "num_tokens": 250292589.0, + "step": 10018 + }, + { + "epoch": 1.1002635624862729, + "grad_norm": 2.513185739517212, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7300254702568054, + "num_tokens": 250313051.0, + "step": 10019 + }, + { + "epoch": 1.1003733801888864, + "grad_norm": 2.2795004844665527, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7216954231262207, + "num_tokens": 250339020.0, + "step": 10020 + }, + { + "epoch": 1.1004831978915002, + "grad_norm": 2.5161337852478027, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7088029384613037, + "num_tokens": 250362820.0, + "step": 10021 + }, + { + "epoch": 1.1005930155941137, + "grad_norm": 2.0628931522369385, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7239833474159241, + "num_tokens": 250391813.0, + "step": 10022 + }, + { + "epoch": 1.1007028332967275, + "grad_norm": 2.1698122024536133, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7000700235366821, + "num_tokens": 250417749.0, + "step": 10023 + }, + { + "epoch": 1.100812650999341, + "grad_norm": 2.396078109741211, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7176997065544128, + "num_tokens": 250439592.0, + "step": 10024 + }, + { + "epoch": 1.1009224687019548, + "grad_norm": 2.77414870262146, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7286456823348999, + "num_tokens": 250456735.0, + "step": 10025 + }, + { + "epoch": 1.1010322864045685, + "grad_norm": 2.5756146907806396, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7160497903823853, + "num_tokens": 250477448.0, + "step": 10026 + }, + { + "epoch": 1.101142104107182, + "grad_norm": 2.40962815284729, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7232921123504639, + "num_tokens": 250501228.0, + "step": 10027 + }, + { + "epoch": 1.1012519218097958, + "grad_norm": 2.717156410217285, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7158401012420654, + "num_tokens": 250520337.0, + "step": 10028 + }, + { + "epoch": 1.1013617395124093, + "grad_norm": 2.210146188735962, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.719825804233551, + "num_tokens": 250545311.0, + "step": 10029 + }, + { + "epoch": 1.101471557215023, + "grad_norm": 1.9285454750061035, + "learning_rate": 1e-06, + "loss": 1.0133, + "mean_token_accuracy": 0.7009458541870117, + "num_tokens": 250577804.0, + "step": 10030 + }, + { + "epoch": 1.1015813749176366, + "grad_norm": 1.866439938545227, + "learning_rate": 1e-06, + "loss": 1.1149, + "mean_token_accuracy": 0.6640346050262451, + "num_tokens": 250613184.0, + "step": 10031 + }, + { + "epoch": 1.1016911926202504, + "grad_norm": 2.147704839706421, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7175880074501038, + "num_tokens": 250639340.0, + "step": 10032 + }, + { + "epoch": 1.1018010103228641, + "grad_norm": 2.4474167823791504, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7184128165245056, + "num_tokens": 250661316.0, + "step": 10033 + }, + { + "epoch": 1.1019108280254777, + "grad_norm": 2.275031805038452, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7072314023971558, + "num_tokens": 250687478.0, + "step": 10034 + }, + { + "epoch": 1.1020206457280914, + "grad_norm": 1.9895154237747192, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7082730531692505, + "num_tokens": 250719103.0, + "step": 10035 + }, + { + "epoch": 1.102130463430705, + "grad_norm": 2.759903907775879, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7186385989189148, + "num_tokens": 250737894.0, + "step": 10036 + }, + { + "epoch": 1.1022402811333187, + "grad_norm": 2.561063289642334, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7324029207229614, + "num_tokens": 250757831.0, + "step": 10037 + }, + { + "epoch": 1.1023500988359323, + "grad_norm": 2.5115065574645996, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7358496785163879, + "num_tokens": 250779024.0, + "step": 10038 + }, + { + "epoch": 1.102459916538546, + "grad_norm": 2.0453035831451416, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7301247119903564, + "num_tokens": 250806107.0, + "step": 10039 + }, + { + "epoch": 1.1025697342411598, + "grad_norm": 2.2462449073791504, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7167302370071411, + "num_tokens": 250832024.0, + "step": 10040 + }, + { + "epoch": 1.1026795519437733, + "grad_norm": 2.2276148796081543, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7099689245223999, + "num_tokens": 250857967.0, + "step": 10041 + }, + { + "epoch": 1.102789369646387, + "grad_norm": 2.236995220184326, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7195990085601807, + "num_tokens": 250881501.0, + "step": 10042 + }, + { + "epoch": 1.1028991873490006, + "grad_norm": 2.680006265640259, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7160031795501709, + "num_tokens": 250900418.0, + "step": 10043 + }, + { + "epoch": 1.1030090050516144, + "grad_norm": 2.2913427352905273, + "learning_rate": 1e-06, + "loss": 0.7358, + "mean_token_accuracy": 0.7645233869552612, + "num_tokens": 250922191.0, + "step": 10044 + }, + { + "epoch": 1.103118822754228, + "grad_norm": 2.3559281826019287, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7300387620925903, + "num_tokens": 250944956.0, + "step": 10045 + }, + { + "epoch": 1.1032286404568417, + "grad_norm": 2.413745880126953, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7279483079910278, + "num_tokens": 250968634.0, + "step": 10046 + }, + { + "epoch": 1.1033384581594552, + "grad_norm": 2.3678059577941895, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7162199020385742, + "num_tokens": 250993017.0, + "step": 10047 + }, + { + "epoch": 1.103448275862069, + "grad_norm": 2.2927815914154053, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7028999328613281, + "num_tokens": 251020156.0, + "step": 10048 + }, + { + "epoch": 1.1035580935646827, + "grad_norm": 2.1857354640960693, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7070125937461853, + "num_tokens": 251047581.0, + "step": 10049 + }, + { + "epoch": 1.1036679112672962, + "grad_norm": 2.5468320846557617, + "learning_rate": 1e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7518820762634277, + "num_tokens": 251068213.0, + "step": 10050 + }, + { + "epoch": 1.10377772896991, + "grad_norm": 2.2948062419891357, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7324867248535156, + "num_tokens": 251091549.0, + "step": 10051 + }, + { + "epoch": 1.1038875466725235, + "grad_norm": 2.3165769577026367, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.721017599105835, + "num_tokens": 251114774.0, + "step": 10052 + }, + { + "epoch": 1.1039973643751373, + "grad_norm": 2.5999298095703125, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7082604169845581, + "num_tokens": 251134531.0, + "step": 10053 + }, + { + "epoch": 1.104107182077751, + "grad_norm": 2.0064430236816406, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7230027914047241, + "num_tokens": 251164774.0, + "step": 10054 + }, + { + "epoch": 1.1042169997803646, + "grad_norm": 2.291654348373413, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.727297842502594, + "num_tokens": 251189404.0, + "step": 10055 + }, + { + "epoch": 1.1043268174829783, + "grad_norm": 2.1542892456054688, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7174972295761108, + "num_tokens": 251216020.0, + "step": 10056 + }, + { + "epoch": 1.1044366351855919, + "grad_norm": 2.1766910552978516, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.7009946703910828, + "num_tokens": 251244094.0, + "step": 10057 + }, + { + "epoch": 1.1045464528882056, + "grad_norm": 2.227327823638916, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7098375558853149, + "num_tokens": 251270460.0, + "step": 10058 + }, + { + "epoch": 1.1046562705908192, + "grad_norm": 1.9973599910736084, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7090858221054077, + "num_tokens": 251301550.0, + "step": 10059 + }, + { + "epoch": 1.104766088293433, + "grad_norm": 2.3276753425598145, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7273163795471191, + "num_tokens": 251323100.0, + "step": 10060 + }, + { + "epoch": 1.1048759059960465, + "grad_norm": 2.4466772079467773, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7235422134399414, + "num_tokens": 251344571.0, + "step": 10061 + }, + { + "epoch": 1.1049857236986602, + "grad_norm": 2.26214337348938, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7214921712875366, + "num_tokens": 251368009.0, + "step": 10062 + }, + { + "epoch": 1.105095541401274, + "grad_norm": 2.014150381088257, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.70734703540802, + "num_tokens": 251398636.0, + "step": 10063 + }, + { + "epoch": 1.1052053591038875, + "grad_norm": 2.277636766433716, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7257786989212036, + "num_tokens": 251424331.0, + "step": 10064 + }, + { + "epoch": 1.1053151768065013, + "grad_norm": 2.4354865550994873, + "learning_rate": 1e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7468451857566833, + "num_tokens": 251445247.0, + "step": 10065 + }, + { + "epoch": 1.1054249945091148, + "grad_norm": 2.477483034133911, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7155233025550842, + "num_tokens": 251467391.0, + "step": 10066 + }, + { + "epoch": 1.1055348122117286, + "grad_norm": 2.362557888031006, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7306559085845947, + "num_tokens": 251488280.0, + "step": 10067 + }, + { + "epoch": 1.105644629914342, + "grad_norm": 1.9857311248779297, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7248737812042236, + "num_tokens": 251520701.0, + "step": 10068 + }, + { + "epoch": 1.1057544476169558, + "grad_norm": 2.1300432682037354, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7049636840820312, + "num_tokens": 251548237.0, + "step": 10069 + }, + { + "epoch": 1.1058642653195696, + "grad_norm": 2.158660650253296, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7197723984718323, + "num_tokens": 251575319.0, + "step": 10070 + }, + { + "epoch": 1.1059740830221831, + "grad_norm": 2.1011998653411865, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7170188426971436, + "num_tokens": 251603615.0, + "step": 10071 + }, + { + "epoch": 1.106083900724797, + "grad_norm": 2.16690993309021, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7164596319198608, + "num_tokens": 251629482.0, + "step": 10072 + }, + { + "epoch": 1.1061937184274104, + "grad_norm": 1.9903169870376587, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7295931577682495, + "num_tokens": 251660823.0, + "step": 10073 + }, + { + "epoch": 1.1063035361300242, + "grad_norm": 2.0432653427124023, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.726316511631012, + "num_tokens": 251691134.0, + "step": 10074 + }, + { + "epoch": 1.1064133538326377, + "grad_norm": 2.0936031341552734, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7115564942359924, + "num_tokens": 251719996.0, + "step": 10075 + }, + { + "epoch": 1.1065231715352515, + "grad_norm": 2.4729156494140625, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7220276594161987, + "num_tokens": 251740593.0, + "step": 10076 + }, + { + "epoch": 1.1066329892378652, + "grad_norm": 2.279653549194336, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7132295370101929, + "num_tokens": 251766854.0, + "step": 10077 + }, + { + "epoch": 1.1067428069404788, + "grad_norm": 2.5382657051086426, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7325714826583862, + "num_tokens": 251786835.0, + "step": 10078 + }, + { + "epoch": 1.1068526246430925, + "grad_norm": 2.291997194290161, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7243866920471191, + "num_tokens": 251811781.0, + "step": 10079 + }, + { + "epoch": 1.106962442345706, + "grad_norm": 2.6159682273864746, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7107952833175659, + "num_tokens": 251831640.0, + "step": 10080 + }, + { + "epoch": 1.1070722600483198, + "grad_norm": 2.5047566890716553, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7300893068313599, + "num_tokens": 251852250.0, + "step": 10081 + }, + { + "epoch": 1.1071820777509334, + "grad_norm": 2.5733487606048584, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7388640642166138, + "num_tokens": 251872603.0, + "step": 10082 + }, + { + "epoch": 1.1072918954535471, + "grad_norm": 2.2422869205474854, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7223048210144043, + "num_tokens": 251896292.0, + "step": 10083 + }, + { + "epoch": 1.1074017131561609, + "grad_norm": 2.2787766456604004, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7385331988334656, + "num_tokens": 251920381.0, + "step": 10084 + }, + { + "epoch": 1.1075115308587744, + "grad_norm": 2.3156397342681885, + "learning_rate": 1e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7389419078826904, + "num_tokens": 251943498.0, + "step": 10085 + }, + { + "epoch": 1.1076213485613882, + "grad_norm": 2.189175844192505, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7019269466400146, + "num_tokens": 251970576.0, + "step": 10086 + }, + { + "epoch": 1.1077311662640017, + "grad_norm": 2.2798397541046143, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7212841510772705, + "num_tokens": 251995294.0, + "step": 10087 + }, + { + "epoch": 1.1078409839666155, + "grad_norm": 2.1382360458374023, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7380481958389282, + "num_tokens": 252020843.0, + "step": 10088 + }, + { + "epoch": 1.107950801669229, + "grad_norm": 2.1650660037994385, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.718990683555603, + "num_tokens": 252045428.0, + "step": 10089 + }, + { + "epoch": 1.1080606193718427, + "grad_norm": 2.5455400943756104, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7235912084579468, + "num_tokens": 252066718.0, + "step": 10090 + }, + { + "epoch": 1.1081704370744565, + "grad_norm": 2.6380162239074707, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7536068558692932, + "num_tokens": 252084432.0, + "step": 10091 + }, + { + "epoch": 1.10828025477707, + "grad_norm": 2.376051902770996, + "learning_rate": 1e-06, + "loss": 0.8236, + "mean_token_accuracy": 0.7362257242202759, + "num_tokens": 252106025.0, + "step": 10092 + }, + { + "epoch": 1.1083900724796838, + "grad_norm": 2.0621984004974365, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7221425771713257, + "num_tokens": 252132974.0, + "step": 10093 + }, + { + "epoch": 1.1084998901822973, + "grad_norm": 2.3490021228790283, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6951327323913574, + "num_tokens": 252158871.0, + "step": 10094 + }, + { + "epoch": 1.108609707884911, + "grad_norm": 2.47952938079834, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7070553302764893, + "num_tokens": 252182868.0, + "step": 10095 + }, + { + "epoch": 1.1087195255875246, + "grad_norm": 2.2677457332611084, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7126359939575195, + "num_tokens": 252208065.0, + "step": 10096 + }, + { + "epoch": 1.1088293432901384, + "grad_norm": 2.374136209487915, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7328372001647949, + "num_tokens": 252229707.0, + "step": 10097 + }, + { + "epoch": 1.108939160992752, + "grad_norm": 2.2419748306274414, + "learning_rate": 1e-06, + "loss": 1.016, + "mean_token_accuracy": 0.7054014801979065, + "num_tokens": 252257112.0, + "step": 10098 + }, + { + "epoch": 1.1090489786953657, + "grad_norm": 2.315364122390747, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7237362861633301, + "num_tokens": 252281523.0, + "step": 10099 + }, + { + "epoch": 1.1091587963979794, + "grad_norm": 2.2725300788879395, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7219387888908386, + "num_tokens": 252305888.0, + "step": 10100 + }, + { + "epoch": 1.109268614100593, + "grad_norm": 2.231287956237793, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.70758056640625, + "num_tokens": 252330543.0, + "step": 10101 + }, + { + "epoch": 1.1093784318032067, + "grad_norm": 2.0541982650756836, + "learning_rate": 1e-06, + "loss": 1.0287, + "mean_token_accuracy": 0.6853688955307007, + "num_tokens": 252360976.0, + "step": 10102 + }, + { + "epoch": 1.1094882495058203, + "grad_norm": 2.2054078578948975, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7081104516983032, + "num_tokens": 252388120.0, + "step": 10103 + }, + { + "epoch": 1.109598067208434, + "grad_norm": 2.3429348468780518, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7467914819717407, + "num_tokens": 252411083.0, + "step": 10104 + }, + { + "epoch": 1.1097078849110478, + "grad_norm": 2.6396050453186035, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.70663982629776, + "num_tokens": 252432384.0, + "step": 10105 + }, + { + "epoch": 1.1098177026136613, + "grad_norm": 2.201306104660034, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6946921348571777, + "num_tokens": 252458763.0, + "step": 10106 + }, + { + "epoch": 1.109927520316275, + "grad_norm": 2.228646993637085, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7274539470672607, + "num_tokens": 252482793.0, + "step": 10107 + }, + { + "epoch": 1.1100373380188886, + "grad_norm": 2.3950390815734863, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7114787101745605, + "num_tokens": 252506102.0, + "step": 10108 + }, + { + "epoch": 1.1101471557215024, + "grad_norm": 2.4420008659362793, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.707085907459259, + "num_tokens": 252534578.0, + "step": 10109 + }, + { + "epoch": 1.110256973424116, + "grad_norm": 2.200103998184204, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7100067734718323, + "num_tokens": 252563614.0, + "step": 10110 + }, + { + "epoch": 1.1103667911267296, + "grad_norm": 2.693639039993286, + "learning_rate": 1e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.740654706954956, + "num_tokens": 252581371.0, + "step": 10111 + }, + { + "epoch": 1.1104766088293432, + "grad_norm": 2.211714506149292, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.73048996925354, + "num_tokens": 252608008.0, + "step": 10112 + }, + { + "epoch": 1.110586426531957, + "grad_norm": 2.4349701404571533, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7236553430557251, + "num_tokens": 252630557.0, + "step": 10113 + }, + { + "epoch": 1.1106962442345707, + "grad_norm": 2.258347749710083, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.711895227432251, + "num_tokens": 252656547.0, + "step": 10114 + }, + { + "epoch": 1.1108060619371842, + "grad_norm": 2.4295926094055176, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7267856597900391, + "num_tokens": 252679846.0, + "step": 10115 + }, + { + "epoch": 1.110915879639798, + "grad_norm": 2.450282335281372, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7209979891777039, + "num_tokens": 252701750.0, + "step": 10116 + }, + { + "epoch": 1.1110256973424115, + "grad_norm": 2.075695514678955, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7241089344024658, + "num_tokens": 252731176.0, + "step": 10117 + }, + { + "epoch": 1.1111355150450253, + "grad_norm": 2.2606136798858643, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7388933897018433, + "num_tokens": 252756414.0, + "step": 10118 + }, + { + "epoch": 1.111245332747639, + "grad_norm": 2.1727802753448486, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7286011576652527, + "num_tokens": 252782049.0, + "step": 10119 + }, + { + "epoch": 1.1113551504502526, + "grad_norm": 2.169111728668213, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7296158075332642, + "num_tokens": 252809904.0, + "step": 10120 + }, + { + "epoch": 1.1114649681528663, + "grad_norm": 2.0517852306365967, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7228394150733948, + "num_tokens": 252837645.0, + "step": 10121 + }, + { + "epoch": 1.1115747858554799, + "grad_norm": 2.3649489879608154, + "learning_rate": 1e-06, + "loss": 0.9845, + "mean_token_accuracy": 0.6967597007751465, + "num_tokens": 252862271.0, + "step": 10122 + }, + { + "epoch": 1.1116846035580936, + "grad_norm": 2.544149160385132, + "learning_rate": 1e-06, + "loss": 0.8786, + "mean_token_accuracy": 0.7329545021057129, + "num_tokens": 252881612.0, + "step": 10123 + }, + { + "epoch": 1.1117944212607072, + "grad_norm": 2.1884241104125977, + "learning_rate": 1e-06, + "loss": 0.9906, + "mean_token_accuracy": 0.6966948509216309, + "num_tokens": 252909275.0, + "step": 10124 + }, + { + "epoch": 1.111904238963321, + "grad_norm": 1.9005072116851807, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7180715799331665, + "num_tokens": 252942555.0, + "step": 10125 + }, + { + "epoch": 1.1120140566659344, + "grad_norm": 2.0345730781555176, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.722952127456665, + "num_tokens": 252971030.0, + "step": 10126 + }, + { + "epoch": 1.1121238743685482, + "grad_norm": 2.239790439605713, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7309998273849487, + "num_tokens": 252995720.0, + "step": 10127 + }, + { + "epoch": 1.112233692071162, + "grad_norm": 2.3706858158111572, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.726109504699707, + "num_tokens": 253019993.0, + "step": 10128 + }, + { + "epoch": 1.1123435097737755, + "grad_norm": 2.288985252380371, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6936715245246887, + "num_tokens": 253048935.0, + "step": 10129 + }, + { + "epoch": 1.1124533274763893, + "grad_norm": 2.261765241622925, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7334040999412537, + "num_tokens": 253072434.0, + "step": 10130 + }, + { + "epoch": 1.1125631451790028, + "grad_norm": 2.316229820251465, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7171357274055481, + "num_tokens": 253096189.0, + "step": 10131 + }, + { + "epoch": 1.1126729628816165, + "grad_norm": 2.5040900707244873, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7228304743766785, + "num_tokens": 253116571.0, + "step": 10132 + }, + { + "epoch": 1.11278278058423, + "grad_norm": 2.444322109222412, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7146077156066895, + "num_tokens": 253137769.0, + "step": 10133 + }, + { + "epoch": 1.1128925982868438, + "grad_norm": 2.4556074142456055, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7135279178619385, + "num_tokens": 253160648.0, + "step": 10134 + }, + { + "epoch": 1.1130024159894576, + "grad_norm": 2.3450632095336914, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7343708276748657, + "num_tokens": 253184126.0, + "step": 10135 + }, + { + "epoch": 1.1131122336920711, + "grad_norm": 2.444646120071411, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7157891392707825, + "num_tokens": 253205928.0, + "step": 10136 + }, + { + "epoch": 1.1132220513946849, + "grad_norm": 2.496108293533325, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7221382856369019, + "num_tokens": 253227348.0, + "step": 10137 + }, + { + "epoch": 1.1133318690972984, + "grad_norm": 2.2710466384887695, + "learning_rate": 1e-06, + "loss": 1.0549, + "mean_token_accuracy": 0.6921122074127197, + "num_tokens": 253251564.0, + "step": 10138 + }, + { + "epoch": 1.1134416867999122, + "grad_norm": 2.539193630218506, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7197949290275574, + "num_tokens": 253273775.0, + "step": 10139 + }, + { + "epoch": 1.1135515045025257, + "grad_norm": 2.161297082901001, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7372128367424011, + "num_tokens": 253299767.0, + "step": 10140 + }, + { + "epoch": 1.1136613222051395, + "grad_norm": 2.7700886726379395, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7252830266952515, + "num_tokens": 253320145.0, + "step": 10141 + }, + { + "epoch": 1.1137711399077532, + "grad_norm": 2.462455987930298, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7329310178756714, + "num_tokens": 253341233.0, + "step": 10142 + }, + { + "epoch": 1.1138809576103668, + "grad_norm": 2.53595232963562, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7170277237892151, + "num_tokens": 253363262.0, + "step": 10143 + }, + { + "epoch": 1.1139907753129805, + "grad_norm": 2.3687705993652344, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7321616411209106, + "num_tokens": 253386084.0, + "step": 10144 + }, + { + "epoch": 1.114100593015594, + "grad_norm": 2.731419563293457, + "learning_rate": 1e-06, + "loss": 0.7685, + "mean_token_accuracy": 0.7481366395950317, + "num_tokens": 253403169.0, + "step": 10145 + }, + { + "epoch": 1.1142104107182078, + "grad_norm": 2.1892123222351074, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7100620269775391, + "num_tokens": 253431822.0, + "step": 10146 + }, + { + "epoch": 1.1143202284208213, + "grad_norm": 2.195741653442383, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.714326798915863, + "num_tokens": 253455850.0, + "step": 10147 + }, + { + "epoch": 1.114430046123435, + "grad_norm": 2.238617420196533, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7185515761375427, + "num_tokens": 253480420.0, + "step": 10148 + }, + { + "epoch": 1.1145398638260489, + "grad_norm": 2.246715784072876, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7190488576889038, + "num_tokens": 253505581.0, + "step": 10149 + }, + { + "epoch": 1.1146496815286624, + "grad_norm": 2.284862518310547, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7382171750068665, + "num_tokens": 253528755.0, + "step": 10150 + }, + { + "epoch": 1.1147594992312762, + "grad_norm": 2.3945724964141846, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7017357349395752, + "num_tokens": 253552475.0, + "step": 10151 + }, + { + "epoch": 1.1148693169338897, + "grad_norm": 2.2878401279449463, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7166441679000854, + "num_tokens": 253579827.0, + "step": 10152 + }, + { + "epoch": 1.1149791346365034, + "grad_norm": 2.4767568111419678, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7219185829162598, + "num_tokens": 253602833.0, + "step": 10153 + }, + { + "epoch": 1.115088952339117, + "grad_norm": 2.1856913566589355, + "learning_rate": 1e-06, + "loss": 1.0525, + "mean_token_accuracy": 0.6893318891525269, + "num_tokens": 253631771.0, + "step": 10154 + }, + { + "epoch": 1.1151987700417307, + "grad_norm": 2.369602680206299, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7123700380325317, + "num_tokens": 253655488.0, + "step": 10155 + }, + { + "epoch": 1.1153085877443445, + "grad_norm": 2.3737998008728027, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7026215195655823, + "num_tokens": 253680660.0, + "step": 10156 + }, + { + "epoch": 1.115418405446958, + "grad_norm": 2.469597578048706, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7257529497146606, + "num_tokens": 253701881.0, + "step": 10157 + }, + { + "epoch": 1.1155282231495718, + "grad_norm": 2.3669707775115967, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7330330610275269, + "num_tokens": 253724208.0, + "step": 10158 + }, + { + "epoch": 1.1156380408521853, + "grad_norm": 2.350975513458252, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7188761234283447, + "num_tokens": 253747015.0, + "step": 10159 + }, + { + "epoch": 1.115747858554799, + "grad_norm": 2.0407369136810303, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7106937170028687, + "num_tokens": 253775093.0, + "step": 10160 + }, + { + "epoch": 1.1158576762574126, + "grad_norm": 2.5722382068634033, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7159701585769653, + "num_tokens": 253796134.0, + "step": 10161 + }, + { + "epoch": 1.1159674939600264, + "grad_norm": 2.55281925201416, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7235879898071289, + "num_tokens": 253817949.0, + "step": 10162 + }, + { + "epoch": 1.11607731166264, + "grad_norm": 2.2245538234710693, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7009050250053406, + "num_tokens": 253845010.0, + "step": 10163 + }, + { + "epoch": 1.1161871293652537, + "grad_norm": 2.0410149097442627, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7054414749145508, + "num_tokens": 253874810.0, + "step": 10164 + }, + { + "epoch": 1.1162969470678674, + "grad_norm": 2.240555763244629, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7170300483703613, + "num_tokens": 253900304.0, + "step": 10165 + }, + { + "epoch": 1.116406764770481, + "grad_norm": 2.1856579780578613, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7123277187347412, + "num_tokens": 253927789.0, + "step": 10166 + }, + { + "epoch": 1.1165165824730947, + "grad_norm": 2.3966212272644043, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7016597986221313, + "num_tokens": 253948583.0, + "step": 10167 + }, + { + "epoch": 1.1166264001757082, + "grad_norm": 2.4145760536193848, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7163048982620239, + "num_tokens": 253972031.0, + "step": 10168 + }, + { + "epoch": 1.116736217878322, + "grad_norm": 2.306304931640625, + "learning_rate": 1e-06, + "loss": 1.022, + "mean_token_accuracy": 0.686039924621582, + "num_tokens": 253999561.0, + "step": 10169 + }, + { + "epoch": 1.1168460355809358, + "grad_norm": 2.268054485321045, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7280960083007812, + "num_tokens": 254026774.0, + "step": 10170 + }, + { + "epoch": 1.1169558532835493, + "grad_norm": 2.1994426250457764, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7071704864501953, + "num_tokens": 254054106.0, + "step": 10171 + }, + { + "epoch": 1.117065670986163, + "grad_norm": 2.600057363510132, + "learning_rate": 1e-06, + "loss": 0.8249, + "mean_token_accuracy": 0.7449034452438354, + "num_tokens": 254074006.0, + "step": 10172 + }, + { + "epoch": 1.1171754886887766, + "grad_norm": 2.5582189559936523, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7182018160820007, + "num_tokens": 254096478.0, + "step": 10173 + }, + { + "epoch": 1.1172853063913903, + "grad_norm": 2.3526830673217773, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7029352188110352, + "num_tokens": 254122447.0, + "step": 10174 + }, + { + "epoch": 1.1173951240940039, + "grad_norm": 2.2887461185455322, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7133272886276245, + "num_tokens": 254149233.0, + "step": 10175 + }, + { + "epoch": 1.1175049417966176, + "grad_norm": 2.1320383548736572, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7316022515296936, + "num_tokens": 254176276.0, + "step": 10176 + }, + { + "epoch": 1.1176147594992312, + "grad_norm": 2.5895767211914062, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7431703209877014, + "num_tokens": 254197296.0, + "step": 10177 + }, + { + "epoch": 1.117724577201845, + "grad_norm": 2.4748592376708984, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.7333148121833801, + "num_tokens": 254219692.0, + "step": 10178 + }, + { + "epoch": 1.1178343949044587, + "grad_norm": 2.38189959526062, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7392216920852661, + "num_tokens": 254244069.0, + "step": 10179 + }, + { + "epoch": 1.1179442126070722, + "grad_norm": 2.246919870376587, + "learning_rate": 1e-06, + "loss": 0.9729, + "mean_token_accuracy": 0.7011626958847046, + "num_tokens": 254271375.0, + "step": 10180 + }, + { + "epoch": 1.118054030309686, + "grad_norm": 2.428133964538574, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7055737972259521, + "num_tokens": 254296363.0, + "step": 10181 + }, + { + "epoch": 1.1181638480122995, + "grad_norm": 2.4545791149139404, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.709990382194519, + "num_tokens": 254318019.0, + "step": 10182 + }, + { + "epoch": 1.1182736657149133, + "grad_norm": 2.0444936752319336, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7298220992088318, + "num_tokens": 254346898.0, + "step": 10183 + }, + { + "epoch": 1.1183834834175268, + "grad_norm": 2.4535152912139893, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.7001549005508423, + "num_tokens": 254368786.0, + "step": 10184 + }, + { + "epoch": 1.1184933011201406, + "grad_norm": 2.3130481243133545, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7203203439712524, + "num_tokens": 254391949.0, + "step": 10185 + }, + { + "epoch": 1.1186031188227543, + "grad_norm": 2.4428868293762207, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7145949006080627, + "num_tokens": 254415754.0, + "step": 10186 + }, + { + "epoch": 1.1187129365253679, + "grad_norm": 2.4203341007232666, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6899821162223816, + "num_tokens": 254440784.0, + "step": 10187 + }, + { + "epoch": 1.1188227542279816, + "grad_norm": 2.630887746810913, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7324342727661133, + "num_tokens": 254461183.0, + "step": 10188 + }, + { + "epoch": 1.1189325719305951, + "grad_norm": 2.0730550289154053, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.705803632736206, + "num_tokens": 254489493.0, + "step": 10189 + }, + { + "epoch": 1.119042389633209, + "grad_norm": 2.2930679321289062, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7157048583030701, + "num_tokens": 254513368.0, + "step": 10190 + }, + { + "epoch": 1.1191522073358224, + "grad_norm": 2.203625202178955, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7245136499404907, + "num_tokens": 254538043.0, + "step": 10191 + }, + { + "epoch": 1.1192620250384362, + "grad_norm": 2.566760301589966, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7107625007629395, + "num_tokens": 254557255.0, + "step": 10192 + }, + { + "epoch": 1.11937184274105, + "grad_norm": 2.4820635318756104, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7377080917358398, + "num_tokens": 254578428.0, + "step": 10193 + }, + { + "epoch": 1.1194816604436635, + "grad_norm": 2.3352248668670654, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7246420383453369, + "num_tokens": 254601536.0, + "step": 10194 + }, + { + "epoch": 1.1195914781462772, + "grad_norm": 2.159362554550171, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.6994805335998535, + "num_tokens": 254630231.0, + "step": 10195 + }, + { + "epoch": 1.1197012958488908, + "grad_norm": 2.4039688110351562, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7165794372558594, + "num_tokens": 254652526.0, + "step": 10196 + }, + { + "epoch": 1.1198111135515045, + "grad_norm": 2.451932191848755, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7320631742477417, + "num_tokens": 254676270.0, + "step": 10197 + }, + { + "epoch": 1.119920931254118, + "grad_norm": 2.199826717376709, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7258116006851196, + "num_tokens": 254702646.0, + "step": 10198 + }, + { + "epoch": 1.1200307489567318, + "grad_norm": 2.1613099575042725, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7042126059532166, + "num_tokens": 254731268.0, + "step": 10199 + }, + { + "epoch": 1.1201405666593456, + "grad_norm": 2.521092653274536, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.7402952909469604, + "num_tokens": 254752026.0, + "step": 10200 + }, + { + "epoch": 1.1202503843619591, + "grad_norm": 1.9889228343963623, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.681603193283081, + "num_tokens": 254785017.0, + "step": 10201 + }, + { + "epoch": 1.1203602020645729, + "grad_norm": 2.2055513858795166, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7058346271514893, + "num_tokens": 254812922.0, + "step": 10202 + }, + { + "epoch": 1.1204700197671864, + "grad_norm": 2.360755205154419, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7074931263923645, + "num_tokens": 254840148.0, + "step": 10203 + }, + { + "epoch": 1.1205798374698002, + "grad_norm": 2.3265795707702637, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7224622964859009, + "num_tokens": 254864766.0, + "step": 10204 + }, + { + "epoch": 1.1206896551724137, + "grad_norm": 2.1071441173553467, + "learning_rate": 1e-06, + "loss": 1.004, + "mean_token_accuracy": 0.6991634368896484, + "num_tokens": 254894018.0, + "step": 10205 + }, + { + "epoch": 1.1207994728750275, + "grad_norm": 2.086089611053467, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7105078101158142, + "num_tokens": 254923717.0, + "step": 10206 + }, + { + "epoch": 1.1209092905776412, + "grad_norm": 2.5336999893188477, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7428231835365295, + "num_tokens": 254943946.0, + "step": 10207 + }, + { + "epoch": 1.1210191082802548, + "grad_norm": 2.3840885162353516, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7023515701293945, + "num_tokens": 254967888.0, + "step": 10208 + }, + { + "epoch": 1.1211289259828685, + "grad_norm": 2.2118895053863525, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7094011306762695, + "num_tokens": 254995750.0, + "step": 10209 + }, + { + "epoch": 1.121238743685482, + "grad_norm": 2.4808554649353027, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7205929160118103, + "num_tokens": 255017415.0, + "step": 10210 + }, + { + "epoch": 1.1213485613880958, + "grad_norm": 2.2081897258758545, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7205380797386169, + "num_tokens": 255043321.0, + "step": 10211 + }, + { + "epoch": 1.1214583790907093, + "grad_norm": 2.2070014476776123, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7105162143707275, + "num_tokens": 255069697.0, + "step": 10212 + }, + { + "epoch": 1.121568196793323, + "grad_norm": 2.312142848968506, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7310912609100342, + "num_tokens": 255091972.0, + "step": 10213 + }, + { + "epoch": 1.1216780144959368, + "grad_norm": 2.0848214626312256, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7110147476196289, + "num_tokens": 255121639.0, + "step": 10214 + }, + { + "epoch": 1.1217878321985504, + "grad_norm": 2.3940768241882324, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7212288975715637, + "num_tokens": 255144537.0, + "step": 10215 + }, + { + "epoch": 1.1218976499011641, + "grad_norm": 2.217906951904297, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7003014087677002, + "num_tokens": 255169697.0, + "step": 10216 + }, + { + "epoch": 1.1220074676037777, + "grad_norm": 2.7662599086761475, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7232035398483276, + "num_tokens": 255185964.0, + "step": 10217 + }, + { + "epoch": 1.1221172853063914, + "grad_norm": 2.179354190826416, + "learning_rate": 1e-06, + "loss": 1.0034, + "mean_token_accuracy": 0.689837634563446, + "num_tokens": 255216181.0, + "step": 10218 + }, + { + "epoch": 1.122227103009005, + "grad_norm": 2.404175281524658, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7331122159957886, + "num_tokens": 255237970.0, + "step": 10219 + }, + { + "epoch": 1.1223369207116187, + "grad_norm": 2.1555113792419434, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7051377296447754, + "num_tokens": 255263945.0, + "step": 10220 + }, + { + "epoch": 1.1224467384142325, + "grad_norm": 2.1275484561920166, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7140414714813232, + "num_tokens": 255290168.0, + "step": 10221 + }, + { + "epoch": 1.122556556116846, + "grad_norm": 2.111469268798828, + "learning_rate": 1e-06, + "loss": 1.0467, + "mean_token_accuracy": 0.6806703209877014, + "num_tokens": 255321404.0, + "step": 10222 + }, + { + "epoch": 1.1226663738194598, + "grad_norm": 1.805788278579712, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7168999910354614, + "num_tokens": 255355326.0, + "step": 10223 + }, + { + "epoch": 1.1227761915220733, + "grad_norm": 2.219059467315674, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7022343873977661, + "num_tokens": 255380074.0, + "step": 10224 + }, + { + "epoch": 1.122886009224687, + "grad_norm": 2.1230273246765137, + "learning_rate": 1e-06, + "loss": 0.9735, + "mean_token_accuracy": 0.7094700932502747, + "num_tokens": 255410045.0, + "step": 10225 + }, + { + "epoch": 1.1229958269273006, + "grad_norm": 2.158388376235962, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.6982574462890625, + "num_tokens": 255440910.0, + "step": 10226 + }, + { + "epoch": 1.1231056446299144, + "grad_norm": 2.6593525409698486, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7192295789718628, + "num_tokens": 255459320.0, + "step": 10227 + }, + { + "epoch": 1.123215462332528, + "grad_norm": 2.196070432662964, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7248039841651917, + "num_tokens": 255486041.0, + "step": 10228 + }, + { + "epoch": 1.1233252800351416, + "grad_norm": 2.142831802368164, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7191331386566162, + "num_tokens": 255513375.0, + "step": 10229 + }, + { + "epoch": 1.1234350977377554, + "grad_norm": 2.204949140548706, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7245650291442871, + "num_tokens": 255538653.0, + "step": 10230 + }, + { + "epoch": 1.123544915440369, + "grad_norm": 2.1257972717285156, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6926466226577759, + "num_tokens": 255568350.0, + "step": 10231 + }, + { + "epoch": 1.1236547331429827, + "grad_norm": 2.681018352508545, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7212449312210083, + "num_tokens": 255588639.0, + "step": 10232 + }, + { + "epoch": 1.1237645508455962, + "grad_norm": 2.1616756916046143, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7168858051300049, + "num_tokens": 255614170.0, + "step": 10233 + }, + { + "epoch": 1.12387436854821, + "grad_norm": 2.2845942974090576, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7158965468406677, + "num_tokens": 255638152.0, + "step": 10234 + }, + { + "epoch": 1.1239841862508237, + "grad_norm": 2.5425784587860107, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7227832078933716, + "num_tokens": 255659149.0, + "step": 10235 + }, + { + "epoch": 1.1240940039534373, + "grad_norm": 2.4802587032318115, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7341006398200989, + "num_tokens": 255679337.0, + "step": 10236 + }, + { + "epoch": 1.124203821656051, + "grad_norm": 2.321014404296875, + "learning_rate": 1e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7377527952194214, + "num_tokens": 255702775.0, + "step": 10237 + }, + { + "epoch": 1.1243136393586646, + "grad_norm": 2.350800037384033, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7131394147872925, + "num_tokens": 255725665.0, + "step": 10238 + }, + { + "epoch": 1.1244234570612783, + "grad_norm": 2.2513535022735596, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6986520290374756, + "num_tokens": 255751167.0, + "step": 10239 + }, + { + "epoch": 1.1245332747638919, + "grad_norm": 2.155731439590454, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7035886645317078, + "num_tokens": 255781012.0, + "step": 10240 + }, + { + "epoch": 1.1246430924665056, + "grad_norm": 2.0827364921569824, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7161658406257629, + "num_tokens": 255809536.0, + "step": 10241 + }, + { + "epoch": 1.1247529101691192, + "grad_norm": 2.2416629791259766, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7180132865905762, + "num_tokens": 255836797.0, + "step": 10242 + }, + { + "epoch": 1.124862727871733, + "grad_norm": 2.2556724548339844, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7255098819732666, + "num_tokens": 255862112.0, + "step": 10243 + }, + { + "epoch": 1.1249725455743467, + "grad_norm": 2.2404158115386963, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7240493297576904, + "num_tokens": 255886993.0, + "step": 10244 + }, + { + "epoch": 1.1250823632769602, + "grad_norm": 2.4329569339752197, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7106050252914429, + "num_tokens": 255910319.0, + "step": 10245 + }, + { + "epoch": 1.125192180979574, + "grad_norm": 2.204456090927124, + "learning_rate": 1e-06, + "loss": 1.0148, + "mean_token_accuracy": 0.6872526407241821, + "num_tokens": 255940358.0, + "step": 10246 + }, + { + "epoch": 1.1253019986821875, + "grad_norm": 2.4661448001861572, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7251124382019043, + "num_tokens": 255962221.0, + "step": 10247 + }, + { + "epoch": 1.1254118163848013, + "grad_norm": 2.1869523525238037, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7177175283432007, + "num_tokens": 255988404.0, + "step": 10248 + }, + { + "epoch": 1.125521634087415, + "grad_norm": 2.336381435394287, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7182204723358154, + "num_tokens": 256014148.0, + "step": 10249 + }, + { + "epoch": 1.1256314517900285, + "grad_norm": 2.5413403511047363, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7398778200149536, + "num_tokens": 256033787.0, + "step": 10250 + }, + { + "epoch": 1.1257412694926423, + "grad_norm": 2.243476629257202, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.7002439498901367, + "num_tokens": 256060136.0, + "step": 10251 + }, + { + "epoch": 1.1258510871952558, + "grad_norm": 2.1941189765930176, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7173817157745361, + "num_tokens": 256087713.0, + "step": 10252 + }, + { + "epoch": 1.1259609048978696, + "grad_norm": 2.1199610233306885, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7300759553909302, + "num_tokens": 256114442.0, + "step": 10253 + }, + { + "epoch": 1.1260707226004831, + "grad_norm": 2.4761624336242676, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7292835712432861, + "num_tokens": 256136344.0, + "step": 10254 + }, + { + "epoch": 1.1261805403030969, + "grad_norm": 2.1507041454315186, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7103722095489502, + "num_tokens": 256165667.0, + "step": 10255 + }, + { + "epoch": 1.1262903580057104, + "grad_norm": 2.523486852645874, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7410963773727417, + "num_tokens": 256186633.0, + "step": 10256 + }, + { + "epoch": 1.1264001757083242, + "grad_norm": 2.1982383728027344, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7121928930282593, + "num_tokens": 256213162.0, + "step": 10257 + }, + { + "epoch": 1.126509993410938, + "grad_norm": 2.253990411758423, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7138522863388062, + "num_tokens": 256239477.0, + "step": 10258 + }, + { + "epoch": 1.1266198111135515, + "grad_norm": 2.2455763816833496, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6892531514167786, + "num_tokens": 256267256.0, + "step": 10259 + }, + { + "epoch": 1.1267296288161652, + "grad_norm": 2.2216312885284424, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7056209444999695, + "num_tokens": 256293645.0, + "step": 10260 + }, + { + "epoch": 1.1268394465187788, + "grad_norm": 2.1761019229888916, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7141780853271484, + "num_tokens": 256321041.0, + "step": 10261 + }, + { + "epoch": 1.1269492642213925, + "grad_norm": 2.3169898986816406, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7303395867347717, + "num_tokens": 256345006.0, + "step": 10262 + }, + { + "epoch": 1.127059081924006, + "grad_norm": 2.582162618637085, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7267608046531677, + "num_tokens": 256364695.0, + "step": 10263 + }, + { + "epoch": 1.1271688996266198, + "grad_norm": 2.4800353050231934, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7224223613739014, + "num_tokens": 256386858.0, + "step": 10264 + }, + { + "epoch": 1.1272787173292333, + "grad_norm": 2.2642719745635986, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7084067463874817, + "num_tokens": 256412706.0, + "step": 10265 + }, + { + "epoch": 1.127388535031847, + "grad_norm": 2.1381001472473145, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7508554458618164, + "num_tokens": 256437479.0, + "step": 10266 + }, + { + "epoch": 1.1274983527344609, + "grad_norm": 2.2647688388824463, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7096012830734253, + "num_tokens": 256464015.0, + "step": 10267 + }, + { + "epoch": 1.1276081704370744, + "grad_norm": 2.5540757179260254, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.734376847743988, + "num_tokens": 256484380.0, + "step": 10268 + }, + { + "epoch": 1.1277179881396882, + "grad_norm": 2.637223482131958, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7104588747024536, + "num_tokens": 256503173.0, + "step": 10269 + }, + { + "epoch": 1.1278278058423017, + "grad_norm": 2.4314157962799072, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7224805951118469, + "num_tokens": 256526352.0, + "step": 10270 + }, + { + "epoch": 1.1279376235449154, + "grad_norm": 2.242006301879883, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.6978365778923035, + "num_tokens": 256551460.0, + "step": 10271 + }, + { + "epoch": 1.1280474412475292, + "grad_norm": 2.1297836303710938, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.716547429561615, + "num_tokens": 256581730.0, + "step": 10272 + }, + { + "epoch": 1.1281572589501427, + "grad_norm": 2.271472454071045, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7158675193786621, + "num_tokens": 256606076.0, + "step": 10273 + }, + { + "epoch": 1.1282670766527565, + "grad_norm": 2.5287110805511475, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7117738723754883, + "num_tokens": 256627899.0, + "step": 10274 + }, + { + "epoch": 1.12837689435537, + "grad_norm": 2.2480504512786865, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.7255878448486328, + "num_tokens": 256652828.0, + "step": 10275 + }, + { + "epoch": 1.1284867120579838, + "grad_norm": 2.1319539546966553, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7253420352935791, + "num_tokens": 256678155.0, + "step": 10276 + }, + { + "epoch": 1.1285965297605973, + "grad_norm": 2.7527434825897217, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7213820219039917, + "num_tokens": 256695685.0, + "step": 10277 + }, + { + "epoch": 1.128706347463211, + "grad_norm": 2.5752573013305664, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7067642211914062, + "num_tokens": 256717095.0, + "step": 10278 + }, + { + "epoch": 1.1288161651658246, + "grad_norm": 2.152195930480957, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6946851015090942, + "num_tokens": 256745502.0, + "step": 10279 + }, + { + "epoch": 1.1289259828684384, + "grad_norm": 2.4455292224884033, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7174836993217468, + "num_tokens": 256769044.0, + "step": 10280 + }, + { + "epoch": 1.1290358005710521, + "grad_norm": 2.063149929046631, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7167337536811829, + "num_tokens": 256797180.0, + "step": 10281 + }, + { + "epoch": 1.1291456182736657, + "grad_norm": 2.5124404430389404, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7205319404602051, + "num_tokens": 256819015.0, + "step": 10282 + }, + { + "epoch": 1.1292554359762794, + "grad_norm": 2.4156711101531982, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6982669830322266, + "num_tokens": 256847415.0, + "step": 10283 + }, + { + "epoch": 1.129365253678893, + "grad_norm": 2.109665870666504, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6943736672401428, + "num_tokens": 256877685.0, + "step": 10284 + }, + { + "epoch": 1.1294750713815067, + "grad_norm": 2.06927227973938, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7265292406082153, + "num_tokens": 256909284.0, + "step": 10285 + }, + { + "epoch": 1.1295848890841205, + "grad_norm": 2.403996229171753, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7154366374015808, + "num_tokens": 256933426.0, + "step": 10286 + }, + { + "epoch": 1.129694706786734, + "grad_norm": 2.518143892288208, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.734163761138916, + "num_tokens": 256953949.0, + "step": 10287 + }, + { + "epoch": 1.1298045244893478, + "grad_norm": 2.666301727294922, + "learning_rate": 1e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7424702644348145, + "num_tokens": 256971602.0, + "step": 10288 + }, + { + "epoch": 1.1299143421919613, + "grad_norm": 2.3047561645507812, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7303026914596558, + "num_tokens": 256995755.0, + "step": 10289 + }, + { + "epoch": 1.130024159894575, + "grad_norm": 2.064640522003174, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6848349571228027, + "num_tokens": 257025279.0, + "step": 10290 + }, + { + "epoch": 1.1301339775971886, + "grad_norm": 2.3858237266540527, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7196317911148071, + "num_tokens": 257047706.0, + "step": 10291 + }, + { + "epoch": 1.1302437952998023, + "grad_norm": 2.1023945808410645, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7200015187263489, + "num_tokens": 257076597.0, + "step": 10292 + }, + { + "epoch": 1.1303536130024159, + "grad_norm": 2.125903844833374, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7183510065078735, + "num_tokens": 257103982.0, + "step": 10293 + }, + { + "epoch": 1.1304634307050296, + "grad_norm": 2.66884446144104, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7100895047187805, + "num_tokens": 257124452.0, + "step": 10294 + }, + { + "epoch": 1.1305732484076434, + "grad_norm": 2.466381311416626, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7234182357788086, + "num_tokens": 257146134.0, + "step": 10295 + }, + { + "epoch": 1.130683066110257, + "grad_norm": 2.0106449127197266, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7101383209228516, + "num_tokens": 257177999.0, + "step": 10296 + }, + { + "epoch": 1.1307928838128707, + "grad_norm": 2.407346725463867, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7370854616165161, + "num_tokens": 257199821.0, + "step": 10297 + }, + { + "epoch": 1.1309027015154842, + "grad_norm": 2.442751169204712, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7065010070800781, + "num_tokens": 257221196.0, + "step": 10298 + }, + { + "epoch": 1.131012519218098, + "grad_norm": 2.308870315551758, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7127266526222229, + "num_tokens": 257246139.0, + "step": 10299 + }, + { + "epoch": 1.1311223369207117, + "grad_norm": 2.3236887454986572, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7027575373649597, + "num_tokens": 257269625.0, + "step": 10300 + }, + { + "epoch": 1.1312321546233253, + "grad_norm": 2.30169939994812, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7048256993293762, + "num_tokens": 257294942.0, + "step": 10301 + }, + { + "epoch": 1.131341972325939, + "grad_norm": 2.238452911376953, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7203540802001953, + "num_tokens": 257319564.0, + "step": 10302 + }, + { + "epoch": 1.1314517900285526, + "grad_norm": 2.3304924964904785, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7051452398300171, + "num_tokens": 257346529.0, + "step": 10303 + }, + { + "epoch": 1.1315616077311663, + "grad_norm": 2.6305010318756104, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.723629355430603, + "num_tokens": 257367482.0, + "step": 10304 + }, + { + "epoch": 1.1316714254337799, + "grad_norm": 2.3402464389801025, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7297451496124268, + "num_tokens": 257390387.0, + "step": 10305 + }, + { + "epoch": 1.1317812431363936, + "grad_norm": 2.0083487033843994, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7238183617591858, + "num_tokens": 257421855.0, + "step": 10306 + }, + { + "epoch": 1.1318910608390071, + "grad_norm": 2.1575887203216553, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7070246934890747, + "num_tokens": 257449827.0, + "step": 10307 + }, + { + "epoch": 1.132000878541621, + "grad_norm": 2.635481357574463, + "learning_rate": 1e-06, + "loss": 0.7865, + "mean_token_accuracy": 0.744462788105011, + "num_tokens": 257468718.0, + "step": 10308 + }, + { + "epoch": 1.1321106962442347, + "grad_norm": 2.1750011444091797, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.706673264503479, + "num_tokens": 257495782.0, + "step": 10309 + }, + { + "epoch": 1.1322205139468482, + "grad_norm": 2.12087082862854, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7367449998855591, + "num_tokens": 257522151.0, + "step": 10310 + }, + { + "epoch": 1.132330331649462, + "grad_norm": 2.172043800354004, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.708673357963562, + "num_tokens": 257550386.0, + "step": 10311 + }, + { + "epoch": 1.1324401493520755, + "grad_norm": 2.494978904724121, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7248295545578003, + "num_tokens": 257572819.0, + "step": 10312 + }, + { + "epoch": 1.1325499670546892, + "grad_norm": 2.1093716621398926, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.702530562877655, + "num_tokens": 257602014.0, + "step": 10313 + }, + { + "epoch": 1.132659784757303, + "grad_norm": 2.7931623458862305, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7217217683792114, + "num_tokens": 257620673.0, + "step": 10314 + }, + { + "epoch": 1.1327696024599165, + "grad_norm": 2.7677254676818848, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7102481722831726, + "num_tokens": 257640292.0, + "step": 10315 + }, + { + "epoch": 1.1328794201625303, + "grad_norm": 2.2186226844787598, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7195037007331848, + "num_tokens": 257666885.0, + "step": 10316 + }, + { + "epoch": 1.1329892378651438, + "grad_norm": 2.293888807296753, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7124709486961365, + "num_tokens": 257691291.0, + "step": 10317 + }, + { + "epoch": 1.1330990555677576, + "grad_norm": 2.0364184379577637, + "learning_rate": 1e-06, + "loss": 0.9896, + "mean_token_accuracy": 0.7079364061355591, + "num_tokens": 257720680.0, + "step": 10318 + }, + { + "epoch": 1.1332088732703711, + "grad_norm": 2.2558865547180176, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7122949361801147, + "num_tokens": 257747156.0, + "step": 10319 + }, + { + "epoch": 1.1333186909729849, + "grad_norm": 2.1852245330810547, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7375259399414062, + "num_tokens": 257772663.0, + "step": 10320 + }, + { + "epoch": 1.1334285086755984, + "grad_norm": 2.4264371395111084, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.709185004234314, + "num_tokens": 257799653.0, + "step": 10321 + }, + { + "epoch": 1.1335383263782122, + "grad_norm": 2.7487337589263916, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7268795967102051, + "num_tokens": 257817089.0, + "step": 10322 + }, + { + "epoch": 1.133648144080826, + "grad_norm": 2.0249171257019043, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7100048065185547, + "num_tokens": 257847239.0, + "step": 10323 + }, + { + "epoch": 1.1337579617834395, + "grad_norm": 2.53039288520813, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7097316384315491, + "num_tokens": 257869706.0, + "step": 10324 + }, + { + "epoch": 1.1338677794860532, + "grad_norm": 2.1668431758880615, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6980069279670715, + "num_tokens": 257901449.0, + "step": 10325 + }, + { + "epoch": 1.1339775971886668, + "grad_norm": 2.3848888874053955, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7005334496498108, + "num_tokens": 257927446.0, + "step": 10326 + }, + { + "epoch": 1.1340874148912805, + "grad_norm": 2.402379035949707, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.710424542427063, + "num_tokens": 257951796.0, + "step": 10327 + }, + { + "epoch": 1.134197232593894, + "grad_norm": 2.4067859649658203, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7232507467269897, + "num_tokens": 257974253.0, + "step": 10328 + }, + { + "epoch": 1.1343070502965078, + "grad_norm": 2.189282178878784, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.7007439136505127, + "num_tokens": 258003194.0, + "step": 10329 + }, + { + "epoch": 1.1344168679991213, + "grad_norm": 2.5095980167388916, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.729926586151123, + "num_tokens": 258024269.0, + "step": 10330 + }, + { + "epoch": 1.134526685701735, + "grad_norm": 2.201552629470825, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7243043184280396, + "num_tokens": 258049134.0, + "step": 10331 + }, + { + "epoch": 1.1346365034043489, + "grad_norm": 2.216557502746582, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7092719674110413, + "num_tokens": 258075802.0, + "step": 10332 + }, + { + "epoch": 1.1347463211069624, + "grad_norm": 2.3847339153289795, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7326144576072693, + "num_tokens": 258097271.0, + "step": 10333 + }, + { + "epoch": 1.1348561388095761, + "grad_norm": 2.2941582202911377, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7047153115272522, + "num_tokens": 258123969.0, + "step": 10334 + }, + { + "epoch": 1.1349659565121897, + "grad_norm": 2.2052342891693115, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.6997096538543701, + "num_tokens": 258154945.0, + "step": 10335 + }, + { + "epoch": 1.1350757742148034, + "grad_norm": 2.226306438446045, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7111386060714722, + "num_tokens": 258181163.0, + "step": 10336 + }, + { + "epoch": 1.1351855919174172, + "grad_norm": 2.11838436126709, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7345727682113647, + "num_tokens": 258208448.0, + "step": 10337 + }, + { + "epoch": 1.1352954096200307, + "grad_norm": 2.1745190620422363, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7176807522773743, + "num_tokens": 258235947.0, + "step": 10338 + }, + { + "epoch": 1.1354052273226445, + "grad_norm": 2.5890214443206787, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7351614236831665, + "num_tokens": 258255113.0, + "step": 10339 + }, + { + "epoch": 1.135515045025258, + "grad_norm": 2.2523422241210938, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.7032788395881653, + "num_tokens": 258280775.0, + "step": 10340 + }, + { + "epoch": 1.1356248627278718, + "grad_norm": 2.1003150939941406, + "learning_rate": 1e-06, + "loss": 0.9895, + "mean_token_accuracy": 0.6961345076560974, + "num_tokens": 258310296.0, + "step": 10341 + }, + { + "epoch": 1.1357346804304853, + "grad_norm": 2.698986053466797, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7323346138000488, + "num_tokens": 258330440.0, + "step": 10342 + }, + { + "epoch": 1.135844498133099, + "grad_norm": 2.026930809020996, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7155979871749878, + "num_tokens": 258360753.0, + "step": 10343 + }, + { + "epoch": 1.1359543158357126, + "grad_norm": 2.60190486907959, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.7044912576675415, + "num_tokens": 258385423.0, + "step": 10344 + }, + { + "epoch": 1.1360641335383264, + "grad_norm": 2.2302210330963135, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7156379818916321, + "num_tokens": 258410956.0, + "step": 10345 + }, + { + "epoch": 1.1361739512409401, + "grad_norm": 2.1815571784973145, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7196912169456482, + "num_tokens": 258436334.0, + "step": 10346 + }, + { + "epoch": 1.1362837689435537, + "grad_norm": 2.3431735038757324, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7305231094360352, + "num_tokens": 258458947.0, + "step": 10347 + }, + { + "epoch": 1.1363935866461674, + "grad_norm": 2.1660797595977783, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.687117874622345, + "num_tokens": 258488659.0, + "step": 10348 + }, + { + "epoch": 1.136503404348781, + "grad_norm": 2.3993515968322754, + "learning_rate": 1e-06, + "loss": 0.779, + "mean_token_accuracy": 0.7552854418754578, + "num_tokens": 258510637.0, + "step": 10349 + }, + { + "epoch": 1.1366132220513947, + "grad_norm": 2.4214870929718018, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7160029411315918, + "num_tokens": 258533710.0, + "step": 10350 + }, + { + "epoch": 1.1367230397540085, + "grad_norm": 2.504626750946045, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7240472435951233, + "num_tokens": 258554130.0, + "step": 10351 + }, + { + "epoch": 1.136832857456622, + "grad_norm": 2.3923707008361816, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.718575656414032, + "num_tokens": 258577841.0, + "step": 10352 + }, + { + "epoch": 1.1369426751592357, + "grad_norm": 2.295243501663208, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7155699729919434, + "num_tokens": 258601609.0, + "step": 10353 + }, + { + "epoch": 1.1370524928618493, + "grad_norm": 2.6232733726501465, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6932080388069153, + "num_tokens": 258623852.0, + "step": 10354 + }, + { + "epoch": 1.137162310564463, + "grad_norm": 2.181210994720459, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7377020120620728, + "num_tokens": 258647407.0, + "step": 10355 + }, + { + "epoch": 1.1372721282670766, + "grad_norm": 2.355872631072998, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.734758734703064, + "num_tokens": 258669495.0, + "step": 10356 + }, + { + "epoch": 1.1373819459696903, + "grad_norm": 1.9508293867111206, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7302800416946411, + "num_tokens": 258699717.0, + "step": 10357 + }, + { + "epoch": 1.1374917636723039, + "grad_norm": 2.383577346801758, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7206774353981018, + "num_tokens": 258721673.0, + "step": 10358 + }, + { + "epoch": 1.1376015813749176, + "grad_norm": 2.2201428413391113, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.703875720500946, + "num_tokens": 258747349.0, + "step": 10359 + }, + { + "epoch": 1.1377113990775314, + "grad_norm": 2.1641623973846436, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7101523876190186, + "num_tokens": 258775315.0, + "step": 10360 + }, + { + "epoch": 1.137821216780145, + "grad_norm": 2.2220945358276367, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.701026976108551, + "num_tokens": 258802075.0, + "step": 10361 + }, + { + "epoch": 1.1379310344827587, + "grad_norm": 2.3930118083953857, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7380717396736145, + "num_tokens": 258825352.0, + "step": 10362 + }, + { + "epoch": 1.1380408521853722, + "grad_norm": 2.66316819190979, + "learning_rate": 1e-06, + "loss": 0.8369, + "mean_token_accuracy": 0.7281284332275391, + "num_tokens": 258843850.0, + "step": 10363 + }, + { + "epoch": 1.138150669887986, + "grad_norm": 2.215311288833618, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7202861309051514, + "num_tokens": 258868741.0, + "step": 10364 + }, + { + "epoch": 1.1382604875905997, + "grad_norm": 2.1195521354675293, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7102744579315186, + "num_tokens": 258898488.0, + "step": 10365 + }, + { + "epoch": 1.1383703052932133, + "grad_norm": 2.3370327949523926, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7153530120849609, + "num_tokens": 258921745.0, + "step": 10366 + }, + { + "epoch": 1.138480122995827, + "grad_norm": 1.9406524896621704, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7168852686882019, + "num_tokens": 258954609.0, + "step": 10367 + }, + { + "epoch": 1.1385899406984406, + "grad_norm": 2.1964313983917236, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7123793959617615, + "num_tokens": 258981450.0, + "step": 10368 + }, + { + "epoch": 1.1386997584010543, + "grad_norm": 2.2939770221710205, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7058221101760864, + "num_tokens": 259005627.0, + "step": 10369 + }, + { + "epoch": 1.1388095761036678, + "grad_norm": 2.4041919708251953, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7208249568939209, + "num_tokens": 259028675.0, + "step": 10370 + }, + { + "epoch": 1.1389193938062816, + "grad_norm": 2.3262939453125, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7074720859527588, + "num_tokens": 259052773.0, + "step": 10371 + }, + { + "epoch": 1.1390292115088951, + "grad_norm": 2.2923336029052734, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7326277494430542, + "num_tokens": 259078466.0, + "step": 10372 + }, + { + "epoch": 1.139139029211509, + "grad_norm": 2.1240909099578857, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6947611570358276, + "num_tokens": 259106914.0, + "step": 10373 + }, + { + "epoch": 1.1392488469141226, + "grad_norm": 2.145009994506836, + "learning_rate": 1e-06, + "loss": 0.9996, + "mean_token_accuracy": 0.6906476616859436, + "num_tokens": 259135919.0, + "step": 10374 + }, + { + "epoch": 1.1393586646167362, + "grad_norm": 2.2815823554992676, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7147499322891235, + "num_tokens": 259160426.0, + "step": 10375 + }, + { + "epoch": 1.13946848231935, + "grad_norm": 2.3182363510131836, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7226160764694214, + "num_tokens": 259184364.0, + "step": 10376 + }, + { + "epoch": 1.1395783000219635, + "grad_norm": 2.3405981063842773, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.723899245262146, + "num_tokens": 259206834.0, + "step": 10377 + }, + { + "epoch": 1.1396881177245772, + "grad_norm": 2.044003486633301, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7035664916038513, + "num_tokens": 259237693.0, + "step": 10378 + }, + { + "epoch": 1.1397979354271908, + "grad_norm": 2.197432041168213, + "learning_rate": 1e-06, + "loss": 0.7893, + "mean_token_accuracy": 0.7622830867767334, + "num_tokens": 259262166.0, + "step": 10379 + }, + { + "epoch": 1.1399077531298045, + "grad_norm": 2.6475374698638916, + "learning_rate": 1e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7364264130592346, + "num_tokens": 259281624.0, + "step": 10380 + }, + { + "epoch": 1.1400175708324183, + "grad_norm": 2.1360037326812744, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7131496071815491, + "num_tokens": 259309053.0, + "step": 10381 + }, + { + "epoch": 1.1401273885350318, + "grad_norm": 2.232055425643921, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7147375345230103, + "num_tokens": 259334803.0, + "step": 10382 + }, + { + "epoch": 1.1402372062376456, + "grad_norm": 2.4070770740509033, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7138598561286926, + "num_tokens": 259360020.0, + "step": 10383 + }, + { + "epoch": 1.140347023940259, + "grad_norm": 2.0163960456848145, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7048128843307495, + "num_tokens": 259391856.0, + "step": 10384 + }, + { + "epoch": 1.1404568416428729, + "grad_norm": 2.5254664421081543, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.723357081413269, + "num_tokens": 259412380.0, + "step": 10385 + }, + { + "epoch": 1.1405666593454864, + "grad_norm": 2.5944674015045166, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7425826191902161, + "num_tokens": 259431338.0, + "step": 10386 + }, + { + "epoch": 1.1406764770481002, + "grad_norm": 2.2038679122924805, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7143802046775818, + "num_tokens": 259457500.0, + "step": 10387 + }, + { + "epoch": 1.140786294750714, + "grad_norm": 2.18214750289917, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7132266759872437, + "num_tokens": 259483595.0, + "step": 10388 + }, + { + "epoch": 1.1408961124533274, + "grad_norm": 2.436901569366455, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7348066568374634, + "num_tokens": 259503303.0, + "step": 10389 + }, + { + "epoch": 1.1410059301559412, + "grad_norm": 2.2575595378875732, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7371361255645752, + "num_tokens": 259527885.0, + "step": 10390 + }, + { + "epoch": 1.1411157478585547, + "grad_norm": 2.078864336013794, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7112525105476379, + "num_tokens": 259559656.0, + "step": 10391 + }, + { + "epoch": 1.1412255655611685, + "grad_norm": 2.160309314727783, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6924486756324768, + "num_tokens": 259587164.0, + "step": 10392 + }, + { + "epoch": 1.141335383263782, + "grad_norm": 2.362959146499634, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7068265676498413, + "num_tokens": 259614232.0, + "step": 10393 + }, + { + "epoch": 1.1414452009663958, + "grad_norm": 2.4308950901031494, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7328931093215942, + "num_tokens": 259636461.0, + "step": 10394 + }, + { + "epoch": 1.1415550186690093, + "grad_norm": 2.619015693664551, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.717410683631897, + "num_tokens": 259658982.0, + "step": 10395 + }, + { + "epoch": 1.141664836371623, + "grad_norm": 2.1084742546081543, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7107218503952026, + "num_tokens": 259687664.0, + "step": 10396 + }, + { + "epoch": 1.1417746540742368, + "grad_norm": 2.4514763355255127, + "learning_rate": 1e-06, + "loss": 0.7678, + "mean_token_accuracy": 0.7624103426933289, + "num_tokens": 259708757.0, + "step": 10397 + }, + { + "epoch": 1.1418844717768504, + "grad_norm": 2.199179172515869, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7266696095466614, + "num_tokens": 259735545.0, + "step": 10398 + }, + { + "epoch": 1.1419942894794641, + "grad_norm": 2.1987128257751465, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7193319201469421, + "num_tokens": 259760840.0, + "step": 10399 + }, + { + "epoch": 1.1421041071820777, + "grad_norm": 2.4730238914489746, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7061295509338379, + "num_tokens": 259785591.0, + "step": 10400 + }, + { + "epoch": 1.1422139248846914, + "grad_norm": 2.511504888534546, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7128380537033081, + "num_tokens": 259811916.0, + "step": 10401 + }, + { + "epoch": 1.1423237425873052, + "grad_norm": 2.3351192474365234, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.6890586614608765, + "num_tokens": 259838943.0, + "step": 10402 + }, + { + "epoch": 1.1424335602899187, + "grad_norm": 2.403106689453125, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7010773420333862, + "num_tokens": 259862337.0, + "step": 10403 + }, + { + "epoch": 1.1425433779925325, + "grad_norm": 2.3179752826690674, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7060405015945435, + "num_tokens": 259888120.0, + "step": 10404 + }, + { + "epoch": 1.142653195695146, + "grad_norm": 2.448111057281494, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7134412527084351, + "num_tokens": 259909511.0, + "step": 10405 + }, + { + "epoch": 1.1427630133977598, + "grad_norm": 2.138657331466675, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7246758937835693, + "num_tokens": 259936324.0, + "step": 10406 + }, + { + "epoch": 1.1428728311003733, + "grad_norm": 2.2577717304229736, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7122815847396851, + "num_tokens": 259964217.0, + "step": 10407 + }, + { + "epoch": 1.142982648802987, + "grad_norm": 2.5837528705596924, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.713421106338501, + "num_tokens": 259986570.0, + "step": 10408 + }, + { + "epoch": 1.1430924665056006, + "grad_norm": 2.1887054443359375, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7118662595748901, + "num_tokens": 260013514.0, + "step": 10409 + }, + { + "epoch": 1.1432022842082143, + "grad_norm": 2.3233706951141357, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7219531536102295, + "num_tokens": 260037195.0, + "step": 10410 + }, + { + "epoch": 1.143312101910828, + "grad_norm": 2.5151095390319824, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7188833355903625, + "num_tokens": 260059200.0, + "step": 10411 + }, + { + "epoch": 1.1434219196134416, + "grad_norm": 2.722668170928955, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7148985862731934, + "num_tokens": 260079148.0, + "step": 10412 + }, + { + "epoch": 1.1435317373160554, + "grad_norm": 2.6261801719665527, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7354397177696228, + "num_tokens": 260098777.0, + "step": 10413 + }, + { + "epoch": 1.143641555018669, + "grad_norm": 2.300098180770874, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7329676151275635, + "num_tokens": 260125375.0, + "step": 10414 + }, + { + "epoch": 1.1437513727212827, + "grad_norm": 2.3180418014526367, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7205707430839539, + "num_tokens": 260149241.0, + "step": 10415 + }, + { + "epoch": 1.1438611904238964, + "grad_norm": 2.2724103927612305, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7009803056716919, + "num_tokens": 260177325.0, + "step": 10416 + }, + { + "epoch": 1.14397100812651, + "grad_norm": 2.1091396808624268, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7275768518447876, + "num_tokens": 260204967.0, + "step": 10417 + }, + { + "epoch": 1.1440808258291237, + "grad_norm": 2.2998228073120117, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.7012842893600464, + "num_tokens": 260228098.0, + "step": 10418 + }, + { + "epoch": 1.1441906435317373, + "grad_norm": 2.3764216899871826, + "learning_rate": 1e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7375009655952454, + "num_tokens": 260250647.0, + "step": 10419 + }, + { + "epoch": 1.144300461234351, + "grad_norm": 2.0401065349578857, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7016264796257019, + "num_tokens": 260281967.0, + "step": 10420 + }, + { + "epoch": 1.1444102789369646, + "grad_norm": 2.3773679733276367, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7242186069488525, + "num_tokens": 260305203.0, + "step": 10421 + }, + { + "epoch": 1.1445200966395783, + "grad_norm": 2.0831518173217773, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7151211500167847, + "num_tokens": 260334541.0, + "step": 10422 + }, + { + "epoch": 1.1446299143421919, + "grad_norm": 2.2818312644958496, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7200402617454529, + "num_tokens": 260359499.0, + "step": 10423 + }, + { + "epoch": 1.1447397320448056, + "grad_norm": 2.2666962146759033, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7242947816848755, + "num_tokens": 260385815.0, + "step": 10424 + }, + { + "epoch": 1.1448495497474194, + "grad_norm": 2.540154218673706, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7198635339736938, + "num_tokens": 260407867.0, + "step": 10425 + }, + { + "epoch": 1.144959367450033, + "grad_norm": 2.6113698482513428, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7302157282829285, + "num_tokens": 260429429.0, + "step": 10426 + }, + { + "epoch": 1.1450691851526467, + "grad_norm": 2.377741813659668, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7121472954750061, + "num_tokens": 260454000.0, + "step": 10427 + }, + { + "epoch": 1.1451790028552602, + "grad_norm": 2.6032235622406006, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7055819034576416, + "num_tokens": 260473400.0, + "step": 10428 + }, + { + "epoch": 1.145288820557874, + "grad_norm": 2.3235480785369873, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7407402396202087, + "num_tokens": 260498626.0, + "step": 10429 + }, + { + "epoch": 1.1453986382604877, + "grad_norm": 2.2859601974487305, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7116777896881104, + "num_tokens": 260524700.0, + "step": 10430 + }, + { + "epoch": 1.1455084559631012, + "grad_norm": 2.213698625564575, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7167149186134338, + "num_tokens": 260550465.0, + "step": 10431 + }, + { + "epoch": 1.145618273665715, + "grad_norm": 2.249709129333496, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7126654982566833, + "num_tokens": 260576336.0, + "step": 10432 + }, + { + "epoch": 1.1457280913683285, + "grad_norm": 2.447274684906006, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7274727821350098, + "num_tokens": 260597133.0, + "step": 10433 + }, + { + "epoch": 1.1458379090709423, + "grad_norm": 2.5401899814605713, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7321137189865112, + "num_tokens": 260617334.0, + "step": 10434 + }, + { + "epoch": 1.1459477267735558, + "grad_norm": 2.3102681636810303, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7344335913658142, + "num_tokens": 260641585.0, + "step": 10435 + }, + { + "epoch": 1.1460575444761696, + "grad_norm": 2.181751012802124, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7234209775924683, + "num_tokens": 260668291.0, + "step": 10436 + }, + { + "epoch": 1.1461673621787831, + "grad_norm": 2.363830089569092, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7112661600112915, + "num_tokens": 260692429.0, + "step": 10437 + }, + { + "epoch": 1.1462771798813969, + "grad_norm": 2.6026206016540527, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7188034057617188, + "num_tokens": 260713340.0, + "step": 10438 + }, + { + "epoch": 1.1463869975840106, + "grad_norm": 2.5800139904022217, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7153859734535217, + "num_tokens": 260734075.0, + "step": 10439 + }, + { + "epoch": 1.1464968152866242, + "grad_norm": 2.393730878829956, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7275078892707825, + "num_tokens": 260756566.0, + "step": 10440 + }, + { + "epoch": 1.146606632989238, + "grad_norm": 2.36531662940979, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7113804817199707, + "num_tokens": 260781759.0, + "step": 10441 + }, + { + "epoch": 1.1467164506918515, + "grad_norm": 2.4966866970062256, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7126745581626892, + "num_tokens": 260805800.0, + "step": 10442 + }, + { + "epoch": 1.1468262683944652, + "grad_norm": 2.3826026916503906, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7048931121826172, + "num_tokens": 260829187.0, + "step": 10443 + }, + { + "epoch": 1.1469360860970788, + "grad_norm": 2.2465786933898926, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7524535655975342, + "num_tokens": 260853975.0, + "step": 10444 + }, + { + "epoch": 1.1470459037996925, + "grad_norm": 2.1585850715637207, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7364495992660522, + "num_tokens": 260877838.0, + "step": 10445 + }, + { + "epoch": 1.147155721502306, + "grad_norm": 2.1858434677124023, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7109168767929077, + "num_tokens": 260904509.0, + "step": 10446 + }, + { + "epoch": 1.1472655392049198, + "grad_norm": 2.0990066528320312, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.727505624294281, + "num_tokens": 260932793.0, + "step": 10447 + }, + { + "epoch": 1.1473753569075336, + "grad_norm": 2.7789483070373535, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.6940334439277649, + "num_tokens": 260951897.0, + "step": 10448 + }, + { + "epoch": 1.147485174610147, + "grad_norm": 2.3231940269470215, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.728750467300415, + "num_tokens": 260974982.0, + "step": 10449 + }, + { + "epoch": 1.1475949923127609, + "grad_norm": 2.305427074432373, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7116128206253052, + "num_tokens": 260999621.0, + "step": 10450 + }, + { + "epoch": 1.1477048100153744, + "grad_norm": 2.2417263984680176, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7197139859199524, + "num_tokens": 261025524.0, + "step": 10451 + }, + { + "epoch": 1.1478146277179881, + "grad_norm": 2.118882656097412, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.6997404098510742, + "num_tokens": 261053879.0, + "step": 10452 + }, + { + "epoch": 1.147924445420602, + "grad_norm": 2.6838438510894775, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7162846326828003, + "num_tokens": 261073919.0, + "step": 10453 + }, + { + "epoch": 1.1480342631232154, + "grad_norm": 2.5459394454956055, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7350915670394897, + "num_tokens": 261095530.0, + "step": 10454 + }, + { + "epoch": 1.1481440808258292, + "grad_norm": 2.414227247238159, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.73625648021698, + "num_tokens": 261118236.0, + "step": 10455 + }, + { + "epoch": 1.1482538985284427, + "grad_norm": 2.198601722717285, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7248679995536804, + "num_tokens": 261143960.0, + "step": 10456 + }, + { + "epoch": 1.1483637162310565, + "grad_norm": 2.1575348377227783, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7063159942626953, + "num_tokens": 261174311.0, + "step": 10457 + }, + { + "epoch": 1.14847353393367, + "grad_norm": 2.362905979156494, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6959983110427856, + "num_tokens": 261198992.0, + "step": 10458 + }, + { + "epoch": 1.1485833516362838, + "grad_norm": 2.1802544593811035, + "learning_rate": 1e-06, + "loss": 0.9821, + "mean_token_accuracy": 0.6960751414299011, + "num_tokens": 261227945.0, + "step": 10459 + }, + { + "epoch": 1.1486931693388973, + "grad_norm": 2.5996031761169434, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7236741781234741, + "num_tokens": 261248850.0, + "step": 10460 + }, + { + "epoch": 1.148802987041511, + "grad_norm": 2.810479164123535, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7258937358856201, + "num_tokens": 261267499.0, + "step": 10461 + }, + { + "epoch": 1.1489128047441248, + "grad_norm": 2.594771385192871, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7397280931472778, + "num_tokens": 261285067.0, + "step": 10462 + }, + { + "epoch": 1.1490226224467384, + "grad_norm": 2.2462072372436523, + "learning_rate": 1e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7393245697021484, + "num_tokens": 261308796.0, + "step": 10463 + }, + { + "epoch": 1.1491324401493521, + "grad_norm": 2.344703197479248, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7203800082206726, + "num_tokens": 261331875.0, + "step": 10464 + }, + { + "epoch": 1.1492422578519657, + "grad_norm": 2.442477226257324, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7406760454177856, + "num_tokens": 261353860.0, + "step": 10465 + }, + { + "epoch": 1.1493520755545794, + "grad_norm": 2.116697072982788, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.716867208480835, + "num_tokens": 261381855.0, + "step": 10466 + }, + { + "epoch": 1.1494618932571932, + "grad_norm": 2.404580593109131, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7313579320907593, + "num_tokens": 261406527.0, + "step": 10467 + }, + { + "epoch": 1.1495717109598067, + "grad_norm": 2.2979679107666016, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7371900677680969, + "num_tokens": 261429527.0, + "step": 10468 + }, + { + "epoch": 1.1496815286624205, + "grad_norm": 2.3554627895355225, + "learning_rate": 1e-06, + "loss": 0.7755, + "mean_token_accuracy": 0.7546466588973999, + "num_tokens": 261451793.0, + "step": 10469 + }, + { + "epoch": 1.149791346365034, + "grad_norm": 2.2902257442474365, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7349258661270142, + "num_tokens": 261475518.0, + "step": 10470 + }, + { + "epoch": 1.1499011640676478, + "grad_norm": 2.382692337036133, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7131410241127014, + "num_tokens": 261498875.0, + "step": 10471 + }, + { + "epoch": 1.1500109817702613, + "grad_norm": 2.3109214305877686, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7264649271965027, + "num_tokens": 261522140.0, + "step": 10472 + }, + { + "epoch": 1.150120799472875, + "grad_norm": 2.4049408435821533, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7370368242263794, + "num_tokens": 261544187.0, + "step": 10473 + }, + { + "epoch": 1.1502306171754886, + "grad_norm": 2.0814480781555176, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7174259424209595, + "num_tokens": 261574938.0, + "step": 10474 + }, + { + "epoch": 1.1503404348781023, + "grad_norm": 2.3221969604492188, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7220690250396729, + "num_tokens": 261600103.0, + "step": 10475 + }, + { + "epoch": 1.150450252580716, + "grad_norm": 2.144411087036133, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7179710865020752, + "num_tokens": 261626597.0, + "step": 10476 + }, + { + "epoch": 1.1505600702833296, + "grad_norm": 2.202648878097534, + "learning_rate": 1e-06, + "loss": 1.021, + "mean_token_accuracy": 0.6873717308044434, + "num_tokens": 261654114.0, + "step": 10477 + }, + { + "epoch": 1.1506698879859434, + "grad_norm": 2.383131742477417, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7318499088287354, + "num_tokens": 261678423.0, + "step": 10478 + }, + { + "epoch": 1.150779705688557, + "grad_norm": 2.4306838512420654, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7132892608642578, + "num_tokens": 261701476.0, + "step": 10479 + }, + { + "epoch": 1.1508895233911707, + "grad_norm": 2.318946599960327, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7100502252578735, + "num_tokens": 261725744.0, + "step": 10480 + }, + { + "epoch": 1.1509993410937844, + "grad_norm": 2.1280148029327393, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7111855745315552, + "num_tokens": 261756318.0, + "step": 10481 + }, + { + "epoch": 1.151109158796398, + "grad_norm": 2.2761125564575195, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7191122770309448, + "num_tokens": 261780250.0, + "step": 10482 + }, + { + "epoch": 1.1512189764990117, + "grad_norm": 2.393021583557129, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7355153560638428, + "num_tokens": 261802539.0, + "step": 10483 + }, + { + "epoch": 1.1513287942016253, + "grad_norm": 2.4498114585876465, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7187522649765015, + "num_tokens": 261824245.0, + "step": 10484 + }, + { + "epoch": 1.151438611904239, + "grad_norm": 2.118136167526245, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7436357140541077, + "num_tokens": 261850221.0, + "step": 10485 + }, + { + "epoch": 1.1515484296068526, + "grad_norm": 2.0189616680145264, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.705070972442627, + "num_tokens": 261882541.0, + "step": 10486 + }, + { + "epoch": 1.1516582473094663, + "grad_norm": 2.7031307220458984, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.726715087890625, + "num_tokens": 261901861.0, + "step": 10487 + }, + { + "epoch": 1.1517680650120798, + "grad_norm": 2.348855495452881, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.7002371549606323, + "num_tokens": 261926410.0, + "step": 10488 + }, + { + "epoch": 1.1518778827146936, + "grad_norm": 2.093323230743408, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7248159646987915, + "num_tokens": 261953397.0, + "step": 10489 + }, + { + "epoch": 1.1519877004173074, + "grad_norm": 2.3290231227874756, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6889932751655579, + "num_tokens": 261978909.0, + "step": 10490 + }, + { + "epoch": 1.152097518119921, + "grad_norm": 2.7672536373138428, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7500262260437012, + "num_tokens": 261996145.0, + "step": 10491 + }, + { + "epoch": 1.1522073358225347, + "grad_norm": 2.166276693344116, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7095436453819275, + "num_tokens": 262023059.0, + "step": 10492 + }, + { + "epoch": 1.1523171535251482, + "grad_norm": 2.384572744369507, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.715114951133728, + "num_tokens": 262046305.0, + "step": 10493 + }, + { + "epoch": 1.152426971227762, + "grad_norm": 2.2603139877319336, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7009164094924927, + "num_tokens": 262071433.0, + "step": 10494 + }, + { + "epoch": 1.1525367889303757, + "grad_norm": 2.288516044616699, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7143884301185608, + "num_tokens": 262094613.0, + "step": 10495 + }, + { + "epoch": 1.1526466066329892, + "grad_norm": 2.110083818435669, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7084711790084839, + "num_tokens": 262125562.0, + "step": 10496 + }, + { + "epoch": 1.152756424335603, + "grad_norm": 2.252121925354004, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7111451625823975, + "num_tokens": 262149346.0, + "step": 10497 + }, + { + "epoch": 1.1528662420382165, + "grad_norm": 2.2656447887420654, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7118018865585327, + "num_tokens": 262174092.0, + "step": 10498 + }, + { + "epoch": 1.1529760597408303, + "grad_norm": 2.480360984802246, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7217607498168945, + "num_tokens": 262196093.0, + "step": 10499 + }, + { + "epoch": 1.1530858774434438, + "grad_norm": 2.3589632511138916, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7235493063926697, + "num_tokens": 262218552.0, + "step": 10500 + }, + { + "epoch": 1.1531956951460576, + "grad_norm": 2.406595230102539, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.7043068408966064, + "num_tokens": 262242728.0, + "step": 10501 + }, + { + "epoch": 1.153305512848671, + "grad_norm": 2.160541296005249, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7135343551635742, + "num_tokens": 262269720.0, + "step": 10502 + }, + { + "epoch": 1.1534153305512849, + "grad_norm": 2.4036853313446045, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7053346037864685, + "num_tokens": 262292554.0, + "step": 10503 + }, + { + "epoch": 1.1535251482538986, + "grad_norm": 2.486307144165039, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7150179147720337, + "num_tokens": 262316792.0, + "step": 10504 + }, + { + "epoch": 1.1536349659565122, + "grad_norm": 2.2496285438537598, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7191234230995178, + "num_tokens": 262341656.0, + "step": 10505 + }, + { + "epoch": 1.153744783659126, + "grad_norm": 2.355870008468628, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7372231483459473, + "num_tokens": 262363397.0, + "step": 10506 + }, + { + "epoch": 1.1538546013617395, + "grad_norm": 2.355051279067993, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7127251625061035, + "num_tokens": 262386200.0, + "step": 10507 + }, + { + "epoch": 1.1539644190643532, + "grad_norm": 2.392317056655884, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7223795652389526, + "num_tokens": 262408328.0, + "step": 10508 + }, + { + "epoch": 1.1540742367669667, + "grad_norm": 2.1242942810058594, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7198262214660645, + "num_tokens": 262434846.0, + "step": 10509 + }, + { + "epoch": 1.1541840544695805, + "grad_norm": 2.5755603313446045, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7207798361778259, + "num_tokens": 262455161.0, + "step": 10510 + }, + { + "epoch": 1.154293872172194, + "grad_norm": 2.125758647918701, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7319616675376892, + "num_tokens": 262483102.0, + "step": 10511 + }, + { + "epoch": 1.1544036898748078, + "grad_norm": 2.292276382446289, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7128047347068787, + "num_tokens": 262507604.0, + "step": 10512 + }, + { + "epoch": 1.1545135075774215, + "grad_norm": 2.0142054557800293, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6899844408035278, + "num_tokens": 262539605.0, + "step": 10513 + }, + { + "epoch": 1.154623325280035, + "grad_norm": 2.578249931335449, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7342036366462708, + "num_tokens": 262560942.0, + "step": 10514 + }, + { + "epoch": 1.1547331429826488, + "grad_norm": 2.261829137802124, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7069762349128723, + "num_tokens": 262586148.0, + "step": 10515 + }, + { + "epoch": 1.1548429606852624, + "grad_norm": 2.1823532581329346, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7105154395103455, + "num_tokens": 262613777.0, + "step": 10516 + }, + { + "epoch": 1.1549527783878761, + "grad_norm": 2.1812069416046143, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7264354228973389, + "num_tokens": 262640550.0, + "step": 10517 + }, + { + "epoch": 1.15506259609049, + "grad_norm": 2.3599071502685547, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7353383302688599, + "num_tokens": 262662205.0, + "step": 10518 + }, + { + "epoch": 1.1551724137931034, + "grad_norm": 2.159579277038574, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.7001092433929443, + "num_tokens": 262693761.0, + "step": 10519 + }, + { + "epoch": 1.1552822314957172, + "grad_norm": 2.2681632041931152, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.751923680305481, + "num_tokens": 262718044.0, + "step": 10520 + }, + { + "epoch": 1.1553920491983307, + "grad_norm": 2.753817081451416, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7307310104370117, + "num_tokens": 262735416.0, + "step": 10521 + }, + { + "epoch": 1.1555018669009445, + "grad_norm": 2.1713485717773438, + "learning_rate": 1e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7344051599502563, + "num_tokens": 262761317.0, + "step": 10522 + }, + { + "epoch": 1.155611684603558, + "grad_norm": 2.1393916606903076, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7172578573226929, + "num_tokens": 262788989.0, + "step": 10523 + }, + { + "epoch": 1.1557215023061718, + "grad_norm": 2.2627336978912354, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7119675278663635, + "num_tokens": 262814251.0, + "step": 10524 + }, + { + "epoch": 1.1558313200087853, + "grad_norm": 2.321770191192627, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7175135612487793, + "num_tokens": 262838437.0, + "step": 10525 + }, + { + "epoch": 1.155941137711399, + "grad_norm": 2.063664197921753, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7156762480735779, + "num_tokens": 262868055.0, + "step": 10526 + }, + { + "epoch": 1.1560509554140128, + "grad_norm": 2.39275860786438, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.6989952325820923, + "num_tokens": 262890879.0, + "step": 10527 + }, + { + "epoch": 1.1561607731166264, + "grad_norm": 2.7762415409088135, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7351988554000854, + "num_tokens": 262909820.0, + "step": 10528 + }, + { + "epoch": 1.15627059081924, + "grad_norm": 2.3149356842041016, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7048901319503784, + "num_tokens": 262933636.0, + "step": 10529 + }, + { + "epoch": 1.1563804085218536, + "grad_norm": 2.379605293273926, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7248799800872803, + "num_tokens": 262956388.0, + "step": 10530 + }, + { + "epoch": 1.1564902262244674, + "grad_norm": 2.4317283630371094, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7191255688667297, + "num_tokens": 262979494.0, + "step": 10531 + }, + { + "epoch": 1.1566000439270812, + "grad_norm": 2.34859037399292, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7023633122444153, + "num_tokens": 263004086.0, + "step": 10532 + }, + { + "epoch": 1.1567098616296947, + "grad_norm": 2.4397501945495605, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7357525825500488, + "num_tokens": 263026090.0, + "step": 10533 + }, + { + "epoch": 1.1568196793323084, + "grad_norm": 2.134629487991333, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7048911452293396, + "num_tokens": 263054588.0, + "step": 10534 + }, + { + "epoch": 1.156929497034922, + "grad_norm": 2.256331443786621, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7390852570533752, + "num_tokens": 263077991.0, + "step": 10535 + }, + { + "epoch": 1.1570393147375357, + "grad_norm": 2.212550401687622, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7027661204338074, + "num_tokens": 263105991.0, + "step": 10536 + }, + { + "epoch": 1.1571491324401493, + "grad_norm": 2.412461042404175, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.714277982711792, + "num_tokens": 263129708.0, + "step": 10537 + }, + { + "epoch": 1.157258950142763, + "grad_norm": 2.423274517059326, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7537751197814941, + "num_tokens": 263151234.0, + "step": 10538 + }, + { + "epoch": 1.1573687678453766, + "grad_norm": 2.4425241947174072, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7273188829421997, + "num_tokens": 263174631.0, + "step": 10539 + }, + { + "epoch": 1.1574785855479903, + "grad_norm": 2.5699844360351562, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7399992942810059, + "num_tokens": 263196173.0, + "step": 10540 + }, + { + "epoch": 1.157588403250604, + "grad_norm": 2.523326873779297, + "learning_rate": 1e-06, + "loss": 0.7977, + "mean_token_accuracy": 0.7409329414367676, + "num_tokens": 263216231.0, + "step": 10541 + }, + { + "epoch": 1.1576982209532176, + "grad_norm": 2.624812364578247, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7152931690216064, + "num_tokens": 263235611.0, + "step": 10542 + }, + { + "epoch": 1.1578080386558314, + "grad_norm": 2.579493999481201, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7259924411773682, + "num_tokens": 263257275.0, + "step": 10543 + }, + { + "epoch": 1.157917856358445, + "grad_norm": 2.1232564449310303, + "learning_rate": 1e-06, + "loss": 0.9759, + "mean_token_accuracy": 0.7084082365036011, + "num_tokens": 263287449.0, + "step": 10544 + }, + { + "epoch": 1.1580276740610587, + "grad_norm": 2.232898235321045, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.6959764957427979, + "num_tokens": 263312680.0, + "step": 10545 + }, + { + "epoch": 1.1581374917636724, + "grad_norm": 2.0975639820098877, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7475494742393494, + "num_tokens": 263339300.0, + "step": 10546 + }, + { + "epoch": 1.158247309466286, + "grad_norm": 2.2148733139038086, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7184088826179504, + "num_tokens": 263366536.0, + "step": 10547 + }, + { + "epoch": 1.1583571271688997, + "grad_norm": 2.573765516281128, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7123266458511353, + "num_tokens": 263386426.0, + "step": 10548 + }, + { + "epoch": 1.1584669448715132, + "grad_norm": 2.2221922874450684, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7206745147705078, + "num_tokens": 263413004.0, + "step": 10549 + }, + { + "epoch": 1.158576762574127, + "grad_norm": 2.4517812728881836, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7134713530540466, + "num_tokens": 263435918.0, + "step": 10550 + }, + { + "epoch": 1.1586865802767405, + "grad_norm": 2.3438992500305176, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7272449731826782, + "num_tokens": 263460745.0, + "step": 10551 + }, + { + "epoch": 1.1587963979793543, + "grad_norm": 2.3410732746124268, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7415798306465149, + "num_tokens": 263483756.0, + "step": 10552 + }, + { + "epoch": 1.1589062156819678, + "grad_norm": 2.1798102855682373, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.714939534664154, + "num_tokens": 263511071.0, + "step": 10553 + }, + { + "epoch": 1.1590160333845816, + "grad_norm": 2.5686566829681396, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7336761951446533, + "num_tokens": 263529845.0, + "step": 10554 + }, + { + "epoch": 1.1591258510871953, + "grad_norm": 2.530771017074585, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7375971674919128, + "num_tokens": 263549213.0, + "step": 10555 + }, + { + "epoch": 1.1592356687898089, + "grad_norm": 2.1455225944519043, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7296996116638184, + "num_tokens": 263577223.0, + "step": 10556 + }, + { + "epoch": 1.1593454864924226, + "grad_norm": 2.3577699661254883, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7161402106285095, + "num_tokens": 263600977.0, + "step": 10557 + }, + { + "epoch": 1.1594553041950362, + "grad_norm": 2.3378701210021973, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.6952347755432129, + "num_tokens": 263624749.0, + "step": 10558 + }, + { + "epoch": 1.15956512189765, + "grad_norm": 2.3891568183898926, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7455529570579529, + "num_tokens": 263647679.0, + "step": 10559 + }, + { + "epoch": 1.1596749396002635, + "grad_norm": 2.5292561054229736, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7264385223388672, + "num_tokens": 263670092.0, + "step": 10560 + }, + { + "epoch": 1.1597847573028772, + "grad_norm": 2.489271879196167, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.745197057723999, + "num_tokens": 263692207.0, + "step": 10561 + }, + { + "epoch": 1.159894575005491, + "grad_norm": 2.415299892425537, + "learning_rate": 1e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7439876198768616, + "num_tokens": 263715400.0, + "step": 10562 + }, + { + "epoch": 1.1600043927081045, + "grad_norm": 2.3745601177215576, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7233599424362183, + "num_tokens": 263738851.0, + "step": 10563 + }, + { + "epoch": 1.1601142104107183, + "grad_norm": 2.25360107421875, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7057344913482666, + "num_tokens": 263764504.0, + "step": 10564 + }, + { + "epoch": 1.1602240281133318, + "grad_norm": 2.206045389175415, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7195640802383423, + "num_tokens": 263791733.0, + "step": 10565 + }, + { + "epoch": 1.1603338458159456, + "grad_norm": 2.278308868408203, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.692129373550415, + "num_tokens": 263819648.0, + "step": 10566 + }, + { + "epoch": 1.160443663518559, + "grad_norm": 2.1935644149780273, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7212031483650208, + "num_tokens": 263848520.0, + "step": 10567 + }, + { + "epoch": 1.1605534812211729, + "grad_norm": 2.6555213928222656, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.720246434211731, + "num_tokens": 263868853.0, + "step": 10568 + }, + { + "epoch": 1.1606632989237866, + "grad_norm": 2.1843478679656982, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7196986079216003, + "num_tokens": 263895863.0, + "step": 10569 + }, + { + "epoch": 1.1607731166264001, + "grad_norm": 2.621281147003174, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7124152779579163, + "num_tokens": 263914314.0, + "step": 10570 + }, + { + "epoch": 1.160882934329014, + "grad_norm": 2.1127805709838867, + "learning_rate": 1e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7477986216545105, + "num_tokens": 263941234.0, + "step": 10571 + }, + { + "epoch": 1.1609927520316274, + "grad_norm": 2.2848894596099854, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.6953915357589722, + "num_tokens": 263965782.0, + "step": 10572 + }, + { + "epoch": 1.1611025697342412, + "grad_norm": 2.373666524887085, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7232469916343689, + "num_tokens": 263988808.0, + "step": 10573 + }, + { + "epoch": 1.1612123874368547, + "grad_norm": 2.329691171646118, + "learning_rate": 1e-06, + "loss": 0.785, + "mean_token_accuracy": 0.7399368286132812, + "num_tokens": 264011047.0, + "step": 10574 + }, + { + "epoch": 1.1613222051394685, + "grad_norm": 2.140899658203125, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.73058021068573, + "num_tokens": 264038102.0, + "step": 10575 + }, + { + "epoch": 1.161432022842082, + "grad_norm": 1.9591494798660278, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.7118897438049316, + "num_tokens": 264071797.0, + "step": 10576 + }, + { + "epoch": 1.1615418405446958, + "grad_norm": 2.2895913124084473, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7406339645385742, + "num_tokens": 264094787.0, + "step": 10577 + }, + { + "epoch": 1.1616516582473095, + "grad_norm": 2.4970779418945312, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7451911568641663, + "num_tokens": 264115992.0, + "step": 10578 + }, + { + "epoch": 1.161761475949923, + "grad_norm": 2.084153413772583, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7052009105682373, + "num_tokens": 264146407.0, + "step": 10579 + }, + { + "epoch": 1.1618712936525368, + "grad_norm": 2.2351818084716797, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7135065793991089, + "num_tokens": 264173705.0, + "step": 10580 + }, + { + "epoch": 1.1619811113551504, + "grad_norm": 2.0422630310058594, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7087328433990479, + "num_tokens": 264204568.0, + "step": 10581 + }, + { + "epoch": 1.1620909290577641, + "grad_norm": 2.3140735626220703, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7176490426063538, + "num_tokens": 264227151.0, + "step": 10582 + }, + { + "epoch": 1.1622007467603779, + "grad_norm": 2.805722236633301, + "learning_rate": 1e-06, + "loss": 0.7373, + "mean_token_accuracy": 0.7555913925170898, + "num_tokens": 264243542.0, + "step": 10583 + }, + { + "epoch": 1.1623105644629914, + "grad_norm": 2.482337474822998, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7274581789970398, + "num_tokens": 264265576.0, + "step": 10584 + }, + { + "epoch": 1.1624203821656052, + "grad_norm": 2.474165201187134, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7404114007949829, + "num_tokens": 264286936.0, + "step": 10585 + }, + { + "epoch": 1.1625301998682187, + "grad_norm": 2.3130059242248535, + "learning_rate": 1e-06, + "loss": 1.0338, + "mean_token_accuracy": 0.6888554096221924, + "num_tokens": 264314390.0, + "step": 10586 + }, + { + "epoch": 1.1626400175708325, + "grad_norm": 2.307992935180664, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7382153868675232, + "num_tokens": 264336054.0, + "step": 10587 + }, + { + "epoch": 1.162749835273446, + "grad_norm": 2.269287109375, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7114573121070862, + "num_tokens": 264363136.0, + "step": 10588 + }, + { + "epoch": 1.1628596529760598, + "grad_norm": 2.169377088546753, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7307173609733582, + "num_tokens": 264389997.0, + "step": 10589 + }, + { + "epoch": 1.1629694706786733, + "grad_norm": 2.3647687435150146, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.6990700364112854, + "num_tokens": 264413996.0, + "step": 10590 + }, + { + "epoch": 1.163079288381287, + "grad_norm": 2.2766785621643066, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7101044654846191, + "num_tokens": 264438426.0, + "step": 10591 + }, + { + "epoch": 1.1631891060839008, + "grad_norm": 2.33076810836792, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7200872302055359, + "num_tokens": 264461351.0, + "step": 10592 + }, + { + "epoch": 1.1632989237865143, + "grad_norm": 2.145764112472534, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7204433679580688, + "num_tokens": 264488392.0, + "step": 10593 + }, + { + "epoch": 1.163408741489128, + "grad_norm": 2.340616226196289, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7301702499389648, + "num_tokens": 264510219.0, + "step": 10594 + }, + { + "epoch": 1.1635185591917416, + "grad_norm": 2.4771459102630615, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7455135583877563, + "num_tokens": 264529548.0, + "step": 10595 + }, + { + "epoch": 1.1636283768943554, + "grad_norm": 2.333822011947632, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7026345133781433, + "num_tokens": 264553944.0, + "step": 10596 + }, + { + "epoch": 1.1637381945969691, + "grad_norm": 2.599907875061035, + "learning_rate": 1e-06, + "loss": 0.792, + "mean_token_accuracy": 0.7454215288162231, + "num_tokens": 264573193.0, + "step": 10597 + }, + { + "epoch": 1.1638480122995827, + "grad_norm": 2.4250001907348633, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7127479314804077, + "num_tokens": 264594633.0, + "step": 10598 + }, + { + "epoch": 1.1639578300021964, + "grad_norm": 2.4279391765594482, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7156246900558472, + "num_tokens": 264617835.0, + "step": 10599 + }, + { + "epoch": 1.16406764770481, + "grad_norm": 2.493194580078125, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7336820363998413, + "num_tokens": 264639799.0, + "step": 10600 + }, + { + "epoch": 1.1641774654074237, + "grad_norm": 2.553170919418335, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7051228880882263, + "num_tokens": 264660936.0, + "step": 10601 + }, + { + "epoch": 1.1642872831100373, + "grad_norm": 2.2116000652313232, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7444487810134888, + "num_tokens": 264686610.0, + "step": 10602 + }, + { + "epoch": 1.164397100812651, + "grad_norm": 2.1170966625213623, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.706307590007782, + "num_tokens": 264715889.0, + "step": 10603 + }, + { + "epoch": 1.1645069185152646, + "grad_norm": 2.1381092071533203, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7194223999977112, + "num_tokens": 264744075.0, + "step": 10604 + }, + { + "epoch": 1.1646167362178783, + "grad_norm": 2.2887566089630127, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7273561954498291, + "num_tokens": 264767622.0, + "step": 10605 + }, + { + "epoch": 1.164726553920492, + "grad_norm": 2.0011825561523438, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7505682706832886, + "num_tokens": 264794397.0, + "step": 10606 + }, + { + "epoch": 1.1648363716231056, + "grad_norm": 2.5946462154388428, + "learning_rate": 1e-06, + "loss": 0.8158, + "mean_token_accuracy": 0.7377089858055115, + "num_tokens": 264814940.0, + "step": 10607 + }, + { + "epoch": 1.1649461893257194, + "grad_norm": 2.3102316856384277, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.7027488946914673, + "num_tokens": 264839079.0, + "step": 10608 + }, + { + "epoch": 1.165056007028333, + "grad_norm": 2.2343451976776123, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7045760154724121, + "num_tokens": 264864086.0, + "step": 10609 + }, + { + "epoch": 1.1651658247309467, + "grad_norm": 2.382671356201172, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7266892194747925, + "num_tokens": 264885817.0, + "step": 10610 + }, + { + "epoch": 1.1652756424335604, + "grad_norm": 2.7386767864227295, + "learning_rate": 1e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.744868278503418, + "num_tokens": 264904011.0, + "step": 10611 + }, + { + "epoch": 1.165385460136174, + "grad_norm": 2.339566469192505, + "learning_rate": 1e-06, + "loss": 0.9331, + "mean_token_accuracy": 0.7200235724449158, + "num_tokens": 264928836.0, + "step": 10612 + }, + { + "epoch": 1.1654952778387877, + "grad_norm": 2.6248066425323486, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7379266023635864, + "num_tokens": 264947611.0, + "step": 10613 + }, + { + "epoch": 1.1656050955414012, + "grad_norm": 2.156956911087036, + "learning_rate": 1e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7357333898544312, + "num_tokens": 264973962.0, + "step": 10614 + }, + { + "epoch": 1.165714913244015, + "grad_norm": 2.4466381072998047, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.72777259349823, + "num_tokens": 264995934.0, + "step": 10615 + }, + { + "epoch": 1.1658247309466285, + "grad_norm": 2.3119187355041504, + "learning_rate": 1e-06, + "loss": 0.8104, + "mean_token_accuracy": 0.7386321425437927, + "num_tokens": 265019403.0, + "step": 10616 + }, + { + "epoch": 1.1659345486492423, + "grad_norm": 2.4272103309631348, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7345921993255615, + "num_tokens": 265040343.0, + "step": 10617 + }, + { + "epoch": 1.1660443663518558, + "grad_norm": 2.5069048404693604, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7316584587097168, + "num_tokens": 265061124.0, + "step": 10618 + }, + { + "epoch": 1.1661541840544696, + "grad_norm": 2.251746654510498, + "learning_rate": 1e-06, + "loss": 0.8169, + "mean_token_accuracy": 0.7409486770629883, + "num_tokens": 265087959.0, + "step": 10619 + }, + { + "epoch": 1.1662640017570833, + "grad_norm": 2.6142563819885254, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7059129476547241, + "num_tokens": 265110451.0, + "step": 10620 + }, + { + "epoch": 1.1663738194596969, + "grad_norm": 2.260417938232422, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.718567967414856, + "num_tokens": 265136088.0, + "step": 10621 + }, + { + "epoch": 1.1664836371623106, + "grad_norm": 2.0732667446136475, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7296236753463745, + "num_tokens": 265164444.0, + "step": 10622 + }, + { + "epoch": 1.1665934548649242, + "grad_norm": 2.3021323680877686, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.736915111541748, + "num_tokens": 265187892.0, + "step": 10623 + }, + { + "epoch": 1.166703272567538, + "grad_norm": 2.3262033462524414, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.719345211982727, + "num_tokens": 265213238.0, + "step": 10624 + }, + { + "epoch": 1.1668130902701515, + "grad_norm": 2.3398168087005615, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7315374612808228, + "num_tokens": 265236793.0, + "step": 10625 + }, + { + "epoch": 1.1669229079727652, + "grad_norm": 2.2754592895507812, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.6963667869567871, + "num_tokens": 265262741.0, + "step": 10626 + }, + { + "epoch": 1.167032725675379, + "grad_norm": 2.17696475982666, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7086270451545715, + "num_tokens": 265291028.0, + "step": 10627 + }, + { + "epoch": 1.1671425433779925, + "grad_norm": 2.101158380508423, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7238024473190308, + "num_tokens": 265319343.0, + "step": 10628 + }, + { + "epoch": 1.1672523610806063, + "grad_norm": 2.096191644668579, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7089933156967163, + "num_tokens": 265348404.0, + "step": 10629 + }, + { + "epoch": 1.1673621787832198, + "grad_norm": 2.683502435684204, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7183051109313965, + "num_tokens": 265368411.0, + "step": 10630 + }, + { + "epoch": 1.1674719964858336, + "grad_norm": 2.2237701416015625, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7076451778411865, + "num_tokens": 265395128.0, + "step": 10631 + }, + { + "epoch": 1.167581814188447, + "grad_norm": 2.2003798484802246, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7249135375022888, + "num_tokens": 265421408.0, + "step": 10632 + }, + { + "epoch": 1.1676916318910608, + "grad_norm": 2.2449772357940674, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7244013547897339, + "num_tokens": 265445123.0, + "step": 10633 + }, + { + "epoch": 1.1678014495936746, + "grad_norm": 2.5211143493652344, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7413431406021118, + "num_tokens": 265464773.0, + "step": 10634 + }, + { + "epoch": 1.1679112672962881, + "grad_norm": 2.470043659210205, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7215874195098877, + "num_tokens": 265486373.0, + "step": 10635 + }, + { + "epoch": 1.168021084998902, + "grad_norm": 2.2310099601745605, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7037715315818787, + "num_tokens": 265513605.0, + "step": 10636 + }, + { + "epoch": 1.1681309027015154, + "grad_norm": 2.1360340118408203, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7238970398902893, + "num_tokens": 265540888.0, + "step": 10637 + }, + { + "epoch": 1.1682407204041292, + "grad_norm": 2.0688419342041016, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.714358389377594, + "num_tokens": 265568771.0, + "step": 10638 + }, + { + "epoch": 1.1683505381067427, + "grad_norm": 2.6367006301879883, + "learning_rate": 1e-06, + "loss": 0.8129, + "mean_token_accuracy": 0.7516580820083618, + "num_tokens": 265587363.0, + "step": 10639 + }, + { + "epoch": 1.1684603558093565, + "grad_norm": 2.3517954349517822, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7025662660598755, + "num_tokens": 265612217.0, + "step": 10640 + }, + { + "epoch": 1.16857017351197, + "grad_norm": 2.3982901573181152, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.725391149520874, + "num_tokens": 265635190.0, + "step": 10641 + }, + { + "epoch": 1.1686799912145838, + "grad_norm": 2.6688857078552246, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7162781953811646, + "num_tokens": 265654166.0, + "step": 10642 + }, + { + "epoch": 1.1687898089171975, + "grad_norm": 2.0770106315612793, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7096809148788452, + "num_tokens": 265685025.0, + "step": 10643 + }, + { + "epoch": 1.168899626619811, + "grad_norm": 2.451287269592285, + "learning_rate": 1e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7401531934738159, + "num_tokens": 265705046.0, + "step": 10644 + }, + { + "epoch": 1.1690094443224248, + "grad_norm": 2.244990587234497, + "learning_rate": 1e-06, + "loss": 1.0687, + "mean_token_accuracy": 0.6802581548690796, + "num_tokens": 265734942.0, + "step": 10645 + }, + { + "epoch": 1.1691192620250384, + "grad_norm": 2.4920449256896973, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7224075794219971, + "num_tokens": 265755707.0, + "step": 10646 + }, + { + "epoch": 1.169229079727652, + "grad_norm": 2.078138828277588, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7019214630126953, + "num_tokens": 265785120.0, + "step": 10647 + }, + { + "epoch": 1.1693388974302659, + "grad_norm": 2.2683629989624023, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.723469614982605, + "num_tokens": 265810140.0, + "step": 10648 + }, + { + "epoch": 1.1694487151328794, + "grad_norm": 2.406059503555298, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7140604257583618, + "num_tokens": 265833868.0, + "step": 10649 + }, + { + "epoch": 1.1695585328354932, + "grad_norm": 2.6552484035491943, + "learning_rate": 1e-06, + "loss": 0.7723, + "mean_token_accuracy": 0.7559942603111267, + "num_tokens": 265852832.0, + "step": 10650 + }, + { + "epoch": 1.1696683505381067, + "grad_norm": 2.0807337760925293, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7116256952285767, + "num_tokens": 265883007.0, + "step": 10651 + }, + { + "epoch": 1.1697781682407205, + "grad_norm": 2.04498553276062, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7280319929122925, + "num_tokens": 265913421.0, + "step": 10652 + }, + { + "epoch": 1.169887985943334, + "grad_norm": 1.8936047554016113, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.731015682220459, + "num_tokens": 265947319.0, + "step": 10653 + }, + { + "epoch": 1.1699978036459477, + "grad_norm": 2.273611068725586, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.715860903263092, + "num_tokens": 265973940.0, + "step": 10654 + }, + { + "epoch": 1.1701076213485613, + "grad_norm": 2.4274353981018066, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7403614521026611, + "num_tokens": 265995777.0, + "step": 10655 + }, + { + "epoch": 1.170217439051175, + "grad_norm": 2.1730895042419434, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7158172130584717, + "num_tokens": 266023069.0, + "step": 10656 + }, + { + "epoch": 1.1703272567537888, + "grad_norm": 2.2729687690734863, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7249190807342529, + "num_tokens": 266046415.0, + "step": 10657 + }, + { + "epoch": 1.1704370744564023, + "grad_norm": 2.2840394973754883, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7447773814201355, + "num_tokens": 266069460.0, + "step": 10658 + }, + { + "epoch": 1.170546892159016, + "grad_norm": 2.8388259410858154, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6924312114715576, + "num_tokens": 266090153.0, + "step": 10659 + }, + { + "epoch": 1.1706567098616296, + "grad_norm": 2.5443620681762695, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7303438186645508, + "num_tokens": 266109849.0, + "step": 10660 + }, + { + "epoch": 1.1707665275642434, + "grad_norm": 2.2854182720184326, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7287689447402954, + "num_tokens": 266133487.0, + "step": 10661 + }, + { + "epoch": 1.1708763452668571, + "grad_norm": 2.1908552646636963, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7212909460067749, + "num_tokens": 266158267.0, + "step": 10662 + }, + { + "epoch": 1.1709861629694707, + "grad_norm": 2.2293946743011475, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7122336030006409, + "num_tokens": 266183497.0, + "step": 10663 + }, + { + "epoch": 1.1710959806720844, + "grad_norm": 2.8567583560943604, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7335169315338135, + "num_tokens": 266201725.0, + "step": 10664 + }, + { + "epoch": 1.171205798374698, + "grad_norm": 2.4290056228637695, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7285782098770142, + "num_tokens": 266222899.0, + "step": 10665 + }, + { + "epoch": 1.1713156160773117, + "grad_norm": 2.173874855041504, + "learning_rate": 1e-06, + "loss": 1.0402, + "mean_token_accuracy": 0.6871522665023804, + "num_tokens": 266251053.0, + "step": 10666 + }, + { + "epoch": 1.1714254337799253, + "grad_norm": 2.212479591369629, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7060852646827698, + "num_tokens": 266279497.0, + "step": 10667 + }, + { + "epoch": 1.171535251482539, + "grad_norm": 2.358102321624756, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7090460658073425, + "num_tokens": 266304090.0, + "step": 10668 + }, + { + "epoch": 1.1716450691851525, + "grad_norm": 2.2383790016174316, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7338985204696655, + "num_tokens": 266329921.0, + "step": 10669 + }, + { + "epoch": 1.1717548868877663, + "grad_norm": 2.3616726398468018, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.730728268623352, + "num_tokens": 266351365.0, + "step": 10670 + }, + { + "epoch": 1.17186470459038, + "grad_norm": 2.520373582839966, + "learning_rate": 1e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7420215606689453, + "num_tokens": 266371936.0, + "step": 10671 + }, + { + "epoch": 1.1719745222929936, + "grad_norm": 2.7744128704071045, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.71741783618927, + "num_tokens": 266390317.0, + "step": 10672 + }, + { + "epoch": 1.1720843399956073, + "grad_norm": 2.253030300140381, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7069997787475586, + "num_tokens": 266415126.0, + "step": 10673 + }, + { + "epoch": 1.1721941576982209, + "grad_norm": 1.981563687324524, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7257356643676758, + "num_tokens": 266444975.0, + "step": 10674 + }, + { + "epoch": 1.1723039754008346, + "grad_norm": 2.2942793369293213, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.709242582321167, + "num_tokens": 266469342.0, + "step": 10675 + }, + { + "epoch": 1.1724137931034484, + "grad_norm": 2.5186595916748047, + "learning_rate": 1e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7563892602920532, + "num_tokens": 266487860.0, + "step": 10676 + }, + { + "epoch": 1.172523610806062, + "grad_norm": 2.0768728256225586, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7124658226966858, + "num_tokens": 266519387.0, + "step": 10677 + }, + { + "epoch": 1.1726334285086757, + "grad_norm": 2.4527714252471924, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7089555263519287, + "num_tokens": 266542769.0, + "step": 10678 + }, + { + "epoch": 1.1727432462112892, + "grad_norm": 2.403940439224243, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7190065383911133, + "num_tokens": 266566633.0, + "step": 10679 + }, + { + "epoch": 1.172853063913903, + "grad_norm": 2.3164515495300293, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7057069540023804, + "num_tokens": 266593036.0, + "step": 10680 + }, + { + "epoch": 1.1729628816165165, + "grad_norm": 2.542003631591797, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7325955629348755, + "num_tokens": 266617885.0, + "step": 10681 + }, + { + "epoch": 1.1730726993191303, + "grad_norm": 2.6617369651794434, + "learning_rate": 1e-06, + "loss": 0.8135, + "mean_token_accuracy": 0.738006055355072, + "num_tokens": 266635496.0, + "step": 10682 + }, + { + "epoch": 1.1731825170217438, + "grad_norm": 2.1391329765319824, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7195302248001099, + "num_tokens": 266662041.0, + "step": 10683 + }, + { + "epoch": 1.1732923347243576, + "grad_norm": 2.2784440517425537, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7104262113571167, + "num_tokens": 266687440.0, + "step": 10684 + }, + { + "epoch": 1.1734021524269713, + "grad_norm": 2.362618923187256, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7228862643241882, + "num_tokens": 266712323.0, + "step": 10685 + }, + { + "epoch": 1.1735119701295849, + "grad_norm": 2.390376567840576, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7055105566978455, + "num_tokens": 266736507.0, + "step": 10686 + }, + { + "epoch": 1.1736217878321986, + "grad_norm": 2.305434465408325, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7284730076789856, + "num_tokens": 266758661.0, + "step": 10687 + }, + { + "epoch": 1.1737316055348122, + "grad_norm": 2.1863820552825928, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.6956116557121277, + "num_tokens": 266785463.0, + "step": 10688 + }, + { + "epoch": 1.173841423237426, + "grad_norm": 2.11486554145813, + "learning_rate": 1e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7520722150802612, + "num_tokens": 266810046.0, + "step": 10689 + }, + { + "epoch": 1.1739512409400394, + "grad_norm": 2.3458359241485596, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.732080340385437, + "num_tokens": 266833195.0, + "step": 10690 + }, + { + "epoch": 1.1740610586426532, + "grad_norm": 2.067955255508423, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7070220112800598, + "num_tokens": 266861037.0, + "step": 10691 + }, + { + "epoch": 1.1741708763452667, + "grad_norm": 2.34263277053833, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7362439632415771, + "num_tokens": 266885099.0, + "step": 10692 + }, + { + "epoch": 1.1742806940478805, + "grad_norm": 2.5824506282806396, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7446062564849854, + "num_tokens": 266906384.0, + "step": 10693 + }, + { + "epoch": 1.1743905117504942, + "grad_norm": 2.396789312362671, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7172444462776184, + "num_tokens": 266930953.0, + "step": 10694 + }, + { + "epoch": 1.1745003294531078, + "grad_norm": 2.279043197631836, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.7023184299468994, + "num_tokens": 266958272.0, + "step": 10695 + }, + { + "epoch": 1.1746101471557215, + "grad_norm": 2.4338643550872803, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7075026631355286, + "num_tokens": 266982321.0, + "step": 10696 + }, + { + "epoch": 1.174719964858335, + "grad_norm": 1.9646188020706177, + "learning_rate": 1e-06, + "loss": 0.9995, + "mean_token_accuracy": 0.6925843954086304, + "num_tokens": 267013912.0, + "step": 10697 + }, + { + "epoch": 1.1748297825609488, + "grad_norm": 1.8960187435150146, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7173064351081848, + "num_tokens": 267049099.0, + "step": 10698 + }, + { + "epoch": 1.1749396002635626, + "grad_norm": 2.0916380882263184, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7159554958343506, + "num_tokens": 267079947.0, + "step": 10699 + }, + { + "epoch": 1.1750494179661761, + "grad_norm": 2.1605112552642822, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.700071394443512, + "num_tokens": 267106212.0, + "step": 10700 + }, + { + "epoch": 1.1751592356687899, + "grad_norm": 2.3871376514434814, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7295643091201782, + "num_tokens": 267128848.0, + "step": 10701 + }, + { + "epoch": 1.1752690533714034, + "grad_norm": 2.371244430541992, + "learning_rate": 1e-06, + "loss": 0.8035, + "mean_token_accuracy": 0.7457475662231445, + "num_tokens": 267150241.0, + "step": 10702 + }, + { + "epoch": 1.1753788710740172, + "grad_norm": 2.13374662399292, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7265859842300415, + "num_tokens": 267176897.0, + "step": 10703 + }, + { + "epoch": 1.1754886887766307, + "grad_norm": 2.1851747035980225, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.6960599422454834, + "num_tokens": 267203978.0, + "step": 10704 + }, + { + "epoch": 1.1755985064792445, + "grad_norm": 2.449841260910034, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7253279685974121, + "num_tokens": 267227130.0, + "step": 10705 + }, + { + "epoch": 1.175708324181858, + "grad_norm": 2.1877553462982178, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7429338693618774, + "num_tokens": 267253523.0, + "step": 10706 + }, + { + "epoch": 1.1758181418844718, + "grad_norm": 2.129455804824829, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7215244770050049, + "num_tokens": 267281166.0, + "step": 10707 + }, + { + "epoch": 1.1759279595870855, + "grad_norm": 2.4132473468780518, + "learning_rate": 1e-06, + "loss": 0.7849, + "mean_token_accuracy": 0.7434638738632202, + "num_tokens": 267301333.0, + "step": 10708 + }, + { + "epoch": 1.176037777289699, + "grad_norm": 2.6413955688476562, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7224781513214111, + "num_tokens": 267321307.0, + "step": 10709 + }, + { + "epoch": 1.1761475949923128, + "grad_norm": 2.2191286087036133, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7081901431083679, + "num_tokens": 267346813.0, + "step": 10710 + }, + { + "epoch": 1.1762574126949263, + "grad_norm": 1.9306405782699585, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7232801914215088, + "num_tokens": 267377386.0, + "step": 10711 + }, + { + "epoch": 1.17636723039754, + "grad_norm": 2.72849440574646, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.716072678565979, + "num_tokens": 267395247.0, + "step": 10712 + }, + { + "epoch": 1.1764770481001539, + "grad_norm": 2.2638652324676514, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7236158847808838, + "num_tokens": 267420060.0, + "step": 10713 + }, + { + "epoch": 1.1765868658027674, + "grad_norm": 2.8433473110198975, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.723969578742981, + "num_tokens": 267437826.0, + "step": 10714 + }, + { + "epoch": 1.1766966835053811, + "grad_norm": 2.0532023906707764, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.709033727645874, + "num_tokens": 267467017.0, + "step": 10715 + }, + { + "epoch": 1.1768065012079947, + "grad_norm": 2.349187135696411, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7206833362579346, + "num_tokens": 267491906.0, + "step": 10716 + }, + { + "epoch": 1.1769163189106084, + "grad_norm": 2.268347978591919, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7136287093162537, + "num_tokens": 267516977.0, + "step": 10717 + }, + { + "epoch": 1.177026136613222, + "grad_norm": 2.792386770248413, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7253758907318115, + "num_tokens": 267536109.0, + "step": 10718 + }, + { + "epoch": 1.1771359543158357, + "grad_norm": 2.4189558029174805, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7154419422149658, + "num_tokens": 267559383.0, + "step": 10719 + }, + { + "epoch": 1.1772457720184493, + "grad_norm": 2.265284538269043, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7069485783576965, + "num_tokens": 267585013.0, + "step": 10720 + }, + { + "epoch": 1.177355589721063, + "grad_norm": 2.3737831115722656, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7227503061294556, + "num_tokens": 267607010.0, + "step": 10721 + }, + { + "epoch": 1.1774654074236768, + "grad_norm": 2.3182873725891113, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.6975778341293335, + "num_tokens": 267633086.0, + "step": 10722 + }, + { + "epoch": 1.1775752251262903, + "grad_norm": 2.278203010559082, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7408447265625, + "num_tokens": 267659864.0, + "step": 10723 + }, + { + "epoch": 1.177685042828904, + "grad_norm": 2.1337296962738037, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.727981448173523, + "num_tokens": 267687643.0, + "step": 10724 + }, + { + "epoch": 1.1777948605315176, + "grad_norm": 2.425062894821167, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7262653708457947, + "num_tokens": 267710428.0, + "step": 10725 + }, + { + "epoch": 1.1779046782341314, + "grad_norm": 2.667884588241577, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.733226478099823, + "num_tokens": 267729684.0, + "step": 10726 + }, + { + "epoch": 1.1780144959367451, + "grad_norm": 2.346613645553589, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7037467956542969, + "num_tokens": 267756499.0, + "step": 10727 + }, + { + "epoch": 1.1781243136393587, + "grad_norm": 2.11446213722229, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7115024328231812, + "num_tokens": 267784705.0, + "step": 10728 + }, + { + "epoch": 1.1782341313419724, + "grad_norm": 2.1759531497955322, + "learning_rate": 1e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7618688344955444, + "num_tokens": 267810074.0, + "step": 10729 + }, + { + "epoch": 1.178343949044586, + "grad_norm": 2.1349120140075684, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7534456253051758, + "num_tokens": 267834494.0, + "step": 10730 + }, + { + "epoch": 1.1784537667471997, + "grad_norm": 2.213618278503418, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7264779806137085, + "num_tokens": 267860962.0, + "step": 10731 + }, + { + "epoch": 1.1785635844498132, + "grad_norm": 2.301116943359375, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7124441266059875, + "num_tokens": 267886149.0, + "step": 10732 + }, + { + "epoch": 1.178673402152427, + "grad_norm": 2.2128007411956787, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7052936553955078, + "num_tokens": 267914521.0, + "step": 10733 + }, + { + "epoch": 1.1787832198550405, + "grad_norm": 2.3174355030059814, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7156391143798828, + "num_tokens": 267939800.0, + "step": 10734 + }, + { + "epoch": 1.1788930375576543, + "grad_norm": 2.3461978435516357, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7069306373596191, + "num_tokens": 267964420.0, + "step": 10735 + }, + { + "epoch": 1.179002855260268, + "grad_norm": 2.490797281265259, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7319344878196716, + "num_tokens": 267987814.0, + "step": 10736 + }, + { + "epoch": 1.1791126729628816, + "grad_norm": 2.288778066635132, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7230509519577026, + "num_tokens": 268011694.0, + "step": 10737 + }, + { + "epoch": 1.1792224906654953, + "grad_norm": 2.7502124309539795, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7207053899765015, + "num_tokens": 268030647.0, + "step": 10738 + }, + { + "epoch": 1.1793323083681089, + "grad_norm": 2.0761537551879883, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7141784429550171, + "num_tokens": 268058632.0, + "step": 10739 + }, + { + "epoch": 1.1794421260707226, + "grad_norm": 1.9466763734817505, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7164624929428101, + "num_tokens": 268088773.0, + "step": 10740 + }, + { + "epoch": 1.1795519437733364, + "grad_norm": 2.140315294265747, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7009227275848389, + "num_tokens": 268117784.0, + "step": 10741 + }, + { + "epoch": 1.17966176147595, + "grad_norm": 2.444988489151001, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7019547820091248, + "num_tokens": 268142275.0, + "step": 10742 + }, + { + "epoch": 1.1797715791785637, + "grad_norm": 2.3623287677764893, + "learning_rate": 1e-06, + "loss": 0.7996, + "mean_token_accuracy": 0.7427748441696167, + "num_tokens": 268163885.0, + "step": 10743 + }, + { + "epoch": 1.1798813968811772, + "grad_norm": 2.457350969314575, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7124136090278625, + "num_tokens": 268187582.0, + "step": 10744 + }, + { + "epoch": 1.179991214583791, + "grad_norm": 2.6198043823242188, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7235819697380066, + "num_tokens": 268206421.0, + "step": 10745 + }, + { + "epoch": 1.1801010322864045, + "grad_norm": 2.4340932369232178, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7347919940948486, + "num_tokens": 268228315.0, + "step": 10746 + }, + { + "epoch": 1.1802108499890183, + "grad_norm": 1.9881218671798706, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7006583213806152, + "num_tokens": 268261687.0, + "step": 10747 + }, + { + "epoch": 1.1803206676916318, + "grad_norm": 1.9384989738464355, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7225698232650757, + "num_tokens": 268293239.0, + "step": 10748 + }, + { + "epoch": 1.1804304853942456, + "grad_norm": 2.5609548091888428, + "learning_rate": 1e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.7526496052742004, + "num_tokens": 268311553.0, + "step": 10749 + }, + { + "epoch": 1.1805403030968593, + "grad_norm": 2.373199701309204, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7153533697128296, + "num_tokens": 268335108.0, + "step": 10750 + }, + { + "epoch": 1.1806501207994728, + "grad_norm": 1.9608888626098633, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.69858717918396, + "num_tokens": 268367684.0, + "step": 10751 + }, + { + "epoch": 1.1807599385020866, + "grad_norm": 2.4874444007873535, + "learning_rate": 1e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7367362380027771, + "num_tokens": 268389653.0, + "step": 10752 + }, + { + "epoch": 1.1808697562047001, + "grad_norm": 2.2948148250579834, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7100926637649536, + "num_tokens": 268413701.0, + "step": 10753 + }, + { + "epoch": 1.180979573907314, + "grad_norm": 2.254403591156006, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.702284574508667, + "num_tokens": 268441213.0, + "step": 10754 + }, + { + "epoch": 1.1810893916099274, + "grad_norm": 2.1161949634552, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7079157829284668, + "num_tokens": 268470165.0, + "step": 10755 + }, + { + "epoch": 1.1811992093125412, + "grad_norm": 2.4231929779052734, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7271522879600525, + "num_tokens": 268495854.0, + "step": 10756 + }, + { + "epoch": 1.1813090270151547, + "grad_norm": 2.138288974761963, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7248371839523315, + "num_tokens": 268523746.0, + "step": 10757 + }, + { + "epoch": 1.1814188447177685, + "grad_norm": 2.327460527420044, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7375134229660034, + "num_tokens": 268548641.0, + "step": 10758 + }, + { + "epoch": 1.1815286624203822, + "grad_norm": 1.9743553400039673, + "learning_rate": 1e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7335991859436035, + "num_tokens": 268578267.0, + "step": 10759 + }, + { + "epoch": 1.1816384801229958, + "grad_norm": 2.274402141571045, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6910001039505005, + "num_tokens": 268604392.0, + "step": 10760 + }, + { + "epoch": 1.1817482978256095, + "grad_norm": 2.205976724624634, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7054462432861328, + "num_tokens": 268631746.0, + "step": 10761 + }, + { + "epoch": 1.181858115528223, + "grad_norm": 1.9816464185714722, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.717235803604126, + "num_tokens": 268663704.0, + "step": 10762 + }, + { + "epoch": 1.1819679332308368, + "grad_norm": 2.5086827278137207, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7209080457687378, + "num_tokens": 268686623.0, + "step": 10763 + }, + { + "epoch": 1.1820777509334506, + "grad_norm": 2.2441611289978027, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7254199385643005, + "num_tokens": 268710505.0, + "step": 10764 + }, + { + "epoch": 1.1821875686360641, + "grad_norm": 2.375419855117798, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7093111276626587, + "num_tokens": 268734430.0, + "step": 10765 + }, + { + "epoch": 1.1822973863386779, + "grad_norm": 2.3717148303985596, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7318212985992432, + "num_tokens": 268756441.0, + "step": 10766 + }, + { + "epoch": 1.1824072040412914, + "grad_norm": 2.513005495071411, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7333686947822571, + "num_tokens": 268776880.0, + "step": 10767 + }, + { + "epoch": 1.1825170217439052, + "grad_norm": 2.4561355113983154, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7101426124572754, + "num_tokens": 268798080.0, + "step": 10768 + }, + { + "epoch": 1.1826268394465187, + "grad_norm": 2.301438331604004, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7292332649230957, + "num_tokens": 268820657.0, + "step": 10769 + }, + { + "epoch": 1.1827366571491325, + "grad_norm": 2.183957576751709, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7172209620475769, + "num_tokens": 268846557.0, + "step": 10770 + }, + { + "epoch": 1.182846474851746, + "grad_norm": 2.2928924560546875, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7305442094802856, + "num_tokens": 268871152.0, + "step": 10771 + }, + { + "epoch": 1.1829562925543597, + "grad_norm": 2.006516218185425, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7289005517959595, + "num_tokens": 268900540.0, + "step": 10772 + }, + { + "epoch": 1.1830661102569735, + "grad_norm": 2.540297746658325, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7100842595100403, + "num_tokens": 268920260.0, + "step": 10773 + }, + { + "epoch": 1.183175927959587, + "grad_norm": 2.282055139541626, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7143206000328064, + "num_tokens": 268949106.0, + "step": 10774 + }, + { + "epoch": 1.1832857456622008, + "grad_norm": 2.4135639667510986, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7056899070739746, + "num_tokens": 268971786.0, + "step": 10775 + }, + { + "epoch": 1.1833955633648143, + "grad_norm": 1.9054934978485107, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.6942598819732666, + "num_tokens": 269006992.0, + "step": 10776 + }, + { + "epoch": 1.183505381067428, + "grad_norm": 2.355591297149658, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.710404634475708, + "num_tokens": 269030763.0, + "step": 10777 + }, + { + "epoch": 1.1836151987700418, + "grad_norm": 2.3938989639282227, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7025012969970703, + "num_tokens": 269054224.0, + "step": 10778 + }, + { + "epoch": 1.1837250164726554, + "grad_norm": 2.2408440113067627, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7280006408691406, + "num_tokens": 269079970.0, + "step": 10779 + }, + { + "epoch": 1.1838348341752691, + "grad_norm": 2.0481934547424316, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7248295545578003, + "num_tokens": 269110150.0, + "step": 10780 + }, + { + "epoch": 1.1839446518778827, + "grad_norm": 2.399060010910034, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7019035816192627, + "num_tokens": 269134186.0, + "step": 10781 + }, + { + "epoch": 1.1840544695804964, + "grad_norm": 2.0183732509613037, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6905183792114258, + "num_tokens": 269167849.0, + "step": 10782 + }, + { + "epoch": 1.18416428728311, + "grad_norm": 2.308777332305908, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6946576833724976, + "num_tokens": 269194167.0, + "step": 10783 + }, + { + "epoch": 1.1842741049857237, + "grad_norm": 2.5982980728149414, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7084287405014038, + "num_tokens": 269214814.0, + "step": 10784 + }, + { + "epoch": 1.1843839226883373, + "grad_norm": 2.1566083431243896, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7241193652153015, + "num_tokens": 269241217.0, + "step": 10785 + }, + { + "epoch": 1.184493740390951, + "grad_norm": 2.463517904281616, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.720149576663971, + "num_tokens": 269262395.0, + "step": 10786 + }, + { + "epoch": 1.1846035580935648, + "grad_norm": 1.971439003944397, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7273963689804077, + "num_tokens": 269295070.0, + "step": 10787 + }, + { + "epoch": 1.1847133757961783, + "grad_norm": 2.2607531547546387, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7153627872467041, + "num_tokens": 269320602.0, + "step": 10788 + }, + { + "epoch": 1.184823193498792, + "grad_norm": 2.3524012565612793, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7059062123298645, + "num_tokens": 269345477.0, + "step": 10789 + }, + { + "epoch": 1.1849330112014056, + "grad_norm": 2.402052640914917, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7335228323936462, + "num_tokens": 269367984.0, + "step": 10790 + }, + { + "epoch": 1.1850428289040194, + "grad_norm": 2.071472406387329, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7157106399536133, + "num_tokens": 269397137.0, + "step": 10791 + }, + { + "epoch": 1.185152646606633, + "grad_norm": 2.3853073120117188, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7291483879089355, + "num_tokens": 269419219.0, + "step": 10792 + }, + { + "epoch": 1.1852624643092466, + "grad_norm": 2.2414772510528564, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7066076397895813, + "num_tokens": 269444421.0, + "step": 10793 + }, + { + "epoch": 1.1853722820118604, + "grad_norm": 2.390880584716797, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7130323648452759, + "num_tokens": 269467251.0, + "step": 10794 + }, + { + "epoch": 1.185482099714474, + "grad_norm": 2.388986825942993, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7257254123687744, + "num_tokens": 269492694.0, + "step": 10795 + }, + { + "epoch": 1.1855919174170877, + "grad_norm": 2.193253517150879, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7146153450012207, + "num_tokens": 269518114.0, + "step": 10796 + }, + { + "epoch": 1.1857017351197012, + "grad_norm": 2.3568179607391357, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.729157567024231, + "num_tokens": 269542608.0, + "step": 10797 + }, + { + "epoch": 1.185811552822315, + "grad_norm": 2.149322509765625, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7256582975387573, + "num_tokens": 269569750.0, + "step": 10798 + }, + { + "epoch": 1.1859213705249285, + "grad_norm": 2.3612000942230225, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7042311429977417, + "num_tokens": 269594487.0, + "step": 10799 + }, + { + "epoch": 1.1860311882275423, + "grad_norm": 2.3579070568084717, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7212774157524109, + "num_tokens": 269619674.0, + "step": 10800 + }, + { + "epoch": 1.186141005930156, + "grad_norm": 2.1906347274780273, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7329262495040894, + "num_tokens": 269646562.0, + "step": 10801 + }, + { + "epoch": 1.1862508236327696, + "grad_norm": 2.194525957107544, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.6978715658187866, + "num_tokens": 269676607.0, + "step": 10802 + }, + { + "epoch": 1.1863606413353833, + "grad_norm": 2.4092531204223633, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7224846482276917, + "num_tokens": 269699516.0, + "step": 10803 + }, + { + "epoch": 1.1864704590379969, + "grad_norm": 2.0582997798919678, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6968706846237183, + "num_tokens": 269729940.0, + "step": 10804 + }, + { + "epoch": 1.1865802767406106, + "grad_norm": 2.330681562423706, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7360226511955261, + "num_tokens": 269754966.0, + "step": 10805 + }, + { + "epoch": 1.1866900944432242, + "grad_norm": 2.4222042560577393, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7351146340370178, + "num_tokens": 269776319.0, + "step": 10806 + }, + { + "epoch": 1.186799912145838, + "grad_norm": 2.033848762512207, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7344178557395935, + "num_tokens": 269807166.0, + "step": 10807 + }, + { + "epoch": 1.1869097298484517, + "grad_norm": 2.261824607849121, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.712412416934967, + "num_tokens": 269832475.0, + "step": 10808 + }, + { + "epoch": 1.1870195475510652, + "grad_norm": 2.556675910949707, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7476348876953125, + "num_tokens": 269853586.0, + "step": 10809 + }, + { + "epoch": 1.187129365253679, + "grad_norm": 2.3428118228912354, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7241439819335938, + "num_tokens": 269878332.0, + "step": 10810 + }, + { + "epoch": 1.1872391829562925, + "grad_norm": 2.132234573364258, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7206370234489441, + "num_tokens": 269905935.0, + "step": 10811 + }, + { + "epoch": 1.1873490006589063, + "grad_norm": 2.211254835128784, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7052642107009888, + "num_tokens": 269933613.0, + "step": 10812 + }, + { + "epoch": 1.1874588183615198, + "grad_norm": 2.3577985763549805, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7335264682769775, + "num_tokens": 269957367.0, + "step": 10813 + }, + { + "epoch": 1.1875686360641335, + "grad_norm": 2.257310628890991, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7350548505783081, + "num_tokens": 269981281.0, + "step": 10814 + }, + { + "epoch": 1.1876784537667473, + "grad_norm": 2.5902228355407715, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7195664048194885, + "num_tokens": 270000943.0, + "step": 10815 + }, + { + "epoch": 1.1877882714693608, + "grad_norm": 2.432331085205078, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.741407036781311, + "num_tokens": 270022911.0, + "step": 10816 + }, + { + "epoch": 1.1878980891719746, + "grad_norm": 2.5148372650146484, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7126194834709167, + "num_tokens": 270043934.0, + "step": 10817 + }, + { + "epoch": 1.1880079068745881, + "grad_norm": 2.4028499126434326, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7217636704444885, + "num_tokens": 270068407.0, + "step": 10818 + }, + { + "epoch": 1.1881177245772019, + "grad_norm": 2.3565685749053955, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7240914702415466, + "num_tokens": 270092325.0, + "step": 10819 + }, + { + "epoch": 1.1882275422798154, + "grad_norm": 2.3511905670166016, + "learning_rate": 1e-06, + "loss": 0.8294, + "mean_token_accuracy": 0.7410677671432495, + "num_tokens": 270114866.0, + "step": 10820 + }, + { + "epoch": 1.1883373599824292, + "grad_norm": 1.8810217380523682, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7078649997711182, + "num_tokens": 270149287.0, + "step": 10821 + }, + { + "epoch": 1.1884471776850427, + "grad_norm": 2.1491198539733887, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7227856516838074, + "num_tokens": 270174009.0, + "step": 10822 + }, + { + "epoch": 1.1885569953876565, + "grad_norm": 2.4534623622894287, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7389004230499268, + "num_tokens": 270195566.0, + "step": 10823 + }, + { + "epoch": 1.1886668130902702, + "grad_norm": 2.102146863937378, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7078860402107239, + "num_tokens": 270224271.0, + "step": 10824 + }, + { + "epoch": 1.1887766307928838, + "grad_norm": 2.241844654083252, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7427065968513489, + "num_tokens": 270248007.0, + "step": 10825 + }, + { + "epoch": 1.1888864484954975, + "grad_norm": 2.5455052852630615, + "learning_rate": 1e-06, + "loss": 0.7923, + "mean_token_accuracy": 0.7436468601226807, + "num_tokens": 270267222.0, + "step": 10826 + }, + { + "epoch": 1.188996266198111, + "grad_norm": 2.443000078201294, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7004649639129639, + "num_tokens": 270289359.0, + "step": 10827 + }, + { + "epoch": 1.1891060839007248, + "grad_norm": 2.714946746826172, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7394188642501831, + "num_tokens": 270307864.0, + "step": 10828 + }, + { + "epoch": 1.1892159016033386, + "grad_norm": 2.3433096408843994, + "learning_rate": 1e-06, + "loss": 0.8149, + "mean_token_accuracy": 0.7414212822914124, + "num_tokens": 270330155.0, + "step": 10829 + }, + { + "epoch": 1.189325719305952, + "grad_norm": 1.9445648193359375, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7050702571868896, + "num_tokens": 270362546.0, + "step": 10830 + }, + { + "epoch": 1.1894355370085659, + "grad_norm": 1.9580249786376953, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7115880250930786, + "num_tokens": 270395601.0, + "step": 10831 + }, + { + "epoch": 1.1895453547111794, + "grad_norm": 2.365818738937378, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.6921982765197754, + "num_tokens": 270419845.0, + "step": 10832 + }, + { + "epoch": 1.1896551724137931, + "grad_norm": 2.015437364578247, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7165395021438599, + "num_tokens": 270453429.0, + "step": 10833 + }, + { + "epoch": 1.1897649901164067, + "grad_norm": 2.2957983016967773, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7171910405158997, + "num_tokens": 270476922.0, + "step": 10834 + }, + { + "epoch": 1.1898748078190204, + "grad_norm": 2.3597888946533203, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7143090963363647, + "num_tokens": 270500235.0, + "step": 10835 + }, + { + "epoch": 1.189984625521634, + "grad_norm": 2.0917234420776367, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6877906322479248, + "num_tokens": 270532594.0, + "step": 10836 + }, + { + "epoch": 1.1900944432242477, + "grad_norm": 2.3194496631622314, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7233604788780212, + "num_tokens": 270559528.0, + "step": 10837 + }, + { + "epoch": 1.1902042609268615, + "grad_norm": 2.5130863189697266, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7261066436767578, + "num_tokens": 270580508.0, + "step": 10838 + }, + { + "epoch": 1.190314078629475, + "grad_norm": 2.216991424560547, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.7013972401618958, + "num_tokens": 270605990.0, + "step": 10839 + }, + { + "epoch": 1.1904238963320888, + "grad_norm": 2.1602230072021484, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7110363245010376, + "num_tokens": 270634890.0, + "step": 10840 + }, + { + "epoch": 1.1905337140347023, + "grad_norm": 2.1434743404388428, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7147852182388306, + "num_tokens": 270662138.0, + "step": 10841 + }, + { + "epoch": 1.190643531737316, + "grad_norm": 2.3318779468536377, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7176374197006226, + "num_tokens": 270685487.0, + "step": 10842 + }, + { + "epoch": 1.1907533494399298, + "grad_norm": 2.2874889373779297, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7069827914237976, + "num_tokens": 270709612.0, + "step": 10843 + }, + { + "epoch": 1.1908631671425434, + "grad_norm": 2.134955883026123, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7009785175323486, + "num_tokens": 270738545.0, + "step": 10844 + }, + { + "epoch": 1.1909729848451571, + "grad_norm": 2.047025680541992, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7175719738006592, + "num_tokens": 270768283.0, + "step": 10845 + }, + { + "epoch": 1.1910828025477707, + "grad_norm": 2.109868049621582, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7410977482795715, + "num_tokens": 270793571.0, + "step": 10846 + }, + { + "epoch": 1.1911926202503844, + "grad_norm": 2.1067888736724854, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7251036167144775, + "num_tokens": 270823164.0, + "step": 10847 + }, + { + "epoch": 1.191302437952998, + "grad_norm": 2.1955418586730957, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7340801954269409, + "num_tokens": 270849776.0, + "step": 10848 + }, + { + "epoch": 1.1914122556556117, + "grad_norm": 2.3611578941345215, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7135998606681824, + "num_tokens": 270873538.0, + "step": 10849 + }, + { + "epoch": 1.1915220733582252, + "grad_norm": 2.2005748748779297, + "learning_rate": 1e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.741502046585083, + "num_tokens": 270898387.0, + "step": 10850 + }, + { + "epoch": 1.191631891060839, + "grad_norm": 2.1761631965637207, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7064692378044128, + "num_tokens": 270927294.0, + "step": 10851 + }, + { + "epoch": 1.1917417087634528, + "grad_norm": 2.292504072189331, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7133650779724121, + "num_tokens": 270951972.0, + "step": 10852 + }, + { + "epoch": 1.1918515264660663, + "grad_norm": 2.381326198577881, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7314578294754028, + "num_tokens": 270975622.0, + "step": 10853 + }, + { + "epoch": 1.19196134416868, + "grad_norm": 2.3560564517974854, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7153663039207458, + "num_tokens": 270999481.0, + "step": 10854 + }, + { + "epoch": 1.1920711618712936, + "grad_norm": 1.9595682621002197, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7276263236999512, + "num_tokens": 271027998.0, + "step": 10855 + }, + { + "epoch": 1.1921809795739073, + "grad_norm": 2.335932970046997, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7245615720748901, + "num_tokens": 271051867.0, + "step": 10856 + }, + { + "epoch": 1.192290797276521, + "grad_norm": 2.0436630249023438, + "learning_rate": 1e-06, + "loss": 1.031, + "mean_token_accuracy": 0.683007538318634, + "num_tokens": 271082667.0, + "step": 10857 + }, + { + "epoch": 1.1924006149791346, + "grad_norm": 2.388394594192505, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7140843868255615, + "num_tokens": 271105772.0, + "step": 10858 + }, + { + "epoch": 1.1925104326817484, + "grad_norm": 2.4319610595703125, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7508411407470703, + "num_tokens": 271125858.0, + "step": 10859 + }, + { + "epoch": 1.192620250384362, + "grad_norm": 2.3602960109710693, + "learning_rate": 1e-06, + "loss": 1.0329, + "mean_token_accuracy": 0.6818843483924866, + "num_tokens": 271152605.0, + "step": 10860 + }, + { + "epoch": 1.1927300680869757, + "grad_norm": 2.2818896770477295, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7176495790481567, + "num_tokens": 271177056.0, + "step": 10861 + }, + { + "epoch": 1.1928398857895892, + "grad_norm": 2.1175949573516846, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6997202634811401, + "num_tokens": 271204247.0, + "step": 10862 + }, + { + "epoch": 1.192949703492203, + "grad_norm": 2.48715877532959, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7098298072814941, + "num_tokens": 271225989.0, + "step": 10863 + }, + { + "epoch": 1.1930595211948165, + "grad_norm": 2.361797571182251, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7378323078155518, + "num_tokens": 271248019.0, + "step": 10864 + }, + { + "epoch": 1.1931693388974303, + "grad_norm": 2.152937412261963, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7309277653694153, + "num_tokens": 271273583.0, + "step": 10865 + }, + { + "epoch": 1.193279156600044, + "grad_norm": 2.035719633102417, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7111185193061829, + "num_tokens": 271304247.0, + "step": 10866 + }, + { + "epoch": 1.1933889743026576, + "grad_norm": 2.0660881996154785, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6940577626228333, + "num_tokens": 271334722.0, + "step": 10867 + }, + { + "epoch": 1.1934987920052713, + "grad_norm": 2.2787179946899414, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7093575596809387, + "num_tokens": 271362659.0, + "step": 10868 + }, + { + "epoch": 1.1936086097078848, + "grad_norm": 2.429241180419922, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7234776020050049, + "num_tokens": 271383410.0, + "step": 10869 + }, + { + "epoch": 1.1937184274104986, + "grad_norm": 2.2727372646331787, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7213577032089233, + "num_tokens": 271409876.0, + "step": 10870 + }, + { + "epoch": 1.1938282451131121, + "grad_norm": 2.2484331130981445, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.6889355778694153, + "num_tokens": 271435194.0, + "step": 10871 + }, + { + "epoch": 1.193938062815726, + "grad_norm": 2.3370704650878906, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.719744861125946, + "num_tokens": 271457609.0, + "step": 10872 + }, + { + "epoch": 1.1940478805183394, + "grad_norm": 2.1640353202819824, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7173670530319214, + "num_tokens": 271484690.0, + "step": 10873 + }, + { + "epoch": 1.1941576982209532, + "grad_norm": 2.0299437046051025, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.7080056667327881, + "num_tokens": 271515700.0, + "step": 10874 + }, + { + "epoch": 1.194267515923567, + "grad_norm": 2.307892322540283, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.708476185798645, + "num_tokens": 271541812.0, + "step": 10875 + }, + { + "epoch": 1.1943773336261805, + "grad_norm": 2.620513439178467, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7078086137771606, + "num_tokens": 271562239.0, + "step": 10876 + }, + { + "epoch": 1.1944871513287942, + "grad_norm": 2.20261287689209, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7242680788040161, + "num_tokens": 271587568.0, + "step": 10877 + }, + { + "epoch": 1.1945969690314078, + "grad_norm": 2.4044711589813232, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7192765474319458, + "num_tokens": 271610564.0, + "step": 10878 + }, + { + "epoch": 1.1947067867340215, + "grad_norm": 2.1080126762390137, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7125480771064758, + "num_tokens": 271639605.0, + "step": 10879 + }, + { + "epoch": 1.1948166044366353, + "grad_norm": 2.400527000427246, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7181582450866699, + "num_tokens": 271663727.0, + "step": 10880 + }, + { + "epoch": 1.1949264221392488, + "grad_norm": 2.203402280807495, + "learning_rate": 1e-06, + "loss": 1.0103, + "mean_token_accuracy": 0.6948591470718384, + "num_tokens": 271692612.0, + "step": 10881 + }, + { + "epoch": 1.1950362398418626, + "grad_norm": 2.237154960632324, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7337561845779419, + "num_tokens": 271715658.0, + "step": 10882 + }, + { + "epoch": 1.1951460575444761, + "grad_norm": 2.543147563934326, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7226821184158325, + "num_tokens": 271736592.0, + "step": 10883 + }, + { + "epoch": 1.1952558752470899, + "grad_norm": 2.3093819618225098, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7161821722984314, + "num_tokens": 271759733.0, + "step": 10884 + }, + { + "epoch": 1.1953656929497034, + "grad_norm": 2.504244089126587, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7133320569992065, + "num_tokens": 271781776.0, + "step": 10885 + }, + { + "epoch": 1.1954755106523172, + "grad_norm": 2.179816246032715, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7204868793487549, + "num_tokens": 271807462.0, + "step": 10886 + }, + { + "epoch": 1.1955853283549307, + "grad_norm": 2.8103106021881104, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7340784668922424, + "num_tokens": 271824659.0, + "step": 10887 + }, + { + "epoch": 1.1956951460575445, + "grad_norm": 2.3000192642211914, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7168769836425781, + "num_tokens": 271850189.0, + "step": 10888 + }, + { + "epoch": 1.1958049637601582, + "grad_norm": 2.248863697052002, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7312661409378052, + "num_tokens": 271872975.0, + "step": 10889 + }, + { + "epoch": 1.1959147814627717, + "grad_norm": 2.10463285446167, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7227012515068054, + "num_tokens": 271900055.0, + "step": 10890 + }, + { + "epoch": 1.1960245991653855, + "grad_norm": 2.174236536026001, + "learning_rate": 1e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7518102526664734, + "num_tokens": 271927757.0, + "step": 10891 + }, + { + "epoch": 1.196134416867999, + "grad_norm": 2.278615713119507, + "learning_rate": 1e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7484745383262634, + "num_tokens": 271950194.0, + "step": 10892 + }, + { + "epoch": 1.1962442345706128, + "grad_norm": 2.395216226577759, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7263228893280029, + "num_tokens": 271971338.0, + "step": 10893 + }, + { + "epoch": 1.1963540522732266, + "grad_norm": 2.0968093872070312, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7081135511398315, + "num_tokens": 271999656.0, + "step": 10894 + }, + { + "epoch": 1.19646386997584, + "grad_norm": 2.3646304607391357, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.714914083480835, + "num_tokens": 272023875.0, + "step": 10895 + }, + { + "epoch": 1.1965736876784538, + "grad_norm": 2.1868441104888916, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7163248658180237, + "num_tokens": 272047904.0, + "step": 10896 + }, + { + "epoch": 1.1966835053810674, + "grad_norm": 2.257570505142212, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7179315090179443, + "num_tokens": 272072684.0, + "step": 10897 + }, + { + "epoch": 1.1967933230836811, + "grad_norm": 2.210458993911743, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7262982130050659, + "num_tokens": 272095815.0, + "step": 10898 + }, + { + "epoch": 1.1969031407862947, + "grad_norm": 1.9926517009735107, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.703107476234436, + "num_tokens": 272128121.0, + "step": 10899 + }, + { + "epoch": 1.1970129584889084, + "grad_norm": 2.1950385570526123, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7270234823226929, + "num_tokens": 272154523.0, + "step": 10900 + }, + { + "epoch": 1.197122776191522, + "grad_norm": 2.145137310028076, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7021452188491821, + "num_tokens": 272180031.0, + "step": 10901 + }, + { + "epoch": 1.1972325938941357, + "grad_norm": 2.3084895610809326, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7557318806648254, + "num_tokens": 272201695.0, + "step": 10902 + }, + { + "epoch": 1.1973424115967495, + "grad_norm": 2.365546226501465, + "learning_rate": 1e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7368742227554321, + "num_tokens": 272224356.0, + "step": 10903 + }, + { + "epoch": 1.197452229299363, + "grad_norm": 2.476811647415161, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7055321931838989, + "num_tokens": 272246213.0, + "step": 10904 + }, + { + "epoch": 1.1975620470019768, + "grad_norm": 2.4234330654144287, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7239733934402466, + "num_tokens": 272268124.0, + "step": 10905 + }, + { + "epoch": 1.1976718647045903, + "grad_norm": 2.827688455581665, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7141678929328918, + "num_tokens": 272285818.0, + "step": 10906 + }, + { + "epoch": 1.197781682407204, + "grad_norm": 2.049926996231079, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7329587936401367, + "num_tokens": 272313791.0, + "step": 10907 + }, + { + "epoch": 1.1978915001098178, + "grad_norm": 2.537346363067627, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7361000776290894, + "num_tokens": 272334349.0, + "step": 10908 + }, + { + "epoch": 1.1980013178124314, + "grad_norm": 2.276381731033325, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.72205650806427, + "num_tokens": 272360950.0, + "step": 10909 + }, + { + "epoch": 1.1981111355150451, + "grad_norm": 2.4414381980895996, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7112045884132385, + "num_tokens": 272382611.0, + "step": 10910 + }, + { + "epoch": 1.1982209532176586, + "grad_norm": 2.3167479038238525, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.7345978021621704, + "num_tokens": 272407722.0, + "step": 10911 + }, + { + "epoch": 1.1983307709202724, + "grad_norm": 2.0489792823791504, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7295047044754028, + "num_tokens": 272436187.0, + "step": 10912 + }, + { + "epoch": 1.198440588622886, + "grad_norm": 2.393986701965332, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7261902093887329, + "num_tokens": 272459227.0, + "step": 10913 + }, + { + "epoch": 1.1985504063254997, + "grad_norm": 1.9771864414215088, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.6991233229637146, + "num_tokens": 272490711.0, + "step": 10914 + }, + { + "epoch": 1.1986602240281132, + "grad_norm": 1.9721381664276123, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.704535722732544, + "num_tokens": 272525103.0, + "step": 10915 + }, + { + "epoch": 1.198770041730727, + "grad_norm": 2.286261796951294, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7244493961334229, + "num_tokens": 272549135.0, + "step": 10916 + }, + { + "epoch": 1.1988798594333407, + "grad_norm": 2.2270290851593018, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7232121229171753, + "num_tokens": 272574054.0, + "step": 10917 + }, + { + "epoch": 1.1989896771359543, + "grad_norm": 2.1420369148254395, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7139153480529785, + "num_tokens": 272602349.0, + "step": 10918 + }, + { + "epoch": 1.199099494838568, + "grad_norm": 2.924178123474121, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7269374132156372, + "num_tokens": 272618744.0, + "step": 10919 + }, + { + "epoch": 1.1992093125411816, + "grad_norm": 2.1769702434539795, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7045323252677917, + "num_tokens": 272646407.0, + "step": 10920 + }, + { + "epoch": 1.1993191302437953, + "grad_norm": 2.023346185684204, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.721493661403656, + "num_tokens": 272675045.0, + "step": 10921 + }, + { + "epoch": 1.199428947946409, + "grad_norm": 2.360318422317505, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.723045289516449, + "num_tokens": 272698812.0, + "step": 10922 + }, + { + "epoch": 1.1995387656490226, + "grad_norm": 2.203251838684082, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7156234979629517, + "num_tokens": 272725275.0, + "step": 10923 + }, + { + "epoch": 1.1996485833516364, + "grad_norm": 2.2609496116638184, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7362649440765381, + "num_tokens": 272748631.0, + "step": 10924 + }, + { + "epoch": 1.19975840105425, + "grad_norm": 2.405728816986084, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7331030964851379, + "num_tokens": 272771884.0, + "step": 10925 + }, + { + "epoch": 1.1998682187568637, + "grad_norm": 2.07425594329834, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7363142967224121, + "num_tokens": 272800654.0, + "step": 10926 + }, + { + "epoch": 1.1999780364594772, + "grad_norm": 2.1268153190612793, + "learning_rate": 1e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7458088397979736, + "num_tokens": 272828042.0, + "step": 10927 + }, + { + "epoch": 1.200087854162091, + "grad_norm": 2.4054958820343018, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.704578161239624, + "num_tokens": 272852451.0, + "step": 10928 + }, + { + "epoch": 1.2001976718647045, + "grad_norm": 2.0714797973632812, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7091507315635681, + "num_tokens": 272882580.0, + "step": 10929 + }, + { + "epoch": 1.2003074895673183, + "grad_norm": 2.208066463470459, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.6985809206962585, + "num_tokens": 272908765.0, + "step": 10930 + }, + { + "epoch": 1.200417307269932, + "grad_norm": 2.670012950897217, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7373390197753906, + "num_tokens": 272927571.0, + "step": 10931 + }, + { + "epoch": 1.2005271249725455, + "grad_norm": 2.4248783588409424, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7278313636779785, + "num_tokens": 272949750.0, + "step": 10932 + }, + { + "epoch": 1.2006369426751593, + "grad_norm": 2.364351987838745, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.730601966381073, + "num_tokens": 272973097.0, + "step": 10933 + }, + { + "epoch": 1.2007467603777728, + "grad_norm": 2.198504686355591, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7132436037063599, + "num_tokens": 273000364.0, + "step": 10934 + }, + { + "epoch": 1.2008565780803866, + "grad_norm": 2.757405996322632, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.696117639541626, + "num_tokens": 273023779.0, + "step": 10935 + }, + { + "epoch": 1.2009663957830001, + "grad_norm": 2.141657590866089, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7012123465538025, + "num_tokens": 273052433.0, + "step": 10936 + }, + { + "epoch": 1.2010762134856139, + "grad_norm": 2.5464727878570557, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7350534796714783, + "num_tokens": 273072140.0, + "step": 10937 + }, + { + "epoch": 1.2011860311882274, + "grad_norm": 2.374631643295288, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7161714434623718, + "num_tokens": 273095638.0, + "step": 10938 + }, + { + "epoch": 1.2012958488908412, + "grad_norm": 2.109048366546631, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7326300740242004, + "num_tokens": 273122250.0, + "step": 10939 + }, + { + "epoch": 1.201405666593455, + "grad_norm": 2.1695477962493896, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.6957941651344299, + "num_tokens": 273153394.0, + "step": 10940 + }, + { + "epoch": 1.2015154842960685, + "grad_norm": 2.534862518310547, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7184560298919678, + "num_tokens": 273175636.0, + "step": 10941 + }, + { + "epoch": 1.2016253019986822, + "grad_norm": 2.0349843502044678, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7095472812652588, + "num_tokens": 273207975.0, + "step": 10942 + }, + { + "epoch": 1.2017351197012958, + "grad_norm": 2.492032289505005, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7353643178939819, + "num_tokens": 273228134.0, + "step": 10943 + }, + { + "epoch": 1.2018449374039095, + "grad_norm": 2.275968551635742, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.729475736618042, + "num_tokens": 273252475.0, + "step": 10944 + }, + { + "epoch": 1.2019547551065233, + "grad_norm": 2.7575221061706543, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7146968841552734, + "num_tokens": 273272066.0, + "step": 10945 + }, + { + "epoch": 1.2020645728091368, + "grad_norm": 2.313563346862793, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7257637977600098, + "num_tokens": 273294874.0, + "step": 10946 + }, + { + "epoch": 1.2021743905117506, + "grad_norm": 2.250121593475342, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7417885065078735, + "num_tokens": 273318751.0, + "step": 10947 + }, + { + "epoch": 1.202284208214364, + "grad_norm": 2.5424625873565674, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7233976125717163, + "num_tokens": 273341137.0, + "step": 10948 + }, + { + "epoch": 1.2023940259169779, + "grad_norm": 2.257718801498413, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7104732394218445, + "num_tokens": 273366735.0, + "step": 10949 + }, + { + "epoch": 1.2025038436195914, + "grad_norm": 2.200392246246338, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7279495596885681, + "num_tokens": 273392228.0, + "step": 10950 + }, + { + "epoch": 1.2026136613222052, + "grad_norm": 2.093005418777466, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7335312366485596, + "num_tokens": 273420033.0, + "step": 10951 + }, + { + "epoch": 1.2027234790248187, + "grad_norm": 2.029341697692871, + "learning_rate": 1e-06, + "loss": 1.043, + "mean_token_accuracy": 0.6803413033485413, + "num_tokens": 273451527.0, + "step": 10952 + }, + { + "epoch": 1.2028332967274324, + "grad_norm": 2.3317251205444336, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7197544574737549, + "num_tokens": 273476562.0, + "step": 10953 + }, + { + "epoch": 1.2029431144300462, + "grad_norm": 2.305781841278076, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.742503821849823, + "num_tokens": 273500632.0, + "step": 10954 + }, + { + "epoch": 1.2030529321326597, + "grad_norm": 2.236067771911621, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.730194628238678, + "num_tokens": 273526739.0, + "step": 10955 + }, + { + "epoch": 1.2031627498352735, + "grad_norm": 2.253028631210327, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7137415409088135, + "num_tokens": 273553038.0, + "step": 10956 + }, + { + "epoch": 1.203272567537887, + "grad_norm": 2.168807029724121, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7064917683601379, + "num_tokens": 273580522.0, + "step": 10957 + }, + { + "epoch": 1.2033823852405008, + "grad_norm": 2.038160562515259, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7113512754440308, + "num_tokens": 273611923.0, + "step": 10958 + }, + { + "epoch": 1.2034922029431145, + "grad_norm": 2.326948881149292, + "learning_rate": 1e-06, + "loss": 0.8772, + "mean_token_accuracy": 0.7308283448219299, + "num_tokens": 273635332.0, + "step": 10959 + }, + { + "epoch": 1.203602020645728, + "grad_norm": 2.2615978717803955, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7038577795028687, + "num_tokens": 273659176.0, + "step": 10960 + }, + { + "epoch": 1.2037118383483418, + "grad_norm": 2.282911777496338, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7342565059661865, + "num_tokens": 273684035.0, + "step": 10961 + }, + { + "epoch": 1.2038216560509554, + "grad_norm": 2.183267593383789, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7110111713409424, + "num_tokens": 273712047.0, + "step": 10962 + }, + { + "epoch": 1.2039314737535691, + "grad_norm": 2.4663186073303223, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7120423316955566, + "num_tokens": 273733246.0, + "step": 10963 + }, + { + "epoch": 1.2040412914561827, + "grad_norm": 2.409761428833008, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7173126339912415, + "num_tokens": 273755318.0, + "step": 10964 + }, + { + "epoch": 1.2041511091587964, + "grad_norm": 2.367036819458008, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7138803005218506, + "num_tokens": 273778328.0, + "step": 10965 + }, + { + "epoch": 1.20426092686141, + "grad_norm": 2.8839197158813477, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7203808426856995, + "num_tokens": 273796864.0, + "step": 10966 + }, + { + "epoch": 1.2043707445640237, + "grad_norm": 2.4523513317108154, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7242136597633362, + "num_tokens": 273818805.0, + "step": 10967 + }, + { + "epoch": 1.2044805622666375, + "grad_norm": 2.4671733379364014, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7268962860107422, + "num_tokens": 273840497.0, + "step": 10968 + }, + { + "epoch": 1.204590379969251, + "grad_norm": 2.1946723461151123, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7188578844070435, + "num_tokens": 273867436.0, + "step": 10969 + }, + { + "epoch": 1.2047001976718648, + "grad_norm": 2.305171012878418, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.6996035575866699, + "num_tokens": 273891808.0, + "step": 10970 + }, + { + "epoch": 1.2048100153744783, + "grad_norm": 2.357736825942993, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.733004093170166, + "num_tokens": 273915585.0, + "step": 10971 + }, + { + "epoch": 1.204919833077092, + "grad_norm": 2.3319714069366455, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7250112891197205, + "num_tokens": 273939925.0, + "step": 10972 + }, + { + "epoch": 1.2050296507797058, + "grad_norm": 2.1333940029144287, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7333517074584961, + "num_tokens": 273967441.0, + "step": 10973 + }, + { + "epoch": 1.2051394684823193, + "grad_norm": 2.7132480144500732, + "learning_rate": 1e-06, + "loss": 0.7209, + "mean_token_accuracy": 0.7648968696594238, + "num_tokens": 273985281.0, + "step": 10974 + }, + { + "epoch": 1.205249286184933, + "grad_norm": 2.550048351287842, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.736731767654419, + "num_tokens": 274003930.0, + "step": 10975 + }, + { + "epoch": 1.2053591038875466, + "grad_norm": 2.192758798599243, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.724008321762085, + "num_tokens": 274029284.0, + "step": 10976 + }, + { + "epoch": 1.2054689215901604, + "grad_norm": 2.303959369659424, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7079096436500549, + "num_tokens": 274056579.0, + "step": 10977 + }, + { + "epoch": 1.205578739292774, + "grad_norm": 2.166433572769165, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7002559900283813, + "num_tokens": 274083553.0, + "step": 10978 + }, + { + "epoch": 1.2056885569953877, + "grad_norm": 2.0679686069488525, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7060016393661499, + "num_tokens": 274114418.0, + "step": 10979 + }, + { + "epoch": 1.2057983746980012, + "grad_norm": 2.2603816986083984, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7222481966018677, + "num_tokens": 274140172.0, + "step": 10980 + }, + { + "epoch": 1.205908192400615, + "grad_norm": 2.000807762145996, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7307767868041992, + "num_tokens": 274169329.0, + "step": 10981 + }, + { + "epoch": 1.2060180101032287, + "grad_norm": 2.175579071044922, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7121937274932861, + "num_tokens": 274196016.0, + "step": 10982 + }, + { + "epoch": 1.2061278278058423, + "grad_norm": 2.2325448989868164, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7175278663635254, + "num_tokens": 274221966.0, + "step": 10983 + }, + { + "epoch": 1.206237645508456, + "grad_norm": 2.2755508422851562, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7110128402709961, + "num_tokens": 274245842.0, + "step": 10984 + }, + { + "epoch": 1.2063474632110696, + "grad_norm": 2.371067523956299, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7235535383224487, + "num_tokens": 274269104.0, + "step": 10985 + }, + { + "epoch": 1.2064572809136833, + "grad_norm": 2.34757399559021, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7082662582397461, + "num_tokens": 274292263.0, + "step": 10986 + }, + { + "epoch": 1.2065670986162969, + "grad_norm": 2.3167312145233154, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7253665924072266, + "num_tokens": 274316462.0, + "step": 10987 + }, + { + "epoch": 1.2066769163189106, + "grad_norm": 2.090653657913208, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7351778149604797, + "num_tokens": 274343310.0, + "step": 10988 + }, + { + "epoch": 1.2067867340215244, + "grad_norm": 2.4624180793762207, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7314144372940063, + "num_tokens": 274364010.0, + "step": 10989 + }, + { + "epoch": 1.206896551724138, + "grad_norm": 2.4796102046966553, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7182568311691284, + "num_tokens": 274386268.0, + "step": 10990 + }, + { + "epoch": 1.2070063694267517, + "grad_norm": 2.4411795139312744, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.717769980430603, + "num_tokens": 274408550.0, + "step": 10991 + }, + { + "epoch": 1.2071161871293652, + "grad_norm": 2.324371576309204, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.704204797744751, + "num_tokens": 274432010.0, + "step": 10992 + }, + { + "epoch": 1.207226004831979, + "grad_norm": 2.408111333847046, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.729325532913208, + "num_tokens": 274454177.0, + "step": 10993 + }, + { + "epoch": 1.2073358225345925, + "grad_norm": 2.267843723297119, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.720160961151123, + "num_tokens": 274479971.0, + "step": 10994 + }, + { + "epoch": 1.2074456402372062, + "grad_norm": 2.4561092853546143, + "learning_rate": 1e-06, + "loss": 0.8564, + "mean_token_accuracy": 0.7301157712936401, + "num_tokens": 274503817.0, + "step": 10995 + }, + { + "epoch": 1.20755545793982, + "grad_norm": 2.648031711578369, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7372355461120605, + "num_tokens": 274524351.0, + "step": 10996 + }, + { + "epoch": 1.2076652756424335, + "grad_norm": 2.2968106269836426, + "learning_rate": 1e-06, + "loss": 0.9987, + "mean_token_accuracy": 0.7090287804603577, + "num_tokens": 274549589.0, + "step": 10997 + }, + { + "epoch": 1.2077750933450473, + "grad_norm": 2.217135190963745, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7212272882461548, + "num_tokens": 274576070.0, + "step": 10998 + }, + { + "epoch": 1.2078849110476608, + "grad_norm": 2.634697198867798, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7315139770507812, + "num_tokens": 274595866.0, + "step": 10999 + }, + { + "epoch": 1.2079947287502746, + "grad_norm": 2.088487148284912, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7503423690795898, + "num_tokens": 274621256.0, + "step": 11000 + }, + { + "epoch": 1.2081045464528881, + "grad_norm": 2.2674248218536377, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7227834463119507, + "num_tokens": 274645782.0, + "step": 11001 + }, + { + "epoch": 1.2082143641555019, + "grad_norm": 2.3111820220947266, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7259521484375, + "num_tokens": 274669224.0, + "step": 11002 + }, + { + "epoch": 1.2083241818581154, + "grad_norm": 2.2009522914886475, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7207465767860413, + "num_tokens": 274697168.0, + "step": 11003 + }, + { + "epoch": 1.2084339995607292, + "grad_norm": 2.3532180786132812, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6924110651016235, + "num_tokens": 274722272.0, + "step": 11004 + }, + { + "epoch": 1.208543817263343, + "grad_norm": 2.419215679168701, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7119837999343872, + "num_tokens": 274746729.0, + "step": 11005 + }, + { + "epoch": 1.2086536349659565, + "grad_norm": 2.2154266834259033, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.7296382188796997, + "num_tokens": 274771796.0, + "step": 11006 + }, + { + "epoch": 1.2087634526685702, + "grad_norm": 2.046914577484131, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7069187760353088, + "num_tokens": 274801186.0, + "step": 11007 + }, + { + "epoch": 1.2088732703711838, + "grad_norm": 2.2321298122406006, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7400784492492676, + "num_tokens": 274825002.0, + "step": 11008 + }, + { + "epoch": 1.2089830880737975, + "grad_norm": 2.1897332668304443, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7158891558647156, + "num_tokens": 274852999.0, + "step": 11009 + }, + { + "epoch": 1.2090929057764113, + "grad_norm": 2.412848949432373, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7314043045043945, + "num_tokens": 274877462.0, + "step": 11010 + }, + { + "epoch": 1.2092027234790248, + "grad_norm": 2.1103315353393555, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7373694181442261, + "num_tokens": 274903734.0, + "step": 11011 + }, + { + "epoch": 1.2093125411816386, + "grad_norm": 2.210819721221924, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7216350436210632, + "num_tokens": 274928047.0, + "step": 11012 + }, + { + "epoch": 1.209422358884252, + "grad_norm": 2.2087690830230713, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7116776704788208, + "num_tokens": 274953739.0, + "step": 11013 + }, + { + "epoch": 1.2095321765868658, + "grad_norm": 2.1524150371551514, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7111511826515198, + "num_tokens": 274980999.0, + "step": 11014 + }, + { + "epoch": 1.2096419942894794, + "grad_norm": 2.2464399337768555, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7035894989967346, + "num_tokens": 275006104.0, + "step": 11015 + }, + { + "epoch": 1.2097518119920931, + "grad_norm": 2.2021644115448, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7291025519371033, + "num_tokens": 275031525.0, + "step": 11016 + }, + { + "epoch": 1.2098616296947067, + "grad_norm": 2.4459927082061768, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7524850964546204, + "num_tokens": 275052710.0, + "step": 11017 + }, + { + "epoch": 1.2099714473973204, + "grad_norm": 2.3827645778656006, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7123284339904785, + "num_tokens": 275074647.0, + "step": 11018 + }, + { + "epoch": 1.2100812650999342, + "grad_norm": 1.956188678741455, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7124489545822144, + "num_tokens": 275106471.0, + "step": 11019 + }, + { + "epoch": 1.2101910828025477, + "grad_norm": 2.3394083976745605, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7209513187408447, + "num_tokens": 275132016.0, + "step": 11020 + }, + { + "epoch": 1.2103009005051615, + "grad_norm": 2.2622225284576416, + "learning_rate": 1e-06, + "loss": 1.0264, + "mean_token_accuracy": 0.689885139465332, + "num_tokens": 275159203.0, + "step": 11021 + }, + { + "epoch": 1.210410718207775, + "grad_norm": 2.158534288406372, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7451990842819214, + "num_tokens": 275185087.0, + "step": 11022 + }, + { + "epoch": 1.2105205359103888, + "grad_norm": 2.076902389526367, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6967967748641968, + "num_tokens": 275213474.0, + "step": 11023 + }, + { + "epoch": 1.2106303536130025, + "grad_norm": 2.4178340435028076, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7208176851272583, + "num_tokens": 275234956.0, + "step": 11024 + }, + { + "epoch": 1.210740171315616, + "grad_norm": 2.3083584308624268, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7255933284759521, + "num_tokens": 275258570.0, + "step": 11025 + }, + { + "epoch": 1.2108499890182298, + "grad_norm": 2.316714286804199, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7151246070861816, + "num_tokens": 275283593.0, + "step": 11026 + }, + { + "epoch": 1.2109598067208434, + "grad_norm": 2.083099603652954, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7092330455780029, + "num_tokens": 275312394.0, + "step": 11027 + }, + { + "epoch": 1.2110696244234571, + "grad_norm": 2.450457811355591, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7405068874359131, + "num_tokens": 275334143.0, + "step": 11028 + }, + { + "epoch": 1.2111794421260706, + "grad_norm": 2.476691961288452, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7335854768753052, + "num_tokens": 275353992.0, + "step": 11029 + }, + { + "epoch": 1.2112892598286844, + "grad_norm": 2.44857120513916, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7186833620071411, + "num_tokens": 275377295.0, + "step": 11030 + }, + { + "epoch": 1.211399077531298, + "grad_norm": 2.019517660140991, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6951664090156555, + "num_tokens": 275408904.0, + "step": 11031 + }, + { + "epoch": 1.2115088952339117, + "grad_norm": 2.423111915588379, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7129783630371094, + "num_tokens": 275432363.0, + "step": 11032 + }, + { + "epoch": 1.2116187129365255, + "grad_norm": 2.316073179244995, + "learning_rate": 1e-06, + "loss": 0.82, + "mean_token_accuracy": 0.7420941591262817, + "num_tokens": 275455420.0, + "step": 11033 + }, + { + "epoch": 1.211728530639139, + "grad_norm": 2.089965343475342, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7034692764282227, + "num_tokens": 275486581.0, + "step": 11034 + }, + { + "epoch": 1.2118383483417527, + "grad_norm": 2.1701080799102783, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7022653818130493, + "num_tokens": 275514118.0, + "step": 11035 + }, + { + "epoch": 1.2119481660443663, + "grad_norm": 2.5812525749206543, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7183165550231934, + "num_tokens": 275535641.0, + "step": 11036 + }, + { + "epoch": 1.21205798374698, + "grad_norm": 2.158345937728882, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7153078317642212, + "num_tokens": 275563057.0, + "step": 11037 + }, + { + "epoch": 1.2121678014495938, + "grad_norm": 2.442598342895508, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7178740501403809, + "num_tokens": 275585880.0, + "step": 11038 + }, + { + "epoch": 1.2122776191522073, + "grad_norm": 2.694211721420288, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7313514947891235, + "num_tokens": 275605193.0, + "step": 11039 + }, + { + "epoch": 1.212387436854821, + "grad_norm": 2.237168073654175, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.729215681552887, + "num_tokens": 275630990.0, + "step": 11040 + }, + { + "epoch": 1.2124972545574346, + "grad_norm": 2.4958620071411133, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7224074006080627, + "num_tokens": 275652502.0, + "step": 11041 + }, + { + "epoch": 1.2126070722600484, + "grad_norm": 2.212538957595825, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7077653408050537, + "num_tokens": 275679209.0, + "step": 11042 + }, + { + "epoch": 1.212716889962662, + "grad_norm": 1.9656147956848145, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7302976846694946, + "num_tokens": 275710728.0, + "step": 11043 + }, + { + "epoch": 1.2128267076652757, + "grad_norm": 2.6446332931518555, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7108463048934937, + "num_tokens": 275731570.0, + "step": 11044 + }, + { + "epoch": 1.2129365253678892, + "grad_norm": 2.4588263034820557, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7355918288230896, + "num_tokens": 275751622.0, + "step": 11045 + }, + { + "epoch": 1.213046343070503, + "grad_norm": 2.356423854827881, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7265876531600952, + "num_tokens": 275772940.0, + "step": 11046 + }, + { + "epoch": 1.2131561607731167, + "grad_norm": 2.0387089252471924, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7090498805046082, + "num_tokens": 275802385.0, + "step": 11047 + }, + { + "epoch": 1.2132659784757303, + "grad_norm": 2.3123607635498047, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7054598331451416, + "num_tokens": 275826818.0, + "step": 11048 + }, + { + "epoch": 1.213375796178344, + "grad_norm": 2.283287286758423, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.728887677192688, + "num_tokens": 275851262.0, + "step": 11049 + }, + { + "epoch": 1.2134856138809575, + "grad_norm": 3.016096830368042, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7271695733070374, + "num_tokens": 275866300.0, + "step": 11050 + }, + { + "epoch": 1.2135954315835713, + "grad_norm": 2.4196605682373047, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.6946002244949341, + "num_tokens": 275890071.0, + "step": 11051 + }, + { + "epoch": 1.2137052492861848, + "grad_norm": 2.2793123722076416, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7175425291061401, + "num_tokens": 275915310.0, + "step": 11052 + }, + { + "epoch": 1.2138150669887986, + "grad_norm": 2.187464952468872, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.6875498294830322, + "num_tokens": 275943738.0, + "step": 11053 + }, + { + "epoch": 1.2139248846914121, + "grad_norm": 1.9907691478729248, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7246742844581604, + "num_tokens": 275973642.0, + "step": 11054 + }, + { + "epoch": 1.2140347023940259, + "grad_norm": 2.064587116241455, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.7126939296722412, + "num_tokens": 276003109.0, + "step": 11055 + }, + { + "epoch": 1.2141445200966396, + "grad_norm": 2.3812618255615234, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7192320823669434, + "num_tokens": 276025556.0, + "step": 11056 + }, + { + "epoch": 1.2142543377992532, + "grad_norm": 2.316267490386963, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7276857495307922, + "num_tokens": 276050074.0, + "step": 11057 + }, + { + "epoch": 1.214364155501867, + "grad_norm": 2.5843265056610107, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7126147747039795, + "num_tokens": 276070771.0, + "step": 11058 + }, + { + "epoch": 1.2144739732044805, + "grad_norm": 2.3901681900024414, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7088217735290527, + "num_tokens": 276094410.0, + "step": 11059 + }, + { + "epoch": 1.2145837909070942, + "grad_norm": 2.181058168411255, + "learning_rate": 1e-06, + "loss": 1.0303, + "mean_token_accuracy": 0.6900514960289001, + "num_tokens": 276121832.0, + "step": 11060 + }, + { + "epoch": 1.214693608609708, + "grad_norm": 2.4523210525512695, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.729091465473175, + "num_tokens": 276144676.0, + "step": 11061 + }, + { + "epoch": 1.2148034263123215, + "grad_norm": 2.3451735973358154, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7128005027770996, + "num_tokens": 276168239.0, + "step": 11062 + }, + { + "epoch": 1.2149132440149353, + "grad_norm": 2.290020227432251, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7277269959449768, + "num_tokens": 276191903.0, + "step": 11063 + }, + { + "epoch": 1.2150230617175488, + "grad_norm": 2.7320642471313477, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7330347299575806, + "num_tokens": 276211954.0, + "step": 11064 + }, + { + "epoch": 1.2151328794201626, + "grad_norm": 2.2273054122924805, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7265743613243103, + "num_tokens": 276238219.0, + "step": 11065 + }, + { + "epoch": 1.215242697122776, + "grad_norm": 2.3867902755737305, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7092084288597107, + "num_tokens": 276261652.0, + "step": 11066 + }, + { + "epoch": 1.2153525148253899, + "grad_norm": 2.5809273719787598, + "learning_rate": 1e-06, + "loss": 0.9816, + "mean_token_accuracy": 0.7079981565475464, + "num_tokens": 276282155.0, + "step": 11067 + }, + { + "epoch": 1.2154623325280034, + "grad_norm": 2.5059635639190674, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7365953922271729, + "num_tokens": 276304978.0, + "step": 11068 + }, + { + "epoch": 1.2155721502306172, + "grad_norm": 2.2640209197998047, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.7049744129180908, + "num_tokens": 276331907.0, + "step": 11069 + }, + { + "epoch": 1.215681967933231, + "grad_norm": 2.6621057987213135, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7385464310646057, + "num_tokens": 276350397.0, + "step": 11070 + }, + { + "epoch": 1.2157917856358444, + "grad_norm": 2.1550076007843018, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7078391313552856, + "num_tokens": 276378395.0, + "step": 11071 + }, + { + "epoch": 1.2159016033384582, + "grad_norm": 2.0726842880249023, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7143882513046265, + "num_tokens": 276407174.0, + "step": 11072 + }, + { + "epoch": 1.2160114210410717, + "grad_norm": 2.420415163040161, + "learning_rate": 1e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7385150194168091, + "num_tokens": 276428566.0, + "step": 11073 + }, + { + "epoch": 1.2161212387436855, + "grad_norm": 2.215665578842163, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7165831327438354, + "num_tokens": 276455565.0, + "step": 11074 + }, + { + "epoch": 1.2162310564462993, + "grad_norm": 2.591665744781494, + "learning_rate": 1e-06, + "loss": 0.8188, + "mean_token_accuracy": 0.7379575371742249, + "num_tokens": 276476966.0, + "step": 11075 + }, + { + "epoch": 1.2163408741489128, + "grad_norm": 2.331463575363159, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7183995246887207, + "num_tokens": 276500734.0, + "step": 11076 + }, + { + "epoch": 1.2164506918515265, + "grad_norm": 2.514955520629883, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7415269017219543, + "num_tokens": 276521711.0, + "step": 11077 + }, + { + "epoch": 1.21656050955414, + "grad_norm": 2.1738219261169434, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7289358973503113, + "num_tokens": 276547345.0, + "step": 11078 + }, + { + "epoch": 1.2166703272567538, + "grad_norm": 2.1614797115325928, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7351992130279541, + "num_tokens": 276572989.0, + "step": 11079 + }, + { + "epoch": 1.2167801449593674, + "grad_norm": 2.789689064025879, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7195086479187012, + "num_tokens": 276592878.0, + "step": 11080 + }, + { + "epoch": 1.2168899626619811, + "grad_norm": 2.411325693130493, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7228634357452393, + "num_tokens": 276618136.0, + "step": 11081 + }, + { + "epoch": 1.2169997803645947, + "grad_norm": 2.623748540878296, + "learning_rate": 1e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.74261474609375, + "num_tokens": 276638368.0, + "step": 11082 + }, + { + "epoch": 1.2171095980672084, + "grad_norm": 2.4257731437683105, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7098803520202637, + "num_tokens": 276662781.0, + "step": 11083 + }, + { + "epoch": 1.2172194157698222, + "grad_norm": 2.21124267578125, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7106654644012451, + "num_tokens": 276689387.0, + "step": 11084 + }, + { + "epoch": 1.2173292334724357, + "grad_norm": 2.3443751335144043, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7087926864624023, + "num_tokens": 276714598.0, + "step": 11085 + }, + { + "epoch": 1.2174390511750495, + "grad_norm": 2.629103183746338, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7209193706512451, + "num_tokens": 276733946.0, + "step": 11086 + }, + { + "epoch": 1.217548868877663, + "grad_norm": 2.4604945182800293, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7183910608291626, + "num_tokens": 276755829.0, + "step": 11087 + }, + { + "epoch": 1.2176586865802768, + "grad_norm": 2.334312915802002, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7374675273895264, + "num_tokens": 276780552.0, + "step": 11088 + }, + { + "epoch": 1.2177685042828905, + "grad_norm": 2.5093915462493896, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7334762811660767, + "num_tokens": 276800874.0, + "step": 11089 + }, + { + "epoch": 1.217878321985504, + "grad_norm": 2.427088499069214, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7329292297363281, + "num_tokens": 276825082.0, + "step": 11090 + }, + { + "epoch": 1.2179881396881178, + "grad_norm": 2.4958560466766357, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7211141586303711, + "num_tokens": 276845683.0, + "step": 11091 + }, + { + "epoch": 1.2180979573907313, + "grad_norm": 2.4003825187683105, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7290294170379639, + "num_tokens": 276867201.0, + "step": 11092 + }, + { + "epoch": 1.218207775093345, + "grad_norm": 2.206665277481079, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7109954357147217, + "num_tokens": 276893628.0, + "step": 11093 + }, + { + "epoch": 1.2183175927959586, + "grad_norm": 1.8626519441604614, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7024509310722351, + "num_tokens": 276928018.0, + "step": 11094 + }, + { + "epoch": 1.2184274104985724, + "grad_norm": 2.5580756664276123, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7281314730644226, + "num_tokens": 276948242.0, + "step": 11095 + }, + { + "epoch": 1.218537228201186, + "grad_norm": 2.43157958984375, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7092652916908264, + "num_tokens": 276971611.0, + "step": 11096 + }, + { + "epoch": 1.2186470459037997, + "grad_norm": 2.400690793991089, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.707815408706665, + "num_tokens": 276995645.0, + "step": 11097 + }, + { + "epoch": 1.2187568636064134, + "grad_norm": 2.524867057800293, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7121084928512573, + "num_tokens": 277021184.0, + "step": 11098 + }, + { + "epoch": 1.218866681309027, + "grad_norm": 2.4938457012176514, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.732081413269043, + "num_tokens": 277041506.0, + "step": 11099 + }, + { + "epoch": 1.2189764990116407, + "grad_norm": 2.542152166366577, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7324278950691223, + "num_tokens": 277063688.0, + "step": 11100 + }, + { + "epoch": 1.2190863167142543, + "grad_norm": 2.4375901222229004, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7114111185073853, + "num_tokens": 277086990.0, + "step": 11101 + }, + { + "epoch": 1.219196134416868, + "grad_norm": 2.3564701080322266, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7237694263458252, + "num_tokens": 277111048.0, + "step": 11102 + }, + { + "epoch": 1.2193059521194818, + "grad_norm": 2.3358473777770996, + "learning_rate": 1e-06, + "loss": 0.94, + "mean_token_accuracy": 0.6999662518501282, + "num_tokens": 277137076.0, + "step": 11103 + }, + { + "epoch": 1.2194157698220953, + "grad_norm": 2.35498046875, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7137483358383179, + "num_tokens": 277160075.0, + "step": 11104 + }, + { + "epoch": 1.219525587524709, + "grad_norm": 2.2534291744232178, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6919601559638977, + "num_tokens": 277184303.0, + "step": 11105 + }, + { + "epoch": 1.2196354052273226, + "grad_norm": 2.4730453491210938, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.6998903751373291, + "num_tokens": 277206645.0, + "step": 11106 + }, + { + "epoch": 1.2197452229299364, + "grad_norm": 2.027144432067871, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7183955311775208, + "num_tokens": 277235728.0, + "step": 11107 + }, + { + "epoch": 1.21985504063255, + "grad_norm": 2.45600962638855, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7101761698722839, + "num_tokens": 277259349.0, + "step": 11108 + }, + { + "epoch": 1.2199648583351637, + "grad_norm": 2.1422154903411865, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7097921371459961, + "num_tokens": 277289325.0, + "step": 11109 + }, + { + "epoch": 1.2200746760377772, + "grad_norm": 2.605356216430664, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.723219633102417, + "num_tokens": 277309329.0, + "step": 11110 + }, + { + "epoch": 1.220184493740391, + "grad_norm": 2.4256222248077393, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7255204916000366, + "num_tokens": 277331557.0, + "step": 11111 + }, + { + "epoch": 1.2202943114430047, + "grad_norm": 2.687307834625244, + "learning_rate": 1e-06, + "loss": 0.9831, + "mean_token_accuracy": 0.6961159706115723, + "num_tokens": 277352365.0, + "step": 11112 + }, + { + "epoch": 1.2204041291456182, + "grad_norm": 2.272381544113159, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7357882261276245, + "num_tokens": 277377317.0, + "step": 11113 + }, + { + "epoch": 1.220513946848232, + "grad_norm": 2.2195730209350586, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.704879879951477, + "num_tokens": 277405814.0, + "step": 11114 + }, + { + "epoch": 1.2206237645508455, + "grad_norm": 2.2405893802642822, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.715050458908081, + "num_tokens": 277432832.0, + "step": 11115 + }, + { + "epoch": 1.2207335822534593, + "grad_norm": 2.586465835571289, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7197311520576477, + "num_tokens": 277452495.0, + "step": 11116 + }, + { + "epoch": 1.2208433999560728, + "grad_norm": 2.2282214164733887, + "learning_rate": 1e-06, + "loss": 0.8674, + "mean_token_accuracy": 0.7279098033905029, + "num_tokens": 277478583.0, + "step": 11117 + }, + { + "epoch": 1.2209532176586866, + "grad_norm": 2.684889316558838, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7195416688919067, + "num_tokens": 277499431.0, + "step": 11118 + }, + { + "epoch": 1.2210630353613001, + "grad_norm": 2.360358953475952, + "learning_rate": 1e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7348825931549072, + "num_tokens": 277522636.0, + "step": 11119 + }, + { + "epoch": 1.2211728530639139, + "grad_norm": 2.512892723083496, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7065630555152893, + "num_tokens": 277544478.0, + "step": 11120 + }, + { + "epoch": 1.2212826707665276, + "grad_norm": 2.043820858001709, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7220792770385742, + "num_tokens": 277575111.0, + "step": 11121 + }, + { + "epoch": 1.2213924884691412, + "grad_norm": 2.1796023845672607, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7032598257064819, + "num_tokens": 277602290.0, + "step": 11122 + }, + { + "epoch": 1.221502306171755, + "grad_norm": 2.2789368629455566, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7273210287094116, + "num_tokens": 277627546.0, + "step": 11123 + }, + { + "epoch": 1.2216121238743685, + "grad_norm": 2.256195306777954, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7096220254898071, + "num_tokens": 277653731.0, + "step": 11124 + }, + { + "epoch": 1.2217219415769822, + "grad_norm": 2.3631629943847656, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.698283314704895, + "num_tokens": 277677826.0, + "step": 11125 + }, + { + "epoch": 1.221831759279596, + "grad_norm": 2.1118621826171875, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7484568357467651, + "num_tokens": 277706019.0, + "step": 11126 + }, + { + "epoch": 1.2219415769822095, + "grad_norm": 2.20489764213562, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.6953067779541016, + "num_tokens": 277734424.0, + "step": 11127 + }, + { + "epoch": 1.2220513946848233, + "grad_norm": 2.336911201477051, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7152599692344666, + "num_tokens": 277759064.0, + "step": 11128 + }, + { + "epoch": 1.2221612123874368, + "grad_norm": 2.1452460289001465, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.706458568572998, + "num_tokens": 277787655.0, + "step": 11129 + }, + { + "epoch": 1.2222710300900506, + "grad_norm": 2.5858774185180664, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7146158814430237, + "num_tokens": 277809180.0, + "step": 11130 + }, + { + "epoch": 1.222380847792664, + "grad_norm": 2.5002660751342773, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7259512543678284, + "num_tokens": 277831031.0, + "step": 11131 + }, + { + "epoch": 1.2224906654952779, + "grad_norm": 2.2679831981658936, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6950068473815918, + "num_tokens": 277856672.0, + "step": 11132 + }, + { + "epoch": 1.2226004831978914, + "grad_norm": 2.228878974914551, + "learning_rate": 1e-06, + "loss": 0.829, + "mean_token_accuracy": 0.7363479733467102, + "num_tokens": 277881758.0, + "step": 11133 + }, + { + "epoch": 1.2227103009005051, + "grad_norm": 2.4602279663085938, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7256736755371094, + "num_tokens": 277903616.0, + "step": 11134 + }, + { + "epoch": 1.222820118603119, + "grad_norm": 2.3904504776000977, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.730840265750885, + "num_tokens": 277927107.0, + "step": 11135 + }, + { + "epoch": 1.2229299363057324, + "grad_norm": 2.1758549213409424, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7026312351226807, + "num_tokens": 277955973.0, + "step": 11136 + }, + { + "epoch": 1.2230397540083462, + "grad_norm": 1.8751362562179565, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7241654992103577, + "num_tokens": 277988179.0, + "step": 11137 + }, + { + "epoch": 1.2231495717109597, + "grad_norm": 1.909940242767334, + "learning_rate": 1e-06, + "loss": 1.0109, + "mean_token_accuracy": 0.6889201402664185, + "num_tokens": 278022801.0, + "step": 11138 + }, + { + "epoch": 1.2232593894135735, + "grad_norm": 2.3292105197906494, + "learning_rate": 1e-06, + "loss": 0.981, + "mean_token_accuracy": 0.7018474340438843, + "num_tokens": 278046092.0, + "step": 11139 + }, + { + "epoch": 1.2233692071161872, + "grad_norm": 2.200242042541504, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.6993486881256104, + "num_tokens": 278072418.0, + "step": 11140 + }, + { + "epoch": 1.2234790248188008, + "grad_norm": 2.351823329925537, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7220987677574158, + "num_tokens": 278094583.0, + "step": 11141 + }, + { + "epoch": 1.2235888425214145, + "grad_norm": 2.1316685676574707, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7028169631958008, + "num_tokens": 278122122.0, + "step": 11142 + }, + { + "epoch": 1.223698660224028, + "grad_norm": 2.3952982425689697, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7135854959487915, + "num_tokens": 278145002.0, + "step": 11143 + }, + { + "epoch": 1.2238084779266418, + "grad_norm": 2.256012439727783, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7285386323928833, + "num_tokens": 278170869.0, + "step": 11144 + }, + { + "epoch": 1.2239182956292554, + "grad_norm": 2.297017812728882, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7193717956542969, + "num_tokens": 278196550.0, + "step": 11145 + }, + { + "epoch": 1.2240281133318691, + "grad_norm": 2.5188984870910645, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7255469560623169, + "num_tokens": 278219677.0, + "step": 11146 + }, + { + "epoch": 1.2241379310344827, + "grad_norm": 2.0515356063842773, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7376459836959839, + "num_tokens": 278246675.0, + "step": 11147 + }, + { + "epoch": 1.2242477487370964, + "grad_norm": 2.2704761028289795, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.745682418346405, + "num_tokens": 278270435.0, + "step": 11148 + }, + { + "epoch": 1.2243575664397102, + "grad_norm": 2.4353835582733154, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7150548696517944, + "num_tokens": 278295240.0, + "step": 11149 + }, + { + "epoch": 1.2244673841423237, + "grad_norm": 2.219911575317383, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7196322679519653, + "num_tokens": 278321021.0, + "step": 11150 + }, + { + "epoch": 1.2245772018449375, + "grad_norm": 2.4251043796539307, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7057337760925293, + "num_tokens": 278344842.0, + "step": 11151 + }, + { + "epoch": 1.224687019547551, + "grad_norm": 2.095214605331421, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7181184887886047, + "num_tokens": 278373760.0, + "step": 11152 + }, + { + "epoch": 1.2247968372501647, + "grad_norm": 2.336121082305908, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7278147339820862, + "num_tokens": 278397687.0, + "step": 11153 + }, + { + "epoch": 1.2249066549527785, + "grad_norm": 2.168314218521118, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7228208780288696, + "num_tokens": 278424069.0, + "step": 11154 + }, + { + "epoch": 1.225016472655392, + "grad_norm": 2.1139183044433594, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7167394161224365, + "num_tokens": 278453107.0, + "step": 11155 + }, + { + "epoch": 1.2251262903580058, + "grad_norm": 2.6227502822875977, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7181125283241272, + "num_tokens": 278471112.0, + "step": 11156 + }, + { + "epoch": 1.2252361080606193, + "grad_norm": 2.2931504249572754, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.6994093060493469, + "num_tokens": 278496514.0, + "step": 11157 + }, + { + "epoch": 1.225345925763233, + "grad_norm": 2.2474007606506348, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7176215648651123, + "num_tokens": 278522850.0, + "step": 11158 + }, + { + "epoch": 1.2254557434658466, + "grad_norm": 2.181626558303833, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7108958959579468, + "num_tokens": 278548330.0, + "step": 11159 + }, + { + "epoch": 1.2255655611684604, + "grad_norm": 2.359257459640503, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7339568138122559, + "num_tokens": 278571594.0, + "step": 11160 + }, + { + "epoch": 1.225675378871074, + "grad_norm": 2.314399480819702, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7342160940170288, + "num_tokens": 278594107.0, + "step": 11161 + }, + { + "epoch": 1.2257851965736877, + "grad_norm": 2.202040433883667, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7170563340187073, + "num_tokens": 278619341.0, + "step": 11162 + }, + { + "epoch": 1.2258950142763014, + "grad_norm": 2.3217673301696777, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7249841690063477, + "num_tokens": 278643176.0, + "step": 11163 + }, + { + "epoch": 1.226004831978915, + "grad_norm": 2.47981858253479, + "learning_rate": 1e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.7366232872009277, + "num_tokens": 278663811.0, + "step": 11164 + }, + { + "epoch": 1.2261146496815287, + "grad_norm": 2.5101559162139893, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7130432724952698, + "num_tokens": 278684613.0, + "step": 11165 + }, + { + "epoch": 1.2262244673841423, + "grad_norm": 2.1343557834625244, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7163800001144409, + "num_tokens": 278714313.0, + "step": 11166 + }, + { + "epoch": 1.226334285086756, + "grad_norm": 2.628619909286499, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7231588363647461, + "num_tokens": 278736177.0, + "step": 11167 + }, + { + "epoch": 1.2264441027893696, + "grad_norm": 2.050421953201294, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7201476097106934, + "num_tokens": 278765045.0, + "step": 11168 + }, + { + "epoch": 1.2265539204919833, + "grad_norm": 2.039395809173584, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7047563791275024, + "num_tokens": 278794134.0, + "step": 11169 + }, + { + "epoch": 1.226663738194597, + "grad_norm": 2.2114455699920654, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7100527286529541, + "num_tokens": 278820291.0, + "step": 11170 + }, + { + "epoch": 1.2267735558972106, + "grad_norm": 2.380753993988037, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7228667736053467, + "num_tokens": 278844005.0, + "step": 11171 + }, + { + "epoch": 1.2268833735998244, + "grad_norm": 2.2676587104797363, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7165316939353943, + "num_tokens": 278871015.0, + "step": 11172 + }, + { + "epoch": 1.226993191302438, + "grad_norm": 2.095266103744507, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6961387991905212, + "num_tokens": 278901108.0, + "step": 11173 + }, + { + "epoch": 1.2271030090050516, + "grad_norm": 2.272998809814453, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7172194719314575, + "num_tokens": 278927410.0, + "step": 11174 + }, + { + "epoch": 1.2272128267076652, + "grad_norm": 2.3822505474090576, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7137657403945923, + "num_tokens": 278952691.0, + "step": 11175 + }, + { + "epoch": 1.227322644410279, + "grad_norm": 1.97239351272583, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.727238118648529, + "num_tokens": 278982929.0, + "step": 11176 + }, + { + "epoch": 1.2274324621128927, + "grad_norm": 2.208660364151001, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7043685913085938, + "num_tokens": 279010207.0, + "step": 11177 + }, + { + "epoch": 1.2275422798155062, + "grad_norm": 2.249363660812378, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7057543992996216, + "num_tokens": 279039443.0, + "step": 11178 + }, + { + "epoch": 1.22765209751812, + "grad_norm": 2.1325929164886475, + "learning_rate": 1e-06, + "loss": 0.9836, + "mean_token_accuracy": 0.6985573768615723, + "num_tokens": 279070063.0, + "step": 11179 + }, + { + "epoch": 1.2277619152207335, + "grad_norm": 2.2349724769592285, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7254674434661865, + "num_tokens": 279094969.0, + "step": 11180 + }, + { + "epoch": 1.2278717329233473, + "grad_norm": 2.1099486351013184, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6974576711654663, + "num_tokens": 279124369.0, + "step": 11181 + }, + { + "epoch": 1.2279815506259608, + "grad_norm": 2.475877285003662, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.715779721736908, + "num_tokens": 279146858.0, + "step": 11182 + }, + { + "epoch": 1.2280913683285746, + "grad_norm": 2.2892673015594482, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.713056206703186, + "num_tokens": 279171429.0, + "step": 11183 + }, + { + "epoch": 1.228201186031188, + "grad_norm": 2.3803582191467285, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7063547968864441, + "num_tokens": 279196209.0, + "step": 11184 + }, + { + "epoch": 1.2283110037338019, + "grad_norm": 2.7291674613952637, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7226907014846802, + "num_tokens": 279214132.0, + "step": 11185 + }, + { + "epoch": 1.2284208214364156, + "grad_norm": 2.058863401412964, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7213131785392761, + "num_tokens": 279244169.0, + "step": 11186 + }, + { + "epoch": 1.2285306391390292, + "grad_norm": 2.3021202087402344, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7118719816207886, + "num_tokens": 279268277.0, + "step": 11187 + }, + { + "epoch": 1.228640456841643, + "grad_norm": 2.638949155807495, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.702684760093689, + "num_tokens": 279289537.0, + "step": 11188 + }, + { + "epoch": 1.2287502745442564, + "grad_norm": 2.478274345397949, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7230970859527588, + "num_tokens": 279310479.0, + "step": 11189 + }, + { + "epoch": 1.2288600922468702, + "grad_norm": 2.2547497749328613, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7093793153762817, + "num_tokens": 279336566.0, + "step": 11190 + }, + { + "epoch": 1.228969909949484, + "grad_norm": 2.4312937259674072, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7098686099052429, + "num_tokens": 279361061.0, + "step": 11191 + }, + { + "epoch": 1.2290797276520975, + "grad_norm": 2.2696163654327393, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7187340259552002, + "num_tokens": 279386513.0, + "step": 11192 + }, + { + "epoch": 1.2291895453547113, + "grad_norm": 2.2408132553100586, + "learning_rate": 1e-06, + "loss": 1.0181, + "mean_token_accuracy": 0.6981284618377686, + "num_tokens": 279411751.0, + "step": 11193 + }, + { + "epoch": 1.2292993630573248, + "grad_norm": 2.1600053310394287, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7333824634552002, + "num_tokens": 279437383.0, + "step": 11194 + }, + { + "epoch": 1.2294091807599385, + "grad_norm": 2.0585782527923584, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7156912088394165, + "num_tokens": 279464041.0, + "step": 11195 + }, + { + "epoch": 1.229518998462552, + "grad_norm": 2.2998030185699463, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7047779560089111, + "num_tokens": 279489722.0, + "step": 11196 + }, + { + "epoch": 1.2296288161651658, + "grad_norm": 2.350457191467285, + "learning_rate": 1e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7406915426254272, + "num_tokens": 279512827.0, + "step": 11197 + }, + { + "epoch": 1.2297386338677794, + "grad_norm": 2.2395577430725098, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7073411345481873, + "num_tokens": 279538420.0, + "step": 11198 + }, + { + "epoch": 1.2298484515703931, + "grad_norm": 2.637331247329712, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7258268594741821, + "num_tokens": 279557735.0, + "step": 11199 + }, + { + "epoch": 1.2299582692730069, + "grad_norm": 2.3090927600860596, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7148585319519043, + "num_tokens": 279581736.0, + "step": 11200 + }, + { + "epoch": 1.2300680869756204, + "grad_norm": 2.2627365589141846, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7138196229934692, + "num_tokens": 279608753.0, + "step": 11201 + }, + { + "epoch": 1.2301779046782342, + "grad_norm": 2.3081600666046143, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7052343487739563, + "num_tokens": 279632266.0, + "step": 11202 + }, + { + "epoch": 1.2302877223808477, + "grad_norm": 2.283507823944092, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7187427282333374, + "num_tokens": 279657395.0, + "step": 11203 + }, + { + "epoch": 1.2303975400834615, + "grad_norm": 2.3268165588378906, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7103312015533447, + "num_tokens": 279681458.0, + "step": 11204 + }, + { + "epoch": 1.2305073577860752, + "grad_norm": 1.9559872150421143, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6991037726402283, + "num_tokens": 279716358.0, + "step": 11205 + }, + { + "epoch": 1.2306171754886888, + "grad_norm": 2.2677340507507324, + "learning_rate": 1e-06, + "loss": 0.857, + "mean_token_accuracy": 0.7322165966033936, + "num_tokens": 279741120.0, + "step": 11206 + }, + { + "epoch": 1.2307269931913025, + "grad_norm": 2.4850246906280518, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.6978470683097839, + "num_tokens": 279763716.0, + "step": 11207 + }, + { + "epoch": 1.230836810893916, + "grad_norm": 2.0172128677368164, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7073689699172974, + "num_tokens": 279794217.0, + "step": 11208 + }, + { + "epoch": 1.2309466285965298, + "grad_norm": 2.1131792068481445, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7364717721939087, + "num_tokens": 279820481.0, + "step": 11209 + }, + { + "epoch": 1.2310564462991433, + "grad_norm": 2.2705540657043457, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7063677906990051, + "num_tokens": 279846856.0, + "step": 11210 + }, + { + "epoch": 1.231166264001757, + "grad_norm": 2.616325616836548, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7283278703689575, + "num_tokens": 279865442.0, + "step": 11211 + }, + { + "epoch": 1.2312760817043706, + "grad_norm": 2.330970287322998, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7407944202423096, + "num_tokens": 279888256.0, + "step": 11212 + }, + { + "epoch": 1.2313858994069844, + "grad_norm": 2.5259058475494385, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.735293447971344, + "num_tokens": 279909257.0, + "step": 11213 + }, + { + "epoch": 1.2314957171095982, + "grad_norm": 2.2489681243896484, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.7114642262458801, + "num_tokens": 279933182.0, + "step": 11214 + }, + { + "epoch": 1.2316055348122117, + "grad_norm": 2.1893911361694336, + "learning_rate": 1e-06, + "loss": 0.9842, + "mean_token_accuracy": 0.6985284686088562, + "num_tokens": 279959853.0, + "step": 11215 + }, + { + "epoch": 1.2317153525148254, + "grad_norm": 2.4413161277770996, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7084223031997681, + "num_tokens": 279983469.0, + "step": 11216 + }, + { + "epoch": 1.231825170217439, + "grad_norm": 2.0023415088653564, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7205053567886353, + "num_tokens": 280015509.0, + "step": 11217 + }, + { + "epoch": 1.2319349879200527, + "grad_norm": 2.331057548522949, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7286586165428162, + "num_tokens": 280038109.0, + "step": 11218 + }, + { + "epoch": 1.2320448056226665, + "grad_norm": 2.527050018310547, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7276109457015991, + "num_tokens": 280059406.0, + "step": 11219 + }, + { + "epoch": 1.23215462332528, + "grad_norm": 2.1586191654205322, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7285927534103394, + "num_tokens": 280085395.0, + "step": 11220 + }, + { + "epoch": 1.2322644410278938, + "grad_norm": 2.436544179916382, + "learning_rate": 1e-06, + "loss": 0.8371, + "mean_token_accuracy": 0.7281672358512878, + "num_tokens": 280107144.0, + "step": 11221 + }, + { + "epoch": 1.2323742587305073, + "grad_norm": 2.516453981399536, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7481362819671631, + "num_tokens": 280129037.0, + "step": 11222 + }, + { + "epoch": 1.232484076433121, + "grad_norm": 2.7284324169158936, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7347633838653564, + "num_tokens": 280147186.0, + "step": 11223 + }, + { + "epoch": 1.2325938941357346, + "grad_norm": 2.3195507526397705, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7085156440734863, + "num_tokens": 280172332.0, + "step": 11224 + }, + { + "epoch": 1.2327037118383484, + "grad_norm": 2.061577081680298, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7184606194496155, + "num_tokens": 280201812.0, + "step": 11225 + }, + { + "epoch": 1.232813529540962, + "grad_norm": 2.143890619277954, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7130030989646912, + "num_tokens": 280232515.0, + "step": 11226 + }, + { + "epoch": 1.2329233472435757, + "grad_norm": 2.3016140460968018, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6940226554870605, + "num_tokens": 280258718.0, + "step": 11227 + }, + { + "epoch": 1.2330331649461894, + "grad_norm": 2.3554282188415527, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7176432013511658, + "num_tokens": 280281944.0, + "step": 11228 + }, + { + "epoch": 1.233142982648803, + "grad_norm": 2.3179755210876465, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7172748446464539, + "num_tokens": 280305657.0, + "step": 11229 + }, + { + "epoch": 1.2332528003514167, + "grad_norm": 2.6528213024139404, + "learning_rate": 1e-06, + "loss": 0.8187, + "mean_token_accuracy": 0.737787127494812, + "num_tokens": 280324756.0, + "step": 11230 + }, + { + "epoch": 1.2333626180540302, + "grad_norm": 2.75935697555542, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7223206758499146, + "num_tokens": 280343518.0, + "step": 11231 + }, + { + "epoch": 1.233472435756644, + "grad_norm": 2.241631031036377, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.718721866607666, + "num_tokens": 280368912.0, + "step": 11232 + }, + { + "epoch": 1.2335822534592575, + "grad_norm": 2.222818374633789, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7097432613372803, + "num_tokens": 280393846.0, + "step": 11233 + }, + { + "epoch": 1.2336920711618713, + "grad_norm": 2.305666208267212, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7333558201789856, + "num_tokens": 280419769.0, + "step": 11234 + }, + { + "epoch": 1.2338018888644848, + "grad_norm": 2.2633249759674072, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7311619520187378, + "num_tokens": 280444751.0, + "step": 11235 + }, + { + "epoch": 1.2339117065670986, + "grad_norm": 2.5222177505493164, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6960593461990356, + "num_tokens": 280468414.0, + "step": 11236 + }, + { + "epoch": 1.2340215242697123, + "grad_norm": 2.465108871459961, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.6965224742889404, + "num_tokens": 280492204.0, + "step": 11237 + }, + { + "epoch": 1.2341313419723259, + "grad_norm": 2.3478732109069824, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7342268824577332, + "num_tokens": 280515284.0, + "step": 11238 + }, + { + "epoch": 1.2342411596749396, + "grad_norm": 2.3725218772888184, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7160661220550537, + "num_tokens": 280538534.0, + "step": 11239 + }, + { + "epoch": 1.2343509773775532, + "grad_norm": 2.7462704181671143, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7243958711624146, + "num_tokens": 280555755.0, + "step": 11240 + }, + { + "epoch": 1.234460795080167, + "grad_norm": 2.4583160877227783, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7050743699073792, + "num_tokens": 280579860.0, + "step": 11241 + }, + { + "epoch": 1.2345706127827807, + "grad_norm": 2.1056020259857178, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7246692180633545, + "num_tokens": 280606980.0, + "step": 11242 + }, + { + "epoch": 1.2346804304853942, + "grad_norm": 2.21597957611084, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.708896279335022, + "num_tokens": 280633760.0, + "step": 11243 + }, + { + "epoch": 1.234790248188008, + "grad_norm": 2.4146087169647217, + "learning_rate": 1e-06, + "loss": 0.9857, + "mean_token_accuracy": 0.6957871913909912, + "num_tokens": 280656895.0, + "step": 11244 + }, + { + "epoch": 1.2349000658906215, + "grad_norm": 2.3357903957366943, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7285213470458984, + "num_tokens": 280679440.0, + "step": 11245 + }, + { + "epoch": 1.2350098835932353, + "grad_norm": 2.3175125122070312, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7322214245796204, + "num_tokens": 280703671.0, + "step": 11246 + }, + { + "epoch": 1.2351197012958488, + "grad_norm": 2.060823440551758, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7084038257598877, + "num_tokens": 280734753.0, + "step": 11247 + }, + { + "epoch": 1.2352295189984626, + "grad_norm": 2.1958978176116943, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7037051916122437, + "num_tokens": 280760539.0, + "step": 11248 + }, + { + "epoch": 1.235339336701076, + "grad_norm": 2.4239354133605957, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7123874425888062, + "num_tokens": 280782004.0, + "step": 11249 + }, + { + "epoch": 1.2354491544036899, + "grad_norm": 2.4006690979003906, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.726581871509552, + "num_tokens": 280804754.0, + "step": 11250 + }, + { + "epoch": 1.2355589721063036, + "grad_norm": 2.0811057090759277, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7288975715637207, + "num_tokens": 280836058.0, + "step": 11251 + }, + { + "epoch": 1.2356687898089171, + "grad_norm": 2.4550812244415283, + "learning_rate": 1e-06, + "loss": 1.0086, + "mean_token_accuracy": 0.6948490142822266, + "num_tokens": 280859296.0, + "step": 11252 + }, + { + "epoch": 1.235778607511531, + "grad_norm": 2.0422158241271973, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7068231105804443, + "num_tokens": 280889283.0, + "step": 11253 + }, + { + "epoch": 1.2358884252141444, + "grad_norm": 2.276477575302124, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.703914999961853, + "num_tokens": 280914621.0, + "step": 11254 + }, + { + "epoch": 1.2359982429167582, + "grad_norm": 2.446992874145508, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7086095213890076, + "num_tokens": 280939429.0, + "step": 11255 + }, + { + "epoch": 1.236108060619372, + "grad_norm": 2.723013162612915, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7319141626358032, + "num_tokens": 280956813.0, + "step": 11256 + }, + { + "epoch": 1.2362178783219855, + "grad_norm": 2.288404941558838, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7104431390762329, + "num_tokens": 280980645.0, + "step": 11257 + }, + { + "epoch": 1.2363276960245992, + "grad_norm": 2.131725311279297, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7110389471054077, + "num_tokens": 281009280.0, + "step": 11258 + }, + { + "epoch": 1.2364375137272128, + "grad_norm": 2.279853343963623, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7117072343826294, + "num_tokens": 281034291.0, + "step": 11259 + }, + { + "epoch": 1.2365473314298265, + "grad_norm": 2.181593179702759, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7095650434494019, + "num_tokens": 281060304.0, + "step": 11260 + }, + { + "epoch": 1.23665714913244, + "grad_norm": 2.283229112625122, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7056592702865601, + "num_tokens": 281085554.0, + "step": 11261 + }, + { + "epoch": 1.2367669668350538, + "grad_norm": 2.1263787746429443, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7157177925109863, + "num_tokens": 281115011.0, + "step": 11262 + }, + { + "epoch": 1.2368767845376674, + "grad_norm": 2.2246010303497314, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7092490196228027, + "num_tokens": 281143204.0, + "step": 11263 + }, + { + "epoch": 1.2369866022402811, + "grad_norm": 2.3455069065093994, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7330118417739868, + "num_tokens": 281166926.0, + "step": 11264 + }, + { + "epoch": 1.2370964199428949, + "grad_norm": 2.2405030727386475, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6981299519538879, + "num_tokens": 281192161.0, + "step": 11265 + }, + { + "epoch": 1.2372062376455084, + "grad_norm": 2.649327039718628, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7510232925415039, + "num_tokens": 281211284.0, + "step": 11266 + }, + { + "epoch": 1.2373160553481222, + "grad_norm": 2.4800565242767334, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7083190083503723, + "num_tokens": 281232308.0, + "step": 11267 + }, + { + "epoch": 1.2374258730507357, + "grad_norm": 2.318310499191284, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7231341600418091, + "num_tokens": 281255571.0, + "step": 11268 + }, + { + "epoch": 1.2375356907533495, + "grad_norm": 2.2949419021606445, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7210402488708496, + "num_tokens": 281281441.0, + "step": 11269 + }, + { + "epoch": 1.2376455084559632, + "grad_norm": 2.2602903842926025, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7188557386398315, + "num_tokens": 281306002.0, + "step": 11270 + }, + { + "epoch": 1.2377553261585768, + "grad_norm": 2.1898465156555176, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.710174560546875, + "num_tokens": 281332913.0, + "step": 11271 + }, + { + "epoch": 1.2378651438611905, + "grad_norm": 2.43662166595459, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7151807546615601, + "num_tokens": 281356183.0, + "step": 11272 + }, + { + "epoch": 1.237974961563804, + "grad_norm": 2.4031713008880615, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7131577134132385, + "num_tokens": 281379595.0, + "step": 11273 + }, + { + "epoch": 1.2380847792664178, + "grad_norm": 2.5169339179992676, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7220358848571777, + "num_tokens": 281400399.0, + "step": 11274 + }, + { + "epoch": 1.2381945969690313, + "grad_norm": 2.406400680541992, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7251008749008179, + "num_tokens": 281421331.0, + "step": 11275 + }, + { + "epoch": 1.238304414671645, + "grad_norm": 2.139937400817871, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7132852673530579, + "num_tokens": 281448612.0, + "step": 11276 + }, + { + "epoch": 1.2384142323742586, + "grad_norm": 2.2155110836029053, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7174637317657471, + "num_tokens": 281474675.0, + "step": 11277 + }, + { + "epoch": 1.2385240500768724, + "grad_norm": 2.6933889389038086, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7040958404541016, + "num_tokens": 281494734.0, + "step": 11278 + }, + { + "epoch": 1.2386338677794861, + "grad_norm": 2.637207269668579, + "learning_rate": 1e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7494658827781677, + "num_tokens": 281513227.0, + "step": 11279 + }, + { + "epoch": 1.2387436854820997, + "grad_norm": 2.499851942062378, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7202588319778442, + "num_tokens": 281534582.0, + "step": 11280 + }, + { + "epoch": 1.2388535031847134, + "grad_norm": 2.0440375804901123, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7140636444091797, + "num_tokens": 281566078.0, + "step": 11281 + }, + { + "epoch": 1.238963320887327, + "grad_norm": 2.04643177986145, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7204184532165527, + "num_tokens": 281593483.0, + "step": 11282 + }, + { + "epoch": 1.2390731385899407, + "grad_norm": 2.6495203971862793, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7185677289962769, + "num_tokens": 281612574.0, + "step": 11283 + }, + { + "epoch": 1.2391829562925545, + "grad_norm": 2.2212889194488525, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7231552004814148, + "num_tokens": 281638024.0, + "step": 11284 + }, + { + "epoch": 1.239292773995168, + "grad_norm": 2.368410587310791, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7071377038955688, + "num_tokens": 281663655.0, + "step": 11285 + }, + { + "epoch": 1.2394025916977818, + "grad_norm": 2.1462972164154053, + "learning_rate": 1e-06, + "loss": 1.0373, + "mean_token_accuracy": 0.6791355609893799, + "num_tokens": 281694334.0, + "step": 11286 + }, + { + "epoch": 1.2395124094003953, + "grad_norm": 2.2510881423950195, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.7075331211090088, + "num_tokens": 281719567.0, + "step": 11287 + }, + { + "epoch": 1.239622227103009, + "grad_norm": 2.47672438621521, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7248497605323792, + "num_tokens": 281740405.0, + "step": 11288 + }, + { + "epoch": 1.2397320448056226, + "grad_norm": 2.1809849739074707, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7141783237457275, + "num_tokens": 281767432.0, + "step": 11289 + }, + { + "epoch": 1.2398418625082364, + "grad_norm": 2.2584848403930664, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.725200891494751, + "num_tokens": 281796227.0, + "step": 11290 + }, + { + "epoch": 1.23995168021085, + "grad_norm": 2.0324461460113525, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7334080338478088, + "num_tokens": 281823643.0, + "step": 11291 + }, + { + "epoch": 1.2400614979134637, + "grad_norm": 2.3895745277404785, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7274441719055176, + "num_tokens": 281846994.0, + "step": 11292 + }, + { + "epoch": 1.2401713156160774, + "grad_norm": 2.300658941268921, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7274971008300781, + "num_tokens": 281871897.0, + "step": 11293 + }, + { + "epoch": 1.240281133318691, + "grad_norm": 2.3938779830932617, + "learning_rate": 1e-06, + "loss": 0.951, + "mean_token_accuracy": 0.7060931324958801, + "num_tokens": 281897751.0, + "step": 11294 + }, + { + "epoch": 1.2403909510213047, + "grad_norm": 2.200249433517456, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6925196647644043, + "num_tokens": 281926233.0, + "step": 11295 + }, + { + "epoch": 1.2405007687239182, + "grad_norm": 2.5086283683776855, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7265661358833313, + "num_tokens": 281948109.0, + "step": 11296 + }, + { + "epoch": 1.240610586426532, + "grad_norm": 2.332664966583252, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7473175525665283, + "num_tokens": 281973055.0, + "step": 11297 + }, + { + "epoch": 1.2407204041291455, + "grad_norm": 1.9988212585449219, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7066627740859985, + "num_tokens": 282004482.0, + "step": 11298 + }, + { + "epoch": 1.2408302218317593, + "grad_norm": 2.2820568084716797, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7139350175857544, + "num_tokens": 282028655.0, + "step": 11299 + }, + { + "epoch": 1.2409400395343728, + "grad_norm": 2.35274338722229, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7244105935096741, + "num_tokens": 282052981.0, + "step": 11300 + }, + { + "epoch": 1.2410498572369866, + "grad_norm": 2.6645071506500244, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7207813858985901, + "num_tokens": 282073524.0, + "step": 11301 + }, + { + "epoch": 1.2411596749396003, + "grad_norm": 2.319615602493286, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7279835939407349, + "num_tokens": 282095998.0, + "step": 11302 + }, + { + "epoch": 1.2412694926422139, + "grad_norm": 2.3948557376861572, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.7052780985832214, + "num_tokens": 282122383.0, + "step": 11303 + }, + { + "epoch": 1.2413793103448276, + "grad_norm": 2.200679302215576, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6944475769996643, + "num_tokens": 282150623.0, + "step": 11304 + }, + { + "epoch": 1.2414891280474412, + "grad_norm": 2.354383707046509, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7185882329940796, + "num_tokens": 282173713.0, + "step": 11305 + }, + { + "epoch": 1.241598945750055, + "grad_norm": 2.5065605640411377, + "learning_rate": 1e-06, + "loss": 0.8056, + "mean_token_accuracy": 0.7427694797515869, + "num_tokens": 282193756.0, + "step": 11306 + }, + { + "epoch": 1.2417087634526687, + "grad_norm": 2.4218504428863525, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7154244184494019, + "num_tokens": 282215221.0, + "step": 11307 + }, + { + "epoch": 1.2418185811552822, + "grad_norm": 2.3709022998809814, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7332366704940796, + "num_tokens": 282238896.0, + "step": 11308 + }, + { + "epoch": 1.241928398857896, + "grad_norm": 2.1549556255340576, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7206413149833679, + "num_tokens": 282265475.0, + "step": 11309 + }, + { + "epoch": 1.2420382165605095, + "grad_norm": 2.188295364379883, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7218837738037109, + "num_tokens": 282291368.0, + "step": 11310 + }, + { + "epoch": 1.2421480342631233, + "grad_norm": 2.5138723850250244, + "learning_rate": 1e-06, + "loss": 1.0033, + "mean_token_accuracy": 0.6992673277854919, + "num_tokens": 282313627.0, + "step": 11311 + }, + { + "epoch": 1.2422578519657368, + "grad_norm": 2.3376660346984863, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7203880548477173, + "num_tokens": 282338546.0, + "step": 11312 + }, + { + "epoch": 1.2423676696683505, + "grad_norm": 2.177760362625122, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7410001158714294, + "num_tokens": 282367685.0, + "step": 11313 + }, + { + "epoch": 1.242477487370964, + "grad_norm": 1.9735875129699707, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7046916484832764, + "num_tokens": 282396047.0, + "step": 11314 + }, + { + "epoch": 1.2425873050735778, + "grad_norm": 2.3646836280822754, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7287468314170837, + "num_tokens": 282418807.0, + "step": 11315 + }, + { + "epoch": 1.2426971227761916, + "grad_norm": 2.160562753677368, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7204398512840271, + "num_tokens": 282446223.0, + "step": 11316 + }, + { + "epoch": 1.2428069404788051, + "grad_norm": 2.270526885986328, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7408651113510132, + "num_tokens": 282469071.0, + "step": 11317 + }, + { + "epoch": 1.242916758181419, + "grad_norm": 2.4923746585845947, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7138495445251465, + "num_tokens": 282489875.0, + "step": 11318 + }, + { + "epoch": 1.2430265758840324, + "grad_norm": 2.263368606567383, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7421115636825562, + "num_tokens": 282513502.0, + "step": 11319 + }, + { + "epoch": 1.2431363935866462, + "grad_norm": 2.0558862686157227, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.6978517174720764, + "num_tokens": 282543614.0, + "step": 11320 + }, + { + "epoch": 1.24324621128926, + "grad_norm": 2.1900458335876465, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.72315913438797, + "num_tokens": 282568908.0, + "step": 11321 + }, + { + "epoch": 1.2433560289918735, + "grad_norm": 2.6903135776519775, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7251261472702026, + "num_tokens": 282588852.0, + "step": 11322 + }, + { + "epoch": 1.2434658466944872, + "grad_norm": 2.1211187839508057, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7278025150299072, + "num_tokens": 282618634.0, + "step": 11323 + }, + { + "epoch": 1.2435756643971008, + "grad_norm": 2.0786116123199463, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.6992040872573853, + "num_tokens": 282648493.0, + "step": 11324 + }, + { + "epoch": 1.2436854820997145, + "grad_norm": 2.431835412979126, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7215794324874878, + "num_tokens": 282671749.0, + "step": 11325 + }, + { + "epoch": 1.243795299802328, + "grad_norm": 2.34487247467041, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7316384315490723, + "num_tokens": 282701071.0, + "step": 11326 + }, + { + "epoch": 1.2439051175049418, + "grad_norm": 2.2405591011047363, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7173053622245789, + "num_tokens": 282728206.0, + "step": 11327 + }, + { + "epoch": 1.2440149352075554, + "grad_norm": 2.2681572437286377, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7256219983100891, + "num_tokens": 282752052.0, + "step": 11328 + }, + { + "epoch": 1.244124752910169, + "grad_norm": 2.2816104888916016, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.715131402015686, + "num_tokens": 282776776.0, + "step": 11329 + }, + { + "epoch": 1.2442345706127829, + "grad_norm": 2.255342483520508, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7280421257019043, + "num_tokens": 282803348.0, + "step": 11330 + }, + { + "epoch": 1.2443443883153964, + "grad_norm": 2.6860814094543457, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7333023548126221, + "num_tokens": 282823472.0, + "step": 11331 + }, + { + "epoch": 1.2444542060180102, + "grad_norm": 2.1544899940490723, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7157705426216125, + "num_tokens": 282851192.0, + "step": 11332 + }, + { + "epoch": 1.2445640237206237, + "grad_norm": 2.191490650177002, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7255934476852417, + "num_tokens": 282877799.0, + "step": 11333 + }, + { + "epoch": 1.2446738414232374, + "grad_norm": 2.3102376461029053, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.722962498664856, + "num_tokens": 282900193.0, + "step": 11334 + }, + { + "epoch": 1.2447836591258512, + "grad_norm": 2.3925459384918213, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.722556471824646, + "num_tokens": 282924093.0, + "step": 11335 + }, + { + "epoch": 1.2448934768284647, + "grad_norm": 2.3660945892333984, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7271188497543335, + "num_tokens": 282947457.0, + "step": 11336 + }, + { + "epoch": 1.2450032945310785, + "grad_norm": 2.2156848907470703, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7252562642097473, + "num_tokens": 282973721.0, + "step": 11337 + }, + { + "epoch": 1.245113112233692, + "grad_norm": 2.2644941806793213, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7038283348083496, + "num_tokens": 282999308.0, + "step": 11338 + }, + { + "epoch": 1.2452229299363058, + "grad_norm": 2.429976463317871, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7293267250061035, + "num_tokens": 283019210.0, + "step": 11339 + }, + { + "epoch": 1.2453327476389193, + "grad_norm": 2.366562604904175, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7255188822746277, + "num_tokens": 283043397.0, + "step": 11340 + }, + { + "epoch": 1.245442565341533, + "grad_norm": 2.2962756156921387, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7060120105743408, + "num_tokens": 283066661.0, + "step": 11341 + }, + { + "epoch": 1.2455523830441466, + "grad_norm": 2.00805926322937, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7011075615882874, + "num_tokens": 283096167.0, + "step": 11342 + }, + { + "epoch": 1.2456622007467604, + "grad_norm": 2.6399049758911133, + "learning_rate": 1e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7499668002128601, + "num_tokens": 283115884.0, + "step": 11343 + }, + { + "epoch": 1.2457720184493741, + "grad_norm": 2.6003732681274414, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6993758678436279, + "num_tokens": 283139074.0, + "step": 11344 + }, + { + "epoch": 1.2458818361519877, + "grad_norm": 2.224135398864746, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7144206166267395, + "num_tokens": 283164684.0, + "step": 11345 + }, + { + "epoch": 1.2459916538546014, + "grad_norm": 2.370119333267212, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7120646238327026, + "num_tokens": 283187436.0, + "step": 11346 + }, + { + "epoch": 1.246101471557215, + "grad_norm": 2.477144479751587, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7365539073944092, + "num_tokens": 283209059.0, + "step": 11347 + }, + { + "epoch": 1.2462112892598287, + "grad_norm": 2.177746534347534, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.722990870475769, + "num_tokens": 283235334.0, + "step": 11348 + }, + { + "epoch": 1.2463211069624422, + "grad_norm": 2.3163156509399414, + "learning_rate": 1e-06, + "loss": 0.9861, + "mean_token_accuracy": 0.6951301693916321, + "num_tokens": 283261451.0, + "step": 11349 + }, + { + "epoch": 1.246430924665056, + "grad_norm": 2.0489068031311035, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7013866901397705, + "num_tokens": 283291466.0, + "step": 11350 + }, + { + "epoch": 1.2465407423676698, + "grad_norm": 2.1389381885528564, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7172640562057495, + "num_tokens": 283318308.0, + "step": 11351 + }, + { + "epoch": 1.2466505600702833, + "grad_norm": 2.347676992416382, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7063882946968079, + "num_tokens": 283341992.0, + "step": 11352 + }, + { + "epoch": 1.246760377772897, + "grad_norm": 2.308781147003174, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7178961038589478, + "num_tokens": 283366474.0, + "step": 11353 + }, + { + "epoch": 1.2468701954755106, + "grad_norm": 2.3312132358551025, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7166557312011719, + "num_tokens": 283390383.0, + "step": 11354 + }, + { + "epoch": 1.2469800131781243, + "grad_norm": 2.5314083099365234, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.7330997586250305, + "num_tokens": 283410194.0, + "step": 11355 + }, + { + "epoch": 1.2470898308807379, + "grad_norm": 2.179436445236206, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7096956372261047, + "num_tokens": 283437881.0, + "step": 11356 + }, + { + "epoch": 1.2471996485833516, + "grad_norm": 1.9917094707489014, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6981631517410278, + "num_tokens": 283471213.0, + "step": 11357 + }, + { + "epoch": 1.2473094662859654, + "grad_norm": 2.212122678756714, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7226043939590454, + "num_tokens": 283497535.0, + "step": 11358 + }, + { + "epoch": 1.247419283988579, + "grad_norm": 2.180685043334961, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7183157205581665, + "num_tokens": 283523134.0, + "step": 11359 + }, + { + "epoch": 1.2475291016911927, + "grad_norm": 2.111375331878662, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7021349668502808, + "num_tokens": 283552078.0, + "step": 11360 + }, + { + "epoch": 1.2476389193938062, + "grad_norm": 2.420367956161499, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7357211112976074, + "num_tokens": 283574413.0, + "step": 11361 + }, + { + "epoch": 1.24774873709642, + "grad_norm": 2.0536935329437256, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7107048630714417, + "num_tokens": 283602628.0, + "step": 11362 + }, + { + "epoch": 1.2478585547990335, + "grad_norm": 2.6852216720581055, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7492322325706482, + "num_tokens": 283621024.0, + "step": 11363 + }, + { + "epoch": 1.2479683725016473, + "grad_norm": 2.3973581790924072, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7292113304138184, + "num_tokens": 283644314.0, + "step": 11364 + }, + { + "epoch": 1.2480781902042608, + "grad_norm": 2.6974079608917236, + "learning_rate": 1e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.7339524030685425, + "num_tokens": 283662471.0, + "step": 11365 + }, + { + "epoch": 1.2481880079068746, + "grad_norm": 2.2163188457489014, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7128384113311768, + "num_tokens": 283687478.0, + "step": 11366 + }, + { + "epoch": 1.2482978256094883, + "grad_norm": 2.088679075241089, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7103960514068604, + "num_tokens": 283717656.0, + "step": 11367 + }, + { + "epoch": 1.2484076433121019, + "grad_norm": 2.122936725616455, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.73191237449646, + "num_tokens": 283744753.0, + "step": 11368 + }, + { + "epoch": 1.2485174610147156, + "grad_norm": 2.4663681983947754, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7298417687416077, + "num_tokens": 283765868.0, + "step": 11369 + }, + { + "epoch": 1.2486272787173291, + "grad_norm": 2.321359634399414, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7067365646362305, + "num_tokens": 283790020.0, + "step": 11370 + }, + { + "epoch": 1.248737096419943, + "grad_norm": 2.094858169555664, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7090950012207031, + "num_tokens": 283819992.0, + "step": 11371 + }, + { + "epoch": 1.2488469141225567, + "grad_norm": 2.151711940765381, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7273341417312622, + "num_tokens": 283848269.0, + "step": 11372 + }, + { + "epoch": 1.2489567318251702, + "grad_norm": 2.2851011753082275, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6980005502700806, + "num_tokens": 283873201.0, + "step": 11373 + }, + { + "epoch": 1.249066549527784, + "grad_norm": 2.3169572353363037, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7484041452407837, + "num_tokens": 283896380.0, + "step": 11374 + }, + { + "epoch": 1.2491763672303975, + "grad_norm": 2.4727933406829834, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7203363180160522, + "num_tokens": 283917806.0, + "step": 11375 + }, + { + "epoch": 1.2492861849330112, + "grad_norm": 2.196119785308838, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6889703869819641, + "num_tokens": 283945273.0, + "step": 11376 + }, + { + "epoch": 1.2493960026356248, + "grad_norm": 2.4579358100891113, + "learning_rate": 1e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7464401721954346, + "num_tokens": 283966329.0, + "step": 11377 + }, + { + "epoch": 1.2495058203382385, + "grad_norm": 2.590850591659546, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7195908427238464, + "num_tokens": 283987105.0, + "step": 11378 + }, + { + "epoch": 1.249615638040852, + "grad_norm": 2.0814261436462402, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7132116556167603, + "num_tokens": 284014341.0, + "step": 11379 + }, + { + "epoch": 1.2497254557434658, + "grad_norm": 2.2027316093444824, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7312544584274292, + "num_tokens": 284039372.0, + "step": 11380 + }, + { + "epoch": 1.2498352734460796, + "grad_norm": 2.2144687175750732, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7083697319030762, + "num_tokens": 284064685.0, + "step": 11381 + }, + { + "epoch": 1.2499450911486931, + "grad_norm": 2.078580617904663, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7097259759902954, + "num_tokens": 284091320.0, + "step": 11382 + }, + { + "epoch": 1.2500549088513069, + "grad_norm": 2.1769211292266846, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7021529078483582, + "num_tokens": 284118597.0, + "step": 11383 + }, + { + "epoch": 1.2501647265539204, + "grad_norm": 2.0935897827148438, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.718451976776123, + "num_tokens": 284143856.0, + "step": 11384 + }, + { + "epoch": 1.2502745442565342, + "grad_norm": 2.3344507217407227, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7181395888328552, + "num_tokens": 284166999.0, + "step": 11385 + }, + { + "epoch": 1.250384361959148, + "grad_norm": 2.6435160636901855, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7190921902656555, + "num_tokens": 284187762.0, + "step": 11386 + }, + { + "epoch": 1.2504941796617615, + "grad_norm": 2.3805737495422363, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7171048521995544, + "num_tokens": 284209619.0, + "step": 11387 + }, + { + "epoch": 1.2506039973643752, + "grad_norm": 2.462996006011963, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7188504338264465, + "num_tokens": 284231050.0, + "step": 11388 + }, + { + "epoch": 1.2507138150669888, + "grad_norm": 2.313448190689087, + "learning_rate": 1e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7512398958206177, + "num_tokens": 284255405.0, + "step": 11389 + }, + { + "epoch": 1.2508236327696025, + "grad_norm": 1.994113564491272, + "learning_rate": 1e-06, + "loss": 1.0334, + "mean_token_accuracy": 0.6844643354415894, + "num_tokens": 284287689.0, + "step": 11390 + }, + { + "epoch": 1.250933450472216, + "grad_norm": 2.1266427040100098, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.707154393196106, + "num_tokens": 284318204.0, + "step": 11391 + }, + { + "epoch": 1.2510432681748298, + "grad_norm": 2.314396619796753, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7291460633277893, + "num_tokens": 284341485.0, + "step": 11392 + }, + { + "epoch": 1.2511530858774433, + "grad_norm": 2.566467046737671, + "learning_rate": 1e-06, + "loss": 0.8074, + "mean_token_accuracy": 0.7437121272087097, + "num_tokens": 284359830.0, + "step": 11393 + }, + { + "epoch": 1.251262903580057, + "grad_norm": 2.232271909713745, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7352991104125977, + "num_tokens": 284385310.0, + "step": 11394 + }, + { + "epoch": 1.2513727212826709, + "grad_norm": 2.5125067234039307, + "learning_rate": 1e-06, + "loss": 0.8307, + "mean_token_accuracy": 0.732699453830719, + "num_tokens": 284406204.0, + "step": 11395 + }, + { + "epoch": 1.2514825389852844, + "grad_norm": 2.5089025497436523, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7138217687606812, + "num_tokens": 284430035.0, + "step": 11396 + }, + { + "epoch": 1.2515923566878981, + "grad_norm": 2.4853932857513428, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7442001104354858, + "num_tokens": 284452050.0, + "step": 11397 + }, + { + "epoch": 1.2517021743905117, + "grad_norm": 2.0982372760772705, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7130300998687744, + "num_tokens": 284481102.0, + "step": 11398 + }, + { + "epoch": 1.2518119920931254, + "grad_norm": 2.2701776027679443, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7271569967269897, + "num_tokens": 284507826.0, + "step": 11399 + }, + { + "epoch": 1.2519218097957392, + "grad_norm": 2.382931709289551, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7315719127655029, + "num_tokens": 284531193.0, + "step": 11400 + }, + { + "epoch": 1.2520316274983527, + "grad_norm": 2.1933908462524414, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7085826396942139, + "num_tokens": 284557559.0, + "step": 11401 + }, + { + "epoch": 1.2521414452009663, + "grad_norm": 2.2713797092437744, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7024020552635193, + "num_tokens": 284583031.0, + "step": 11402 + }, + { + "epoch": 1.25225126290358, + "grad_norm": 2.384535312652588, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7069859504699707, + "num_tokens": 284607838.0, + "step": 11403 + }, + { + "epoch": 1.2523610806061938, + "grad_norm": 2.445871114730835, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7305994033813477, + "num_tokens": 284628718.0, + "step": 11404 + }, + { + "epoch": 1.2524708983088073, + "grad_norm": 2.164517879486084, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7211691737174988, + "num_tokens": 284655267.0, + "step": 11405 + }, + { + "epoch": 1.252580716011421, + "grad_norm": 2.289942741394043, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7350927591323853, + "num_tokens": 284679588.0, + "step": 11406 + }, + { + "epoch": 1.2526905337140346, + "grad_norm": 2.3837008476257324, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7298470735549927, + "num_tokens": 284703787.0, + "step": 11407 + }, + { + "epoch": 1.2528003514166484, + "grad_norm": 2.1632566452026367, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7377339601516724, + "num_tokens": 284731437.0, + "step": 11408 + }, + { + "epoch": 1.2529101691192621, + "grad_norm": 2.4576609134674072, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7267590761184692, + "num_tokens": 284752232.0, + "step": 11409 + }, + { + "epoch": 1.2530199868218757, + "grad_norm": 2.4371182918548584, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.719419002532959, + "num_tokens": 284774518.0, + "step": 11410 + }, + { + "epoch": 1.2531298045244894, + "grad_norm": 2.1562728881835938, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7220556735992432, + "num_tokens": 284800618.0, + "step": 11411 + }, + { + "epoch": 1.253239622227103, + "grad_norm": 2.5194733142852783, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.7007474899291992, + "num_tokens": 284822446.0, + "step": 11412 + }, + { + "epoch": 1.2533494399297167, + "grad_norm": 2.2605342864990234, + "learning_rate": 1e-06, + "loss": 0.8595, + "mean_token_accuracy": 0.7322750091552734, + "num_tokens": 284847705.0, + "step": 11413 + }, + { + "epoch": 1.2534592576323305, + "grad_norm": 2.2332470417022705, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7189135551452637, + "num_tokens": 284874026.0, + "step": 11414 + }, + { + "epoch": 1.253569075334944, + "grad_norm": 2.118079900741577, + "learning_rate": 1e-06, + "loss": 0.8727, + "mean_token_accuracy": 0.7194682955741882, + "num_tokens": 284902509.0, + "step": 11415 + }, + { + "epoch": 1.2536788930375575, + "grad_norm": 2.215266466140747, + "learning_rate": 1e-06, + "loss": 0.9785, + "mean_token_accuracy": 0.7072904109954834, + "num_tokens": 284931061.0, + "step": 11416 + }, + { + "epoch": 1.2537887107401713, + "grad_norm": 2.070042610168457, + "learning_rate": 1e-06, + "loss": 0.9765, + "mean_token_accuracy": 0.7156256437301636, + "num_tokens": 284959657.0, + "step": 11417 + }, + { + "epoch": 1.253898528442785, + "grad_norm": 2.185009002685547, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.727279782295227, + "num_tokens": 284985686.0, + "step": 11418 + }, + { + "epoch": 1.2540083461453986, + "grad_norm": 2.2929651737213135, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7212288975715637, + "num_tokens": 285010790.0, + "step": 11419 + }, + { + "epoch": 1.2541181638480123, + "grad_norm": 2.1010539531707764, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7178084850311279, + "num_tokens": 285038370.0, + "step": 11420 + }, + { + "epoch": 1.2542279815506259, + "grad_norm": 2.1178460121154785, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7194085121154785, + "num_tokens": 285067357.0, + "step": 11421 + }, + { + "epoch": 1.2543377992532396, + "grad_norm": 2.0847818851470947, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.713015079498291, + "num_tokens": 285097275.0, + "step": 11422 + }, + { + "epoch": 1.2544476169558534, + "grad_norm": 2.211087942123413, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7252184152603149, + "num_tokens": 285122735.0, + "step": 11423 + }, + { + "epoch": 1.254557434658467, + "grad_norm": 2.0830440521240234, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7240851521492004, + "num_tokens": 285150163.0, + "step": 11424 + }, + { + "epoch": 1.2546672523610807, + "grad_norm": 2.4704461097717285, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7387291193008423, + "num_tokens": 285170761.0, + "step": 11425 + }, + { + "epoch": 1.2547770700636942, + "grad_norm": 2.5345451831817627, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7014011740684509, + "num_tokens": 285192837.0, + "step": 11426 + }, + { + "epoch": 1.254886887766308, + "grad_norm": 2.1771798133850098, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7121559381484985, + "num_tokens": 285220057.0, + "step": 11427 + }, + { + "epoch": 1.2549967054689217, + "grad_norm": 2.6148605346679688, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7285161018371582, + "num_tokens": 285238642.0, + "step": 11428 + }, + { + "epoch": 1.2551065231715353, + "grad_norm": 2.3432118892669678, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7257776260375977, + "num_tokens": 285263810.0, + "step": 11429 + }, + { + "epoch": 1.2552163408741488, + "grad_norm": 2.0711028575897217, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.6951844692230225, + "num_tokens": 285295234.0, + "step": 11430 + }, + { + "epoch": 1.2553261585767626, + "grad_norm": 2.6511635780334473, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7254458665847778, + "num_tokens": 285313957.0, + "step": 11431 + }, + { + "epoch": 1.2554359762793763, + "grad_norm": 2.4196863174438477, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7349351644515991, + "num_tokens": 285336757.0, + "step": 11432 + }, + { + "epoch": 1.2555457939819898, + "grad_norm": 2.1649010181427, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7241507768630981, + "num_tokens": 285363863.0, + "step": 11433 + }, + { + "epoch": 1.2556556116846036, + "grad_norm": 2.205073595046997, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7138883471488953, + "num_tokens": 285388994.0, + "step": 11434 + }, + { + "epoch": 1.2557654293872171, + "grad_norm": 2.2382514476776123, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7263476848602295, + "num_tokens": 285413646.0, + "step": 11435 + }, + { + "epoch": 1.255875247089831, + "grad_norm": 1.9993045330047607, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6923387050628662, + "num_tokens": 285448283.0, + "step": 11436 + }, + { + "epoch": 1.2559850647924446, + "grad_norm": 2.2214980125427246, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7296239137649536, + "num_tokens": 285473282.0, + "step": 11437 + }, + { + "epoch": 1.2560948824950582, + "grad_norm": 2.1252353191375732, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7090094089508057, + "num_tokens": 285502320.0, + "step": 11438 + }, + { + "epoch": 1.256204700197672, + "grad_norm": 2.1620430946350098, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7060173749923706, + "num_tokens": 285532082.0, + "step": 11439 + }, + { + "epoch": 1.2563145179002855, + "grad_norm": 2.595075845718384, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7061340808868408, + "num_tokens": 285552986.0, + "step": 11440 + }, + { + "epoch": 1.2564243356028992, + "grad_norm": 2.2375245094299316, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7172063589096069, + "num_tokens": 285577338.0, + "step": 11441 + }, + { + "epoch": 1.2565341533055128, + "grad_norm": 2.5945096015930176, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7263830900192261, + "num_tokens": 285597896.0, + "step": 11442 + }, + { + "epoch": 1.2566439710081265, + "grad_norm": 2.0742435455322266, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7144303321838379, + "num_tokens": 285626994.0, + "step": 11443 + }, + { + "epoch": 1.25675378871074, + "grad_norm": 2.595107078552246, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7482792139053345, + "num_tokens": 285644929.0, + "step": 11444 + }, + { + "epoch": 1.2568636064133538, + "grad_norm": 2.249328851699829, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7325600385665894, + "num_tokens": 285669142.0, + "step": 11445 + }, + { + "epoch": 1.2569734241159676, + "grad_norm": 2.182708501815796, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7296841144561768, + "num_tokens": 285694854.0, + "step": 11446 + }, + { + "epoch": 1.257083241818581, + "grad_norm": 2.175320863723755, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7227679491043091, + "num_tokens": 285723076.0, + "step": 11447 + }, + { + "epoch": 1.2571930595211949, + "grad_norm": 2.3295912742614746, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7027578949928284, + "num_tokens": 285747182.0, + "step": 11448 + }, + { + "epoch": 1.2573028772238084, + "grad_norm": 2.120783805847168, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7228410840034485, + "num_tokens": 285773973.0, + "step": 11449 + }, + { + "epoch": 1.2574126949264222, + "grad_norm": 2.427213668823242, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7210184931755066, + "num_tokens": 285795251.0, + "step": 11450 + }, + { + "epoch": 1.257522512629036, + "grad_norm": 1.9674906730651855, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.712587833404541, + "num_tokens": 285826834.0, + "step": 11451 + }, + { + "epoch": 1.2576323303316495, + "grad_norm": 2.5211422443389893, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7317091822624207, + "num_tokens": 285848147.0, + "step": 11452 + }, + { + "epoch": 1.257742148034263, + "grad_norm": 2.3759844303131104, + "learning_rate": 1e-06, + "loss": 0.9569, + "mean_token_accuracy": 0.7093024253845215, + "num_tokens": 285872974.0, + "step": 11453 + }, + { + "epoch": 1.2578519657368767, + "grad_norm": 2.838268756866455, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7411000728607178, + "num_tokens": 285890708.0, + "step": 11454 + }, + { + "epoch": 1.2579617834394905, + "grad_norm": 2.0905821323394775, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7180880308151245, + "num_tokens": 285918163.0, + "step": 11455 + }, + { + "epoch": 1.258071601142104, + "grad_norm": 2.2090225219726562, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7276470065116882, + "num_tokens": 285944833.0, + "step": 11456 + }, + { + "epoch": 1.2581814188447178, + "grad_norm": 2.340980291366577, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7212873101234436, + "num_tokens": 285970082.0, + "step": 11457 + }, + { + "epoch": 1.2582912365473313, + "grad_norm": 2.131720781326294, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7302636504173279, + "num_tokens": 285998853.0, + "step": 11458 + }, + { + "epoch": 1.258401054249945, + "grad_norm": 1.9323116540908813, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7341382503509521, + "num_tokens": 286030839.0, + "step": 11459 + }, + { + "epoch": 1.2585108719525588, + "grad_norm": 2.0096986293792725, + "learning_rate": 1e-06, + "loss": 0.9479, + "mean_token_accuracy": 0.7080824375152588, + "num_tokens": 286063547.0, + "step": 11460 + }, + { + "epoch": 1.2586206896551724, + "grad_norm": 2.081307888031006, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7340209484100342, + "num_tokens": 286092160.0, + "step": 11461 + }, + { + "epoch": 1.2587305073577861, + "grad_norm": 2.4242844581604004, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7064309120178223, + "num_tokens": 286113812.0, + "step": 11462 + }, + { + "epoch": 1.2588403250603997, + "grad_norm": 2.2987349033355713, + "learning_rate": 1e-06, + "loss": 0.8847, + "mean_token_accuracy": 0.7196544408798218, + "num_tokens": 286139516.0, + "step": 11463 + }, + { + "epoch": 1.2589501427630134, + "grad_norm": 1.9750970602035522, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6926227807998657, + "num_tokens": 286172987.0, + "step": 11464 + }, + { + "epoch": 1.2590599604656272, + "grad_norm": 2.645684242248535, + "learning_rate": 1e-06, + "loss": 0.889, + "mean_token_accuracy": 0.7376499772071838, + "num_tokens": 286191769.0, + "step": 11465 + }, + { + "epoch": 1.2591697781682407, + "grad_norm": 2.2962703704833984, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7345253229141235, + "num_tokens": 286215929.0, + "step": 11466 + }, + { + "epoch": 1.2592795958708543, + "grad_norm": 2.010258436203003, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6987249851226807, + "num_tokens": 286245916.0, + "step": 11467 + }, + { + "epoch": 1.259389413573468, + "grad_norm": 2.271810531616211, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.742469310760498, + "num_tokens": 286267351.0, + "step": 11468 + }, + { + "epoch": 1.2594992312760818, + "grad_norm": 2.1456286907196045, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7171786427497864, + "num_tokens": 286292443.0, + "step": 11469 + }, + { + "epoch": 1.2596090489786953, + "grad_norm": 2.312648057937622, + "learning_rate": 1e-06, + "loss": 1.0048, + "mean_token_accuracy": 0.6943598985671997, + "num_tokens": 286317111.0, + "step": 11470 + }, + { + "epoch": 1.259718866681309, + "grad_norm": 2.351674795150757, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.7431870698928833, + "num_tokens": 286341256.0, + "step": 11471 + }, + { + "epoch": 1.2598286843839226, + "grad_norm": 2.1160454750061035, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7319168448448181, + "num_tokens": 286369160.0, + "step": 11472 + }, + { + "epoch": 1.2599385020865363, + "grad_norm": 2.100069284439087, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7104136943817139, + "num_tokens": 286396098.0, + "step": 11473 + }, + { + "epoch": 1.26004831978915, + "grad_norm": 2.208265542984009, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7193577289581299, + "num_tokens": 286420417.0, + "step": 11474 + }, + { + "epoch": 1.2601581374917636, + "grad_norm": 2.1780316829681396, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7288995385169983, + "num_tokens": 286446624.0, + "step": 11475 + }, + { + "epoch": 1.2602679551943774, + "grad_norm": 2.2105929851531982, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7007272243499756, + "num_tokens": 286474858.0, + "step": 11476 + }, + { + "epoch": 1.260377772896991, + "grad_norm": 2.4630351066589355, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7407699823379517, + "num_tokens": 286496504.0, + "step": 11477 + }, + { + "epoch": 1.2604875905996047, + "grad_norm": 2.1073567867279053, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7237368226051331, + "num_tokens": 286524631.0, + "step": 11478 + }, + { + "epoch": 1.2605974083022184, + "grad_norm": 2.584475040435791, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7061043381690979, + "num_tokens": 286545886.0, + "step": 11479 + }, + { + "epoch": 1.260707226004832, + "grad_norm": 2.3803539276123047, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.731583297252655, + "num_tokens": 286568726.0, + "step": 11480 + }, + { + "epoch": 1.2608170437074455, + "grad_norm": 2.679565191268921, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7164375185966492, + "num_tokens": 286588744.0, + "step": 11481 + }, + { + "epoch": 1.2609268614100593, + "grad_norm": 2.186535120010376, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7168527841567993, + "num_tokens": 286616286.0, + "step": 11482 + }, + { + "epoch": 1.261036679112673, + "grad_norm": 2.2694904804229736, + "learning_rate": 1e-06, + "loss": 0.9628, + "mean_token_accuracy": 0.7003668546676636, + "num_tokens": 286642285.0, + "step": 11483 + }, + { + "epoch": 1.2611464968152866, + "grad_norm": 2.117192268371582, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7142518758773804, + "num_tokens": 286670379.0, + "step": 11484 + }, + { + "epoch": 1.2612563145179003, + "grad_norm": 2.2607967853546143, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7060984373092651, + "num_tokens": 286696778.0, + "step": 11485 + }, + { + "epoch": 1.2613661322205139, + "grad_norm": 2.427840232849121, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7304215431213379, + "num_tokens": 286719768.0, + "step": 11486 + }, + { + "epoch": 1.2614759499231276, + "grad_norm": 2.2867379188537598, + "learning_rate": 1e-06, + "loss": 0.8062, + "mean_token_accuracy": 0.7430779337882996, + "num_tokens": 286744239.0, + "step": 11487 + }, + { + "epoch": 1.2615857676257414, + "grad_norm": 2.0138673782348633, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7187291383743286, + "num_tokens": 286776215.0, + "step": 11488 + }, + { + "epoch": 1.261695585328355, + "grad_norm": 2.3375394344329834, + "learning_rate": 1e-06, + "loss": 0.7859, + "mean_token_accuracy": 0.7479981184005737, + "num_tokens": 286796080.0, + "step": 11489 + }, + { + "epoch": 1.2618054030309687, + "grad_norm": 2.345701217651367, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7169696688652039, + "num_tokens": 286819383.0, + "step": 11490 + }, + { + "epoch": 1.2619152207335822, + "grad_norm": 2.5981597900390625, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7314307689666748, + "num_tokens": 286838617.0, + "step": 11491 + }, + { + "epoch": 1.262025038436196, + "grad_norm": 2.372415781021118, + "learning_rate": 1e-06, + "loss": 0.7284, + "mean_token_accuracy": 0.762048602104187, + "num_tokens": 286859377.0, + "step": 11492 + }, + { + "epoch": 1.2621348561388097, + "grad_norm": 2.235792636871338, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7091925144195557, + "num_tokens": 286886100.0, + "step": 11493 + }, + { + "epoch": 1.2622446738414232, + "grad_norm": 2.248457908630371, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7028408050537109, + "num_tokens": 286914806.0, + "step": 11494 + }, + { + "epoch": 1.2623544915440368, + "grad_norm": 2.312126398086548, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7096244096755981, + "num_tokens": 286940952.0, + "step": 11495 + }, + { + "epoch": 1.2624643092466505, + "grad_norm": 2.4078755378723145, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7080682516098022, + "num_tokens": 286966200.0, + "step": 11496 + }, + { + "epoch": 1.2625741269492643, + "grad_norm": 2.2650930881500244, + "learning_rate": 1e-06, + "loss": 1.0025, + "mean_token_accuracy": 0.689227819442749, + "num_tokens": 286991774.0, + "step": 11497 + }, + { + "epoch": 1.2626839446518778, + "grad_norm": 2.5547850131988525, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7269759178161621, + "num_tokens": 287011923.0, + "step": 11498 + }, + { + "epoch": 1.2627937623544916, + "grad_norm": 2.348283052444458, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7195261716842651, + "num_tokens": 287034659.0, + "step": 11499 + }, + { + "epoch": 1.2629035800571051, + "grad_norm": 2.376408100128174, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7063659429550171, + "num_tokens": 287057876.0, + "step": 11500 + }, + { + "epoch": 1.2630133977597189, + "grad_norm": 2.3430912494659424, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7268367409706116, + "num_tokens": 287082563.0, + "step": 11501 + }, + { + "epoch": 1.2631232154623326, + "grad_norm": 2.323204517364502, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7305861115455627, + "num_tokens": 287107978.0, + "step": 11502 + }, + { + "epoch": 1.2632330331649462, + "grad_norm": 2.515868902206421, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7203818559646606, + "num_tokens": 287130233.0, + "step": 11503 + }, + { + "epoch": 1.26334285086756, + "grad_norm": 2.069060802459717, + "learning_rate": 1e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7383565306663513, + "num_tokens": 287158264.0, + "step": 11504 + }, + { + "epoch": 1.2634526685701735, + "grad_norm": 2.2851271629333496, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7168557643890381, + "num_tokens": 287183602.0, + "step": 11505 + }, + { + "epoch": 1.2635624862727872, + "grad_norm": 2.4386603832244873, + "learning_rate": 1e-06, + "loss": 0.7489, + "mean_token_accuracy": 0.7570533752441406, + "num_tokens": 287205797.0, + "step": 11506 + }, + { + "epoch": 1.2636723039754008, + "grad_norm": 2.427159309387207, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7323172688484192, + "num_tokens": 287227143.0, + "step": 11507 + }, + { + "epoch": 1.2637821216780145, + "grad_norm": 2.1749427318573, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7106324434280396, + "num_tokens": 287254449.0, + "step": 11508 + }, + { + "epoch": 1.263891939380628, + "grad_norm": 2.4038617610931396, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7126273512840271, + "num_tokens": 287276862.0, + "step": 11509 + }, + { + "epoch": 1.2640017570832418, + "grad_norm": 2.2424702644348145, + "learning_rate": 1e-06, + "loss": 0.9586, + "mean_token_accuracy": 0.7051374912261963, + "num_tokens": 287303723.0, + "step": 11510 + }, + { + "epoch": 1.2641115747858556, + "grad_norm": 2.777820587158203, + "learning_rate": 1e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.739185094833374, + "num_tokens": 287321934.0, + "step": 11511 + }, + { + "epoch": 1.264221392488469, + "grad_norm": 2.821380615234375, + "learning_rate": 1e-06, + "loss": 0.7939, + "mean_token_accuracy": 0.7516235113143921, + "num_tokens": 287337612.0, + "step": 11512 + }, + { + "epoch": 1.2643312101910829, + "grad_norm": 1.9377222061157227, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6917850971221924, + "num_tokens": 287372535.0, + "step": 11513 + }, + { + "epoch": 1.2644410278936964, + "grad_norm": 2.3918635845184326, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7232913374900818, + "num_tokens": 287397241.0, + "step": 11514 + }, + { + "epoch": 1.2645508455963101, + "grad_norm": 2.8741586208343506, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.726465106010437, + "num_tokens": 287414547.0, + "step": 11515 + }, + { + "epoch": 1.264660663298924, + "grad_norm": 2.468257427215576, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7087275981903076, + "num_tokens": 287438837.0, + "step": 11516 + }, + { + "epoch": 1.2647704810015374, + "grad_norm": 2.0849666595458984, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7332806587219238, + "num_tokens": 287465832.0, + "step": 11517 + }, + { + "epoch": 1.264880298704151, + "grad_norm": 2.539731502532959, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.700324535369873, + "num_tokens": 287487488.0, + "step": 11518 + }, + { + "epoch": 1.2649901164067647, + "grad_norm": 2.325003147125244, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7219151258468628, + "num_tokens": 287512294.0, + "step": 11519 + }, + { + "epoch": 1.2650999341093785, + "grad_norm": 2.3286166191101074, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.6952844858169556, + "num_tokens": 287538017.0, + "step": 11520 + }, + { + "epoch": 1.265209751811992, + "grad_norm": 2.5281660556793213, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7329889535903931, + "num_tokens": 287559581.0, + "step": 11521 + }, + { + "epoch": 1.2653195695146058, + "grad_norm": 2.236731767654419, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.708798348903656, + "num_tokens": 287585262.0, + "step": 11522 + }, + { + "epoch": 1.2654293872172193, + "grad_norm": 2.151407480239868, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7175703644752502, + "num_tokens": 287613184.0, + "step": 11523 + }, + { + "epoch": 1.265539204919833, + "grad_norm": 2.4015188217163086, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.73771733045578, + "num_tokens": 287633131.0, + "step": 11524 + }, + { + "epoch": 1.2656490226224468, + "grad_norm": 2.232748508453369, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7099484205245972, + "num_tokens": 287660295.0, + "step": 11525 + }, + { + "epoch": 1.2657588403250604, + "grad_norm": 2.249155044555664, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7366502285003662, + "num_tokens": 287683430.0, + "step": 11526 + }, + { + "epoch": 1.2658686580276741, + "grad_norm": 2.2093682289123535, + "learning_rate": 1e-06, + "loss": 0.9809, + "mean_token_accuracy": 0.6925145387649536, + "num_tokens": 287709890.0, + "step": 11527 + }, + { + "epoch": 1.2659784757302877, + "grad_norm": 2.4791665077209473, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7373637557029724, + "num_tokens": 287731027.0, + "step": 11528 + }, + { + "epoch": 1.2660882934329014, + "grad_norm": 2.6173973083496094, + "learning_rate": 1e-06, + "loss": 0.7667, + "mean_token_accuracy": 0.7515122890472412, + "num_tokens": 287749398.0, + "step": 11529 + }, + { + "epoch": 1.2661981111355152, + "grad_norm": 2.538733959197998, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7372721433639526, + "num_tokens": 287770508.0, + "step": 11530 + }, + { + "epoch": 1.2663079288381287, + "grad_norm": 2.510465383529663, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7135807871818542, + "num_tokens": 287792565.0, + "step": 11531 + }, + { + "epoch": 1.2664177465407422, + "grad_norm": 2.123861312866211, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7112916707992554, + "num_tokens": 287822803.0, + "step": 11532 + }, + { + "epoch": 1.266527564243356, + "grad_norm": 2.4983932971954346, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7191604375839233, + "num_tokens": 287845451.0, + "step": 11533 + }, + { + "epoch": 1.2666373819459698, + "grad_norm": 2.202768087387085, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7333779335021973, + "num_tokens": 287870750.0, + "step": 11534 + }, + { + "epoch": 1.2667471996485833, + "grad_norm": 2.2088005542755127, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.714806079864502, + "num_tokens": 287896700.0, + "step": 11535 + }, + { + "epoch": 1.266857017351197, + "grad_norm": 2.1102488040924072, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7376216650009155, + "num_tokens": 287927687.0, + "step": 11536 + }, + { + "epoch": 1.2669668350538106, + "grad_norm": 2.0743045806884766, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.7015188336372375, + "num_tokens": 287957729.0, + "step": 11537 + }, + { + "epoch": 1.2670766527564243, + "grad_norm": 1.9865710735321045, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7173030376434326, + "num_tokens": 287989677.0, + "step": 11538 + }, + { + "epoch": 1.267186470459038, + "grad_norm": 2.251814842224121, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.712108850479126, + "num_tokens": 288014217.0, + "step": 11539 + }, + { + "epoch": 1.2672962881616516, + "grad_norm": 2.439669370651245, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7241522669792175, + "num_tokens": 288035533.0, + "step": 11540 + }, + { + "epoch": 1.2674061058642654, + "grad_norm": 2.4259374141693115, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7266390323638916, + "num_tokens": 288059097.0, + "step": 11541 + }, + { + "epoch": 1.267515923566879, + "grad_norm": 2.5330896377563477, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7363948822021484, + "num_tokens": 288080472.0, + "step": 11542 + }, + { + "epoch": 1.2676257412694927, + "grad_norm": 2.1817820072174072, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7253572344779968, + "num_tokens": 288106837.0, + "step": 11543 + }, + { + "epoch": 1.2677355589721064, + "grad_norm": 2.311643600463867, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7232829332351685, + "num_tokens": 288131502.0, + "step": 11544 + }, + { + "epoch": 1.26784537667472, + "grad_norm": 2.1239490509033203, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7215996384620667, + "num_tokens": 288157543.0, + "step": 11545 + }, + { + "epoch": 1.2679551943773335, + "grad_norm": 2.1009254455566406, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7111737728118896, + "num_tokens": 288187017.0, + "step": 11546 + }, + { + "epoch": 1.2680650120799473, + "grad_norm": 2.137674331665039, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7131020426750183, + "num_tokens": 288215251.0, + "step": 11547 + }, + { + "epoch": 1.268174829782561, + "grad_norm": 2.12648344039917, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7056237459182739, + "num_tokens": 288247577.0, + "step": 11548 + }, + { + "epoch": 1.2682846474851746, + "grad_norm": 2.222949981689453, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7203118801116943, + "num_tokens": 288272601.0, + "step": 11549 + }, + { + "epoch": 1.2683944651877883, + "grad_norm": 2.3248233795166016, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7071268558502197, + "num_tokens": 288299108.0, + "step": 11550 + }, + { + "epoch": 1.2685042828904018, + "grad_norm": 2.1817517280578613, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7317914366722107, + "num_tokens": 288324939.0, + "step": 11551 + }, + { + "epoch": 1.2686141005930156, + "grad_norm": 2.434602975845337, + "learning_rate": 1e-06, + "loss": 0.7349, + "mean_token_accuracy": 0.7574512362480164, + "num_tokens": 288345438.0, + "step": 11552 + }, + { + "epoch": 1.2687239182956294, + "grad_norm": 2.345590353012085, + "learning_rate": 1e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.7391959428787231, + "num_tokens": 288367461.0, + "step": 11553 + }, + { + "epoch": 1.268833735998243, + "grad_norm": 2.2353906631469727, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7264991998672485, + "num_tokens": 288393573.0, + "step": 11554 + }, + { + "epoch": 1.2689435537008567, + "grad_norm": 2.4960148334503174, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7117365598678589, + "num_tokens": 288415919.0, + "step": 11555 + }, + { + "epoch": 1.2690533714034702, + "grad_norm": 2.1022024154663086, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7161026000976562, + "num_tokens": 288443093.0, + "step": 11556 + }, + { + "epoch": 1.269163189106084, + "grad_norm": 2.2440500259399414, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7099844813346863, + "num_tokens": 288469050.0, + "step": 11557 + }, + { + "epoch": 1.2692730068086975, + "grad_norm": 2.455066680908203, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7278921604156494, + "num_tokens": 288490257.0, + "step": 11558 + }, + { + "epoch": 1.2693828245113112, + "grad_norm": 2.349087715148926, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7158421277999878, + "num_tokens": 288516805.0, + "step": 11559 + }, + { + "epoch": 1.2694926422139248, + "grad_norm": 2.2275619506835938, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7433567047119141, + "num_tokens": 288541989.0, + "step": 11560 + }, + { + "epoch": 1.2696024599165385, + "grad_norm": 2.4981541633605957, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7037115097045898, + "num_tokens": 288563332.0, + "step": 11561 + }, + { + "epoch": 1.2697122776191523, + "grad_norm": 2.2880938053131104, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7143966555595398, + "num_tokens": 288588458.0, + "step": 11562 + }, + { + "epoch": 1.2698220953217658, + "grad_norm": 2.2856297492980957, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7256869077682495, + "num_tokens": 288614134.0, + "step": 11563 + }, + { + "epoch": 1.2699319130243796, + "grad_norm": 2.222484588623047, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7305852174758911, + "num_tokens": 288639306.0, + "step": 11564 + }, + { + "epoch": 1.2700417307269931, + "grad_norm": 2.4228858947753906, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7263562679290771, + "num_tokens": 288660101.0, + "step": 11565 + }, + { + "epoch": 1.2701515484296069, + "grad_norm": 2.2331087589263916, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7150993943214417, + "num_tokens": 288686957.0, + "step": 11566 + }, + { + "epoch": 1.2702613661322206, + "grad_norm": 2.4694790840148926, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7234847545623779, + "num_tokens": 288710134.0, + "step": 11567 + }, + { + "epoch": 1.2703711838348342, + "grad_norm": 2.5242934226989746, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7377233505249023, + "num_tokens": 288729928.0, + "step": 11568 + }, + { + "epoch": 1.270481001537448, + "grad_norm": 2.3661158084869385, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7170640230178833, + "num_tokens": 288752273.0, + "step": 11569 + }, + { + "epoch": 1.2705908192400615, + "grad_norm": 2.105397939682007, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7073764801025391, + "num_tokens": 288780172.0, + "step": 11570 + }, + { + "epoch": 1.2707006369426752, + "grad_norm": 2.156437635421753, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.732650637626648, + "num_tokens": 288805833.0, + "step": 11571 + }, + { + "epoch": 1.2708104546452887, + "grad_norm": 2.6622049808502197, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7200760841369629, + "num_tokens": 288825517.0, + "step": 11572 + }, + { + "epoch": 1.2709202723479025, + "grad_norm": 2.3123111724853516, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7373814582824707, + "num_tokens": 288850687.0, + "step": 11573 + }, + { + "epoch": 1.271030090050516, + "grad_norm": 2.4135334491729736, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7197628021240234, + "num_tokens": 288872777.0, + "step": 11574 + }, + { + "epoch": 1.2711399077531298, + "grad_norm": 2.3894686698913574, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7137110829353333, + "num_tokens": 288895109.0, + "step": 11575 + }, + { + "epoch": 1.2712497254557436, + "grad_norm": 1.96454918384552, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.6998482942581177, + "num_tokens": 288925392.0, + "step": 11576 + }, + { + "epoch": 1.271359543158357, + "grad_norm": 2.347874641418457, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7094122171401978, + "num_tokens": 288949547.0, + "step": 11577 + }, + { + "epoch": 1.2714693608609708, + "grad_norm": 2.1289868354797363, + "learning_rate": 1e-06, + "loss": 0.8409, + "mean_token_accuracy": 0.7435505390167236, + "num_tokens": 288976380.0, + "step": 11578 + }, + { + "epoch": 1.2715791785635844, + "grad_norm": 2.4044792652130127, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7204487919807434, + "num_tokens": 288999764.0, + "step": 11579 + }, + { + "epoch": 1.2716889962661981, + "grad_norm": 2.3786046504974365, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7311346530914307, + "num_tokens": 289021877.0, + "step": 11580 + }, + { + "epoch": 1.271798813968812, + "grad_norm": 2.2818164825439453, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7396439909934998, + "num_tokens": 289044525.0, + "step": 11581 + }, + { + "epoch": 1.2719086316714254, + "grad_norm": 2.0940005779266357, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7200133204460144, + "num_tokens": 289071428.0, + "step": 11582 + }, + { + "epoch": 1.272018449374039, + "grad_norm": 2.216155767440796, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7225731611251831, + "num_tokens": 289098767.0, + "step": 11583 + }, + { + "epoch": 1.2721282670766527, + "grad_norm": 2.319362163543701, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7340076565742493, + "num_tokens": 289123094.0, + "step": 11584 + }, + { + "epoch": 1.2722380847792665, + "grad_norm": 1.9858399629592896, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7177773714065552, + "num_tokens": 289152047.0, + "step": 11585 + }, + { + "epoch": 1.27234790248188, + "grad_norm": 2.0205111503601074, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7149744629859924, + "num_tokens": 289182498.0, + "step": 11586 + }, + { + "epoch": 1.2724577201844938, + "grad_norm": 2.1538567543029785, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7146066427230835, + "num_tokens": 289210607.0, + "step": 11587 + }, + { + "epoch": 1.2725675378871073, + "grad_norm": 2.203714609146118, + "learning_rate": 1e-06, + "loss": 1.0324, + "mean_token_accuracy": 0.6885209679603577, + "num_tokens": 289236960.0, + "step": 11588 + }, + { + "epoch": 1.272677355589721, + "grad_norm": 2.3201584815979004, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7107191681861877, + "num_tokens": 289261997.0, + "step": 11589 + }, + { + "epoch": 1.2727871732923348, + "grad_norm": 2.1277618408203125, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.735519289970398, + "num_tokens": 289289147.0, + "step": 11590 + }, + { + "epoch": 1.2728969909949484, + "grad_norm": 2.239226818084717, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7075727581977844, + "num_tokens": 289317588.0, + "step": 11591 + }, + { + "epoch": 1.273006808697562, + "grad_norm": 2.1041059494018555, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7166196703910828, + "num_tokens": 289345869.0, + "step": 11592 + }, + { + "epoch": 1.2731166264001756, + "grad_norm": 2.405121326446533, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7150141596794128, + "num_tokens": 289369445.0, + "step": 11593 + }, + { + "epoch": 1.2732264441027894, + "grad_norm": 2.6288304328918457, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7235516309738159, + "num_tokens": 289389755.0, + "step": 11594 + }, + { + "epoch": 1.2733362618054032, + "grad_norm": 2.340257406234741, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7127169370651245, + "num_tokens": 289412608.0, + "step": 11595 + }, + { + "epoch": 1.2734460795080167, + "grad_norm": 2.428046703338623, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7153496742248535, + "num_tokens": 289435144.0, + "step": 11596 + }, + { + "epoch": 1.2735558972106302, + "grad_norm": 2.493595600128174, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7138201594352722, + "num_tokens": 289457458.0, + "step": 11597 + }, + { + "epoch": 1.273665714913244, + "grad_norm": 2.0657026767730713, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7052454352378845, + "num_tokens": 289487061.0, + "step": 11598 + }, + { + "epoch": 1.2737755326158577, + "grad_norm": 3.0853681564331055, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7208298444747925, + "num_tokens": 289501614.0, + "step": 11599 + }, + { + "epoch": 1.2738853503184713, + "grad_norm": 2.4459123611450195, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7126794457435608, + "num_tokens": 289523925.0, + "step": 11600 + }, + { + "epoch": 1.273995168021085, + "grad_norm": 2.3488781452178955, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.723331868648529, + "num_tokens": 289547497.0, + "step": 11601 + }, + { + "epoch": 1.2741049857236986, + "grad_norm": 2.6457600593566895, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7370994091033936, + "num_tokens": 289565813.0, + "step": 11602 + }, + { + "epoch": 1.2742148034263123, + "grad_norm": 2.292969226837158, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.733561635017395, + "num_tokens": 289590933.0, + "step": 11603 + }, + { + "epoch": 1.274324621128926, + "grad_norm": 2.4450180530548096, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7125245332717896, + "num_tokens": 289613698.0, + "step": 11604 + }, + { + "epoch": 1.2744344388315396, + "grad_norm": 2.1338443756103516, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7133040428161621, + "num_tokens": 289641953.0, + "step": 11605 + }, + { + "epoch": 1.2745442565341534, + "grad_norm": 2.775386095046997, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7183579802513123, + "num_tokens": 289660385.0, + "step": 11606 + }, + { + "epoch": 1.274654074236767, + "grad_norm": 2.2874696254730225, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7232592105865479, + "num_tokens": 289685620.0, + "step": 11607 + }, + { + "epoch": 1.2747638919393807, + "grad_norm": 2.2336559295654297, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7062007188796997, + "num_tokens": 289712151.0, + "step": 11608 + }, + { + "epoch": 1.2748737096419944, + "grad_norm": 2.208176851272583, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7148624658584595, + "num_tokens": 289737859.0, + "step": 11609 + }, + { + "epoch": 1.274983527344608, + "grad_norm": 2.1148290634155273, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7104018926620483, + "num_tokens": 289767329.0, + "step": 11610 + }, + { + "epoch": 1.2750933450472215, + "grad_norm": 2.2495675086975098, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.729688823223114, + "num_tokens": 289792149.0, + "step": 11611 + }, + { + "epoch": 1.2752031627498353, + "grad_norm": 1.9423003196716309, + "learning_rate": 1e-06, + "loss": 0.9212, + "mean_token_accuracy": 0.7166347503662109, + "num_tokens": 289823237.0, + "step": 11612 + }, + { + "epoch": 1.275312980452449, + "grad_norm": 2.177445411682129, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6927170157432556, + "num_tokens": 289848785.0, + "step": 11613 + }, + { + "epoch": 1.2754227981550625, + "grad_norm": 2.3415212631225586, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7049764394760132, + "num_tokens": 289872799.0, + "step": 11614 + }, + { + "epoch": 1.2755326158576763, + "grad_norm": 2.180187702178955, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7183901071548462, + "num_tokens": 289899980.0, + "step": 11615 + }, + { + "epoch": 1.2756424335602898, + "grad_norm": 2.367605686187744, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.7079293727874756, + "num_tokens": 289924237.0, + "step": 11616 + }, + { + "epoch": 1.2757522512629036, + "grad_norm": 2.4752211570739746, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7238905429840088, + "num_tokens": 289945202.0, + "step": 11617 + }, + { + "epoch": 1.2758620689655173, + "grad_norm": 2.329099416732788, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7111397981643677, + "num_tokens": 289967865.0, + "step": 11618 + }, + { + "epoch": 1.2759718866681309, + "grad_norm": 2.234602689743042, + "learning_rate": 1e-06, + "loss": 0.9932, + "mean_token_accuracy": 0.6943098306655884, + "num_tokens": 289994173.0, + "step": 11619 + }, + { + "epoch": 1.2760817043707446, + "grad_norm": 2.745925188064575, + "learning_rate": 1e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7536429762840271, + "num_tokens": 290011558.0, + "step": 11620 + }, + { + "epoch": 1.2761915220733582, + "grad_norm": 2.1880195140838623, + "learning_rate": 1e-06, + "loss": 0.7695, + "mean_token_accuracy": 0.7546711564064026, + "num_tokens": 290037102.0, + "step": 11621 + }, + { + "epoch": 1.276301339775972, + "grad_norm": 2.3684935569763184, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7333687543869019, + "num_tokens": 290060299.0, + "step": 11622 + }, + { + "epoch": 1.2764111574785855, + "grad_norm": 2.1847116947174072, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7141681909561157, + "num_tokens": 290087247.0, + "step": 11623 + }, + { + "epoch": 1.2765209751811992, + "grad_norm": 2.286867618560791, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7224687933921814, + "num_tokens": 290112753.0, + "step": 11624 + }, + { + "epoch": 1.2766307928838128, + "grad_norm": 2.6066479682922363, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7318204641342163, + "num_tokens": 290133537.0, + "step": 11625 + }, + { + "epoch": 1.2767406105864265, + "grad_norm": 2.187767505645752, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7107234001159668, + "num_tokens": 290162304.0, + "step": 11626 + }, + { + "epoch": 1.2768504282890403, + "grad_norm": 2.3377139568328857, + "learning_rate": 1e-06, + "loss": 0.9509, + "mean_token_accuracy": 0.6992040872573853, + "num_tokens": 290184995.0, + "step": 11627 + }, + { + "epoch": 1.2769602459916538, + "grad_norm": 2.3760101795196533, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.6907837390899658, + "num_tokens": 290206776.0, + "step": 11628 + }, + { + "epoch": 1.2770700636942676, + "grad_norm": 2.272047996520996, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7181003093719482, + "num_tokens": 290232572.0, + "step": 11629 + }, + { + "epoch": 1.277179881396881, + "grad_norm": 2.3011515140533447, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7169394493103027, + "num_tokens": 290257765.0, + "step": 11630 + }, + { + "epoch": 1.2772896990994949, + "grad_norm": 2.1882095336914062, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7264172434806824, + "num_tokens": 290283226.0, + "step": 11631 + }, + { + "epoch": 1.2773995168021086, + "grad_norm": 2.654604196548462, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7319139838218689, + "num_tokens": 290301411.0, + "step": 11632 + }, + { + "epoch": 1.2775093345047221, + "grad_norm": 2.240053653717041, + "learning_rate": 1e-06, + "loss": 1.0366, + "mean_token_accuracy": 0.6856675744056702, + "num_tokens": 290329320.0, + "step": 11633 + }, + { + "epoch": 1.2776191522073357, + "grad_norm": 2.8570637702941895, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7449700832366943, + "num_tokens": 290345750.0, + "step": 11634 + }, + { + "epoch": 1.2777289699099494, + "grad_norm": 2.1301052570343018, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7234625816345215, + "num_tokens": 290370934.0, + "step": 11635 + }, + { + "epoch": 1.2778387876125632, + "grad_norm": 2.180691957473755, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7109283208847046, + "num_tokens": 290397480.0, + "step": 11636 + }, + { + "epoch": 1.2779486053151767, + "grad_norm": 2.353736400604248, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7202518582344055, + "num_tokens": 290419739.0, + "step": 11637 + }, + { + "epoch": 1.2780584230177905, + "grad_norm": 2.249941110610962, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.6964644193649292, + "num_tokens": 290446630.0, + "step": 11638 + }, + { + "epoch": 1.278168240720404, + "grad_norm": 2.215471029281616, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7204635143280029, + "num_tokens": 290472610.0, + "step": 11639 + }, + { + "epoch": 1.2782780584230178, + "grad_norm": 2.10905122756958, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7150403261184692, + "num_tokens": 290498630.0, + "step": 11640 + }, + { + "epoch": 1.2783878761256315, + "grad_norm": 1.982702374458313, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7227165699005127, + "num_tokens": 290529247.0, + "step": 11641 + }, + { + "epoch": 1.278497693828245, + "grad_norm": 2.256070375442505, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7038311958312988, + "num_tokens": 290554674.0, + "step": 11642 + }, + { + "epoch": 1.2786075115308588, + "grad_norm": 2.2006304264068604, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7194509506225586, + "num_tokens": 290580190.0, + "step": 11643 + }, + { + "epoch": 1.2787173292334724, + "grad_norm": 2.307185411453247, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.7059646248817444, + "num_tokens": 290605497.0, + "step": 11644 + }, + { + "epoch": 1.2788271469360861, + "grad_norm": 2.1803336143493652, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.7334156632423401, + "num_tokens": 290629832.0, + "step": 11645 + }, + { + "epoch": 1.2789369646386999, + "grad_norm": 2.7199621200561523, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.733326256275177, + "num_tokens": 290649031.0, + "step": 11646 + }, + { + "epoch": 1.2790467823413134, + "grad_norm": 2.147855281829834, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7261433601379395, + "num_tokens": 290675351.0, + "step": 11647 + }, + { + "epoch": 1.279156600043927, + "grad_norm": 2.1836795806884766, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7228928804397583, + "num_tokens": 290702144.0, + "step": 11648 + }, + { + "epoch": 1.2792664177465407, + "grad_norm": 2.7373385429382324, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7248363494873047, + "num_tokens": 290720385.0, + "step": 11649 + }, + { + "epoch": 1.2793762354491545, + "grad_norm": 2.4972376823425293, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7263720631599426, + "num_tokens": 290741137.0, + "step": 11650 + }, + { + "epoch": 1.279486053151768, + "grad_norm": 2.3402304649353027, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7248351573944092, + "num_tokens": 290764270.0, + "step": 11651 + }, + { + "epoch": 1.2795958708543818, + "grad_norm": 2.4214906692504883, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.730877161026001, + "num_tokens": 290790815.0, + "step": 11652 + }, + { + "epoch": 1.2797056885569953, + "grad_norm": 2.5184755325317383, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7101666331291199, + "num_tokens": 290813186.0, + "step": 11653 + }, + { + "epoch": 1.279815506259609, + "grad_norm": 2.205610990524292, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.722535252571106, + "num_tokens": 290837027.0, + "step": 11654 + }, + { + "epoch": 1.2799253239622228, + "grad_norm": 2.235280990600586, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6980605125427246, + "num_tokens": 290863683.0, + "step": 11655 + }, + { + "epoch": 1.2800351416648363, + "grad_norm": 2.0516645908355713, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.7017860412597656, + "num_tokens": 290892256.0, + "step": 11656 + }, + { + "epoch": 1.28014495936745, + "grad_norm": 2.5731360912323, + "learning_rate": 1e-06, + "loss": 0.8133, + "mean_token_accuracy": 0.7425826787948608, + "num_tokens": 290911533.0, + "step": 11657 + }, + { + "epoch": 1.2802547770700636, + "grad_norm": 2.144127130508423, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7093881964683533, + "num_tokens": 290939873.0, + "step": 11658 + }, + { + "epoch": 1.2803645947726774, + "grad_norm": 2.246415615081787, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7236047387123108, + "num_tokens": 290965599.0, + "step": 11659 + }, + { + "epoch": 1.2804744124752911, + "grad_norm": 2.480098247528076, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7226648330688477, + "num_tokens": 290985861.0, + "step": 11660 + }, + { + "epoch": 1.2805842301779047, + "grad_norm": 2.484654188156128, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7218798995018005, + "num_tokens": 291008815.0, + "step": 11661 + }, + { + "epoch": 1.2806940478805182, + "grad_norm": 2.5288467407226562, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7471015453338623, + "num_tokens": 291028478.0, + "step": 11662 + }, + { + "epoch": 1.280803865583132, + "grad_norm": 2.194472074508667, + "learning_rate": 1e-06, + "loss": 0.9622, + "mean_token_accuracy": 0.7020300626754761, + "num_tokens": 291054991.0, + "step": 11663 + }, + { + "epoch": 1.2809136832857457, + "grad_norm": 2.1177213191986084, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.6970816254615784, + "num_tokens": 291083571.0, + "step": 11664 + }, + { + "epoch": 1.2810235009883593, + "grad_norm": 2.367870807647705, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6909045577049255, + "num_tokens": 291108993.0, + "step": 11665 + }, + { + "epoch": 1.281133318690973, + "grad_norm": 2.0160136222839355, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7244991064071655, + "num_tokens": 291139247.0, + "step": 11666 + }, + { + "epoch": 1.2812431363935866, + "grad_norm": 2.286929130554199, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7288768291473389, + "num_tokens": 291163426.0, + "step": 11667 + }, + { + "epoch": 1.2813529540962003, + "grad_norm": 2.06778883934021, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7037621140480042, + "num_tokens": 291193815.0, + "step": 11668 + }, + { + "epoch": 1.281462771798814, + "grad_norm": 2.248190402984619, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7220813035964966, + "num_tokens": 291219802.0, + "step": 11669 + }, + { + "epoch": 1.2815725895014276, + "grad_norm": 2.478360891342163, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7319499850273132, + "num_tokens": 291242601.0, + "step": 11670 + }, + { + "epoch": 1.2816824072040414, + "grad_norm": 2.278303861618042, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7258020639419556, + "num_tokens": 291268183.0, + "step": 11671 + }, + { + "epoch": 1.281792224906655, + "grad_norm": 2.1428534984588623, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7225996851921082, + "num_tokens": 291294991.0, + "step": 11672 + }, + { + "epoch": 1.2819020426092687, + "grad_norm": 2.7010021209716797, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7282662391662598, + "num_tokens": 291313223.0, + "step": 11673 + }, + { + "epoch": 1.2820118603118824, + "grad_norm": 2.953789472579956, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7599393725395203, + "num_tokens": 291329288.0, + "step": 11674 + }, + { + "epoch": 1.282121678014496, + "grad_norm": 2.429123878479004, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7413905262947083, + "num_tokens": 291350600.0, + "step": 11675 + }, + { + "epoch": 1.2822314957171095, + "grad_norm": 2.2210800647735596, + "learning_rate": 1e-06, + "loss": 0.763, + "mean_token_accuracy": 0.7576168775558472, + "num_tokens": 291373080.0, + "step": 11676 + }, + { + "epoch": 1.2823413134197232, + "grad_norm": 2.309022903442383, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7450636029243469, + "num_tokens": 291396517.0, + "step": 11677 + }, + { + "epoch": 1.282451131122337, + "grad_norm": 2.2779057025909424, + "learning_rate": 1e-06, + "loss": 0.9931, + "mean_token_accuracy": 0.7025357484817505, + "num_tokens": 291425954.0, + "step": 11678 + }, + { + "epoch": 1.2825609488249505, + "grad_norm": 2.362872362136841, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7187775373458862, + "num_tokens": 291450715.0, + "step": 11679 + }, + { + "epoch": 1.2826707665275643, + "grad_norm": 2.1275758743286133, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7173281908035278, + "num_tokens": 291478650.0, + "step": 11680 + }, + { + "epoch": 1.2827805842301778, + "grad_norm": 2.549898147583008, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7351488471031189, + "num_tokens": 291500147.0, + "step": 11681 + }, + { + "epoch": 1.2828904019327916, + "grad_norm": 2.14909029006958, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7198441028594971, + "num_tokens": 291528527.0, + "step": 11682 + }, + { + "epoch": 1.2830002196354053, + "grad_norm": 2.047902822494507, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7046619057655334, + "num_tokens": 291557503.0, + "step": 11683 + }, + { + "epoch": 1.2831100373380189, + "grad_norm": 2.417907238006592, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7272523641586304, + "num_tokens": 291580436.0, + "step": 11684 + }, + { + "epoch": 1.2832198550406326, + "grad_norm": 2.502455472946167, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.742734432220459, + "num_tokens": 291600657.0, + "step": 11685 + }, + { + "epoch": 1.2833296727432462, + "grad_norm": 2.4625144004821777, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7139352560043335, + "num_tokens": 291625759.0, + "step": 11686 + }, + { + "epoch": 1.28343949044586, + "grad_norm": 2.368469715118408, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7107622623443604, + "num_tokens": 291649309.0, + "step": 11687 + }, + { + "epoch": 1.2835493081484735, + "grad_norm": 2.419914722442627, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7151064276695251, + "num_tokens": 291672493.0, + "step": 11688 + }, + { + "epoch": 1.2836591258510872, + "grad_norm": 2.1700494289398193, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7096040844917297, + "num_tokens": 291698378.0, + "step": 11689 + }, + { + "epoch": 1.2837689435537007, + "grad_norm": 2.421372175216675, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7097328901290894, + "num_tokens": 291719988.0, + "step": 11690 + }, + { + "epoch": 1.2838787612563145, + "grad_norm": 2.1659581661224365, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7442579865455627, + "num_tokens": 291747759.0, + "step": 11691 + }, + { + "epoch": 1.2839885789589283, + "grad_norm": 2.621731996536255, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7309777140617371, + "num_tokens": 291768014.0, + "step": 11692 + }, + { + "epoch": 1.2840983966615418, + "grad_norm": 1.9924111366271973, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7283922433853149, + "num_tokens": 291795633.0, + "step": 11693 + }, + { + "epoch": 1.2842082143641556, + "grad_norm": 2.2895455360412598, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7121390700340271, + "num_tokens": 291820651.0, + "step": 11694 + }, + { + "epoch": 1.284318032066769, + "grad_norm": 2.675837516784668, + "learning_rate": 1e-06, + "loss": 0.814, + "mean_token_accuracy": 0.734354555606842, + "num_tokens": 291838422.0, + "step": 11695 + }, + { + "epoch": 1.2844278497693828, + "grad_norm": 2.386399030685425, + "learning_rate": 1e-06, + "loss": 0.8561, + "mean_token_accuracy": 0.7296236753463745, + "num_tokens": 291863697.0, + "step": 11696 + }, + { + "epoch": 1.2845376674719966, + "grad_norm": 2.162626266479492, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.712610125541687, + "num_tokens": 291891070.0, + "step": 11697 + }, + { + "epoch": 1.2846474851746101, + "grad_norm": 2.4643845558166504, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7180898189544678, + "num_tokens": 291914227.0, + "step": 11698 + }, + { + "epoch": 1.2847573028772237, + "grad_norm": 2.248137950897217, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7106971144676208, + "num_tokens": 291938766.0, + "step": 11699 + }, + { + "epoch": 1.2848671205798374, + "grad_norm": 2.1534340381622314, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7038156986236572, + "num_tokens": 291969780.0, + "step": 11700 + }, + { + "epoch": 1.2849769382824512, + "grad_norm": 2.3859472274780273, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7271865606307983, + "num_tokens": 291993954.0, + "step": 11701 + }, + { + "epoch": 1.2850867559850647, + "grad_norm": 2.20206356048584, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7153434753417969, + "num_tokens": 292020935.0, + "step": 11702 + }, + { + "epoch": 1.2851965736876785, + "grad_norm": 2.0441243648529053, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7290452122688293, + "num_tokens": 292051232.0, + "step": 11703 + }, + { + "epoch": 1.285306391390292, + "grad_norm": 2.0765883922576904, + "learning_rate": 1e-06, + "loss": 0.9722, + "mean_token_accuracy": 0.7010636329650879, + "num_tokens": 292079625.0, + "step": 11704 + }, + { + "epoch": 1.2854162090929058, + "grad_norm": 2.2674667835235596, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7376527786254883, + "num_tokens": 292101993.0, + "step": 11705 + }, + { + "epoch": 1.2855260267955195, + "grad_norm": 2.004331350326538, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7366176843643188, + "num_tokens": 292131025.0, + "step": 11706 + }, + { + "epoch": 1.285635844498133, + "grad_norm": 2.4982731342315674, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7324559092521667, + "num_tokens": 292151875.0, + "step": 11707 + }, + { + "epoch": 1.2857456622007468, + "grad_norm": 2.283191680908203, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7354638576507568, + "num_tokens": 292177914.0, + "step": 11708 + }, + { + "epoch": 1.2858554799033604, + "grad_norm": 2.1257925033569336, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.727188229560852, + "num_tokens": 292204898.0, + "step": 11709 + }, + { + "epoch": 1.2859652976059741, + "grad_norm": 1.9769160747528076, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7179942727088928, + "num_tokens": 292239013.0, + "step": 11710 + }, + { + "epoch": 1.2860751153085879, + "grad_norm": 2.427063226699829, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7176928520202637, + "num_tokens": 292263408.0, + "step": 11711 + }, + { + "epoch": 1.2861849330112014, + "grad_norm": 2.3602840900421143, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.7148489356040955, + "num_tokens": 292288024.0, + "step": 11712 + }, + { + "epoch": 1.286294750713815, + "grad_norm": 2.3667349815368652, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7146466970443726, + "num_tokens": 292311805.0, + "step": 11713 + }, + { + "epoch": 1.2864045684164287, + "grad_norm": 2.2462847232818604, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7304387092590332, + "num_tokens": 292336958.0, + "step": 11714 + }, + { + "epoch": 1.2865143861190425, + "grad_norm": 2.1400487422943115, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7022101879119873, + "num_tokens": 292365791.0, + "step": 11715 + }, + { + "epoch": 1.286624203821656, + "grad_norm": 2.5149471759796143, + "learning_rate": 1e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.747062087059021, + "num_tokens": 292386733.0, + "step": 11716 + }, + { + "epoch": 1.2867340215242697, + "grad_norm": 2.4320425987243652, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7275141477584839, + "num_tokens": 292407792.0, + "step": 11717 + }, + { + "epoch": 1.2868438392268833, + "grad_norm": 2.2708516120910645, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7023122906684875, + "num_tokens": 292433187.0, + "step": 11718 + }, + { + "epoch": 1.286953656929497, + "grad_norm": 2.0396711826324463, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7081767320632935, + "num_tokens": 292462623.0, + "step": 11719 + }, + { + "epoch": 1.2870634746321108, + "grad_norm": 2.3335494995117188, + "learning_rate": 1e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.7413685917854309, + "num_tokens": 292485736.0, + "step": 11720 + }, + { + "epoch": 1.2871732923347243, + "grad_norm": 2.0938706398010254, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7180213332176208, + "num_tokens": 292515224.0, + "step": 11721 + }, + { + "epoch": 1.287283110037338, + "grad_norm": 2.3631370067596436, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7140092253684998, + "num_tokens": 292539406.0, + "step": 11722 + }, + { + "epoch": 1.2873929277399516, + "grad_norm": 2.2183518409729004, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7424294352531433, + "num_tokens": 292565358.0, + "step": 11723 + }, + { + "epoch": 1.2875027454425654, + "grad_norm": 2.3664026260375977, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7097769975662231, + "num_tokens": 292588422.0, + "step": 11724 + }, + { + "epoch": 1.2876125631451791, + "grad_norm": 2.106748104095459, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7043521404266357, + "num_tokens": 292617414.0, + "step": 11725 + }, + { + "epoch": 1.2877223808477927, + "grad_norm": 2.2248432636260986, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7238953709602356, + "num_tokens": 292644231.0, + "step": 11726 + }, + { + "epoch": 1.2878321985504062, + "grad_norm": 2.5643203258514404, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7260340452194214, + "num_tokens": 292664046.0, + "step": 11727 + }, + { + "epoch": 1.28794201625302, + "grad_norm": 2.217017889022827, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6998059153556824, + "num_tokens": 292692664.0, + "step": 11728 + }, + { + "epoch": 1.2880518339556337, + "grad_norm": 2.3479130268096924, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7168775796890259, + "num_tokens": 292718389.0, + "step": 11729 + }, + { + "epoch": 1.2881616516582473, + "grad_norm": 2.6080739498138428, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7216919660568237, + "num_tokens": 292738595.0, + "step": 11730 + }, + { + "epoch": 1.288271469360861, + "grad_norm": 2.3473775386810303, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7060070037841797, + "num_tokens": 292761480.0, + "step": 11731 + }, + { + "epoch": 1.2883812870634745, + "grad_norm": 2.17423677444458, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7094725370407104, + "num_tokens": 292789938.0, + "step": 11732 + }, + { + "epoch": 1.2884911047660883, + "grad_norm": 2.2435121536254883, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.6985466480255127, + "num_tokens": 292816759.0, + "step": 11733 + }, + { + "epoch": 1.288600922468702, + "grad_norm": 2.340471029281616, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.721636176109314, + "num_tokens": 292841079.0, + "step": 11734 + }, + { + "epoch": 1.2887107401713156, + "grad_norm": 2.3501296043395996, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7343131303787231, + "num_tokens": 292864996.0, + "step": 11735 + }, + { + "epoch": 1.2888205578739294, + "grad_norm": 1.9222493171691895, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6955149173736572, + "num_tokens": 292900903.0, + "step": 11736 + }, + { + "epoch": 1.2889303755765429, + "grad_norm": 1.9954625368118286, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7390960454940796, + "num_tokens": 292931037.0, + "step": 11737 + }, + { + "epoch": 1.2890401932791566, + "grad_norm": 2.5095345973968506, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7415881752967834, + "num_tokens": 292951296.0, + "step": 11738 + }, + { + "epoch": 1.2891500109817704, + "grad_norm": 2.222123861312866, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7269153594970703, + "num_tokens": 292977416.0, + "step": 11739 + }, + { + "epoch": 1.289259828684384, + "grad_norm": 2.204371452331543, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6941989660263062, + "num_tokens": 293005193.0, + "step": 11740 + }, + { + "epoch": 1.2893696463869975, + "grad_norm": 2.0725629329681396, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7237271070480347, + "num_tokens": 293036794.0, + "step": 11741 + }, + { + "epoch": 1.2894794640896112, + "grad_norm": 2.263625144958496, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7034862041473389, + "num_tokens": 293063378.0, + "step": 11742 + }, + { + "epoch": 1.289589281792225, + "grad_norm": 2.0528571605682373, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7184576988220215, + "num_tokens": 293093063.0, + "step": 11743 + }, + { + "epoch": 1.2896990994948385, + "grad_norm": 2.3886590003967285, + "learning_rate": 1e-06, + "loss": 0.7874, + "mean_token_accuracy": 0.748528242111206, + "num_tokens": 293115504.0, + "step": 11744 + }, + { + "epoch": 1.2898089171974523, + "grad_norm": 2.2437655925750732, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7278788089752197, + "num_tokens": 293140854.0, + "step": 11745 + }, + { + "epoch": 1.2899187349000658, + "grad_norm": 2.4449892044067383, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7112460136413574, + "num_tokens": 293165881.0, + "step": 11746 + }, + { + "epoch": 1.2900285526026796, + "grad_norm": 2.3949475288391113, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.725212574005127, + "num_tokens": 293188888.0, + "step": 11747 + }, + { + "epoch": 1.2901383703052933, + "grad_norm": 2.091923952102661, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7138469219207764, + "num_tokens": 293220144.0, + "step": 11748 + }, + { + "epoch": 1.2902481880079069, + "grad_norm": 2.4375698566436768, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7142737507820129, + "num_tokens": 293242398.0, + "step": 11749 + }, + { + "epoch": 1.2903580057105206, + "grad_norm": 2.173645496368408, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.736275315284729, + "num_tokens": 293268737.0, + "step": 11750 + }, + { + "epoch": 1.2904678234131342, + "grad_norm": 2.3115646839141846, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7085952162742615, + "num_tokens": 293295346.0, + "step": 11751 + }, + { + "epoch": 1.290577641115748, + "grad_norm": 2.125331401824951, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7180660367012024, + "num_tokens": 293321870.0, + "step": 11752 + }, + { + "epoch": 1.2906874588183614, + "grad_norm": 2.3752341270446777, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.709264874458313, + "num_tokens": 293345043.0, + "step": 11753 + }, + { + "epoch": 1.2907972765209752, + "grad_norm": 2.0445749759674072, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7012922763824463, + "num_tokens": 293374337.0, + "step": 11754 + }, + { + "epoch": 1.2909070942235887, + "grad_norm": 2.632335901260376, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7366037964820862, + "num_tokens": 293393342.0, + "step": 11755 + }, + { + "epoch": 1.2910169119262025, + "grad_norm": 2.6393229961395264, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7320383787155151, + "num_tokens": 293414631.0, + "step": 11756 + }, + { + "epoch": 1.2911267296288162, + "grad_norm": 2.2896556854248047, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7125938534736633, + "num_tokens": 293441304.0, + "step": 11757 + }, + { + "epoch": 1.2912365473314298, + "grad_norm": 2.434704303741455, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.697019100189209, + "num_tokens": 293462916.0, + "step": 11758 + }, + { + "epoch": 1.2913463650340435, + "grad_norm": 2.5686872005462646, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7368890047073364, + "num_tokens": 293481855.0, + "step": 11759 + }, + { + "epoch": 1.291456182736657, + "grad_norm": 2.2621312141418457, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7204673290252686, + "num_tokens": 293506418.0, + "step": 11760 + }, + { + "epoch": 1.2915660004392708, + "grad_norm": 2.283637523651123, + "learning_rate": 1e-06, + "loss": 0.7983, + "mean_token_accuracy": 0.7447782754898071, + "num_tokens": 293531370.0, + "step": 11761 + }, + { + "epoch": 1.2916758181418846, + "grad_norm": 2.53686785697937, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7372556924819946, + "num_tokens": 293551476.0, + "step": 11762 + }, + { + "epoch": 1.2917856358444981, + "grad_norm": 2.4601998329162598, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7267493009567261, + "num_tokens": 293572272.0, + "step": 11763 + }, + { + "epoch": 1.2918954535471117, + "grad_norm": 2.1746394634246826, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7258594036102295, + "num_tokens": 293598577.0, + "step": 11764 + }, + { + "epoch": 1.2920052712497254, + "grad_norm": 2.210446357727051, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7130753397941589, + "num_tokens": 293625462.0, + "step": 11765 + }, + { + "epoch": 1.2921150889523392, + "grad_norm": 2.2821645736694336, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.6955163478851318, + "num_tokens": 293651636.0, + "step": 11766 + }, + { + "epoch": 1.2922249066549527, + "grad_norm": 2.0362799167633057, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.6977683901786804, + "num_tokens": 293683763.0, + "step": 11767 + }, + { + "epoch": 1.2923347243575665, + "grad_norm": 2.5857174396514893, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7330927848815918, + "num_tokens": 293704869.0, + "step": 11768 + }, + { + "epoch": 1.29244454206018, + "grad_norm": 2.6616756916046143, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.716102659702301, + "num_tokens": 293725812.0, + "step": 11769 + }, + { + "epoch": 1.2925543597627938, + "grad_norm": 2.4867281913757324, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7302001118659973, + "num_tokens": 293747214.0, + "step": 11770 + }, + { + "epoch": 1.2926641774654075, + "grad_norm": 2.1501717567443848, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.7197211980819702, + "num_tokens": 293776286.0, + "step": 11771 + }, + { + "epoch": 1.292773995168021, + "grad_norm": 2.161583662033081, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.731635332107544, + "num_tokens": 293801711.0, + "step": 11772 + }, + { + "epoch": 1.2928838128706348, + "grad_norm": 2.3618438243865967, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.738595724105835, + "num_tokens": 293824811.0, + "step": 11773 + }, + { + "epoch": 1.2929936305732483, + "grad_norm": 2.2906301021575928, + "learning_rate": 1e-06, + "loss": 0.9712, + "mean_token_accuracy": 0.7009083032608032, + "num_tokens": 293849992.0, + "step": 11774 + }, + { + "epoch": 1.293103448275862, + "grad_norm": 2.4792449474334717, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7052263021469116, + "num_tokens": 293873693.0, + "step": 11775 + }, + { + "epoch": 1.2932132659784759, + "grad_norm": 2.2153642177581787, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7183119058609009, + "num_tokens": 293899738.0, + "step": 11776 + }, + { + "epoch": 1.2933230836810894, + "grad_norm": 2.4313693046569824, + "learning_rate": 1e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7410619854927063, + "num_tokens": 293920503.0, + "step": 11777 + }, + { + "epoch": 1.293432901383703, + "grad_norm": 1.9484071731567383, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7317371368408203, + "num_tokens": 293952254.0, + "step": 11778 + }, + { + "epoch": 1.2935427190863167, + "grad_norm": 2.088153600692749, + "learning_rate": 1e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7472771406173706, + "num_tokens": 293979576.0, + "step": 11779 + }, + { + "epoch": 1.2936525367889304, + "grad_norm": 2.3411636352539062, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7210144400596619, + "num_tokens": 294001148.0, + "step": 11780 + }, + { + "epoch": 1.293762354491544, + "grad_norm": 2.3274428844451904, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.734661340713501, + "num_tokens": 294025684.0, + "step": 11781 + }, + { + "epoch": 1.2938721721941577, + "grad_norm": 2.4255599975585938, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7068870067596436, + "num_tokens": 294047661.0, + "step": 11782 + }, + { + "epoch": 1.2939819898967713, + "grad_norm": 2.602694034576416, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7476693987846375, + "num_tokens": 294066067.0, + "step": 11783 + }, + { + "epoch": 1.294091807599385, + "grad_norm": 2.2688262462615967, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7159795165061951, + "num_tokens": 294091600.0, + "step": 11784 + }, + { + "epoch": 1.2942016253019988, + "grad_norm": 2.398655891418457, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7027150392532349, + "num_tokens": 294115225.0, + "step": 11785 + }, + { + "epoch": 1.2943114430046123, + "grad_norm": 2.3409311771392822, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7349028587341309, + "num_tokens": 294138695.0, + "step": 11786 + }, + { + "epoch": 1.294421260707226, + "grad_norm": 2.0357611179351807, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.711283802986145, + "num_tokens": 294168076.0, + "step": 11787 + }, + { + "epoch": 1.2945310784098396, + "grad_norm": 2.768242835998535, + "learning_rate": 1e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7373754978179932, + "num_tokens": 294186673.0, + "step": 11788 + }, + { + "epoch": 1.2946408961124534, + "grad_norm": 2.423516035079956, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7414095401763916, + "num_tokens": 294208374.0, + "step": 11789 + }, + { + "epoch": 1.2947507138150671, + "grad_norm": 2.2890844345092773, + "learning_rate": 1e-06, + "loss": 0.7552, + "mean_token_accuracy": 0.7619147300720215, + "num_tokens": 294231219.0, + "step": 11790 + }, + { + "epoch": 1.2948605315176807, + "grad_norm": 2.2981045246124268, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7262760996818542, + "num_tokens": 294257505.0, + "step": 11791 + }, + { + "epoch": 1.2949703492202942, + "grad_norm": 2.477229356765747, + "learning_rate": 1e-06, + "loss": 0.813, + "mean_token_accuracy": 0.7437743544578552, + "num_tokens": 294278399.0, + "step": 11792 + }, + { + "epoch": 1.295080166922908, + "grad_norm": 2.1576087474823, + "learning_rate": 1e-06, + "loss": 0.995, + "mean_token_accuracy": 0.6902843713760376, + "num_tokens": 294306389.0, + "step": 11793 + }, + { + "epoch": 1.2951899846255217, + "grad_norm": 2.3974366188049316, + "learning_rate": 1e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7467942237854004, + "num_tokens": 294327144.0, + "step": 11794 + }, + { + "epoch": 1.2952998023281352, + "grad_norm": 2.1308982372283936, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7117668986320496, + "num_tokens": 294354688.0, + "step": 11795 + }, + { + "epoch": 1.295409620030749, + "grad_norm": 2.324078321456909, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.704192578792572, + "num_tokens": 294378581.0, + "step": 11796 + }, + { + "epoch": 1.2955194377333625, + "grad_norm": 2.404141426086426, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7205787897109985, + "num_tokens": 294400794.0, + "step": 11797 + }, + { + "epoch": 1.2956292554359763, + "grad_norm": 2.262017011642456, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7168675661087036, + "num_tokens": 294424478.0, + "step": 11798 + }, + { + "epoch": 1.29573907313859, + "grad_norm": 2.4626762866973877, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7426831722259521, + "num_tokens": 294444942.0, + "step": 11799 + }, + { + "epoch": 1.2958488908412036, + "grad_norm": 2.456402540206909, + "learning_rate": 1e-06, + "loss": 0.8039, + "mean_token_accuracy": 0.7381126880645752, + "num_tokens": 294465078.0, + "step": 11800 + }, + { + "epoch": 1.2959587085438173, + "grad_norm": 2.6084814071655273, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7338281869888306, + "num_tokens": 294484022.0, + "step": 11801 + }, + { + "epoch": 1.2960685262464309, + "grad_norm": 2.213059663772583, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7117419838905334, + "num_tokens": 294511100.0, + "step": 11802 + }, + { + "epoch": 1.2961783439490446, + "grad_norm": 2.3501040935516357, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6904861330986023, + "num_tokens": 294534333.0, + "step": 11803 + }, + { + "epoch": 1.2962881616516582, + "grad_norm": 2.940412998199463, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7216116189956665, + "num_tokens": 294550413.0, + "step": 11804 + }, + { + "epoch": 1.296397979354272, + "grad_norm": 2.546051025390625, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7275698781013489, + "num_tokens": 294572201.0, + "step": 11805 + }, + { + "epoch": 1.2965077970568855, + "grad_norm": 2.173643112182617, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7110053300857544, + "num_tokens": 294598339.0, + "step": 11806 + }, + { + "epoch": 1.2966176147594992, + "grad_norm": 1.9972461462020874, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7115427851676941, + "num_tokens": 294629031.0, + "step": 11807 + }, + { + "epoch": 1.296727432462113, + "grad_norm": 2.0091521739959717, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7131671905517578, + "num_tokens": 294659770.0, + "step": 11808 + }, + { + "epoch": 1.2968372501647265, + "grad_norm": 2.311361312866211, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7223425507545471, + "num_tokens": 294683061.0, + "step": 11809 + }, + { + "epoch": 1.2969470678673403, + "grad_norm": 2.495358467102051, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7297431230545044, + "num_tokens": 294702702.0, + "step": 11810 + }, + { + "epoch": 1.2970568855699538, + "grad_norm": 2.261336088180542, + "learning_rate": 1e-06, + "loss": 0.8045, + "mean_token_accuracy": 0.7429410219192505, + "num_tokens": 294725552.0, + "step": 11811 + }, + { + "epoch": 1.2971667032725676, + "grad_norm": 2.4056198596954346, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7251003980636597, + "num_tokens": 294746449.0, + "step": 11812 + }, + { + "epoch": 1.2972765209751813, + "grad_norm": 2.219796895980835, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7220686674118042, + "num_tokens": 294771659.0, + "step": 11813 + }, + { + "epoch": 1.2973863386777948, + "grad_norm": 2.4611806869506836, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.69912189245224, + "num_tokens": 294796148.0, + "step": 11814 + }, + { + "epoch": 1.2974961563804086, + "grad_norm": 2.1918396949768066, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7347310185432434, + "num_tokens": 294821584.0, + "step": 11815 + }, + { + "epoch": 1.2976059740830221, + "grad_norm": 2.2701504230499268, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7079333066940308, + "num_tokens": 294847790.0, + "step": 11816 + }, + { + "epoch": 1.297715791785636, + "grad_norm": 2.5520987510681152, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7105672955513, + "num_tokens": 294869305.0, + "step": 11817 + }, + { + "epoch": 1.2978256094882494, + "grad_norm": 2.436718463897705, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7130998373031616, + "num_tokens": 294890794.0, + "step": 11818 + }, + { + "epoch": 1.2979354271908632, + "grad_norm": 2.2122256755828857, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.710612416267395, + "num_tokens": 294917925.0, + "step": 11819 + }, + { + "epoch": 1.2980452448934767, + "grad_norm": 2.143862724304199, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7124758958816528, + "num_tokens": 294947892.0, + "step": 11820 + }, + { + "epoch": 1.2981550625960905, + "grad_norm": 2.148817539215088, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7457650899887085, + "num_tokens": 294976795.0, + "step": 11821 + }, + { + "epoch": 1.2982648802987042, + "grad_norm": 2.222609043121338, + "learning_rate": 1e-06, + "loss": 0.7714, + "mean_token_accuracy": 0.7502117156982422, + "num_tokens": 295000521.0, + "step": 11822 + }, + { + "epoch": 1.2983746980013178, + "grad_norm": 2.481840133666992, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7245162725448608, + "num_tokens": 295024457.0, + "step": 11823 + }, + { + "epoch": 1.2984845157039315, + "grad_norm": 2.360668897628784, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7367571592330933, + "num_tokens": 295046562.0, + "step": 11824 + }, + { + "epoch": 1.298594333406545, + "grad_norm": 2.267056465148926, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7085894346237183, + "num_tokens": 295072493.0, + "step": 11825 + }, + { + "epoch": 1.2987041511091588, + "grad_norm": 2.0218729972839355, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.710422158241272, + "num_tokens": 295102686.0, + "step": 11826 + }, + { + "epoch": 1.2988139688117726, + "grad_norm": 2.300280809402466, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.727816641330719, + "num_tokens": 295128294.0, + "step": 11827 + }, + { + "epoch": 1.2989237865143861, + "grad_norm": 2.834719657897949, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7327542304992676, + "num_tokens": 295145537.0, + "step": 11828 + }, + { + "epoch": 1.2990336042169996, + "grad_norm": 2.378789186477661, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7234027981758118, + "num_tokens": 295168445.0, + "step": 11829 + }, + { + "epoch": 1.2991434219196134, + "grad_norm": 2.166738271713257, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7015982270240784, + "num_tokens": 295198472.0, + "step": 11830 + }, + { + "epoch": 1.2992532396222272, + "grad_norm": 2.2678933143615723, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7307405471801758, + "num_tokens": 295224622.0, + "step": 11831 + }, + { + "epoch": 1.2993630573248407, + "grad_norm": 2.700958728790283, + "learning_rate": 1e-06, + "loss": 1.0047, + "mean_token_accuracy": 0.6875723600387573, + "num_tokens": 295242515.0, + "step": 11832 + }, + { + "epoch": 1.2994728750274545, + "grad_norm": 2.119117498397827, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7045295238494873, + "num_tokens": 295272838.0, + "step": 11833 + }, + { + "epoch": 1.299582692730068, + "grad_norm": 2.2496721744537354, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7282025814056396, + "num_tokens": 295298764.0, + "step": 11834 + }, + { + "epoch": 1.2996925104326817, + "grad_norm": 2.3479366302490234, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7012144327163696, + "num_tokens": 295325571.0, + "step": 11835 + }, + { + "epoch": 1.2998023281352955, + "grad_norm": 2.7956573963165283, + "learning_rate": 1e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7400171756744385, + "num_tokens": 295341919.0, + "step": 11836 + }, + { + "epoch": 1.299912145837909, + "grad_norm": 2.555293560028076, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7116944789886475, + "num_tokens": 295363052.0, + "step": 11837 + }, + { + "epoch": 1.3000219635405228, + "grad_norm": 2.7273409366607666, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7324978113174438, + "num_tokens": 295382315.0, + "step": 11838 + }, + { + "epoch": 1.3001317812431363, + "grad_norm": 2.1494507789611816, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7041721343994141, + "num_tokens": 295410385.0, + "step": 11839 + }, + { + "epoch": 1.30024159894575, + "grad_norm": 2.157393217086792, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7022337913513184, + "num_tokens": 295437084.0, + "step": 11840 + }, + { + "epoch": 1.3003514166483638, + "grad_norm": 2.2407546043395996, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7131980061531067, + "num_tokens": 295462304.0, + "step": 11841 + }, + { + "epoch": 1.3004612343509774, + "grad_norm": 2.205430746078491, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7368018627166748, + "num_tokens": 295488583.0, + "step": 11842 + }, + { + "epoch": 1.300571052053591, + "grad_norm": 2.2762668132781982, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.708564043045044, + "num_tokens": 295513822.0, + "step": 11843 + }, + { + "epoch": 1.3006808697562047, + "grad_norm": 2.3627288341522217, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7363949418067932, + "num_tokens": 295536817.0, + "step": 11844 + }, + { + "epoch": 1.3007906874588184, + "grad_norm": 2.381143569946289, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7376922369003296, + "num_tokens": 295560628.0, + "step": 11845 + }, + { + "epoch": 1.300900505161432, + "grad_norm": 2.756495475769043, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7217197418212891, + "num_tokens": 295579688.0, + "step": 11846 + }, + { + "epoch": 1.3010103228640457, + "grad_norm": 2.0413053035736084, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7078971266746521, + "num_tokens": 295610109.0, + "step": 11847 + }, + { + "epoch": 1.3011201405666593, + "grad_norm": 2.279784917831421, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7325443029403687, + "num_tokens": 295634118.0, + "step": 11848 + }, + { + "epoch": 1.301229958269273, + "grad_norm": 2.4713916778564453, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7167128920555115, + "num_tokens": 295657523.0, + "step": 11849 + }, + { + "epoch": 1.3013397759718868, + "grad_norm": 2.214491605758667, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7280921339988708, + "num_tokens": 295684508.0, + "step": 11850 + }, + { + "epoch": 1.3014495936745003, + "grad_norm": 2.519970178604126, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7210650444030762, + "num_tokens": 295709902.0, + "step": 11851 + }, + { + "epoch": 1.301559411377114, + "grad_norm": 2.4642510414123535, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.7089143991470337, + "num_tokens": 295731826.0, + "step": 11852 + }, + { + "epoch": 1.3016692290797276, + "grad_norm": 2.086810350418091, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7320007085800171, + "num_tokens": 295758911.0, + "step": 11853 + }, + { + "epoch": 1.3017790467823414, + "grad_norm": 2.379488229751587, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.710639476776123, + "num_tokens": 295783183.0, + "step": 11854 + }, + { + "epoch": 1.301888864484955, + "grad_norm": 2.3362679481506348, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7292102575302124, + "num_tokens": 295806635.0, + "step": 11855 + }, + { + "epoch": 1.3019986821875686, + "grad_norm": 2.7737467288970947, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.7476822137832642, + "num_tokens": 295822965.0, + "step": 11856 + }, + { + "epoch": 1.3021084998901822, + "grad_norm": 2.300100326538086, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7154486179351807, + "num_tokens": 295847769.0, + "step": 11857 + }, + { + "epoch": 1.302218317592796, + "grad_norm": 2.8157992362976074, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7242101430892944, + "num_tokens": 295866423.0, + "step": 11858 + }, + { + "epoch": 1.3023281352954097, + "grad_norm": 2.246044158935547, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7170352339744568, + "num_tokens": 295891394.0, + "step": 11859 + }, + { + "epoch": 1.3024379529980232, + "grad_norm": 2.655155658721924, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7231694459915161, + "num_tokens": 295912607.0, + "step": 11860 + }, + { + "epoch": 1.302547770700637, + "grad_norm": 2.5933892726898193, + "learning_rate": 1e-06, + "loss": 0.7748, + "mean_token_accuracy": 0.7540200352668762, + "num_tokens": 295930760.0, + "step": 11861 + }, + { + "epoch": 1.3026575884032505, + "grad_norm": 2.2968909740448, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.6872433423995972, + "num_tokens": 295958220.0, + "step": 11862 + }, + { + "epoch": 1.3027674061058643, + "grad_norm": 2.3492422103881836, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7196263074874878, + "num_tokens": 295982635.0, + "step": 11863 + }, + { + "epoch": 1.302877223808478, + "grad_norm": 2.3052823543548584, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7122179269790649, + "num_tokens": 296008330.0, + "step": 11864 + }, + { + "epoch": 1.3029870415110916, + "grad_norm": 2.320747137069702, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7176249623298645, + "num_tokens": 296031972.0, + "step": 11865 + }, + { + "epoch": 1.3030968592137053, + "grad_norm": 2.0193212032318115, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7464030981063843, + "num_tokens": 296059693.0, + "step": 11866 + }, + { + "epoch": 1.3032066769163189, + "grad_norm": 2.1614108085632324, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7226566076278687, + "num_tokens": 296088134.0, + "step": 11867 + }, + { + "epoch": 1.3033164946189326, + "grad_norm": 2.3050026893615723, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7324485778808594, + "num_tokens": 296109984.0, + "step": 11868 + }, + { + "epoch": 1.3034263123215462, + "grad_norm": 2.4113271236419678, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7374559640884399, + "num_tokens": 296133075.0, + "step": 11869 + }, + { + "epoch": 1.30353613002416, + "grad_norm": 2.2838308811187744, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7240504622459412, + "num_tokens": 296157324.0, + "step": 11870 + }, + { + "epoch": 1.3036459477267734, + "grad_norm": 1.9808248281478882, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7214007377624512, + "num_tokens": 296187354.0, + "step": 11871 + }, + { + "epoch": 1.3037557654293872, + "grad_norm": 2.2821381092071533, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7218968868255615, + "num_tokens": 296212503.0, + "step": 11872 + }, + { + "epoch": 1.303865583132001, + "grad_norm": 2.1190431118011475, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7252872586250305, + "num_tokens": 296241896.0, + "step": 11873 + }, + { + "epoch": 1.3039754008346145, + "grad_norm": 2.2070536613464355, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7238446474075317, + "num_tokens": 296267429.0, + "step": 11874 + }, + { + "epoch": 1.3040852185372283, + "grad_norm": 2.1685235500335693, + "learning_rate": 1e-06, + "loss": 0.8434, + "mean_token_accuracy": 0.7333817481994629, + "num_tokens": 296293575.0, + "step": 11875 + }, + { + "epoch": 1.3041950362398418, + "grad_norm": 2.182461738586426, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7011003494262695, + "num_tokens": 296320630.0, + "step": 11876 + }, + { + "epoch": 1.3043048539424555, + "grad_norm": 2.313283681869507, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7179833650588989, + "num_tokens": 296345380.0, + "step": 11877 + }, + { + "epoch": 1.3044146716450693, + "grad_norm": 2.4167640209198, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7258336544036865, + "num_tokens": 296368488.0, + "step": 11878 + }, + { + "epoch": 1.3045244893476828, + "grad_norm": 2.6568679809570312, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7136169075965881, + "num_tokens": 296389825.0, + "step": 11879 + }, + { + "epoch": 1.3046343070502964, + "grad_norm": 2.1853127479553223, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7007485032081604, + "num_tokens": 296418748.0, + "step": 11880 + }, + { + "epoch": 1.3047441247529101, + "grad_norm": 2.319706678390503, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7369921207427979, + "num_tokens": 296443307.0, + "step": 11881 + }, + { + "epoch": 1.3048539424555239, + "grad_norm": 2.3518803119659424, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7115709781646729, + "num_tokens": 296466101.0, + "step": 11882 + }, + { + "epoch": 1.3049637601581374, + "grad_norm": 2.551107406616211, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7429484724998474, + "num_tokens": 296485957.0, + "step": 11883 + }, + { + "epoch": 1.3050735778607512, + "grad_norm": 2.3912672996520996, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7205573320388794, + "num_tokens": 296509443.0, + "step": 11884 + }, + { + "epoch": 1.3051833955633647, + "grad_norm": 2.391939401626587, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.711209774017334, + "num_tokens": 296532401.0, + "step": 11885 + }, + { + "epoch": 1.3052932132659785, + "grad_norm": 2.4327614307403564, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7354505658149719, + "num_tokens": 296554026.0, + "step": 11886 + }, + { + "epoch": 1.3054030309685922, + "grad_norm": 2.16855788230896, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7014107704162598, + "num_tokens": 296583338.0, + "step": 11887 + }, + { + "epoch": 1.3055128486712058, + "grad_norm": 2.281190872192383, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7237534523010254, + "num_tokens": 296608740.0, + "step": 11888 + }, + { + "epoch": 1.3056226663738195, + "grad_norm": 2.042576551437378, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.726414680480957, + "num_tokens": 296637347.0, + "step": 11889 + }, + { + "epoch": 1.305732484076433, + "grad_norm": 1.9432669878005981, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.7047593593597412, + "num_tokens": 296670547.0, + "step": 11890 + }, + { + "epoch": 1.3058423017790468, + "grad_norm": 2.9028475284576416, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7469812631607056, + "num_tokens": 296688535.0, + "step": 11891 + }, + { + "epoch": 1.3059521194816606, + "grad_norm": 2.1507606506347656, + "learning_rate": 1e-06, + "loss": 0.7945, + "mean_token_accuracy": 0.7455757856369019, + "num_tokens": 296715655.0, + "step": 11892 + }, + { + "epoch": 1.306061937184274, + "grad_norm": 2.2029941082000732, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7178651094436646, + "num_tokens": 296743274.0, + "step": 11893 + }, + { + "epoch": 1.3061717548868876, + "grad_norm": 2.1449785232543945, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7423135042190552, + "num_tokens": 296769958.0, + "step": 11894 + }, + { + "epoch": 1.3062815725895014, + "grad_norm": 2.1861610412597656, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7282769680023193, + "num_tokens": 296796750.0, + "step": 11895 + }, + { + "epoch": 1.3063913902921152, + "grad_norm": 2.4806432723999023, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7232091426849365, + "num_tokens": 296817070.0, + "step": 11896 + }, + { + "epoch": 1.3065012079947287, + "grad_norm": 2.472688913345337, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7381435632705688, + "num_tokens": 296838955.0, + "step": 11897 + }, + { + "epoch": 1.3066110256973424, + "grad_norm": 2.3771588802337646, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7348782420158386, + "num_tokens": 296865309.0, + "step": 11898 + }, + { + "epoch": 1.306720843399956, + "grad_norm": 2.247912883758545, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6985244750976562, + "num_tokens": 296889308.0, + "step": 11899 + }, + { + "epoch": 1.3068306611025697, + "grad_norm": 2.103227376937866, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.6936275959014893, + "num_tokens": 296917557.0, + "step": 11900 + }, + { + "epoch": 1.3069404788051835, + "grad_norm": 2.252913475036621, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7279211282730103, + "num_tokens": 296940576.0, + "step": 11901 + }, + { + "epoch": 1.307050296507797, + "grad_norm": 2.120372772216797, + "learning_rate": 1e-06, + "loss": 0.9965, + "mean_token_accuracy": 0.7014291286468506, + "num_tokens": 296968461.0, + "step": 11902 + }, + { + "epoch": 1.3071601142104108, + "grad_norm": 2.3099751472473145, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7244902849197388, + "num_tokens": 296993360.0, + "step": 11903 + }, + { + "epoch": 1.3072699319130243, + "grad_norm": 2.384840726852417, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7244126796722412, + "num_tokens": 297016199.0, + "step": 11904 + }, + { + "epoch": 1.307379749615638, + "grad_norm": 2.2025203704833984, + "learning_rate": 1e-06, + "loss": 0.9849, + "mean_token_accuracy": 0.7017747163772583, + "num_tokens": 297046047.0, + "step": 11905 + }, + { + "epoch": 1.3074895673182518, + "grad_norm": 2.447112560272217, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7269576787948608, + "num_tokens": 297066362.0, + "step": 11906 + }, + { + "epoch": 1.3075993850208654, + "grad_norm": 2.215212821960449, + "learning_rate": 1e-06, + "loss": 0.9707, + "mean_token_accuracy": 0.6887285709381104, + "num_tokens": 297093431.0, + "step": 11907 + }, + { + "epoch": 1.307709202723479, + "grad_norm": 2.017335891723633, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7069982886314392, + "num_tokens": 297123325.0, + "step": 11908 + }, + { + "epoch": 1.3078190204260927, + "grad_norm": 2.126101493835449, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7145161628723145, + "num_tokens": 297151617.0, + "step": 11909 + }, + { + "epoch": 1.3079288381287064, + "grad_norm": 2.337031364440918, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7087531089782715, + "num_tokens": 297175256.0, + "step": 11910 + }, + { + "epoch": 1.30803865583132, + "grad_norm": 2.4658491611480713, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7280199527740479, + "num_tokens": 297197482.0, + "step": 11911 + }, + { + "epoch": 1.3081484735339337, + "grad_norm": 2.6879518032073975, + "learning_rate": 1e-06, + "loss": 0.8193, + "mean_token_accuracy": 0.7413235902786255, + "num_tokens": 297216629.0, + "step": 11912 + }, + { + "epoch": 1.3082582912365472, + "grad_norm": 2.174257516860962, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7075865268707275, + "num_tokens": 297245264.0, + "step": 11913 + }, + { + "epoch": 1.308368108939161, + "grad_norm": 2.5274417400360107, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.7508123517036438, + "num_tokens": 297264304.0, + "step": 11914 + }, + { + "epoch": 1.3084779266417748, + "grad_norm": 1.9751389026641846, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7093697786331177, + "num_tokens": 297295505.0, + "step": 11915 + }, + { + "epoch": 1.3085877443443883, + "grad_norm": 2.304422616958618, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7054133415222168, + "num_tokens": 297320081.0, + "step": 11916 + }, + { + "epoch": 1.308697562047002, + "grad_norm": 2.3919410705566406, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.746953010559082, + "num_tokens": 297343453.0, + "step": 11917 + }, + { + "epoch": 1.3088073797496156, + "grad_norm": 1.9494270086288452, + "learning_rate": 1e-06, + "loss": 0.9756, + "mean_token_accuracy": 0.6978317499160767, + "num_tokens": 297375875.0, + "step": 11918 + }, + { + "epoch": 1.3089171974522293, + "grad_norm": 2.2825398445129395, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7229276895523071, + "num_tokens": 297401213.0, + "step": 11919 + }, + { + "epoch": 1.309027015154843, + "grad_norm": 2.299800157546997, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7386306524276733, + "num_tokens": 297426463.0, + "step": 11920 + }, + { + "epoch": 1.3091368328574566, + "grad_norm": 2.1136255264282227, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7062534689903259, + "num_tokens": 297454317.0, + "step": 11921 + }, + { + "epoch": 1.3092466505600702, + "grad_norm": 2.29135799407959, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7229548692703247, + "num_tokens": 297480329.0, + "step": 11922 + }, + { + "epoch": 1.309356468262684, + "grad_norm": 2.578408718109131, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.7371820211410522, + "num_tokens": 297500027.0, + "step": 11923 + }, + { + "epoch": 1.3094662859652977, + "grad_norm": 2.382777690887451, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7475640773773193, + "num_tokens": 297525770.0, + "step": 11924 + }, + { + "epoch": 1.3095761036679112, + "grad_norm": 2.409583330154419, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7290459871292114, + "num_tokens": 297549471.0, + "step": 11925 + }, + { + "epoch": 1.309685921370525, + "grad_norm": 2.21026873588562, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7117495536804199, + "num_tokens": 297574385.0, + "step": 11926 + }, + { + "epoch": 1.3097957390731385, + "grad_norm": 2.497192144393921, + "learning_rate": 1e-06, + "loss": 0.8146, + "mean_token_accuracy": 0.7393798232078552, + "num_tokens": 297595762.0, + "step": 11927 + }, + { + "epoch": 1.3099055567757523, + "grad_norm": 2.1997218132019043, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.726449728012085, + "num_tokens": 297622706.0, + "step": 11928 + }, + { + "epoch": 1.310015374478366, + "grad_norm": 1.9625204801559448, + "learning_rate": 1e-06, + "loss": 1.0341, + "mean_token_accuracy": 0.6869616508483887, + "num_tokens": 297655622.0, + "step": 11929 + }, + { + "epoch": 1.3101251921809796, + "grad_norm": 2.159553289413452, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.6994304656982422, + "num_tokens": 297681774.0, + "step": 11930 + }, + { + "epoch": 1.3102350098835933, + "grad_norm": 2.1470179557800293, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7160811424255371, + "num_tokens": 297712792.0, + "step": 11931 + }, + { + "epoch": 1.3103448275862069, + "grad_norm": 2.0097532272338867, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7250413298606873, + "num_tokens": 297743487.0, + "step": 11932 + }, + { + "epoch": 1.3104546452888206, + "grad_norm": 2.213926076889038, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7059758901596069, + "num_tokens": 297770610.0, + "step": 11933 + }, + { + "epoch": 1.3105644629914341, + "grad_norm": 2.36612606048584, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.6992320418357849, + "num_tokens": 297795144.0, + "step": 11934 + }, + { + "epoch": 1.310674280694048, + "grad_norm": 2.233222484588623, + "learning_rate": 1e-06, + "loss": 1.0454, + "mean_token_accuracy": 0.6877607703208923, + "num_tokens": 297825227.0, + "step": 11935 + }, + { + "epoch": 1.3107840983966614, + "grad_norm": 2.4446210861206055, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7132065296173096, + "num_tokens": 297848411.0, + "step": 11936 + }, + { + "epoch": 1.3108939160992752, + "grad_norm": 2.3282408714294434, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.699425458908081, + "num_tokens": 297872535.0, + "step": 11937 + }, + { + "epoch": 1.311003733801889, + "grad_norm": 2.307307004928589, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7275777459144592, + "num_tokens": 297895958.0, + "step": 11938 + }, + { + "epoch": 1.3111135515045025, + "grad_norm": 2.4267890453338623, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7220253348350525, + "num_tokens": 297918110.0, + "step": 11939 + }, + { + "epoch": 1.3112233692071162, + "grad_norm": 2.281888961791992, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7377403378486633, + "num_tokens": 297944177.0, + "step": 11940 + }, + { + "epoch": 1.3113331869097298, + "grad_norm": 2.090832471847534, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6946477890014648, + "num_tokens": 297972785.0, + "step": 11941 + }, + { + "epoch": 1.3114430046123435, + "grad_norm": 2.3312935829162598, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7073156237602234, + "num_tokens": 297998322.0, + "step": 11942 + }, + { + "epoch": 1.3115528223149573, + "grad_norm": 2.464294672012329, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7175483703613281, + "num_tokens": 298021711.0, + "step": 11943 + }, + { + "epoch": 1.3116626400175708, + "grad_norm": 2.2239863872528076, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.696755051612854, + "num_tokens": 298047771.0, + "step": 11944 + }, + { + "epoch": 1.3117724577201844, + "grad_norm": 2.3382487297058105, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.7003809809684753, + "num_tokens": 298072003.0, + "step": 11945 + }, + { + "epoch": 1.3118822754227981, + "grad_norm": 2.254556894302368, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7011294960975647, + "num_tokens": 298097565.0, + "step": 11946 + }, + { + "epoch": 1.3119920931254119, + "grad_norm": 2.105483293533325, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7129428386688232, + "num_tokens": 298123677.0, + "step": 11947 + }, + { + "epoch": 1.3121019108280254, + "grad_norm": 2.085498094558716, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7035335302352905, + "num_tokens": 298151199.0, + "step": 11948 + }, + { + "epoch": 1.3122117285306392, + "grad_norm": 2.197911500930786, + "learning_rate": 1e-06, + "loss": 1.013, + "mean_token_accuracy": 0.6965271234512329, + "num_tokens": 298176763.0, + "step": 11949 + }, + { + "epoch": 1.3123215462332527, + "grad_norm": 2.203714370727539, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.703048586845398, + "num_tokens": 298204261.0, + "step": 11950 + }, + { + "epoch": 1.3124313639358665, + "grad_norm": 2.272094964981079, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7270936965942383, + "num_tokens": 298228324.0, + "step": 11951 + }, + { + "epoch": 1.3125411816384802, + "grad_norm": 2.675804853439331, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7253410816192627, + "num_tokens": 298246994.0, + "step": 11952 + }, + { + "epoch": 1.3126509993410937, + "grad_norm": 2.158093214035034, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7089251279830933, + "num_tokens": 298272399.0, + "step": 11953 + }, + { + "epoch": 1.3127608170437075, + "grad_norm": 2.2560014724731445, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6989012956619263, + "num_tokens": 298300337.0, + "step": 11954 + }, + { + "epoch": 1.312870634746321, + "grad_norm": 2.355396270751953, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7120751142501831, + "num_tokens": 298324037.0, + "step": 11955 + }, + { + "epoch": 1.3129804524489348, + "grad_norm": 2.6754753589630127, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.728508472442627, + "num_tokens": 298343009.0, + "step": 11956 + }, + { + "epoch": 1.3130902701515486, + "grad_norm": 2.4447083473205566, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7262061834335327, + "num_tokens": 298364755.0, + "step": 11957 + }, + { + "epoch": 1.313200087854162, + "grad_norm": 2.426069974899292, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7257950901985168, + "num_tokens": 298384260.0, + "step": 11958 + }, + { + "epoch": 1.3133099055567756, + "grad_norm": 2.233236074447632, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.6953902840614319, + "num_tokens": 298410974.0, + "step": 11959 + }, + { + "epoch": 1.3134197232593894, + "grad_norm": 2.6186230182647705, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7199326753616333, + "num_tokens": 298430886.0, + "step": 11960 + }, + { + "epoch": 1.3135295409620031, + "grad_norm": 2.069807767868042, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7058652639389038, + "num_tokens": 298460840.0, + "step": 11961 + }, + { + "epoch": 1.3136393586646167, + "grad_norm": 2.127941608428955, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7084759473800659, + "num_tokens": 298491479.0, + "step": 11962 + }, + { + "epoch": 1.3137491763672304, + "grad_norm": 2.381927728652954, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7174255847930908, + "num_tokens": 298515380.0, + "step": 11963 + }, + { + "epoch": 1.313858994069844, + "grad_norm": 2.321319580078125, + "learning_rate": 1e-06, + "loss": 0.8214, + "mean_token_accuracy": 0.7498628497123718, + "num_tokens": 298538432.0, + "step": 11964 + }, + { + "epoch": 1.3139688117724577, + "grad_norm": 2.381455421447754, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.6982764005661011, + "num_tokens": 298563239.0, + "step": 11965 + }, + { + "epoch": 1.3140786294750715, + "grad_norm": 2.17295503616333, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.705345630645752, + "num_tokens": 298588493.0, + "step": 11966 + }, + { + "epoch": 1.314188447177685, + "grad_norm": 2.219858407974243, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7343332767486572, + "num_tokens": 298614103.0, + "step": 11967 + }, + { + "epoch": 1.3142982648802988, + "grad_norm": 2.0815985202789307, + "learning_rate": 1e-06, + "loss": 1.0442, + "mean_token_accuracy": 0.6881532669067383, + "num_tokens": 298643935.0, + "step": 11968 + }, + { + "epoch": 1.3144080825829123, + "grad_norm": 2.4836254119873047, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7316551208496094, + "num_tokens": 298665182.0, + "step": 11969 + }, + { + "epoch": 1.314517900285526, + "grad_norm": 2.1321122646331787, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7136359214782715, + "num_tokens": 298694222.0, + "step": 11970 + }, + { + "epoch": 1.3146277179881398, + "grad_norm": 2.418962001800537, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7078486680984497, + "num_tokens": 298716920.0, + "step": 11971 + }, + { + "epoch": 1.3147375356907534, + "grad_norm": 2.4494171142578125, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7264183759689331, + "num_tokens": 298738473.0, + "step": 11972 + }, + { + "epoch": 1.314847353393367, + "grad_norm": 2.161695957183838, + "learning_rate": 1e-06, + "loss": 0.9933, + "mean_token_accuracy": 0.6951360702514648, + "num_tokens": 298766738.0, + "step": 11973 + }, + { + "epoch": 1.3149571710959806, + "grad_norm": 2.514400005340576, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7166434526443481, + "num_tokens": 298787861.0, + "step": 11974 + }, + { + "epoch": 1.3150669887985944, + "grad_norm": 2.2218167781829834, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7143257856369019, + "num_tokens": 298812174.0, + "step": 11975 + }, + { + "epoch": 1.315176806501208, + "grad_norm": 2.155280828475952, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7077255249023438, + "num_tokens": 298839257.0, + "step": 11976 + }, + { + "epoch": 1.3152866242038217, + "grad_norm": 2.2614667415618896, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7272458076477051, + "num_tokens": 298862682.0, + "step": 11977 + }, + { + "epoch": 1.3153964419064352, + "grad_norm": 2.021754503250122, + "learning_rate": 1e-06, + "loss": 0.9694, + "mean_token_accuracy": 0.7022892236709595, + "num_tokens": 298894002.0, + "step": 11978 + }, + { + "epoch": 1.315506259609049, + "grad_norm": 2.5412936210632324, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7113889455795288, + "num_tokens": 298914979.0, + "step": 11979 + }, + { + "epoch": 1.3156160773116627, + "grad_norm": 2.1847598552703857, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7098830938339233, + "num_tokens": 298939428.0, + "step": 11980 + }, + { + "epoch": 1.3157258950142763, + "grad_norm": 1.8835570812225342, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7269514799118042, + "num_tokens": 298973164.0, + "step": 11981 + }, + { + "epoch": 1.31583571271689, + "grad_norm": 2.529402494430542, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7417225241661072, + "num_tokens": 298992372.0, + "step": 11982 + }, + { + "epoch": 1.3159455304195036, + "grad_norm": 2.2301337718963623, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7247155904769897, + "num_tokens": 299018775.0, + "step": 11983 + }, + { + "epoch": 1.3160553481221173, + "grad_norm": 2.286912202835083, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7295182943344116, + "num_tokens": 299044783.0, + "step": 11984 + }, + { + "epoch": 1.3161651658247309, + "grad_norm": 2.098147392272949, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7254579067230225, + "num_tokens": 299072576.0, + "step": 11985 + }, + { + "epoch": 1.3162749835273446, + "grad_norm": 2.3142051696777344, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7047663927078247, + "num_tokens": 299098706.0, + "step": 11986 + }, + { + "epoch": 1.3163848012299582, + "grad_norm": 2.4928293228149414, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7315185070037842, + "num_tokens": 299119330.0, + "step": 11987 + }, + { + "epoch": 1.316494618932572, + "grad_norm": 2.381979465484619, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7069189548492432, + "num_tokens": 299141695.0, + "step": 11988 + }, + { + "epoch": 1.3166044366351857, + "grad_norm": 2.1336581707000732, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7421727180480957, + "num_tokens": 299170162.0, + "step": 11989 + }, + { + "epoch": 1.3167142543377992, + "grad_norm": 2.0176539421081543, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.6986963152885437, + "num_tokens": 299200454.0, + "step": 11990 + }, + { + "epoch": 1.316824072040413, + "grad_norm": 2.6198601722717285, + "learning_rate": 1e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7360244989395142, + "num_tokens": 299220993.0, + "step": 11991 + }, + { + "epoch": 1.3169338897430265, + "grad_norm": 2.2850987911224365, + "learning_rate": 1e-06, + "loss": 0.9733, + "mean_token_accuracy": 0.7036465406417847, + "num_tokens": 299247354.0, + "step": 11992 + }, + { + "epoch": 1.3170437074456403, + "grad_norm": 2.2371551990509033, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7081266045570374, + "num_tokens": 299272798.0, + "step": 11993 + }, + { + "epoch": 1.317153525148254, + "grad_norm": 2.398508310317993, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7323829531669617, + "num_tokens": 299296366.0, + "step": 11994 + }, + { + "epoch": 1.3172633428508675, + "grad_norm": 2.116105079650879, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7256249189376831, + "num_tokens": 299324404.0, + "step": 11995 + }, + { + "epoch": 1.3173731605534813, + "grad_norm": 2.4370505809783936, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7204494476318359, + "num_tokens": 299346511.0, + "step": 11996 + }, + { + "epoch": 1.3174829782560948, + "grad_norm": 2.208484411239624, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.752577543258667, + "num_tokens": 299371660.0, + "step": 11997 + }, + { + "epoch": 1.3175927959587086, + "grad_norm": 2.178110361099243, + "learning_rate": 1e-06, + "loss": 1.0077, + "mean_token_accuracy": 0.690376877784729, + "num_tokens": 299400307.0, + "step": 11998 + }, + { + "epoch": 1.3177026136613221, + "grad_norm": 2.1294026374816895, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.707568883895874, + "num_tokens": 299428733.0, + "step": 11999 + }, + { + "epoch": 1.3178124313639359, + "grad_norm": 2.345505714416504, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7116439938545227, + "num_tokens": 299454047.0, + "step": 12000 + }, + { + "epoch": 1.3179222490665494, + "grad_norm": 2.611844778060913, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7288386821746826, + "num_tokens": 299473188.0, + "step": 12001 + }, + { + "epoch": 1.3180320667691632, + "grad_norm": 2.2968101501464844, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7124388217926025, + "num_tokens": 299499072.0, + "step": 12002 + }, + { + "epoch": 1.318141884471777, + "grad_norm": 2.238553285598755, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.720665454864502, + "num_tokens": 299522381.0, + "step": 12003 + }, + { + "epoch": 1.3182517021743905, + "grad_norm": 2.2590372562408447, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7017049789428711, + "num_tokens": 299547805.0, + "step": 12004 + }, + { + "epoch": 1.3183615198770042, + "grad_norm": 2.4284205436706543, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7228757739067078, + "num_tokens": 299571355.0, + "step": 12005 + }, + { + "epoch": 1.3184713375796178, + "grad_norm": 2.426966905593872, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7088936567306519, + "num_tokens": 299596346.0, + "step": 12006 + }, + { + "epoch": 1.3185811552822315, + "grad_norm": 2.2446391582489014, + "learning_rate": 1e-06, + "loss": 0.9884, + "mean_token_accuracy": 0.703620195388794, + "num_tokens": 299623645.0, + "step": 12007 + }, + { + "epoch": 1.3186909729848453, + "grad_norm": 2.335442304611206, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7270432710647583, + "num_tokens": 299647188.0, + "step": 12008 + }, + { + "epoch": 1.3188007906874588, + "grad_norm": 2.358914375305176, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7294214367866516, + "num_tokens": 299671216.0, + "step": 12009 + }, + { + "epoch": 1.3189106083900723, + "grad_norm": 2.3969874382019043, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7264857888221741, + "num_tokens": 299695343.0, + "step": 12010 + }, + { + "epoch": 1.319020426092686, + "grad_norm": 2.5255014896392822, + "learning_rate": 1e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7393361926078796, + "num_tokens": 299715327.0, + "step": 12011 + }, + { + "epoch": 1.3191302437952999, + "grad_norm": 2.4963138103485107, + "learning_rate": 1e-06, + "loss": 0.9256, + "mean_token_accuracy": 0.7233179807662964, + "num_tokens": 299738528.0, + "step": 12012 + }, + { + "epoch": 1.3192400614979134, + "grad_norm": 2.1005706787109375, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7151989936828613, + "num_tokens": 299764983.0, + "step": 12013 + }, + { + "epoch": 1.3193498792005272, + "grad_norm": 2.3873417377471924, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7281028032302856, + "num_tokens": 299787399.0, + "step": 12014 + }, + { + "epoch": 1.3194596969031407, + "grad_norm": 2.4562439918518066, + "learning_rate": 1e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7360301613807678, + "num_tokens": 299809108.0, + "step": 12015 + }, + { + "epoch": 1.3195695146057544, + "grad_norm": 2.4849092960357666, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7363401651382446, + "num_tokens": 299830936.0, + "step": 12016 + }, + { + "epoch": 1.3196793323083682, + "grad_norm": 2.426158905029297, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.734130322933197, + "num_tokens": 299853638.0, + "step": 12017 + }, + { + "epoch": 1.3197891500109817, + "grad_norm": 2.3777172565460205, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7258714437484741, + "num_tokens": 299876696.0, + "step": 12018 + }, + { + "epoch": 1.3198989677135955, + "grad_norm": 2.6466636657714844, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.737006664276123, + "num_tokens": 299896796.0, + "step": 12019 + }, + { + "epoch": 1.320008785416209, + "grad_norm": 2.1143391132354736, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7400869727134705, + "num_tokens": 299924779.0, + "step": 12020 + }, + { + "epoch": 1.3201186031188228, + "grad_norm": 2.4688167572021484, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7115768194198608, + "num_tokens": 299948153.0, + "step": 12021 + }, + { + "epoch": 1.3202284208214365, + "grad_norm": 2.185662269592285, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7338994741439819, + "num_tokens": 299974697.0, + "step": 12022 + }, + { + "epoch": 1.32033823852405, + "grad_norm": 2.4064433574676514, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7334702014923096, + "num_tokens": 299996261.0, + "step": 12023 + }, + { + "epoch": 1.3204480562266636, + "grad_norm": 2.2533740997314453, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7059035301208496, + "num_tokens": 300022996.0, + "step": 12024 + }, + { + "epoch": 1.3205578739292774, + "grad_norm": 2.194741725921631, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.7033178806304932, + "num_tokens": 300050673.0, + "step": 12025 + }, + { + "epoch": 1.3206676916318911, + "grad_norm": 2.164888858795166, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7274563312530518, + "num_tokens": 300077288.0, + "step": 12026 + }, + { + "epoch": 1.3207775093345047, + "grad_norm": 2.1315197944641113, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.7052156925201416, + "num_tokens": 300105932.0, + "step": 12027 + }, + { + "epoch": 1.3208873270371184, + "grad_norm": 2.178023338317871, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7311569452285767, + "num_tokens": 300132238.0, + "step": 12028 + }, + { + "epoch": 1.320997144739732, + "grad_norm": 2.506016492843628, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.736496090888977, + "num_tokens": 300152972.0, + "step": 12029 + }, + { + "epoch": 1.3211069624423457, + "grad_norm": 2.5146446228027344, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.707794189453125, + "num_tokens": 300174858.0, + "step": 12030 + }, + { + "epoch": 1.3212167801449595, + "grad_norm": 2.1881003379821777, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7190929651260376, + "num_tokens": 300200477.0, + "step": 12031 + }, + { + "epoch": 1.321326597847573, + "grad_norm": 2.5684847831726074, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7243543863296509, + "num_tokens": 300221088.0, + "step": 12032 + }, + { + "epoch": 1.3214364155501868, + "grad_norm": 2.23046612739563, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7326384782791138, + "num_tokens": 300245428.0, + "step": 12033 + }, + { + "epoch": 1.3215462332528003, + "grad_norm": 2.213289499282837, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7131614685058594, + "num_tokens": 300272788.0, + "step": 12034 + }, + { + "epoch": 1.321656050955414, + "grad_norm": 2.3459293842315674, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7359400987625122, + "num_tokens": 300297095.0, + "step": 12035 + }, + { + "epoch": 1.3217658686580278, + "grad_norm": 2.2305588722229004, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7207956910133362, + "num_tokens": 300322033.0, + "step": 12036 + }, + { + "epoch": 1.3218756863606413, + "grad_norm": 2.292217493057251, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.716139554977417, + "num_tokens": 300346102.0, + "step": 12037 + }, + { + "epoch": 1.3219855040632549, + "grad_norm": 2.264934778213501, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7372984886169434, + "num_tokens": 300371315.0, + "step": 12038 + }, + { + "epoch": 1.3220953217658686, + "grad_norm": 2.2239270210266113, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7128979563713074, + "num_tokens": 300398759.0, + "step": 12039 + }, + { + "epoch": 1.3222051394684824, + "grad_norm": 2.272270441055298, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7089574337005615, + "num_tokens": 300422841.0, + "step": 12040 + }, + { + "epoch": 1.322314957171096, + "grad_norm": 2.1010756492614746, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7081825733184814, + "num_tokens": 300452667.0, + "step": 12041 + }, + { + "epoch": 1.3224247748737097, + "grad_norm": 2.0392913818359375, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7035334706306458, + "num_tokens": 300482370.0, + "step": 12042 + }, + { + "epoch": 1.3225345925763232, + "grad_norm": 2.3192453384399414, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7179758548736572, + "num_tokens": 300506713.0, + "step": 12043 + }, + { + "epoch": 1.322644410278937, + "grad_norm": 2.4139790534973145, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7330278754234314, + "num_tokens": 300529326.0, + "step": 12044 + }, + { + "epoch": 1.3227542279815507, + "grad_norm": 2.5146870613098145, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7308922410011292, + "num_tokens": 300549634.0, + "step": 12045 + }, + { + "epoch": 1.3228640456841643, + "grad_norm": 2.1984102725982666, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7154029607772827, + "num_tokens": 300577640.0, + "step": 12046 + }, + { + "epoch": 1.322973863386778, + "grad_norm": 2.305793046951294, + "learning_rate": 1e-06, + "loss": 1.0073, + "mean_token_accuracy": 0.6976293325424194, + "num_tokens": 300605409.0, + "step": 12047 + }, + { + "epoch": 1.3230836810893916, + "grad_norm": 2.244148015975952, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7342760562896729, + "num_tokens": 300629845.0, + "step": 12048 + }, + { + "epoch": 1.3231934987920053, + "grad_norm": 2.1994543075561523, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7192158102989197, + "num_tokens": 300657055.0, + "step": 12049 + }, + { + "epoch": 1.3233033164946189, + "grad_norm": 2.2119739055633545, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7303142547607422, + "num_tokens": 300683192.0, + "step": 12050 + }, + { + "epoch": 1.3234131341972326, + "grad_norm": 2.2537498474121094, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7233301401138306, + "num_tokens": 300709727.0, + "step": 12051 + }, + { + "epoch": 1.3235229518998461, + "grad_norm": 2.225292205810547, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7308646440505981, + "num_tokens": 300734261.0, + "step": 12052 + }, + { + "epoch": 1.32363276960246, + "grad_norm": 2.07995867729187, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7233135104179382, + "num_tokens": 300764124.0, + "step": 12053 + }, + { + "epoch": 1.3237425873050737, + "grad_norm": 2.2456777095794678, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.717170000076294, + "num_tokens": 300791210.0, + "step": 12054 + }, + { + "epoch": 1.3238524050076872, + "grad_norm": 1.9972468614578247, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7193198204040527, + "num_tokens": 300820240.0, + "step": 12055 + }, + { + "epoch": 1.323962222710301, + "grad_norm": 2.3377926349639893, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7275804281234741, + "num_tokens": 300842411.0, + "step": 12056 + }, + { + "epoch": 1.3240720404129145, + "grad_norm": 2.238213300704956, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7329199314117432, + "num_tokens": 300867494.0, + "step": 12057 + }, + { + "epoch": 1.3241818581155282, + "grad_norm": 2.5350890159606934, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7310396432876587, + "num_tokens": 300888480.0, + "step": 12058 + }, + { + "epoch": 1.324291675818142, + "grad_norm": 2.5351874828338623, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7247365713119507, + "num_tokens": 300909978.0, + "step": 12059 + }, + { + "epoch": 1.3244014935207555, + "grad_norm": 2.2707157135009766, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7247708439826965, + "num_tokens": 300934388.0, + "step": 12060 + }, + { + "epoch": 1.324511311223369, + "grad_norm": 2.4141132831573486, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7166354656219482, + "num_tokens": 300957821.0, + "step": 12061 + }, + { + "epoch": 1.3246211289259828, + "grad_norm": 2.5760021209716797, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7294164299964905, + "num_tokens": 300978710.0, + "step": 12062 + }, + { + "epoch": 1.3247309466285966, + "grad_norm": 2.2064249515533447, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6973318457603455, + "num_tokens": 301008796.0, + "step": 12063 + }, + { + "epoch": 1.3248407643312101, + "grad_norm": 2.466320276260376, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7064396739006042, + "num_tokens": 301031638.0, + "step": 12064 + }, + { + "epoch": 1.3249505820338239, + "grad_norm": 1.8857245445251465, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7024509906768799, + "num_tokens": 301068069.0, + "step": 12065 + }, + { + "epoch": 1.3250603997364374, + "grad_norm": 2.2376151084899902, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7346546649932861, + "num_tokens": 301091544.0, + "step": 12066 + }, + { + "epoch": 1.3251702174390512, + "grad_norm": 2.5287187099456787, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7202165126800537, + "num_tokens": 301112551.0, + "step": 12067 + }, + { + "epoch": 1.325280035141665, + "grad_norm": 2.274916887283325, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7210437655448914, + "num_tokens": 301137690.0, + "step": 12068 + }, + { + "epoch": 1.3253898528442785, + "grad_norm": 2.362030506134033, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.7033882737159729, + "num_tokens": 301162473.0, + "step": 12069 + }, + { + "epoch": 1.3254996705468922, + "grad_norm": 2.0465095043182373, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7118557691574097, + "num_tokens": 301190576.0, + "step": 12070 + }, + { + "epoch": 1.3256094882495058, + "grad_norm": 2.3693413734436035, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7186648845672607, + "num_tokens": 301213336.0, + "step": 12071 + }, + { + "epoch": 1.3257193059521195, + "grad_norm": 2.259887218475342, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.726864755153656, + "num_tokens": 301239040.0, + "step": 12072 + }, + { + "epoch": 1.3258291236547333, + "grad_norm": 2.087157964706421, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7130465507507324, + "num_tokens": 301269349.0, + "step": 12073 + }, + { + "epoch": 1.3259389413573468, + "grad_norm": 2.499955415725708, + "learning_rate": 1e-06, + "loss": 0.7665, + "mean_token_accuracy": 0.754365086555481, + "num_tokens": 301289112.0, + "step": 12074 + }, + { + "epoch": 1.3260487590599603, + "grad_norm": 2.4542932510375977, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7097557783126831, + "num_tokens": 301311873.0, + "step": 12075 + }, + { + "epoch": 1.326158576762574, + "grad_norm": 2.064436674118042, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7395156621932983, + "num_tokens": 301340814.0, + "step": 12076 + }, + { + "epoch": 1.3262683944651878, + "grad_norm": 2.15952467918396, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7063000202178955, + "num_tokens": 301370435.0, + "step": 12077 + }, + { + "epoch": 1.3263782121678014, + "grad_norm": 2.0735080242156982, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.727826714515686, + "num_tokens": 301399428.0, + "step": 12078 + }, + { + "epoch": 1.3264880298704151, + "grad_norm": 2.197111129760742, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7068332433700562, + "num_tokens": 301427367.0, + "step": 12079 + }, + { + "epoch": 1.3265978475730287, + "grad_norm": 2.4591121673583984, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7113141417503357, + "num_tokens": 301449267.0, + "step": 12080 + }, + { + "epoch": 1.3267076652756424, + "grad_norm": 2.3352057933807373, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7291733026504517, + "num_tokens": 301474125.0, + "step": 12081 + }, + { + "epoch": 1.3268174829782562, + "grad_norm": 2.2270708084106445, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7438729405403137, + "num_tokens": 301498394.0, + "step": 12082 + }, + { + "epoch": 1.3269273006808697, + "grad_norm": 2.4229109287261963, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7241337895393372, + "num_tokens": 301520876.0, + "step": 12083 + }, + { + "epoch": 1.3270371183834835, + "grad_norm": 2.5142159461975098, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.708145797252655, + "num_tokens": 301543093.0, + "step": 12084 + }, + { + "epoch": 1.327146936086097, + "grad_norm": 2.1969354152679443, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7160484790802002, + "num_tokens": 301570745.0, + "step": 12085 + }, + { + "epoch": 1.3272567537887108, + "grad_norm": 2.331613540649414, + "learning_rate": 1e-06, + "loss": 0.8735, + "mean_token_accuracy": 0.7326130867004395, + "num_tokens": 301593778.0, + "step": 12086 + }, + { + "epoch": 1.3273665714913245, + "grad_norm": 2.4284780025482178, + "learning_rate": 1e-06, + "loss": 0.8196, + "mean_token_accuracy": 0.745553195476532, + "num_tokens": 301613979.0, + "step": 12087 + }, + { + "epoch": 1.327476389193938, + "grad_norm": 2.351328134536743, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.720859706401825, + "num_tokens": 301639092.0, + "step": 12088 + }, + { + "epoch": 1.3275862068965516, + "grad_norm": 2.2294604778289795, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7206926941871643, + "num_tokens": 301664629.0, + "step": 12089 + }, + { + "epoch": 1.3276960245991654, + "grad_norm": 2.366960287094116, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7288612127304077, + "num_tokens": 301686405.0, + "step": 12090 + }, + { + "epoch": 1.3278058423017791, + "grad_norm": 2.139352798461914, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7226543426513672, + "num_tokens": 301713161.0, + "step": 12091 + }, + { + "epoch": 1.3279156600043927, + "grad_norm": 2.39233136177063, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7305201888084412, + "num_tokens": 301735948.0, + "step": 12092 + }, + { + "epoch": 1.3280254777070064, + "grad_norm": 2.056455135345459, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7240231037139893, + "num_tokens": 301766223.0, + "step": 12093 + }, + { + "epoch": 1.32813529540962, + "grad_norm": 2.1472620964050293, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7208996415138245, + "num_tokens": 301792477.0, + "step": 12094 + }, + { + "epoch": 1.3282451131122337, + "grad_norm": 2.308947801589966, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.707513689994812, + "num_tokens": 301819818.0, + "step": 12095 + }, + { + "epoch": 1.3283549308148475, + "grad_norm": 2.2594850063323975, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7106631994247437, + "num_tokens": 301847924.0, + "step": 12096 + }, + { + "epoch": 1.328464748517461, + "grad_norm": 2.1101040840148926, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6908618807792664, + "num_tokens": 301877837.0, + "step": 12097 + }, + { + "epoch": 1.3285745662200747, + "grad_norm": 2.4606640338897705, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7231823205947876, + "num_tokens": 301900580.0, + "step": 12098 + }, + { + "epoch": 1.3286843839226883, + "grad_norm": 2.2962582111358643, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7371321320533752, + "num_tokens": 301924401.0, + "step": 12099 + }, + { + "epoch": 1.328794201625302, + "grad_norm": 2.6276721954345703, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7157808542251587, + "num_tokens": 301943133.0, + "step": 12100 + }, + { + "epoch": 1.3289040193279158, + "grad_norm": 2.50565505027771, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7126349806785583, + "num_tokens": 301964081.0, + "step": 12101 + }, + { + "epoch": 1.3290138370305293, + "grad_norm": 2.2166621685028076, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7154122591018677, + "num_tokens": 301989064.0, + "step": 12102 + }, + { + "epoch": 1.3291236547331429, + "grad_norm": 2.065173625946045, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7079716324806213, + "num_tokens": 302018595.0, + "step": 12103 + }, + { + "epoch": 1.3292334724357566, + "grad_norm": 2.5709421634674072, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7038840055465698, + "num_tokens": 302040403.0, + "step": 12104 + }, + { + "epoch": 1.3293432901383704, + "grad_norm": 2.1925294399261475, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7251887321472168, + "num_tokens": 302067632.0, + "step": 12105 + }, + { + "epoch": 1.329453107840984, + "grad_norm": 2.3413798809051514, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7179070115089417, + "num_tokens": 302091951.0, + "step": 12106 + }, + { + "epoch": 1.3295629255435977, + "grad_norm": 2.0688512325286865, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7173876762390137, + "num_tokens": 302121468.0, + "step": 12107 + }, + { + "epoch": 1.3296727432462112, + "grad_norm": 2.391731023788452, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.716680645942688, + "num_tokens": 302145380.0, + "step": 12108 + }, + { + "epoch": 1.329782560948825, + "grad_norm": 2.3051035404205322, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7166078090667725, + "num_tokens": 302168568.0, + "step": 12109 + }, + { + "epoch": 1.3298923786514387, + "grad_norm": 2.6000888347625732, + "learning_rate": 1e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7282906174659729, + "num_tokens": 302187681.0, + "step": 12110 + }, + { + "epoch": 1.3300021963540523, + "grad_norm": 2.546623468399048, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7231399416923523, + "num_tokens": 302208463.0, + "step": 12111 + }, + { + "epoch": 1.330112014056666, + "grad_norm": 2.335265874862671, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7260172367095947, + "num_tokens": 302231743.0, + "step": 12112 + }, + { + "epoch": 1.3302218317592795, + "grad_norm": 2.423313856124878, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7115265727043152, + "num_tokens": 302255819.0, + "step": 12113 + }, + { + "epoch": 1.3303316494618933, + "grad_norm": 2.5539703369140625, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7024213075637817, + "num_tokens": 302278928.0, + "step": 12114 + }, + { + "epoch": 1.3304414671645068, + "grad_norm": 2.3035058975219727, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7244119644165039, + "num_tokens": 302302983.0, + "step": 12115 + }, + { + "epoch": 1.3305512848671206, + "grad_norm": 2.3765411376953125, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.7052812576293945, + "num_tokens": 302328737.0, + "step": 12116 + }, + { + "epoch": 1.3306611025697341, + "grad_norm": 2.229567289352417, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7019972205162048, + "num_tokens": 302353731.0, + "step": 12117 + }, + { + "epoch": 1.330770920272348, + "grad_norm": 2.2572402954101562, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7144780158996582, + "num_tokens": 302380698.0, + "step": 12118 + }, + { + "epoch": 1.3308807379749616, + "grad_norm": 2.5866475105285645, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7376075387001038, + "num_tokens": 302399694.0, + "step": 12119 + }, + { + "epoch": 1.3309905556775752, + "grad_norm": 2.371126174926758, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6969250440597534, + "num_tokens": 302425256.0, + "step": 12120 + }, + { + "epoch": 1.331100373380189, + "grad_norm": 2.0366368293762207, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7076493501663208, + "num_tokens": 302455395.0, + "step": 12121 + }, + { + "epoch": 1.3312101910828025, + "grad_norm": 2.3624107837677, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7189415693283081, + "num_tokens": 302478205.0, + "step": 12122 + }, + { + "epoch": 1.3313200087854162, + "grad_norm": 2.2427303791046143, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7253050804138184, + "num_tokens": 302502401.0, + "step": 12123 + }, + { + "epoch": 1.33142982648803, + "grad_norm": 2.058946132659912, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7157851457595825, + "num_tokens": 302530518.0, + "step": 12124 + }, + { + "epoch": 1.3315396441906435, + "grad_norm": 2.191434144973755, + "learning_rate": 1e-06, + "loss": 0.9818, + "mean_token_accuracy": 0.698218584060669, + "num_tokens": 302558916.0, + "step": 12125 + }, + { + "epoch": 1.331649461893257, + "grad_norm": 2.5167322158813477, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7192074060440063, + "num_tokens": 302580430.0, + "step": 12126 + }, + { + "epoch": 1.3317592795958708, + "grad_norm": 1.837445855140686, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.7059147357940674, + "num_tokens": 302613519.0, + "step": 12127 + }, + { + "epoch": 1.3318690972984846, + "grad_norm": 1.9651515483856201, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7213389873504639, + "num_tokens": 302644966.0, + "step": 12128 + }, + { + "epoch": 1.331978915001098, + "grad_norm": 2.145231246948242, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7149760723114014, + "num_tokens": 302672361.0, + "step": 12129 + }, + { + "epoch": 1.3320887327037119, + "grad_norm": 2.2402591705322266, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7142527103424072, + "num_tokens": 302699532.0, + "step": 12130 + }, + { + "epoch": 1.3321985504063254, + "grad_norm": 2.2495298385620117, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.708796501159668, + "num_tokens": 302726292.0, + "step": 12131 + }, + { + "epoch": 1.3323083681089392, + "grad_norm": 1.9676878452301025, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7119274139404297, + "num_tokens": 302760068.0, + "step": 12132 + }, + { + "epoch": 1.332418185811553, + "grad_norm": 2.2258801460266113, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7085393071174622, + "num_tokens": 302787568.0, + "step": 12133 + }, + { + "epoch": 1.3325280035141664, + "grad_norm": 2.0801286697387695, + "learning_rate": 1e-06, + "loss": 0.9847, + "mean_token_accuracy": 0.7047009468078613, + "num_tokens": 302818540.0, + "step": 12134 + }, + { + "epoch": 1.3326378212167802, + "grad_norm": 2.347834348678589, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6931867599487305, + "num_tokens": 302843791.0, + "step": 12135 + }, + { + "epoch": 1.3327476389193937, + "grad_norm": 2.1705162525177, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7058206796646118, + "num_tokens": 302872607.0, + "step": 12136 + }, + { + "epoch": 1.3328574566220075, + "grad_norm": 2.1668124198913574, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.712548553943634, + "num_tokens": 302901241.0, + "step": 12137 + }, + { + "epoch": 1.3329672743246213, + "grad_norm": 2.4026267528533936, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7330443859100342, + "num_tokens": 302923917.0, + "step": 12138 + }, + { + "epoch": 1.3330770920272348, + "grad_norm": 2.404808521270752, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7224555015563965, + "num_tokens": 302946298.0, + "step": 12139 + }, + { + "epoch": 1.3331869097298483, + "grad_norm": 2.007469415664673, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7369792461395264, + "num_tokens": 302977024.0, + "step": 12140 + }, + { + "epoch": 1.333296727432462, + "grad_norm": 2.031614303588867, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7101380228996277, + "num_tokens": 303006627.0, + "step": 12141 + }, + { + "epoch": 1.3334065451350758, + "grad_norm": 2.122236728668213, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7249158024787903, + "num_tokens": 303034405.0, + "step": 12142 + }, + { + "epoch": 1.3335163628376894, + "grad_norm": 2.4770259857177734, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.6998820900917053, + "num_tokens": 303056056.0, + "step": 12143 + }, + { + "epoch": 1.3336261805403031, + "grad_norm": 2.3100523948669434, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7425209283828735, + "num_tokens": 303078807.0, + "step": 12144 + }, + { + "epoch": 1.3337359982429167, + "grad_norm": 2.1445884704589844, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7230129837989807, + "num_tokens": 303106333.0, + "step": 12145 + }, + { + "epoch": 1.3338458159455304, + "grad_norm": 2.2105212211608887, + "learning_rate": 1e-06, + "loss": 0.9907, + "mean_token_accuracy": 0.7046568393707275, + "num_tokens": 303133629.0, + "step": 12146 + }, + { + "epoch": 1.3339556336481442, + "grad_norm": 2.3516318798065186, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7245608568191528, + "num_tokens": 303157216.0, + "step": 12147 + }, + { + "epoch": 1.3340654513507577, + "grad_norm": 2.422292470932007, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7365664839744568, + "num_tokens": 303179695.0, + "step": 12148 + }, + { + "epoch": 1.3341752690533715, + "grad_norm": 2.075131416320801, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.6896249055862427, + "num_tokens": 303207806.0, + "step": 12149 + }, + { + "epoch": 1.334285086755985, + "grad_norm": 2.103504180908203, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.692263662815094, + "num_tokens": 303236580.0, + "step": 12150 + }, + { + "epoch": 1.3343949044585988, + "grad_norm": 2.263286590576172, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.726183295249939, + "num_tokens": 303261454.0, + "step": 12151 + }, + { + "epoch": 1.3345047221612125, + "grad_norm": 2.238419532775879, + "learning_rate": 1e-06, + "loss": 0.9992, + "mean_token_accuracy": 0.7056659460067749, + "num_tokens": 303286103.0, + "step": 12152 + }, + { + "epoch": 1.334614539863826, + "grad_norm": 2.56884765625, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.739385724067688, + "num_tokens": 303306160.0, + "step": 12153 + }, + { + "epoch": 1.3347243575664396, + "grad_norm": 2.374546527862549, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7173404693603516, + "num_tokens": 303331191.0, + "step": 12154 + }, + { + "epoch": 1.3348341752690533, + "grad_norm": 2.829648017883301, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7294148802757263, + "num_tokens": 303348791.0, + "step": 12155 + }, + { + "epoch": 1.334943992971667, + "grad_norm": 2.5692734718322754, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7118862867355347, + "num_tokens": 303369514.0, + "step": 12156 + }, + { + "epoch": 1.3350538106742806, + "grad_norm": 2.1648619174957275, + "learning_rate": 1e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7371737957000732, + "num_tokens": 303395181.0, + "step": 12157 + }, + { + "epoch": 1.3351636283768944, + "grad_norm": 2.374013662338257, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7309762239456177, + "num_tokens": 303416551.0, + "step": 12158 + }, + { + "epoch": 1.335273446079508, + "grad_norm": 2.2707231044769287, + "learning_rate": 1e-06, + "loss": 0.9982, + "mean_token_accuracy": 0.6921296715736389, + "num_tokens": 303441595.0, + "step": 12159 + }, + { + "epoch": 1.3353832637821217, + "grad_norm": 2.6351828575134277, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.708776593208313, + "num_tokens": 303461478.0, + "step": 12160 + }, + { + "epoch": 1.3354930814847354, + "grad_norm": 2.2205700874328613, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7200905680656433, + "num_tokens": 303486746.0, + "step": 12161 + }, + { + "epoch": 1.335602899187349, + "grad_norm": 2.138578414916992, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7084598541259766, + "num_tokens": 303516519.0, + "step": 12162 + }, + { + "epoch": 1.3357127168899627, + "grad_norm": 2.2470169067382812, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7182422876358032, + "num_tokens": 303542747.0, + "step": 12163 + }, + { + "epoch": 1.3358225345925763, + "grad_norm": 2.2587814331054688, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7126529812812805, + "num_tokens": 303569144.0, + "step": 12164 + }, + { + "epoch": 1.33593235229519, + "grad_norm": 2.259465217590332, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.714923083782196, + "num_tokens": 303594939.0, + "step": 12165 + }, + { + "epoch": 1.3360421699978036, + "grad_norm": 1.9568021297454834, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7206299304962158, + "num_tokens": 303627362.0, + "step": 12166 + }, + { + "epoch": 1.3361519877004173, + "grad_norm": 1.9723485708236694, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.6968607902526855, + "num_tokens": 303658886.0, + "step": 12167 + }, + { + "epoch": 1.3362618054030309, + "grad_norm": 2.0133934020996094, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7131228446960449, + "num_tokens": 303690030.0, + "step": 12168 + }, + { + "epoch": 1.3363716231056446, + "grad_norm": 2.5799543857574463, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.7263745665550232, + "num_tokens": 303711018.0, + "step": 12169 + }, + { + "epoch": 1.3364814408082584, + "grad_norm": 2.086416721343994, + "learning_rate": 1e-06, + "loss": 1.0146, + "mean_token_accuracy": 0.6897844672203064, + "num_tokens": 303742956.0, + "step": 12170 + }, + { + "epoch": 1.336591258510872, + "grad_norm": 1.995218276977539, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6973669528961182, + "num_tokens": 303774948.0, + "step": 12171 + }, + { + "epoch": 1.3367010762134857, + "grad_norm": 2.8560662269592285, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7287659049034119, + "num_tokens": 303793057.0, + "step": 12172 + }, + { + "epoch": 1.3368108939160992, + "grad_norm": 1.9467555284500122, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.698212742805481, + "num_tokens": 303825834.0, + "step": 12173 + }, + { + "epoch": 1.336920711618713, + "grad_norm": 2.2146921157836914, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7100464105606079, + "num_tokens": 303852048.0, + "step": 12174 + }, + { + "epoch": 1.3370305293213267, + "grad_norm": 2.618957757949829, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7326330542564392, + "num_tokens": 303870775.0, + "step": 12175 + }, + { + "epoch": 1.3371403470239402, + "grad_norm": 2.259800672531128, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7246534824371338, + "num_tokens": 303894661.0, + "step": 12176 + }, + { + "epoch": 1.337250164726554, + "grad_norm": 2.0707664489746094, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.737520158290863, + "num_tokens": 303919873.0, + "step": 12177 + }, + { + "epoch": 1.3373599824291675, + "grad_norm": 2.331205368041992, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7288810610771179, + "num_tokens": 303944444.0, + "step": 12178 + }, + { + "epoch": 1.3374698001317813, + "grad_norm": 2.2254786491394043, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7122602462768555, + "num_tokens": 303968549.0, + "step": 12179 + }, + { + "epoch": 1.3375796178343948, + "grad_norm": 1.9672504663467407, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7075046300888062, + "num_tokens": 304000358.0, + "step": 12180 + }, + { + "epoch": 1.3376894355370086, + "grad_norm": 2.1025278568267822, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7340491414070129, + "num_tokens": 304027955.0, + "step": 12181 + }, + { + "epoch": 1.3377992532396221, + "grad_norm": 2.3108489513397217, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7074716091156006, + "num_tokens": 304052064.0, + "step": 12182 + }, + { + "epoch": 1.3379090709422359, + "grad_norm": 2.4406349658966064, + "learning_rate": 1e-06, + "loss": 0.8736, + "mean_token_accuracy": 0.732332706451416, + "num_tokens": 304074305.0, + "step": 12183 + }, + { + "epoch": 1.3380188886448496, + "grad_norm": 2.2609493732452393, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7148795127868652, + "num_tokens": 304097473.0, + "step": 12184 + }, + { + "epoch": 1.3381287063474632, + "grad_norm": 2.1284382343292236, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7074628472328186, + "num_tokens": 304125682.0, + "step": 12185 + }, + { + "epoch": 1.338238524050077, + "grad_norm": 2.5267701148986816, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7189747095108032, + "num_tokens": 304146052.0, + "step": 12186 + }, + { + "epoch": 1.3383483417526905, + "grad_norm": 2.230048418045044, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7285257577896118, + "num_tokens": 304172277.0, + "step": 12187 + }, + { + "epoch": 1.3384581594553042, + "grad_norm": 2.062091588973999, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7222052812576294, + "num_tokens": 304199288.0, + "step": 12188 + }, + { + "epoch": 1.338567977157918, + "grad_norm": 2.2657716274261475, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7546694278717041, + "num_tokens": 304224328.0, + "step": 12189 + }, + { + "epoch": 1.3386777948605315, + "grad_norm": 2.2223379611968994, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7193975448608398, + "num_tokens": 304250074.0, + "step": 12190 + }, + { + "epoch": 1.338787612563145, + "grad_norm": 2.5327048301696777, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7195471525192261, + "num_tokens": 304270075.0, + "step": 12191 + }, + { + "epoch": 1.3388974302657588, + "grad_norm": 2.189960479736328, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7204324007034302, + "num_tokens": 304296700.0, + "step": 12192 + }, + { + "epoch": 1.3390072479683726, + "grad_norm": 2.0030503273010254, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7271466851234436, + "num_tokens": 304328385.0, + "step": 12193 + }, + { + "epoch": 1.339117065670986, + "grad_norm": 2.511573314666748, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7234522700309753, + "num_tokens": 304349531.0, + "step": 12194 + }, + { + "epoch": 1.3392268833735999, + "grad_norm": 2.2647457122802734, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7217515707015991, + "num_tokens": 304375336.0, + "step": 12195 + }, + { + "epoch": 1.3393367010762134, + "grad_norm": 2.3583178520202637, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7058721780776978, + "num_tokens": 304399034.0, + "step": 12196 + }, + { + "epoch": 1.3394465187788271, + "grad_norm": 2.366349935531616, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7325925230979919, + "num_tokens": 304423229.0, + "step": 12197 + }, + { + "epoch": 1.339556336481441, + "grad_norm": 2.672574758529663, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7366952300071716, + "num_tokens": 304441618.0, + "step": 12198 + }, + { + "epoch": 1.3396661541840544, + "grad_norm": 1.9480540752410889, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6873161196708679, + "num_tokens": 304474682.0, + "step": 12199 + }, + { + "epoch": 1.3397759718866682, + "grad_norm": 2.292457103729248, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7117454409599304, + "num_tokens": 304499870.0, + "step": 12200 + }, + { + "epoch": 1.3398857895892817, + "grad_norm": 2.1578850746154785, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7159047722816467, + "num_tokens": 304526978.0, + "step": 12201 + }, + { + "epoch": 1.3399956072918955, + "grad_norm": 2.049344062805176, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7075594067573547, + "num_tokens": 304559138.0, + "step": 12202 + }, + { + "epoch": 1.3401054249945092, + "grad_norm": 2.6505942344665527, + "learning_rate": 1e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7529374957084656, + "num_tokens": 304578872.0, + "step": 12203 + }, + { + "epoch": 1.3402152426971228, + "grad_norm": 2.319531202316284, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7187708616256714, + "num_tokens": 304604232.0, + "step": 12204 + }, + { + "epoch": 1.3403250603997363, + "grad_norm": 2.297126293182373, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.714253306388855, + "num_tokens": 304629150.0, + "step": 12205 + }, + { + "epoch": 1.34043487810235, + "grad_norm": 2.496335506439209, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7190960645675659, + "num_tokens": 304650102.0, + "step": 12206 + }, + { + "epoch": 1.3405446958049638, + "grad_norm": 2.257756233215332, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.7075480222702026, + "num_tokens": 304676409.0, + "step": 12207 + }, + { + "epoch": 1.3406545135075774, + "grad_norm": 2.225506067276001, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7428089380264282, + "num_tokens": 304703321.0, + "step": 12208 + }, + { + "epoch": 1.3407643312101911, + "grad_norm": 2.4272429943084717, + "learning_rate": 1e-06, + "loss": 0.7938, + "mean_token_accuracy": 0.7513453960418701, + "num_tokens": 304725499.0, + "step": 12209 + }, + { + "epoch": 1.3408741489128047, + "grad_norm": 2.451686382293701, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7227616310119629, + "num_tokens": 304747905.0, + "step": 12210 + }, + { + "epoch": 1.3409839666154184, + "grad_norm": 2.056934118270874, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7158815264701843, + "num_tokens": 304775435.0, + "step": 12211 + }, + { + "epoch": 1.3410937843180322, + "grad_norm": 2.379047155380249, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.732322633266449, + "num_tokens": 304798637.0, + "step": 12212 + }, + { + "epoch": 1.3412036020206457, + "grad_norm": 2.331751823425293, + "learning_rate": 1e-06, + "loss": 0.8384, + "mean_token_accuracy": 0.7352993488311768, + "num_tokens": 304821644.0, + "step": 12213 + }, + { + "epoch": 1.3413134197232595, + "grad_norm": 2.29099178314209, + "learning_rate": 1e-06, + "loss": 0.8273, + "mean_token_accuracy": 0.7459686994552612, + "num_tokens": 304845836.0, + "step": 12214 + }, + { + "epoch": 1.341423237425873, + "grad_norm": 1.9731160402297974, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.6974168419837952, + "num_tokens": 304876047.0, + "step": 12215 + }, + { + "epoch": 1.3415330551284868, + "grad_norm": 2.223900079727173, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7160637378692627, + "num_tokens": 304900842.0, + "step": 12216 + }, + { + "epoch": 1.3416428728311005, + "grad_norm": 2.0998494625091553, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7047027349472046, + "num_tokens": 304929787.0, + "step": 12217 + }, + { + "epoch": 1.341752690533714, + "grad_norm": 2.8610806465148926, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7197673320770264, + "num_tokens": 304946388.0, + "step": 12218 + }, + { + "epoch": 1.3418625082363276, + "grad_norm": 2.105095863342285, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7232950329780579, + "num_tokens": 304973846.0, + "step": 12219 + }, + { + "epoch": 1.3419723259389413, + "grad_norm": 2.3873515129089355, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7089198231697083, + "num_tokens": 304996998.0, + "step": 12220 + }, + { + "epoch": 1.342082143641555, + "grad_norm": 2.5828702449798584, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7185500860214233, + "num_tokens": 305017216.0, + "step": 12221 + }, + { + "epoch": 1.3421919613441686, + "grad_norm": 1.9293314218521118, + "learning_rate": 1e-06, + "loss": 1.0265, + "mean_token_accuracy": 0.6921991109848022, + "num_tokens": 305050026.0, + "step": 12222 + }, + { + "epoch": 1.3423017790467824, + "grad_norm": 2.2398879528045654, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7427181601524353, + "num_tokens": 305074283.0, + "step": 12223 + }, + { + "epoch": 1.342411596749396, + "grad_norm": 2.118368625640869, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7306990027427673, + "num_tokens": 305099621.0, + "step": 12224 + }, + { + "epoch": 1.3425214144520097, + "grad_norm": 2.236212968826294, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7405163049697876, + "num_tokens": 305124348.0, + "step": 12225 + }, + { + "epoch": 1.3426312321546234, + "grad_norm": 2.52313232421875, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7156323194503784, + "num_tokens": 305147477.0, + "step": 12226 + }, + { + "epoch": 1.342741049857237, + "grad_norm": 2.354919910430908, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7173075675964355, + "num_tokens": 305172574.0, + "step": 12227 + }, + { + "epoch": 1.3428508675598507, + "grad_norm": 2.715329885482788, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7162038087844849, + "num_tokens": 305191004.0, + "step": 12228 + }, + { + "epoch": 1.3429606852624643, + "grad_norm": 2.2163190841674805, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7228655815124512, + "num_tokens": 305220244.0, + "step": 12229 + }, + { + "epoch": 1.343070502965078, + "grad_norm": 2.435523271560669, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7247490882873535, + "num_tokens": 305243422.0, + "step": 12230 + }, + { + "epoch": 1.3431803206676916, + "grad_norm": 2.4284873008728027, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7267951965332031, + "num_tokens": 305265840.0, + "step": 12231 + }, + { + "epoch": 1.3432901383703053, + "grad_norm": 2.41191029548645, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7319489121437073, + "num_tokens": 305288067.0, + "step": 12232 + }, + { + "epoch": 1.3433999560729188, + "grad_norm": 2.1060941219329834, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7120237350463867, + "num_tokens": 305318095.0, + "step": 12233 + }, + { + "epoch": 1.3435097737755326, + "grad_norm": 2.0749573707580566, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.70132976770401, + "num_tokens": 305346397.0, + "step": 12234 + }, + { + "epoch": 1.3436195914781464, + "grad_norm": 1.9676661491394043, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7127884030342102, + "num_tokens": 305376836.0, + "step": 12235 + }, + { + "epoch": 1.34372940918076, + "grad_norm": 2.2267587184906006, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7121914029121399, + "num_tokens": 305403177.0, + "step": 12236 + }, + { + "epoch": 1.3438392268833736, + "grad_norm": 2.3802928924560547, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7299586534500122, + "num_tokens": 305426470.0, + "step": 12237 + }, + { + "epoch": 1.3439490445859872, + "grad_norm": 2.273350238800049, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.6979105472564697, + "num_tokens": 305454128.0, + "step": 12238 + }, + { + "epoch": 1.344058862288601, + "grad_norm": 1.953856110572815, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.7110958099365234, + "num_tokens": 305488019.0, + "step": 12239 + }, + { + "epoch": 1.3441686799912147, + "grad_norm": 2.1547141075134277, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7183898687362671, + "num_tokens": 305515859.0, + "step": 12240 + }, + { + "epoch": 1.3442784976938282, + "grad_norm": 2.3041133880615234, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7006183862686157, + "num_tokens": 305539488.0, + "step": 12241 + }, + { + "epoch": 1.3443883153964418, + "grad_norm": 2.2589523792266846, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7105390429496765, + "num_tokens": 305564079.0, + "step": 12242 + }, + { + "epoch": 1.3444981330990555, + "grad_norm": 2.667707681655884, + "learning_rate": 1e-06, + "loss": 0.7981, + "mean_token_accuracy": 0.7482736110687256, + "num_tokens": 305582855.0, + "step": 12243 + }, + { + "epoch": 1.3446079508016693, + "grad_norm": 2.001984119415283, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.7009590268135071, + "num_tokens": 305615826.0, + "step": 12244 + }, + { + "epoch": 1.3447177685042828, + "grad_norm": 2.2565841674804688, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7145622372627258, + "num_tokens": 305641931.0, + "step": 12245 + }, + { + "epoch": 1.3448275862068966, + "grad_norm": 2.4335057735443115, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7375872135162354, + "num_tokens": 305662406.0, + "step": 12246 + }, + { + "epoch": 1.34493740390951, + "grad_norm": 2.4064910411834717, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7100130915641785, + "num_tokens": 305686116.0, + "step": 12247 + }, + { + "epoch": 1.3450472216121239, + "grad_norm": 2.368224859237671, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.6995707750320435, + "num_tokens": 305711841.0, + "step": 12248 + }, + { + "epoch": 1.3451570393147376, + "grad_norm": 2.099590301513672, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6868155598640442, + "num_tokens": 305742507.0, + "step": 12249 + }, + { + "epoch": 1.3452668570173512, + "grad_norm": 2.125005006790161, + "learning_rate": 1e-06, + "loss": 0.975, + "mean_token_accuracy": 0.7049615979194641, + "num_tokens": 305773462.0, + "step": 12250 + }, + { + "epoch": 1.345376674719965, + "grad_norm": 2.7001705169677734, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7372831106185913, + "num_tokens": 305793742.0, + "step": 12251 + }, + { + "epoch": 1.3454864924225785, + "grad_norm": 2.3877947330474854, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7187406420707703, + "num_tokens": 305818563.0, + "step": 12252 + }, + { + "epoch": 1.3455963101251922, + "grad_norm": 2.1561057567596436, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7220994830131531, + "num_tokens": 305845231.0, + "step": 12253 + }, + { + "epoch": 1.345706127827806, + "grad_norm": 2.3768646717071533, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7140166759490967, + "num_tokens": 305867409.0, + "step": 12254 + }, + { + "epoch": 1.3458159455304195, + "grad_norm": 2.4403674602508545, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7429718971252441, + "num_tokens": 305888347.0, + "step": 12255 + }, + { + "epoch": 1.345925763233033, + "grad_norm": 2.3478572368621826, + "learning_rate": 1e-06, + "loss": 0.8551, + "mean_token_accuracy": 0.7311015129089355, + "num_tokens": 305911596.0, + "step": 12256 + }, + { + "epoch": 1.3460355809356468, + "grad_norm": 2.335573434829712, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7172036170959473, + "num_tokens": 305935109.0, + "step": 12257 + }, + { + "epoch": 1.3461453986382605, + "grad_norm": 2.2132809162139893, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7143831253051758, + "num_tokens": 305963761.0, + "step": 12258 + }, + { + "epoch": 1.346255216340874, + "grad_norm": 2.4203033447265625, + "learning_rate": 1e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7447136044502258, + "num_tokens": 305984889.0, + "step": 12259 + }, + { + "epoch": 1.3463650340434878, + "grad_norm": 2.3542065620422363, + "learning_rate": 1e-06, + "loss": 0.8141, + "mean_token_accuracy": 0.7379723787307739, + "num_tokens": 306008697.0, + "step": 12260 + }, + { + "epoch": 1.3464748517461014, + "grad_norm": 2.604865550994873, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7278566360473633, + "num_tokens": 306029501.0, + "step": 12261 + }, + { + "epoch": 1.3465846694487151, + "grad_norm": 2.3330721855163574, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7193572521209717, + "num_tokens": 306053516.0, + "step": 12262 + }, + { + "epoch": 1.346694487151329, + "grad_norm": 2.0287888050079346, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.707107424736023, + "num_tokens": 306083556.0, + "step": 12263 + }, + { + "epoch": 1.3468043048539424, + "grad_norm": 2.3224239349365234, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7046989798545837, + "num_tokens": 306109796.0, + "step": 12264 + }, + { + "epoch": 1.3469141225565562, + "grad_norm": 2.138779401779175, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7270444631576538, + "num_tokens": 306138192.0, + "step": 12265 + }, + { + "epoch": 1.3470239402591697, + "grad_norm": 2.3468070030212402, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7160332798957825, + "num_tokens": 306160847.0, + "step": 12266 + }, + { + "epoch": 1.3471337579617835, + "grad_norm": 2.053551197052002, + "learning_rate": 1e-06, + "loss": 0.9978, + "mean_token_accuracy": 0.6938712000846863, + "num_tokens": 306190461.0, + "step": 12267 + }, + { + "epoch": 1.3472435756643972, + "grad_norm": 2.235645055770874, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6964266300201416, + "num_tokens": 306216979.0, + "step": 12268 + }, + { + "epoch": 1.3473533933670108, + "grad_norm": 2.6268696784973145, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.6907901763916016, + "num_tokens": 306236332.0, + "step": 12269 + }, + { + "epoch": 1.3474632110696243, + "grad_norm": 2.215473175048828, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7022035121917725, + "num_tokens": 306262550.0, + "step": 12270 + }, + { + "epoch": 1.347573028772238, + "grad_norm": 1.9319844245910645, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7348848581314087, + "num_tokens": 306293674.0, + "step": 12271 + }, + { + "epoch": 1.3476828464748518, + "grad_norm": 2.3299806118011475, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7202746868133545, + "num_tokens": 306319287.0, + "step": 12272 + }, + { + "epoch": 1.3477926641774653, + "grad_norm": 2.2370166778564453, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7004376649856567, + "num_tokens": 306345059.0, + "step": 12273 + }, + { + "epoch": 1.347902481880079, + "grad_norm": 2.3319036960601807, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7024928331375122, + "num_tokens": 306369102.0, + "step": 12274 + }, + { + "epoch": 1.3480122995826926, + "grad_norm": 2.1754064559936523, + "learning_rate": 1e-06, + "loss": 0.9414, + "mean_token_accuracy": 0.7130377292633057, + "num_tokens": 306396795.0, + "step": 12275 + }, + { + "epoch": 1.3481221172853064, + "grad_norm": 2.002262830734253, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7282476425170898, + "num_tokens": 306425624.0, + "step": 12276 + }, + { + "epoch": 1.3482319349879202, + "grad_norm": 2.2972280979156494, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7200229167938232, + "num_tokens": 306450167.0, + "step": 12277 + }, + { + "epoch": 1.3483417526905337, + "grad_norm": 2.1530683040618896, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.7018994092941284, + "num_tokens": 306480080.0, + "step": 12278 + }, + { + "epoch": 1.3484515703931474, + "grad_norm": 2.0922322273254395, + "learning_rate": 1e-06, + "loss": 1.0653, + "mean_token_accuracy": 0.6908434629440308, + "num_tokens": 306511558.0, + "step": 12279 + }, + { + "epoch": 1.348561388095761, + "grad_norm": 2.060647487640381, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.6986280679702759, + "num_tokens": 306542396.0, + "step": 12280 + }, + { + "epoch": 1.3486712057983747, + "grad_norm": 2.4619996547698975, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7276999950408936, + "num_tokens": 306563059.0, + "step": 12281 + }, + { + "epoch": 1.3487810235009885, + "grad_norm": 2.1795835494995117, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7189019322395325, + "num_tokens": 306588338.0, + "step": 12282 + }, + { + "epoch": 1.348890841203602, + "grad_norm": 2.036771535873413, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7361091375350952, + "num_tokens": 306617276.0, + "step": 12283 + }, + { + "epoch": 1.3490006589062156, + "grad_norm": 2.290825605392456, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7216707468032837, + "num_tokens": 306640489.0, + "step": 12284 + }, + { + "epoch": 1.3491104766088293, + "grad_norm": 2.1089928150177, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7018846273422241, + "num_tokens": 306668718.0, + "step": 12285 + }, + { + "epoch": 1.349220294311443, + "grad_norm": 2.3496100902557373, + "learning_rate": 1e-06, + "loss": 0.9517, + "mean_token_accuracy": 0.704054057598114, + "num_tokens": 306693452.0, + "step": 12286 + }, + { + "epoch": 1.3493301120140566, + "grad_norm": 2.405134439468384, + "learning_rate": 1e-06, + "loss": 0.8282, + "mean_token_accuracy": 0.7352195978164673, + "num_tokens": 306715589.0, + "step": 12287 + }, + { + "epoch": 1.3494399297166704, + "grad_norm": 2.4832370281219482, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7382802963256836, + "num_tokens": 306735588.0, + "step": 12288 + }, + { + "epoch": 1.349549747419284, + "grad_norm": 2.330047845840454, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7344506978988647, + "num_tokens": 306757493.0, + "step": 12289 + }, + { + "epoch": 1.3496595651218977, + "grad_norm": 2.1964070796966553, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7091739773750305, + "num_tokens": 306783357.0, + "step": 12290 + }, + { + "epoch": 1.3497693828245114, + "grad_norm": 2.350473642349243, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7228008508682251, + "num_tokens": 306807127.0, + "step": 12291 + }, + { + "epoch": 1.349879200527125, + "grad_norm": 2.057220458984375, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7215749621391296, + "num_tokens": 306835201.0, + "step": 12292 + }, + { + "epoch": 1.3499890182297387, + "grad_norm": 2.392427682876587, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.712762713432312, + "num_tokens": 306858894.0, + "step": 12293 + }, + { + "epoch": 1.3500988359323522, + "grad_norm": 2.4713549613952637, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7109220027923584, + "num_tokens": 306880359.0, + "step": 12294 + }, + { + "epoch": 1.350208653634966, + "grad_norm": 2.5558927059173584, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7269200086593628, + "num_tokens": 306899984.0, + "step": 12295 + }, + { + "epoch": 1.3503184713375795, + "grad_norm": 2.2897207736968994, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7313907146453857, + "num_tokens": 306925594.0, + "step": 12296 + }, + { + "epoch": 1.3504282890401933, + "grad_norm": 2.306410312652588, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7183699011802673, + "num_tokens": 306952818.0, + "step": 12297 + }, + { + "epoch": 1.3505381067428068, + "grad_norm": 1.950003743171692, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7223470211029053, + "num_tokens": 306985404.0, + "step": 12298 + }, + { + "epoch": 1.3506479244454206, + "grad_norm": 2.1307406425476074, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7041172981262207, + "num_tokens": 307011739.0, + "step": 12299 + }, + { + "epoch": 1.3507577421480343, + "grad_norm": 2.3401894569396973, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.736319899559021, + "num_tokens": 307034375.0, + "step": 12300 + }, + { + "epoch": 1.3508675598506479, + "grad_norm": 2.139087677001953, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.6917864084243774, + "num_tokens": 307062965.0, + "step": 12301 + }, + { + "epoch": 1.3509773775532616, + "grad_norm": 2.562269687652588, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7260000705718994, + "num_tokens": 307082556.0, + "step": 12302 + }, + { + "epoch": 1.3510871952558752, + "grad_norm": 2.204568862915039, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7161360383033752, + "num_tokens": 307108569.0, + "step": 12303 + }, + { + "epoch": 1.351197012958489, + "grad_norm": 2.4620370864868164, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7208243608474731, + "num_tokens": 307130559.0, + "step": 12304 + }, + { + "epoch": 1.3513068306611027, + "grad_norm": 2.449404001235962, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7055907249450684, + "num_tokens": 307152258.0, + "step": 12305 + }, + { + "epoch": 1.3514166483637162, + "grad_norm": 2.7809088230133057, + "learning_rate": 1e-06, + "loss": 0.756, + "mean_token_accuracy": 0.7533053159713745, + "num_tokens": 307169167.0, + "step": 12306 + }, + { + "epoch": 1.3515264660663298, + "grad_norm": 2.1489176750183105, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6876017451286316, + "num_tokens": 307195916.0, + "step": 12307 + }, + { + "epoch": 1.3516362837689435, + "grad_norm": 2.5235729217529297, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7333617210388184, + "num_tokens": 307216089.0, + "step": 12308 + }, + { + "epoch": 1.3517461014715573, + "grad_norm": 2.348727226257324, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7082356214523315, + "num_tokens": 307240553.0, + "step": 12309 + }, + { + "epoch": 1.3518559191741708, + "grad_norm": 2.508378267288208, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7073936462402344, + "num_tokens": 307262333.0, + "step": 12310 + }, + { + "epoch": 1.3519657368767846, + "grad_norm": 2.349874496459961, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7243553400039673, + "num_tokens": 307286997.0, + "step": 12311 + }, + { + "epoch": 1.352075554579398, + "grad_norm": 2.361806869506836, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7216382622718811, + "num_tokens": 307309756.0, + "step": 12312 + }, + { + "epoch": 1.3521853722820119, + "grad_norm": 2.2186944484710693, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7180202007293701, + "num_tokens": 307335057.0, + "step": 12313 + }, + { + "epoch": 1.3522951899846256, + "grad_norm": 2.0670483112335205, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7295478582382202, + "num_tokens": 307363454.0, + "step": 12314 + }, + { + "epoch": 1.3524050076872391, + "grad_norm": 2.6257331371307373, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7095271348953247, + "num_tokens": 307384806.0, + "step": 12315 + }, + { + "epoch": 1.352514825389853, + "grad_norm": 2.2578487396240234, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7371039986610413, + "num_tokens": 307412412.0, + "step": 12316 + }, + { + "epoch": 1.3526246430924664, + "grad_norm": 2.2502589225769043, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7212828397750854, + "num_tokens": 307438454.0, + "step": 12317 + }, + { + "epoch": 1.3527344607950802, + "grad_norm": 2.434213638305664, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.724708080291748, + "num_tokens": 307460667.0, + "step": 12318 + }, + { + "epoch": 1.352844278497694, + "grad_norm": 2.2694170475006104, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7112348079681396, + "num_tokens": 307485405.0, + "step": 12319 + }, + { + "epoch": 1.3529540962003075, + "grad_norm": 2.1695005893707275, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7063295841217041, + "num_tokens": 307513179.0, + "step": 12320 + }, + { + "epoch": 1.353063913902921, + "grad_norm": 2.288947582244873, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7244859933853149, + "num_tokens": 307537104.0, + "step": 12321 + }, + { + "epoch": 1.3531737316055348, + "grad_norm": 2.084318161010742, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7216347455978394, + "num_tokens": 307567213.0, + "step": 12322 + }, + { + "epoch": 1.3532835493081485, + "grad_norm": 2.362658977508545, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7380176186561584, + "num_tokens": 307589998.0, + "step": 12323 + }, + { + "epoch": 1.353393367010762, + "grad_norm": 2.3117995262145996, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7310802936553955, + "num_tokens": 307612952.0, + "step": 12324 + }, + { + "epoch": 1.3535031847133758, + "grad_norm": 2.1935348510742188, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7108360528945923, + "num_tokens": 307638059.0, + "step": 12325 + }, + { + "epoch": 1.3536130024159894, + "grad_norm": 2.5546441078186035, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7171482443809509, + "num_tokens": 307659985.0, + "step": 12326 + }, + { + "epoch": 1.3537228201186031, + "grad_norm": 2.289949655532837, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.724175751209259, + "num_tokens": 307683190.0, + "step": 12327 + }, + { + "epoch": 1.3538326378212169, + "grad_norm": 2.286550283432007, + "learning_rate": 1e-06, + "loss": 0.9686, + "mean_token_accuracy": 0.7151904106140137, + "num_tokens": 307707415.0, + "step": 12328 + }, + { + "epoch": 1.3539424555238304, + "grad_norm": 2.235233783721924, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7211714386940002, + "num_tokens": 307733554.0, + "step": 12329 + }, + { + "epoch": 1.3540522732264442, + "grad_norm": 2.20692777633667, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7300927639007568, + "num_tokens": 307759079.0, + "step": 12330 + }, + { + "epoch": 1.3541620909290577, + "grad_norm": 2.1881067752838135, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.715348482131958, + "num_tokens": 307787184.0, + "step": 12331 + }, + { + "epoch": 1.3542719086316715, + "grad_norm": 2.414738655090332, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7194626331329346, + "num_tokens": 307811201.0, + "step": 12332 + }, + { + "epoch": 1.3543817263342852, + "grad_norm": 2.350982904434204, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.738365888595581, + "num_tokens": 307833101.0, + "step": 12333 + }, + { + "epoch": 1.3544915440368988, + "grad_norm": 2.2211809158325195, + "learning_rate": 1e-06, + "loss": 0.9593, + "mean_token_accuracy": 0.719353437423706, + "num_tokens": 307861739.0, + "step": 12334 + }, + { + "epoch": 1.3546013617395123, + "grad_norm": 2.076657772064209, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.6975334882736206, + "num_tokens": 307892054.0, + "step": 12335 + }, + { + "epoch": 1.354711179442126, + "grad_norm": 2.749177932739258, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7357112765312195, + "num_tokens": 307909934.0, + "step": 12336 + }, + { + "epoch": 1.3548209971447398, + "grad_norm": 2.2818729877471924, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7094879150390625, + "num_tokens": 307936363.0, + "step": 12337 + }, + { + "epoch": 1.3549308148473533, + "grad_norm": 2.544276237487793, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7196570634841919, + "num_tokens": 307956804.0, + "step": 12338 + }, + { + "epoch": 1.355040632549967, + "grad_norm": 2.078803539276123, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7261819839477539, + "num_tokens": 307986142.0, + "step": 12339 + }, + { + "epoch": 1.3551504502525806, + "grad_norm": 2.4087867736816406, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7136523127555847, + "num_tokens": 308010826.0, + "step": 12340 + }, + { + "epoch": 1.3552602679551944, + "grad_norm": 2.163466215133667, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7027447819709778, + "num_tokens": 308036570.0, + "step": 12341 + }, + { + "epoch": 1.3553700856578081, + "grad_norm": 2.5585134029388428, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7320572137832642, + "num_tokens": 308055282.0, + "step": 12342 + }, + { + "epoch": 1.3554799033604217, + "grad_norm": 2.194052219390869, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7320238351821899, + "num_tokens": 308081370.0, + "step": 12343 + }, + { + "epoch": 1.3555897210630354, + "grad_norm": 2.054349660873413, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.715319812297821, + "num_tokens": 308111434.0, + "step": 12344 + }, + { + "epoch": 1.355699538765649, + "grad_norm": 2.4125475883483887, + "learning_rate": 1e-06, + "loss": 0.8542, + "mean_token_accuracy": 0.7327395081520081, + "num_tokens": 308133512.0, + "step": 12345 + }, + { + "epoch": 1.3558093564682627, + "grad_norm": 2.4044125080108643, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7247956991195679, + "num_tokens": 308155940.0, + "step": 12346 + }, + { + "epoch": 1.3559191741708763, + "grad_norm": 2.68094539642334, + "learning_rate": 1e-06, + "loss": 0.814, + "mean_token_accuracy": 0.739081859588623, + "num_tokens": 308173781.0, + "step": 12347 + }, + { + "epoch": 1.35602899187349, + "grad_norm": 2.29386043548584, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7270113825798035, + "num_tokens": 308197606.0, + "step": 12348 + }, + { + "epoch": 1.3561388095761036, + "grad_norm": 2.363654613494873, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7155594229698181, + "num_tokens": 308221239.0, + "step": 12349 + }, + { + "epoch": 1.3562486272787173, + "grad_norm": 2.2628962993621826, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7338754534721375, + "num_tokens": 308247773.0, + "step": 12350 + }, + { + "epoch": 1.356358444981331, + "grad_norm": 2.1356542110443115, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7129752039909363, + "num_tokens": 308275100.0, + "step": 12351 + }, + { + "epoch": 1.3564682626839446, + "grad_norm": 2.407362699508667, + "learning_rate": 1e-06, + "loss": 0.747, + "mean_token_accuracy": 0.766924262046814, + "num_tokens": 308295606.0, + "step": 12352 + }, + { + "epoch": 1.3565780803865584, + "grad_norm": 2.1158862113952637, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.6926336288452148, + "num_tokens": 308324286.0, + "step": 12353 + }, + { + "epoch": 1.356687898089172, + "grad_norm": 2.0869009494781494, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.718042254447937, + "num_tokens": 308353025.0, + "step": 12354 + }, + { + "epoch": 1.3567977157917857, + "grad_norm": 2.1870534420013428, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7322639226913452, + "num_tokens": 308379420.0, + "step": 12355 + }, + { + "epoch": 1.3569075334943994, + "grad_norm": 2.569430351257324, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7195433974266052, + "num_tokens": 308399657.0, + "step": 12356 + }, + { + "epoch": 1.357017351197013, + "grad_norm": 2.4469687938690186, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7229660749435425, + "num_tokens": 308423500.0, + "step": 12357 + }, + { + "epoch": 1.3571271688996267, + "grad_norm": 2.2494373321533203, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7275698184967041, + "num_tokens": 308449474.0, + "step": 12358 + }, + { + "epoch": 1.3572369866022402, + "grad_norm": 2.5863821506500244, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7386449575424194, + "num_tokens": 308470198.0, + "step": 12359 + }, + { + "epoch": 1.357346804304854, + "grad_norm": 2.0305144786834717, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.6981304883956909, + "num_tokens": 308498688.0, + "step": 12360 + }, + { + "epoch": 1.3574566220074675, + "grad_norm": 2.336181163787842, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7170776128768921, + "num_tokens": 308522383.0, + "step": 12361 + }, + { + "epoch": 1.3575664397100813, + "grad_norm": 2.3785176277160645, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7053962349891663, + "num_tokens": 308548010.0, + "step": 12362 + }, + { + "epoch": 1.3576762574126948, + "grad_norm": 2.456637144088745, + "learning_rate": 1e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.7462895512580872, + "num_tokens": 308569578.0, + "step": 12363 + }, + { + "epoch": 1.3577860751153086, + "grad_norm": 2.436552047729492, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7251738905906677, + "num_tokens": 308590484.0, + "step": 12364 + }, + { + "epoch": 1.3578958928179223, + "grad_norm": 2.2045319080352783, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7534784078598022, + "num_tokens": 308615600.0, + "step": 12365 + }, + { + "epoch": 1.3580057105205359, + "grad_norm": 2.688566207885742, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7103840112686157, + "num_tokens": 308634969.0, + "step": 12366 + }, + { + "epoch": 1.3581155282231496, + "grad_norm": 2.254504680633545, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7214471101760864, + "num_tokens": 308661927.0, + "step": 12367 + }, + { + "epoch": 1.3582253459257632, + "grad_norm": 2.453939437866211, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7442655563354492, + "num_tokens": 308682341.0, + "step": 12368 + }, + { + "epoch": 1.358335163628377, + "grad_norm": 2.5902626514434814, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7076659202575684, + "num_tokens": 308704196.0, + "step": 12369 + }, + { + "epoch": 1.3584449813309907, + "grad_norm": 2.051503896713257, + "learning_rate": 1e-06, + "loss": 1.0051, + "mean_token_accuracy": 0.694446861743927, + "num_tokens": 308735280.0, + "step": 12370 + }, + { + "epoch": 1.3585547990336042, + "grad_norm": 2.107409954071045, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7137527465820312, + "num_tokens": 308764116.0, + "step": 12371 + }, + { + "epoch": 1.3586646167362177, + "grad_norm": 2.85121488571167, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.715890645980835, + "num_tokens": 308784254.0, + "step": 12372 + }, + { + "epoch": 1.3587744344388315, + "grad_norm": 2.2730438709259033, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.6961283087730408, + "num_tokens": 308810497.0, + "step": 12373 + }, + { + "epoch": 1.3588842521414453, + "grad_norm": 2.205777883529663, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7014697790145874, + "num_tokens": 308835185.0, + "step": 12374 + }, + { + "epoch": 1.3589940698440588, + "grad_norm": 2.3258779048919678, + "learning_rate": 1e-06, + "loss": 0.8299, + "mean_token_accuracy": 0.7354413270950317, + "num_tokens": 308858385.0, + "step": 12375 + }, + { + "epoch": 1.3591038875466726, + "grad_norm": 2.292665719985962, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.727968156337738, + "num_tokens": 308883781.0, + "step": 12376 + }, + { + "epoch": 1.359213705249286, + "grad_norm": 2.229377508163452, + "learning_rate": 1e-06, + "loss": 0.8505, + "mean_token_accuracy": 0.7275097370147705, + "num_tokens": 308907259.0, + "step": 12377 + }, + { + "epoch": 1.3593235229518998, + "grad_norm": 2.4322328567504883, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7294970750808716, + "num_tokens": 308930774.0, + "step": 12378 + }, + { + "epoch": 1.3594333406545136, + "grad_norm": 2.313931703567505, + "learning_rate": 1e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7410135269165039, + "num_tokens": 308953529.0, + "step": 12379 + }, + { + "epoch": 1.3595431583571271, + "grad_norm": 2.5470004081726074, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7142270803451538, + "num_tokens": 308974884.0, + "step": 12380 + }, + { + "epoch": 1.359652976059741, + "grad_norm": 2.1822211742401123, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7231318950653076, + "num_tokens": 309002567.0, + "step": 12381 + }, + { + "epoch": 1.3597627937623544, + "grad_norm": 2.726017951965332, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7151213884353638, + "num_tokens": 309020948.0, + "step": 12382 + }, + { + "epoch": 1.3598726114649682, + "grad_norm": 2.5282814502716064, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7006239891052246, + "num_tokens": 309044680.0, + "step": 12383 + }, + { + "epoch": 1.359982429167582, + "grad_norm": 2.4592976570129395, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.709709644317627, + "num_tokens": 309068185.0, + "step": 12384 + }, + { + "epoch": 1.3600922468701955, + "grad_norm": 2.362086296081543, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.705554187297821, + "num_tokens": 309093158.0, + "step": 12385 + }, + { + "epoch": 1.360202064572809, + "grad_norm": 2.2410666942596436, + "learning_rate": 1e-06, + "loss": 1.0252, + "mean_token_accuracy": 0.692975640296936, + "num_tokens": 309121247.0, + "step": 12386 + }, + { + "epoch": 1.3603118822754228, + "grad_norm": 2.427156448364258, + "learning_rate": 1e-06, + "loss": 0.8762, + "mean_token_accuracy": 0.731819748878479, + "num_tokens": 309143931.0, + "step": 12387 + }, + { + "epoch": 1.3604216999780365, + "grad_norm": 2.08656644821167, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7116326689720154, + "num_tokens": 309173538.0, + "step": 12388 + }, + { + "epoch": 1.36053151768065, + "grad_norm": 2.460437774658203, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7497711777687073, + "num_tokens": 309194501.0, + "step": 12389 + }, + { + "epoch": 1.3606413353832638, + "grad_norm": 1.9948232173919678, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7460618019104004, + "num_tokens": 309225725.0, + "step": 12390 + }, + { + "epoch": 1.3607511530858774, + "grad_norm": 2.3615169525146484, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7173052430152893, + "num_tokens": 309248133.0, + "step": 12391 + }, + { + "epoch": 1.360860970788491, + "grad_norm": 2.199319362640381, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6954960227012634, + "num_tokens": 309274527.0, + "step": 12392 + }, + { + "epoch": 1.3609707884911049, + "grad_norm": 2.2786056995391846, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7003504037857056, + "num_tokens": 309300337.0, + "step": 12393 + }, + { + "epoch": 1.3610806061937184, + "grad_norm": 2.525913953781128, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7282618284225464, + "num_tokens": 309321318.0, + "step": 12394 + }, + { + "epoch": 1.3611904238963322, + "grad_norm": 2.250509738922119, + "learning_rate": 1e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7410323023796082, + "num_tokens": 309346384.0, + "step": 12395 + }, + { + "epoch": 1.3613002415989457, + "grad_norm": 2.4302637577056885, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7191218137741089, + "num_tokens": 309369521.0, + "step": 12396 + }, + { + "epoch": 1.3614100593015594, + "grad_norm": 2.1425065994262695, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.696727454662323, + "num_tokens": 309398314.0, + "step": 12397 + }, + { + "epoch": 1.3615198770041732, + "grad_norm": 2.044675588607788, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7085727453231812, + "num_tokens": 309428936.0, + "step": 12398 + }, + { + "epoch": 1.3616296947067867, + "grad_norm": 2.1429572105407715, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6944568157196045, + "num_tokens": 309458499.0, + "step": 12399 + }, + { + "epoch": 1.3617395124094003, + "grad_norm": 2.1898765563964844, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7133315801620483, + "num_tokens": 309486068.0, + "step": 12400 + }, + { + "epoch": 1.361849330112014, + "grad_norm": 2.5932087898254395, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.706212043762207, + "num_tokens": 309506367.0, + "step": 12401 + }, + { + "epoch": 1.3619591478146278, + "grad_norm": 2.2999625205993652, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7249851226806641, + "num_tokens": 309529676.0, + "step": 12402 + }, + { + "epoch": 1.3620689655172413, + "grad_norm": 2.43776273727417, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7054216861724854, + "num_tokens": 309552343.0, + "step": 12403 + }, + { + "epoch": 1.362178783219855, + "grad_norm": 2.241478443145752, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7214325666427612, + "num_tokens": 309579361.0, + "step": 12404 + }, + { + "epoch": 1.3622886009224686, + "grad_norm": 2.2280163764953613, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6950567364692688, + "num_tokens": 309608009.0, + "step": 12405 + }, + { + "epoch": 1.3623984186250824, + "grad_norm": 2.3644065856933594, + "learning_rate": 1e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7419244647026062, + "num_tokens": 309630099.0, + "step": 12406 + }, + { + "epoch": 1.3625082363276961, + "grad_norm": 2.541347026824951, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7178231477737427, + "num_tokens": 309650788.0, + "step": 12407 + }, + { + "epoch": 1.3626180540303097, + "grad_norm": 2.514545202255249, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7200161218643188, + "num_tokens": 309673564.0, + "step": 12408 + }, + { + "epoch": 1.3627278717329234, + "grad_norm": 2.1337080001831055, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7259179353713989, + "num_tokens": 309700862.0, + "step": 12409 + }, + { + "epoch": 1.362837689435537, + "grad_norm": 2.274897575378418, + "learning_rate": 1e-06, + "loss": 0.9107, + "mean_token_accuracy": 0.7232427597045898, + "num_tokens": 309728835.0, + "step": 12410 + }, + { + "epoch": 1.3629475071381507, + "grad_norm": 2.368840217590332, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7314237356185913, + "num_tokens": 309752125.0, + "step": 12411 + }, + { + "epoch": 1.3630573248407643, + "grad_norm": 2.31528639793396, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7086588144302368, + "num_tokens": 309775673.0, + "step": 12412 + }, + { + "epoch": 1.363167142543378, + "grad_norm": 2.401658535003662, + "learning_rate": 1e-06, + "loss": 0.8588, + "mean_token_accuracy": 0.7261468172073364, + "num_tokens": 309797096.0, + "step": 12413 + }, + { + "epoch": 1.3632769602459915, + "grad_norm": 2.3635623455047607, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7333145141601562, + "num_tokens": 309819229.0, + "step": 12414 + }, + { + "epoch": 1.3633867779486053, + "grad_norm": 2.2644360065460205, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7110480666160583, + "num_tokens": 309844276.0, + "step": 12415 + }, + { + "epoch": 1.363496595651219, + "grad_norm": 2.2425878047943115, + "learning_rate": 1e-06, + "loss": 0.9984, + "mean_token_accuracy": 0.7016642093658447, + "num_tokens": 309872061.0, + "step": 12416 + }, + { + "epoch": 1.3636064133538326, + "grad_norm": 2.2755091190338135, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7162700891494751, + "num_tokens": 309895788.0, + "step": 12417 + }, + { + "epoch": 1.3637162310564463, + "grad_norm": 1.9609061479568481, + "learning_rate": 1e-06, + "loss": 1.0168, + "mean_token_accuracy": 0.6821067929267883, + "num_tokens": 309929767.0, + "step": 12418 + }, + { + "epoch": 1.3638260487590599, + "grad_norm": 2.013242483139038, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7119701504707336, + "num_tokens": 309960601.0, + "step": 12419 + }, + { + "epoch": 1.3639358664616736, + "grad_norm": 2.2494921684265137, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.727959394454956, + "num_tokens": 309986966.0, + "step": 12420 + }, + { + "epoch": 1.3640456841642874, + "grad_norm": 2.1769967079162598, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7238820791244507, + "num_tokens": 310014297.0, + "step": 12421 + }, + { + "epoch": 1.364155501866901, + "grad_norm": 2.0361831188201904, + "learning_rate": 1e-06, + "loss": 0.9762, + "mean_token_accuracy": 0.7025250792503357, + "num_tokens": 310044842.0, + "step": 12422 + }, + { + "epoch": 1.3642653195695145, + "grad_norm": 2.4267539978027344, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.720323920249939, + "num_tokens": 310067092.0, + "step": 12423 + }, + { + "epoch": 1.3643751372721282, + "grad_norm": 2.154348373413086, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.7155708074569702, + "num_tokens": 310094155.0, + "step": 12424 + }, + { + "epoch": 1.364484954974742, + "grad_norm": 3.0270352363586426, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7247657775878906, + "num_tokens": 310112466.0, + "step": 12425 + }, + { + "epoch": 1.3645947726773555, + "grad_norm": 2.3627171516418457, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.6955569386482239, + "num_tokens": 310138746.0, + "step": 12426 + }, + { + "epoch": 1.3647045903799693, + "grad_norm": 2.537768602371216, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7111221551895142, + "num_tokens": 310161929.0, + "step": 12427 + }, + { + "epoch": 1.3648144080825828, + "grad_norm": 2.18593692779541, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6949220895767212, + "num_tokens": 310192108.0, + "step": 12428 + }, + { + "epoch": 1.3649242257851966, + "grad_norm": 2.4881322383880615, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7273978590965271, + "num_tokens": 310214197.0, + "step": 12429 + }, + { + "epoch": 1.3650340434878103, + "grad_norm": 2.024904727935791, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.74444580078125, + "num_tokens": 310241565.0, + "step": 12430 + }, + { + "epoch": 1.3651438611904239, + "grad_norm": 2.0002670288085938, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7083057165145874, + "num_tokens": 310271988.0, + "step": 12431 + }, + { + "epoch": 1.3652536788930376, + "grad_norm": 2.3227782249450684, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7135480642318726, + "num_tokens": 310294704.0, + "step": 12432 + }, + { + "epoch": 1.3653634965956511, + "grad_norm": 2.2788279056549072, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7292121648788452, + "num_tokens": 310317549.0, + "step": 12433 + }, + { + "epoch": 1.365473314298265, + "grad_norm": 2.3434693813323975, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7229580879211426, + "num_tokens": 310340099.0, + "step": 12434 + }, + { + "epoch": 1.3655831320008787, + "grad_norm": 2.257295608520508, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7059465646743774, + "num_tokens": 310365578.0, + "step": 12435 + }, + { + "epoch": 1.3656929497034922, + "grad_norm": 2.242887258529663, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7169453501701355, + "num_tokens": 310389740.0, + "step": 12436 + }, + { + "epoch": 1.3658027674061057, + "grad_norm": 2.354518413543701, + "learning_rate": 1e-06, + "loss": 0.9926, + "mean_token_accuracy": 0.6898199319839478, + "num_tokens": 310414187.0, + "step": 12437 + }, + { + "epoch": 1.3659125851087195, + "grad_norm": 2.4218311309814453, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.72235107421875, + "num_tokens": 310436118.0, + "step": 12438 + }, + { + "epoch": 1.3660224028113332, + "grad_norm": 2.5083296298980713, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7225335240364075, + "num_tokens": 310457713.0, + "step": 12439 + }, + { + "epoch": 1.3661322205139468, + "grad_norm": 2.3473825454711914, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7078951597213745, + "num_tokens": 310483076.0, + "step": 12440 + }, + { + "epoch": 1.3662420382165605, + "grad_norm": 2.4171760082244873, + "learning_rate": 1e-06, + "loss": 0.953, + "mean_token_accuracy": 0.7135859727859497, + "num_tokens": 310507367.0, + "step": 12441 + }, + { + "epoch": 1.366351855919174, + "grad_norm": 2.218357801437378, + "learning_rate": 1e-06, + "loss": 0.9938, + "mean_token_accuracy": 0.6977536678314209, + "num_tokens": 310533892.0, + "step": 12442 + }, + { + "epoch": 1.3664616736217878, + "grad_norm": 2.4815011024475098, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.738402247428894, + "num_tokens": 310554188.0, + "step": 12443 + }, + { + "epoch": 1.3665714913244016, + "grad_norm": 2.5849363803863525, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7211122512817383, + "num_tokens": 310573660.0, + "step": 12444 + }, + { + "epoch": 1.3666813090270151, + "grad_norm": 2.3504281044006348, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.7361508011817932, + "num_tokens": 310595694.0, + "step": 12445 + }, + { + "epoch": 1.3667911267296289, + "grad_norm": 2.2755837440490723, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7209277153015137, + "num_tokens": 310620560.0, + "step": 12446 + }, + { + "epoch": 1.3669009444322424, + "grad_norm": 2.7455813884735107, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7076307535171509, + "num_tokens": 310639583.0, + "step": 12447 + }, + { + "epoch": 1.3670107621348562, + "grad_norm": 2.669452667236328, + "learning_rate": 1e-06, + "loss": 0.7716, + "mean_token_accuracy": 0.7476706504821777, + "num_tokens": 310658001.0, + "step": 12448 + }, + { + "epoch": 1.36712057983747, + "grad_norm": 2.293956756591797, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7313883304595947, + "num_tokens": 310681124.0, + "step": 12449 + }, + { + "epoch": 1.3672303975400835, + "grad_norm": 2.338397741317749, + "learning_rate": 1e-06, + "loss": 0.842, + "mean_token_accuracy": 0.7316080331802368, + "num_tokens": 310702536.0, + "step": 12450 + }, + { + "epoch": 1.367340215242697, + "grad_norm": 2.4166815280914307, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7228143811225891, + "num_tokens": 310724996.0, + "step": 12451 + }, + { + "epoch": 1.3674500329453108, + "grad_norm": 2.4642343521118164, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7261719703674316, + "num_tokens": 310747176.0, + "step": 12452 + }, + { + "epoch": 1.3675598506479245, + "grad_norm": 2.171189069747925, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7058752179145813, + "num_tokens": 310774786.0, + "step": 12453 + }, + { + "epoch": 1.367669668350538, + "grad_norm": 2.080657720565796, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.7242183089256287, + "num_tokens": 310803907.0, + "step": 12454 + }, + { + "epoch": 1.3677794860531518, + "grad_norm": 2.396294116973877, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7125637531280518, + "num_tokens": 310826577.0, + "step": 12455 + }, + { + "epoch": 1.3678893037557653, + "grad_norm": 2.0712339878082275, + "learning_rate": 1e-06, + "loss": 0.959, + "mean_token_accuracy": 0.6961114406585693, + "num_tokens": 310859159.0, + "step": 12456 + }, + { + "epoch": 1.367999121458379, + "grad_norm": 2.0250353813171387, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.713893473148346, + "num_tokens": 310889389.0, + "step": 12457 + }, + { + "epoch": 1.3681089391609929, + "grad_norm": 2.037677764892578, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7247200608253479, + "num_tokens": 310918003.0, + "step": 12458 + }, + { + "epoch": 1.3682187568636064, + "grad_norm": 2.3074393272399902, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7070420980453491, + "num_tokens": 310941360.0, + "step": 12459 + }, + { + "epoch": 1.3683285745662201, + "grad_norm": 2.37103271484375, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7323228120803833, + "num_tokens": 310964924.0, + "step": 12460 + }, + { + "epoch": 1.3684383922688337, + "grad_norm": 2.1670467853546143, + "learning_rate": 1e-06, + "loss": 1.0543, + "mean_token_accuracy": 0.6798837184906006, + "num_tokens": 310992851.0, + "step": 12461 + }, + { + "epoch": 1.3685482099714474, + "grad_norm": 2.3717293739318848, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7149130702018738, + "num_tokens": 311016172.0, + "step": 12462 + }, + { + "epoch": 1.3686580276740612, + "grad_norm": 2.123725414276123, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7237606048583984, + "num_tokens": 311042832.0, + "step": 12463 + }, + { + "epoch": 1.3687678453766747, + "grad_norm": 2.1531805992126465, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7208186388015747, + "num_tokens": 311069784.0, + "step": 12464 + }, + { + "epoch": 1.3688776630792883, + "grad_norm": 2.3517110347747803, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7050249576568604, + "num_tokens": 311094567.0, + "step": 12465 + }, + { + "epoch": 1.368987480781902, + "grad_norm": 2.719067335128784, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7290725708007812, + "num_tokens": 311113284.0, + "step": 12466 + }, + { + "epoch": 1.3690972984845158, + "grad_norm": 2.3108887672424316, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7263597249984741, + "num_tokens": 311135396.0, + "step": 12467 + }, + { + "epoch": 1.3692071161871293, + "grad_norm": 2.3715405464172363, + "learning_rate": 1e-06, + "loss": 0.9871, + "mean_token_accuracy": 0.6955172419548035, + "num_tokens": 311160485.0, + "step": 12468 + }, + { + "epoch": 1.369316933889743, + "grad_norm": 2.3510255813598633, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7340160608291626, + "num_tokens": 311183957.0, + "step": 12469 + }, + { + "epoch": 1.3694267515923566, + "grad_norm": 2.0325324535369873, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7145713567733765, + "num_tokens": 311218022.0, + "step": 12470 + }, + { + "epoch": 1.3695365692949704, + "grad_norm": 2.1491198539733887, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7139163017272949, + "num_tokens": 311245498.0, + "step": 12471 + }, + { + "epoch": 1.3696463869975841, + "grad_norm": 2.1613669395446777, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7200760841369629, + "num_tokens": 311270861.0, + "step": 12472 + }, + { + "epoch": 1.3697562047001977, + "grad_norm": 2.1638667583465576, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7223706245422363, + "num_tokens": 311297838.0, + "step": 12473 + }, + { + "epoch": 1.3698660224028114, + "grad_norm": 2.394590139389038, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.701987624168396, + "num_tokens": 311322034.0, + "step": 12474 + }, + { + "epoch": 1.369975840105425, + "grad_norm": 2.089728593826294, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.717267632484436, + "num_tokens": 311348869.0, + "step": 12475 + }, + { + "epoch": 1.3700856578080387, + "grad_norm": 2.384402275085449, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7298955321311951, + "num_tokens": 311371899.0, + "step": 12476 + }, + { + "epoch": 1.3701954755106522, + "grad_norm": 2.3126485347747803, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7172887921333313, + "num_tokens": 311396610.0, + "step": 12477 + }, + { + "epoch": 1.370305293213266, + "grad_norm": 2.0365076065063477, + "learning_rate": 1e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7367488145828247, + "num_tokens": 311425181.0, + "step": 12478 + }, + { + "epoch": 1.3704151109158795, + "grad_norm": 2.2755253314971924, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7173833847045898, + "num_tokens": 311449438.0, + "step": 12479 + }, + { + "epoch": 1.3705249286184933, + "grad_norm": 2.318265438079834, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7368243932723999, + "num_tokens": 311472575.0, + "step": 12480 + }, + { + "epoch": 1.370634746321107, + "grad_norm": 2.314401388168335, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7224876880645752, + "num_tokens": 311495331.0, + "step": 12481 + }, + { + "epoch": 1.3707445640237206, + "grad_norm": 2.161435604095459, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7127971053123474, + "num_tokens": 311522594.0, + "step": 12482 + }, + { + "epoch": 1.3708543817263343, + "grad_norm": 2.2425730228424072, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7169284820556641, + "num_tokens": 311549848.0, + "step": 12483 + }, + { + "epoch": 1.3709641994289479, + "grad_norm": 2.091356039047241, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7314194440841675, + "num_tokens": 311578847.0, + "step": 12484 + }, + { + "epoch": 1.3710740171315616, + "grad_norm": 2.3239476680755615, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7108224034309387, + "num_tokens": 311602947.0, + "step": 12485 + }, + { + "epoch": 1.3711838348341754, + "grad_norm": 2.246448516845703, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7419214844703674, + "num_tokens": 311627775.0, + "step": 12486 + }, + { + "epoch": 1.371293652536789, + "grad_norm": 2.1208081245422363, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7281566262245178, + "num_tokens": 311654632.0, + "step": 12487 + }, + { + "epoch": 1.3714034702394025, + "grad_norm": 2.2126052379608154, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7128781676292419, + "num_tokens": 311681771.0, + "step": 12488 + }, + { + "epoch": 1.3715132879420162, + "grad_norm": 2.2750208377838135, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7326797246932983, + "num_tokens": 311706025.0, + "step": 12489 + }, + { + "epoch": 1.37162310564463, + "grad_norm": 2.5031282901763916, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7194638252258301, + "num_tokens": 311727841.0, + "step": 12490 + }, + { + "epoch": 1.3717329233472435, + "grad_norm": 2.3978798389434814, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7260746359825134, + "num_tokens": 311750361.0, + "step": 12491 + }, + { + "epoch": 1.3718427410498573, + "grad_norm": 2.2076549530029297, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7405933141708374, + "num_tokens": 311775368.0, + "step": 12492 + }, + { + "epoch": 1.3719525587524708, + "grad_norm": 2.493560791015625, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7262872457504272, + "num_tokens": 311797895.0, + "step": 12493 + }, + { + "epoch": 1.3720623764550846, + "grad_norm": 2.4580626487731934, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.704832911491394, + "num_tokens": 311821275.0, + "step": 12494 + }, + { + "epoch": 1.3721721941576983, + "grad_norm": 2.437410593032837, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7396143078804016, + "num_tokens": 311843060.0, + "step": 12495 + }, + { + "epoch": 1.3722820118603118, + "grad_norm": 2.318413019180298, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7191650867462158, + "num_tokens": 311866796.0, + "step": 12496 + }, + { + "epoch": 1.3723918295629256, + "grad_norm": 2.089622974395752, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7146745324134827, + "num_tokens": 311896572.0, + "step": 12497 + }, + { + "epoch": 1.3725016472655391, + "grad_norm": 2.2104477882385254, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7317221164703369, + "num_tokens": 311920579.0, + "step": 12498 + }, + { + "epoch": 1.372611464968153, + "grad_norm": 2.3517017364501953, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7401909828186035, + "num_tokens": 311942761.0, + "step": 12499 + }, + { + "epoch": 1.3727212826707667, + "grad_norm": 2.182142496109009, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7188946008682251, + "num_tokens": 311971824.0, + "step": 12500 + }, + { + "epoch": 1.3728311003733802, + "grad_norm": 2.846928834915161, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7245587706565857, + "num_tokens": 311988616.0, + "step": 12501 + }, + { + "epoch": 1.3729409180759937, + "grad_norm": 2.4450037479400635, + "learning_rate": 1e-06, + "loss": 0.9034, + "mean_token_accuracy": 0.7172482013702393, + "num_tokens": 312009263.0, + "step": 12502 + }, + { + "epoch": 1.3730507357786075, + "grad_norm": 2.412445068359375, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7198370099067688, + "num_tokens": 312031179.0, + "step": 12503 + }, + { + "epoch": 1.3731605534812212, + "grad_norm": 2.084853410720825, + "learning_rate": 1e-06, + "loss": 0.9374, + "mean_token_accuracy": 0.7120205163955688, + "num_tokens": 312061419.0, + "step": 12504 + }, + { + "epoch": 1.3732703711838348, + "grad_norm": 2.232865571975708, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7072681188583374, + "num_tokens": 312088834.0, + "step": 12505 + }, + { + "epoch": 1.3733801888864485, + "grad_norm": 1.9789271354675293, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7225912809371948, + "num_tokens": 312120411.0, + "step": 12506 + }, + { + "epoch": 1.373490006589062, + "grad_norm": 2.465984344482422, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.716380774974823, + "num_tokens": 312142718.0, + "step": 12507 + }, + { + "epoch": 1.3735998242916758, + "grad_norm": 2.15016770362854, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7149111032485962, + "num_tokens": 312168348.0, + "step": 12508 + }, + { + "epoch": 1.3737096419942896, + "grad_norm": 2.1861159801483154, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7418795824050903, + "num_tokens": 312194528.0, + "step": 12509 + }, + { + "epoch": 1.3738194596969031, + "grad_norm": 2.3887996673583984, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7100154757499695, + "num_tokens": 312217846.0, + "step": 12510 + }, + { + "epoch": 1.3739292773995169, + "grad_norm": 2.4950649738311768, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7174621820449829, + "num_tokens": 312239593.0, + "step": 12511 + }, + { + "epoch": 1.3740390951021304, + "grad_norm": 2.086320161819458, + "learning_rate": 1e-06, + "loss": 0.9977, + "mean_token_accuracy": 0.7039101719856262, + "num_tokens": 312269345.0, + "step": 12512 + }, + { + "epoch": 1.3741489128047442, + "grad_norm": 2.262160539627075, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.6835432052612305, + "num_tokens": 312296606.0, + "step": 12513 + }, + { + "epoch": 1.374258730507358, + "grad_norm": 2.2031402587890625, + "learning_rate": 1e-06, + "loss": 1.0115, + "mean_token_accuracy": 0.6861450672149658, + "num_tokens": 312324194.0, + "step": 12514 + }, + { + "epoch": 1.3743685482099715, + "grad_norm": 2.2984936237335205, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7209494113922119, + "num_tokens": 312346745.0, + "step": 12515 + }, + { + "epoch": 1.374478365912585, + "grad_norm": 2.382993459701538, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7190349102020264, + "num_tokens": 312370585.0, + "step": 12516 + }, + { + "epoch": 1.3745881836151987, + "grad_norm": 2.100752353668213, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.6981699466705322, + "num_tokens": 312400038.0, + "step": 12517 + }, + { + "epoch": 1.3746980013178125, + "grad_norm": 2.515690565109253, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7208919525146484, + "num_tokens": 312420738.0, + "step": 12518 + }, + { + "epoch": 1.374807819020426, + "grad_norm": 2.164041757583618, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7244716882705688, + "num_tokens": 312449976.0, + "step": 12519 + }, + { + "epoch": 1.3749176367230398, + "grad_norm": 2.018441915512085, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7179365158081055, + "num_tokens": 312480289.0, + "step": 12520 + }, + { + "epoch": 1.3750274544256533, + "grad_norm": 2.3191745281219482, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7345132827758789, + "num_tokens": 312503040.0, + "step": 12521 + }, + { + "epoch": 1.375137272128267, + "grad_norm": 2.3152830600738525, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7253783941268921, + "num_tokens": 312525617.0, + "step": 12522 + }, + { + "epoch": 1.3752470898308808, + "grad_norm": 2.186314105987549, + "learning_rate": 1e-06, + "loss": 1.0247, + "mean_token_accuracy": 0.7018457055091858, + "num_tokens": 312551073.0, + "step": 12523 + }, + { + "epoch": 1.3753569075334944, + "grad_norm": 2.4479708671569824, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7214242815971375, + "num_tokens": 312573949.0, + "step": 12524 + }, + { + "epoch": 1.3754667252361081, + "grad_norm": 2.21649169921875, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7449905872344971, + "num_tokens": 312599185.0, + "step": 12525 + }, + { + "epoch": 1.3755765429387217, + "grad_norm": 2.2145447731018066, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7042717933654785, + "num_tokens": 312624652.0, + "step": 12526 + }, + { + "epoch": 1.3756863606413354, + "grad_norm": 1.97627854347229, + "learning_rate": 1e-06, + "loss": 0.7825, + "mean_token_accuracy": 0.7537691593170166, + "num_tokens": 312653594.0, + "step": 12527 + }, + { + "epoch": 1.3757961783439492, + "grad_norm": 2.1742360591888428, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7027696967124939, + "num_tokens": 312680521.0, + "step": 12528 + }, + { + "epoch": 1.3759059960465627, + "grad_norm": 2.5707905292510986, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7179141044616699, + "num_tokens": 312703942.0, + "step": 12529 + }, + { + "epoch": 1.3760158137491763, + "grad_norm": 2.3378279209136963, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.735703706741333, + "num_tokens": 312726428.0, + "step": 12530 + }, + { + "epoch": 1.37612563145179, + "grad_norm": 2.290940046310425, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7293041944503784, + "num_tokens": 312752063.0, + "step": 12531 + }, + { + "epoch": 1.3762354491544038, + "grad_norm": 2.34158992767334, + "learning_rate": 1e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7409287691116333, + "num_tokens": 312774940.0, + "step": 12532 + }, + { + "epoch": 1.3763452668570173, + "grad_norm": 2.0567073822021484, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7019597887992859, + "num_tokens": 312806108.0, + "step": 12533 + }, + { + "epoch": 1.376455084559631, + "grad_norm": 2.0599892139434814, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7273460626602173, + "num_tokens": 312834391.0, + "step": 12534 + }, + { + "epoch": 1.3765649022622446, + "grad_norm": 2.351266384124756, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7348308563232422, + "num_tokens": 312857208.0, + "step": 12535 + }, + { + "epoch": 1.3766747199648584, + "grad_norm": 2.419922113418579, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7254370450973511, + "num_tokens": 312879236.0, + "step": 12536 + }, + { + "epoch": 1.376784537667472, + "grad_norm": 2.2435460090637207, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7104897499084473, + "num_tokens": 312905434.0, + "step": 12537 + }, + { + "epoch": 1.3768943553700856, + "grad_norm": 2.0682291984558105, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.708175539970398, + "num_tokens": 312933715.0, + "step": 12538 + }, + { + "epoch": 1.3770041730726994, + "grad_norm": 2.1889379024505615, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7050284147262573, + "num_tokens": 312960752.0, + "step": 12539 + }, + { + "epoch": 1.377113990775313, + "grad_norm": 2.430464744567871, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.7035816311836243, + "num_tokens": 312985315.0, + "step": 12540 + }, + { + "epoch": 1.3772238084779267, + "grad_norm": 2.358059883117676, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7419033050537109, + "num_tokens": 313008698.0, + "step": 12541 + }, + { + "epoch": 1.3773336261805402, + "grad_norm": 2.196042776107788, + "learning_rate": 1e-06, + "loss": 0.7468, + "mean_token_accuracy": 0.7649924159049988, + "num_tokens": 313032263.0, + "step": 12542 + }, + { + "epoch": 1.377443443883154, + "grad_norm": 2.2177093029022217, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7190951108932495, + "num_tokens": 313060299.0, + "step": 12543 + }, + { + "epoch": 1.3775532615857675, + "grad_norm": 2.481335401535034, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7369788289070129, + "num_tokens": 313081701.0, + "step": 12544 + }, + { + "epoch": 1.3776630792883813, + "grad_norm": 2.4746127128601074, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7077682018280029, + "num_tokens": 313103845.0, + "step": 12545 + }, + { + "epoch": 1.377772896990995, + "grad_norm": 2.1467065811157227, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7265269160270691, + "num_tokens": 313130849.0, + "step": 12546 + }, + { + "epoch": 1.3778827146936086, + "grad_norm": 2.047797918319702, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.6974449753761292, + "num_tokens": 313162454.0, + "step": 12547 + }, + { + "epoch": 1.3779925323962223, + "grad_norm": 2.2979838848114014, + "learning_rate": 1e-06, + "loss": 0.7959, + "mean_token_accuracy": 0.7478598952293396, + "num_tokens": 313185553.0, + "step": 12548 + }, + { + "epoch": 1.3781023500988359, + "grad_norm": 2.419222116470337, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7237902879714966, + "num_tokens": 313205994.0, + "step": 12549 + }, + { + "epoch": 1.3782121678014496, + "grad_norm": 2.327472686767578, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7321279048919678, + "num_tokens": 313229695.0, + "step": 12550 + }, + { + "epoch": 1.3783219855040634, + "grad_norm": 2.4060120582580566, + "learning_rate": 1e-06, + "loss": 0.9663, + "mean_token_accuracy": 0.7088882327079773, + "num_tokens": 313255049.0, + "step": 12551 + }, + { + "epoch": 1.378431803206677, + "grad_norm": 2.7141005992889404, + "learning_rate": 1e-06, + "loss": 0.7601, + "mean_token_accuracy": 0.7642104625701904, + "num_tokens": 313273632.0, + "step": 12552 + }, + { + "epoch": 1.3785416209092904, + "grad_norm": 2.4928178787231445, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7095441818237305, + "num_tokens": 313297901.0, + "step": 12553 + }, + { + "epoch": 1.3786514386119042, + "grad_norm": 2.284419536590576, + "learning_rate": 1e-06, + "loss": 0.8211, + "mean_token_accuracy": 0.7505691051483154, + "num_tokens": 313321122.0, + "step": 12554 + }, + { + "epoch": 1.378761256314518, + "grad_norm": 2.8600995540618896, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7377313375473022, + "num_tokens": 313338843.0, + "step": 12555 + }, + { + "epoch": 1.3788710740171315, + "grad_norm": 2.447932004928589, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7178305983543396, + "num_tokens": 313361859.0, + "step": 12556 + }, + { + "epoch": 1.3789808917197452, + "grad_norm": 2.346132278442383, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7296304702758789, + "num_tokens": 313384856.0, + "step": 12557 + }, + { + "epoch": 1.3790907094223588, + "grad_norm": 2.384746789932251, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7092068791389465, + "num_tokens": 313409105.0, + "step": 12558 + }, + { + "epoch": 1.3792005271249725, + "grad_norm": 2.3436968326568604, + "learning_rate": 1e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7408517599105835, + "num_tokens": 313432131.0, + "step": 12559 + }, + { + "epoch": 1.3793103448275863, + "grad_norm": 2.361663579940796, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7237992286682129, + "num_tokens": 313456934.0, + "step": 12560 + }, + { + "epoch": 1.3794201625301998, + "grad_norm": 2.4644112586975098, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7317774295806885, + "num_tokens": 313478157.0, + "step": 12561 + }, + { + "epoch": 1.3795299802328136, + "grad_norm": 2.2934670448303223, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7232183218002319, + "num_tokens": 313502591.0, + "step": 12562 + }, + { + "epoch": 1.3796397979354271, + "grad_norm": 2.1664621829986572, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6998163461685181, + "num_tokens": 313532191.0, + "step": 12563 + }, + { + "epoch": 1.3797496156380409, + "grad_norm": 2.0930018424987793, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.7009629607200623, + "num_tokens": 313561910.0, + "step": 12564 + }, + { + "epoch": 1.3798594333406546, + "grad_norm": 2.1655218601226807, + "learning_rate": 1e-06, + "loss": 0.7727, + "mean_token_accuracy": 0.7526647448539734, + "num_tokens": 313587302.0, + "step": 12565 + }, + { + "epoch": 1.3799692510432682, + "grad_norm": 2.138939142227173, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7091673612594604, + "num_tokens": 313616037.0, + "step": 12566 + }, + { + "epoch": 1.3800790687458817, + "grad_norm": 2.5305583477020264, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.7328576445579529, + "num_tokens": 313636413.0, + "step": 12567 + }, + { + "epoch": 1.3801888864484955, + "grad_norm": 2.2189621925354004, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.6965206265449524, + "num_tokens": 313663630.0, + "step": 12568 + }, + { + "epoch": 1.3802987041511092, + "grad_norm": 2.241077184677124, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.7046091556549072, + "num_tokens": 313688878.0, + "step": 12569 + }, + { + "epoch": 1.3804085218537228, + "grad_norm": 1.9571115970611572, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7384964823722839, + "num_tokens": 313721582.0, + "step": 12570 + }, + { + "epoch": 1.3805183395563365, + "grad_norm": 2.290578603744507, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7349368929862976, + "num_tokens": 313745180.0, + "step": 12571 + }, + { + "epoch": 1.38062815725895, + "grad_norm": 2.074005365371704, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7262497544288635, + "num_tokens": 313773715.0, + "step": 12572 + }, + { + "epoch": 1.3807379749615638, + "grad_norm": 2.2127277851104736, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.716001570224762, + "num_tokens": 313801079.0, + "step": 12573 + }, + { + "epoch": 1.3808477926641776, + "grad_norm": 2.3089866638183594, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.716690719127655, + "num_tokens": 313827080.0, + "step": 12574 + }, + { + "epoch": 1.380957610366791, + "grad_norm": 2.380089521408081, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7249890565872192, + "num_tokens": 313849512.0, + "step": 12575 + }, + { + "epoch": 1.3810674280694049, + "grad_norm": 2.2601771354675293, + "learning_rate": 1e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7225310802459717, + "num_tokens": 313875557.0, + "step": 12576 + }, + { + "epoch": 1.3811772457720184, + "grad_norm": 2.1577136516571045, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7315571308135986, + "num_tokens": 313903554.0, + "step": 12577 + }, + { + "epoch": 1.3812870634746321, + "grad_norm": 2.2984538078308105, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7178627848625183, + "num_tokens": 313929106.0, + "step": 12578 + }, + { + "epoch": 1.381396881177246, + "grad_norm": 2.325471878051758, + "learning_rate": 1e-06, + "loss": 0.9693, + "mean_token_accuracy": 0.7072179317474365, + "num_tokens": 313953498.0, + "step": 12579 + }, + { + "epoch": 1.3815066988798594, + "grad_norm": 2.142714500427246, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7064363360404968, + "num_tokens": 313982809.0, + "step": 12580 + }, + { + "epoch": 1.381616516582473, + "grad_norm": 2.5668811798095703, + "learning_rate": 1e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7351186871528625, + "num_tokens": 314003158.0, + "step": 12581 + }, + { + "epoch": 1.3817263342850867, + "grad_norm": 2.129424810409546, + "learning_rate": 1e-06, + "loss": 0.9695, + "mean_token_accuracy": 0.700999915599823, + "num_tokens": 314034030.0, + "step": 12582 + }, + { + "epoch": 1.3818361519877005, + "grad_norm": 2.149876117706299, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7353253364562988, + "num_tokens": 314062875.0, + "step": 12583 + }, + { + "epoch": 1.381945969690314, + "grad_norm": 2.2054920196533203, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7335253953933716, + "num_tokens": 314087972.0, + "step": 12584 + }, + { + "epoch": 1.3820557873929278, + "grad_norm": 2.443302631378174, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7165548801422119, + "num_tokens": 314110065.0, + "step": 12585 + }, + { + "epoch": 1.3821656050955413, + "grad_norm": 2.4024088382720947, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7238823175430298, + "num_tokens": 314134950.0, + "step": 12586 + }, + { + "epoch": 1.382275422798155, + "grad_norm": 2.175248146057129, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7077102065086365, + "num_tokens": 314163232.0, + "step": 12587 + }, + { + "epoch": 1.3823852405007688, + "grad_norm": 2.510749578475952, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7120606303215027, + "num_tokens": 314187315.0, + "step": 12588 + }, + { + "epoch": 1.3824950582033824, + "grad_norm": 2.435079574584961, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7355976104736328, + "num_tokens": 314209035.0, + "step": 12589 + }, + { + "epoch": 1.3826048759059961, + "grad_norm": 2.1134934425354004, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7167136669158936, + "num_tokens": 314236144.0, + "step": 12590 + }, + { + "epoch": 1.3827146936086097, + "grad_norm": 2.3268864154815674, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7060457468032837, + "num_tokens": 314260752.0, + "step": 12591 + }, + { + "epoch": 1.3828245113112234, + "grad_norm": 1.9014256000518799, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7208251357078552, + "num_tokens": 314290193.0, + "step": 12592 + }, + { + "epoch": 1.382934329013837, + "grad_norm": 2.5273900032043457, + "learning_rate": 1e-06, + "loss": 0.84, + "mean_token_accuracy": 0.7345272302627563, + "num_tokens": 314309694.0, + "step": 12593 + }, + { + "epoch": 1.3830441467164507, + "grad_norm": 2.6847925186157227, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7168592214584351, + "num_tokens": 314329559.0, + "step": 12594 + }, + { + "epoch": 1.3831539644190642, + "grad_norm": 2.6916909217834473, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7209130525588989, + "num_tokens": 314349610.0, + "step": 12595 + }, + { + "epoch": 1.383263782121678, + "grad_norm": 2.6052372455596924, + "learning_rate": 1e-06, + "loss": 0.8703, + "mean_token_accuracy": 0.7349731922149658, + "num_tokens": 314368468.0, + "step": 12596 + }, + { + "epoch": 1.3833735998242918, + "grad_norm": 2.1999921798706055, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.7015810012817383, + "num_tokens": 314397928.0, + "step": 12597 + }, + { + "epoch": 1.3834834175269053, + "grad_norm": 2.205042839050293, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7243982553482056, + "num_tokens": 314422992.0, + "step": 12598 + }, + { + "epoch": 1.383593235229519, + "grad_norm": 2.313835620880127, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7090690732002258, + "num_tokens": 314446342.0, + "step": 12599 + }, + { + "epoch": 1.3837030529321326, + "grad_norm": 2.3597400188446045, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7189004421234131, + "num_tokens": 314469862.0, + "step": 12600 + }, + { + "epoch": 1.3838128706347463, + "grad_norm": 1.9900779724121094, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7354605197906494, + "num_tokens": 314500346.0, + "step": 12601 + }, + { + "epoch": 1.38392268833736, + "grad_norm": 2.4243907928466797, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.6989415287971497, + "num_tokens": 314523171.0, + "step": 12602 + }, + { + "epoch": 1.3840325060399736, + "grad_norm": 2.1960911750793457, + "learning_rate": 1e-06, + "loss": 0.9791, + "mean_token_accuracy": 0.7022800445556641, + "num_tokens": 314549457.0, + "step": 12603 + }, + { + "epoch": 1.3841423237425874, + "grad_norm": 2.1866583824157715, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7285190224647522, + "num_tokens": 314575163.0, + "step": 12604 + }, + { + "epoch": 1.384252141445201, + "grad_norm": 2.25814151763916, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7459166049957275, + "num_tokens": 314598113.0, + "step": 12605 + }, + { + "epoch": 1.3843619591478147, + "grad_norm": 2.4732682704925537, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.759257435798645, + "num_tokens": 314617815.0, + "step": 12606 + }, + { + "epoch": 1.3844717768504282, + "grad_norm": 2.050443649291992, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.722411572933197, + "num_tokens": 314647138.0, + "step": 12607 + }, + { + "epoch": 1.384581594553042, + "grad_norm": 2.2893319129943848, + "learning_rate": 1e-06, + "loss": 0.991, + "mean_token_accuracy": 0.6936402320861816, + "num_tokens": 314673859.0, + "step": 12608 + }, + { + "epoch": 1.3846914122556555, + "grad_norm": 1.9952359199523926, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.7065869569778442, + "num_tokens": 314706165.0, + "step": 12609 + }, + { + "epoch": 1.3848012299582693, + "grad_norm": 2.0978846549987793, + "learning_rate": 1e-06, + "loss": 1.0226, + "mean_token_accuracy": 0.6861497163772583, + "num_tokens": 314735836.0, + "step": 12610 + }, + { + "epoch": 1.384911047660883, + "grad_norm": 2.0202395915985107, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7158507108688354, + "num_tokens": 314764620.0, + "step": 12611 + }, + { + "epoch": 1.3850208653634966, + "grad_norm": 2.565138339996338, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7455604076385498, + "num_tokens": 314783888.0, + "step": 12612 + }, + { + "epoch": 1.3851306830661103, + "grad_norm": 2.2058000564575195, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7065428495407104, + "num_tokens": 314813032.0, + "step": 12613 + }, + { + "epoch": 1.3852405007687238, + "grad_norm": 2.385739326477051, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7181957960128784, + "num_tokens": 314836569.0, + "step": 12614 + }, + { + "epoch": 1.3853503184713376, + "grad_norm": 2.0516130924224854, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.6978588700294495, + "num_tokens": 314868571.0, + "step": 12615 + }, + { + "epoch": 1.3854601361739514, + "grad_norm": 2.3363289833068848, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7162875533103943, + "num_tokens": 314894179.0, + "step": 12616 + }, + { + "epoch": 1.385569953876565, + "grad_norm": 2.5543994903564453, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.6989269256591797, + "num_tokens": 314916036.0, + "step": 12617 + }, + { + "epoch": 1.3856797715791784, + "grad_norm": 2.290274143218994, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7106269598007202, + "num_tokens": 314944576.0, + "step": 12618 + }, + { + "epoch": 1.3857895892817922, + "grad_norm": 2.365189552307129, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7241487503051758, + "num_tokens": 314966962.0, + "step": 12619 + }, + { + "epoch": 1.385899406984406, + "grad_norm": 2.233647346496582, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7144320607185364, + "num_tokens": 314992077.0, + "step": 12620 + }, + { + "epoch": 1.3860092246870195, + "grad_norm": 1.9940887689590454, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.706845760345459, + "num_tokens": 315023347.0, + "step": 12621 + }, + { + "epoch": 1.3861190423896332, + "grad_norm": 2.4121530055999756, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7078859806060791, + "num_tokens": 315047035.0, + "step": 12622 + }, + { + "epoch": 1.3862288600922468, + "grad_norm": 2.197969913482666, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7301016449928284, + "num_tokens": 315074145.0, + "step": 12623 + }, + { + "epoch": 1.3863386777948605, + "grad_norm": 2.343425989151001, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.711072564125061, + "num_tokens": 315099090.0, + "step": 12624 + }, + { + "epoch": 1.3864484954974743, + "grad_norm": 2.167282819747925, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7113094925880432, + "num_tokens": 315126677.0, + "step": 12625 + }, + { + "epoch": 1.3865583132000878, + "grad_norm": 2.432668447494507, + "learning_rate": 1e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7362527847290039, + "num_tokens": 315147006.0, + "step": 12626 + }, + { + "epoch": 1.3866681309027016, + "grad_norm": 2.185039520263672, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6966898441314697, + "num_tokens": 315177070.0, + "step": 12627 + }, + { + "epoch": 1.3867779486053151, + "grad_norm": 2.5768015384674072, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7410505414009094, + "num_tokens": 315197873.0, + "step": 12628 + }, + { + "epoch": 1.3868877663079289, + "grad_norm": 2.3559486865997314, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7378737926483154, + "num_tokens": 315220109.0, + "step": 12629 + }, + { + "epoch": 1.3869975840105426, + "grad_norm": 2.078146457672119, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7321004867553711, + "num_tokens": 315246505.0, + "step": 12630 + }, + { + "epoch": 1.3871074017131562, + "grad_norm": 2.2641544342041016, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.723948061466217, + "num_tokens": 315271193.0, + "step": 12631 + }, + { + "epoch": 1.3872172194157697, + "grad_norm": 2.0429601669311523, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7184362411499023, + "num_tokens": 315300421.0, + "step": 12632 + }, + { + "epoch": 1.3873270371183835, + "grad_norm": 2.0785679817199707, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.6998763680458069, + "num_tokens": 315331494.0, + "step": 12633 + }, + { + "epoch": 1.3874368548209972, + "grad_norm": 2.2665107250213623, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7213267087936401, + "num_tokens": 315355848.0, + "step": 12634 + }, + { + "epoch": 1.3875466725236107, + "grad_norm": 2.608112335205078, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7219488024711609, + "num_tokens": 315375936.0, + "step": 12635 + }, + { + "epoch": 1.3876564902262245, + "grad_norm": 2.2517812252044678, + "learning_rate": 1e-06, + "loss": 1.0266, + "mean_token_accuracy": 0.6808316707611084, + "num_tokens": 315403390.0, + "step": 12636 + }, + { + "epoch": 1.387766307928838, + "grad_norm": 2.6367759704589844, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7251076102256775, + "num_tokens": 315424291.0, + "step": 12637 + }, + { + "epoch": 1.3878761256314518, + "grad_norm": 2.1769230365753174, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7108151912689209, + "num_tokens": 315454052.0, + "step": 12638 + }, + { + "epoch": 1.3879859433340656, + "grad_norm": 2.2458646297454834, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7294752597808838, + "num_tokens": 315478011.0, + "step": 12639 + }, + { + "epoch": 1.388095761036679, + "grad_norm": 2.138237476348877, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7167601585388184, + "num_tokens": 315506284.0, + "step": 12640 + }, + { + "epoch": 1.3882055787392928, + "grad_norm": 2.631012439727783, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7394317984580994, + "num_tokens": 315524840.0, + "step": 12641 + }, + { + "epoch": 1.3883153964419064, + "grad_norm": 2.4744598865509033, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7429611682891846, + "num_tokens": 315546796.0, + "step": 12642 + }, + { + "epoch": 1.3884252141445201, + "grad_norm": 2.4549245834350586, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7140327095985413, + "num_tokens": 315568524.0, + "step": 12643 + }, + { + "epoch": 1.388535031847134, + "grad_norm": 2.681053638458252, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7239048480987549, + "num_tokens": 315588747.0, + "step": 12644 + }, + { + "epoch": 1.3886448495497474, + "grad_norm": 2.2068381309509277, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7368987798690796, + "num_tokens": 315616799.0, + "step": 12645 + }, + { + "epoch": 1.388754667252361, + "grad_norm": 2.3000054359436035, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7119088172912598, + "num_tokens": 315643036.0, + "step": 12646 + }, + { + "epoch": 1.3888644849549747, + "grad_norm": 2.435861825942993, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.7237237691879272, + "num_tokens": 315665145.0, + "step": 12647 + }, + { + "epoch": 1.3889743026575885, + "grad_norm": 2.3496711254119873, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.724902331829071, + "num_tokens": 315689148.0, + "step": 12648 + }, + { + "epoch": 1.389084120360202, + "grad_norm": 2.786996603012085, + "learning_rate": 1e-06, + "loss": 0.81, + "mean_token_accuracy": 0.7426917552947998, + "num_tokens": 315708793.0, + "step": 12649 + }, + { + "epoch": 1.3891939380628158, + "grad_norm": 2.3146607875823975, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.693062961101532, + "num_tokens": 315734423.0, + "step": 12650 + }, + { + "epoch": 1.3893037557654293, + "grad_norm": 2.0964643955230713, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7225150465965271, + "num_tokens": 315762993.0, + "step": 12651 + }, + { + "epoch": 1.389413573468043, + "grad_norm": 2.2472407817840576, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7165641188621521, + "num_tokens": 315791222.0, + "step": 12652 + }, + { + "epoch": 1.3895233911706568, + "grad_norm": 2.046109199523926, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7070953845977783, + "num_tokens": 315819877.0, + "step": 12653 + }, + { + "epoch": 1.3896332088732704, + "grad_norm": 2.3771965503692627, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.721237301826477, + "num_tokens": 315844601.0, + "step": 12654 + }, + { + "epoch": 1.389743026575884, + "grad_norm": 2.0001964569091797, + "learning_rate": 1e-06, + "loss": 0.9746, + "mean_token_accuracy": 0.7043192386627197, + "num_tokens": 315876782.0, + "step": 12655 + }, + { + "epoch": 1.3898528442784976, + "grad_norm": 2.6504125595092773, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7168220281600952, + "num_tokens": 315896436.0, + "step": 12656 + }, + { + "epoch": 1.3899626619811114, + "grad_norm": 2.072345018386841, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7243356704711914, + "num_tokens": 315923554.0, + "step": 12657 + }, + { + "epoch": 1.390072479683725, + "grad_norm": 2.1393136978149414, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.710177481174469, + "num_tokens": 315954139.0, + "step": 12658 + }, + { + "epoch": 1.3901822973863387, + "grad_norm": 2.348112106323242, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7216733694076538, + "num_tokens": 315976537.0, + "step": 12659 + }, + { + "epoch": 1.3902921150889522, + "grad_norm": 2.366109609603882, + "learning_rate": 1e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7459294199943542, + "num_tokens": 315999635.0, + "step": 12660 + }, + { + "epoch": 1.390401932791566, + "grad_norm": 2.4592442512512207, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7324831485748291, + "num_tokens": 316023044.0, + "step": 12661 + }, + { + "epoch": 1.3905117504941797, + "grad_norm": 2.2531819343566895, + "learning_rate": 1e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7422395348548889, + "num_tokens": 316046061.0, + "step": 12662 + }, + { + "epoch": 1.3906215681967933, + "grad_norm": 2.296715259552002, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7334230542182922, + "num_tokens": 316070275.0, + "step": 12663 + }, + { + "epoch": 1.390731385899407, + "grad_norm": 2.1010019779205322, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7097575664520264, + "num_tokens": 316099929.0, + "step": 12664 + }, + { + "epoch": 1.3908412036020206, + "grad_norm": 2.5586819648742676, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7125716209411621, + "num_tokens": 316121832.0, + "step": 12665 + }, + { + "epoch": 1.3909510213046343, + "grad_norm": 2.3490052223205566, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7251359224319458, + "num_tokens": 316144210.0, + "step": 12666 + }, + { + "epoch": 1.391060839007248, + "grad_norm": 2.1843087673187256, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7243878841400146, + "num_tokens": 316171644.0, + "step": 12667 + }, + { + "epoch": 1.3911706567098616, + "grad_norm": 2.1603522300720215, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7411401867866516, + "num_tokens": 316197000.0, + "step": 12668 + }, + { + "epoch": 1.3912804744124752, + "grad_norm": 2.3725972175598145, + "learning_rate": 1e-06, + "loss": 1.0125, + "mean_token_accuracy": 0.6933649778366089, + "num_tokens": 316220309.0, + "step": 12669 + }, + { + "epoch": 1.391390292115089, + "grad_norm": 1.8446913957595825, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7022229433059692, + "num_tokens": 316253083.0, + "step": 12670 + }, + { + "epoch": 1.3915001098177027, + "grad_norm": 2.170356273651123, + "learning_rate": 1e-06, + "loss": 0.8043, + "mean_token_accuracy": 0.7388688921928406, + "num_tokens": 316279018.0, + "step": 12671 + }, + { + "epoch": 1.3916099275203162, + "grad_norm": 2.415553331375122, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7218888998031616, + "num_tokens": 316303533.0, + "step": 12672 + }, + { + "epoch": 1.39171974522293, + "grad_norm": 2.081519603729248, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7145967483520508, + "num_tokens": 316330126.0, + "step": 12673 + }, + { + "epoch": 1.3918295629255435, + "grad_norm": 2.3981032371520996, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7432559728622437, + "num_tokens": 316350400.0, + "step": 12674 + }, + { + "epoch": 1.3919393806281573, + "grad_norm": 2.172659158706665, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7350524663925171, + "num_tokens": 316375450.0, + "step": 12675 + }, + { + "epoch": 1.392049198330771, + "grad_norm": 2.1879708766937256, + "learning_rate": 1e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7377896904945374, + "num_tokens": 316401420.0, + "step": 12676 + }, + { + "epoch": 1.3921590160333845, + "grad_norm": 2.2808074951171875, + "learning_rate": 1e-06, + "loss": 0.9136, + "mean_token_accuracy": 0.7209030985832214, + "num_tokens": 316426696.0, + "step": 12677 + }, + { + "epoch": 1.3922688337359983, + "grad_norm": 2.542282819747925, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7295266389846802, + "num_tokens": 316447390.0, + "step": 12678 + }, + { + "epoch": 1.3923786514386118, + "grad_norm": 2.4003264904022217, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7142333984375, + "num_tokens": 316470722.0, + "step": 12679 + }, + { + "epoch": 1.3924884691412256, + "grad_norm": 2.255145788192749, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7402173280715942, + "num_tokens": 316493395.0, + "step": 12680 + }, + { + "epoch": 1.3925982868438394, + "grad_norm": 2.3802542686462402, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7286756038665771, + "num_tokens": 316516340.0, + "step": 12681 + }, + { + "epoch": 1.3927081045464529, + "grad_norm": 2.352583885192871, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7381744384765625, + "num_tokens": 316539778.0, + "step": 12682 + }, + { + "epoch": 1.3928179222490664, + "grad_norm": 2.450617551803589, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7360782623291016, + "num_tokens": 316560589.0, + "step": 12683 + }, + { + "epoch": 1.3929277399516802, + "grad_norm": 2.3178820610046387, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7223286628723145, + "num_tokens": 316583752.0, + "step": 12684 + }, + { + "epoch": 1.393037557654294, + "grad_norm": 2.299683094024658, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7514997720718384, + "num_tokens": 316605737.0, + "step": 12685 + }, + { + "epoch": 1.3931473753569075, + "grad_norm": 2.0851686000823975, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7068420648574829, + "num_tokens": 316636536.0, + "step": 12686 + }, + { + "epoch": 1.3932571930595212, + "grad_norm": 2.467841148376465, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7168079614639282, + "num_tokens": 316660372.0, + "step": 12687 + }, + { + "epoch": 1.3933670107621348, + "grad_norm": 2.7676329612731934, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7379411458969116, + "num_tokens": 316678966.0, + "step": 12688 + }, + { + "epoch": 1.3934768284647485, + "grad_norm": 2.1670589447021484, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7117803692817688, + "num_tokens": 316707764.0, + "step": 12689 + }, + { + "epoch": 1.3935866461673623, + "grad_norm": 2.509716749191284, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7329339981079102, + "num_tokens": 316729172.0, + "step": 12690 + }, + { + "epoch": 1.3936964638699758, + "grad_norm": 2.2820165157318115, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7127087712287903, + "num_tokens": 316755578.0, + "step": 12691 + }, + { + "epoch": 1.3938062815725896, + "grad_norm": 2.351922035217285, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7266047596931458, + "num_tokens": 316777775.0, + "step": 12692 + }, + { + "epoch": 1.393916099275203, + "grad_norm": 2.246195077896118, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7394089698791504, + "num_tokens": 316802984.0, + "step": 12693 + }, + { + "epoch": 1.3940259169778169, + "grad_norm": 2.1130828857421875, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7317295074462891, + "num_tokens": 316830041.0, + "step": 12694 + }, + { + "epoch": 1.3941357346804306, + "grad_norm": 2.3478493690490723, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7253445386886597, + "num_tokens": 316854449.0, + "step": 12695 + }, + { + "epoch": 1.3942455523830442, + "grad_norm": 2.2486166954040527, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7163369655609131, + "num_tokens": 316879369.0, + "step": 12696 + }, + { + "epoch": 1.3943553700856577, + "grad_norm": 2.624600410461426, + "learning_rate": 1e-06, + "loss": 0.819, + "mean_token_accuracy": 0.7446301579475403, + "num_tokens": 316899778.0, + "step": 12697 + }, + { + "epoch": 1.3944651877882714, + "grad_norm": 2.300703525543213, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7219744920730591, + "num_tokens": 316924164.0, + "step": 12698 + }, + { + "epoch": 1.3945750054908852, + "grad_norm": 2.7973713874816895, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7236883640289307, + "num_tokens": 316941820.0, + "step": 12699 + }, + { + "epoch": 1.3946848231934987, + "grad_norm": 2.3782432079315186, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7237004041671753, + "num_tokens": 316964386.0, + "step": 12700 + }, + { + "epoch": 1.3947946408961125, + "grad_norm": 2.717505931854248, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7351436614990234, + "num_tokens": 316984012.0, + "step": 12701 + }, + { + "epoch": 1.394904458598726, + "grad_norm": 2.786799669265747, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7346552610397339, + "num_tokens": 317002768.0, + "step": 12702 + }, + { + "epoch": 1.3950142763013398, + "grad_norm": 2.0079140663146973, + "learning_rate": 1e-06, + "loss": 0.9915, + "mean_token_accuracy": 0.6838884353637695, + "num_tokens": 317034942.0, + "step": 12703 + }, + { + "epoch": 1.3951240940039535, + "grad_norm": 2.1325228214263916, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.7063056826591492, + "num_tokens": 317064694.0, + "step": 12704 + }, + { + "epoch": 1.395233911706567, + "grad_norm": 2.287858247756958, + "learning_rate": 1e-06, + "loss": 0.8033, + "mean_token_accuracy": 0.7450189590454102, + "num_tokens": 317088858.0, + "step": 12705 + }, + { + "epoch": 1.3953437294091808, + "grad_norm": 2.0826568603515625, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7293720245361328, + "num_tokens": 317114280.0, + "step": 12706 + }, + { + "epoch": 1.3954535471117944, + "grad_norm": 2.093761920928955, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.6994107961654663, + "num_tokens": 317142955.0, + "step": 12707 + }, + { + "epoch": 1.3955633648144081, + "grad_norm": 2.413038969039917, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7187559008598328, + "num_tokens": 317168362.0, + "step": 12708 + }, + { + "epoch": 1.3956731825170219, + "grad_norm": 2.1314280033111572, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7035608887672424, + "num_tokens": 317196848.0, + "step": 12709 + }, + { + "epoch": 1.3957830002196354, + "grad_norm": 2.1689538955688477, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7318847179412842, + "num_tokens": 317223824.0, + "step": 12710 + }, + { + "epoch": 1.395892817922249, + "grad_norm": 2.339369773864746, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7191174626350403, + "num_tokens": 317248644.0, + "step": 12711 + }, + { + "epoch": 1.3960026356248627, + "grad_norm": 2.3707542419433594, + "learning_rate": 1e-06, + "loss": 0.9804, + "mean_token_accuracy": 0.6985399723052979, + "num_tokens": 317273680.0, + "step": 12712 + }, + { + "epoch": 1.3961124533274765, + "grad_norm": 2.0229995250701904, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7035688161849976, + "num_tokens": 317303273.0, + "step": 12713 + }, + { + "epoch": 1.39622227103009, + "grad_norm": 2.0553719997406006, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7159154415130615, + "num_tokens": 317333874.0, + "step": 12714 + }, + { + "epoch": 1.3963320887327038, + "grad_norm": 2.112508773803711, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7034014463424683, + "num_tokens": 317363679.0, + "step": 12715 + }, + { + "epoch": 1.3964419064353173, + "grad_norm": 2.6961708068847656, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7461093664169312, + "num_tokens": 317381687.0, + "step": 12716 + }, + { + "epoch": 1.396551724137931, + "grad_norm": 2.3227391242980957, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7164381742477417, + "num_tokens": 317407517.0, + "step": 12717 + }, + { + "epoch": 1.3966615418405448, + "grad_norm": 2.165830373764038, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.699317216873169, + "num_tokens": 317436484.0, + "step": 12718 + }, + { + "epoch": 1.3967713595431583, + "grad_norm": 2.1204259395599365, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7032041549682617, + "num_tokens": 317464493.0, + "step": 12719 + }, + { + "epoch": 1.396881177245772, + "grad_norm": 2.212231397628784, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7171309590339661, + "num_tokens": 317490448.0, + "step": 12720 + }, + { + "epoch": 1.3969909949483856, + "grad_norm": 1.9367289543151855, + "learning_rate": 1e-06, + "loss": 0.8616, + "mean_token_accuracy": 0.724967360496521, + "num_tokens": 317521788.0, + "step": 12721 + }, + { + "epoch": 1.3971008126509994, + "grad_norm": 2.3033084869384766, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7359501719474792, + "num_tokens": 317547467.0, + "step": 12722 + }, + { + "epoch": 1.397210630353613, + "grad_norm": 2.255445957183838, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7235473394393921, + "num_tokens": 317572785.0, + "step": 12723 + }, + { + "epoch": 1.3973204480562267, + "grad_norm": 2.1196112632751465, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7273050546646118, + "num_tokens": 317599478.0, + "step": 12724 + }, + { + "epoch": 1.3974302657588402, + "grad_norm": 2.1234920024871826, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7153830528259277, + "num_tokens": 317627584.0, + "step": 12725 + }, + { + "epoch": 1.397540083461454, + "grad_norm": 2.113237142562866, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7108133435249329, + "num_tokens": 317658004.0, + "step": 12726 + }, + { + "epoch": 1.3976499011640677, + "grad_norm": 2.276221990585327, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7129025459289551, + "num_tokens": 317684336.0, + "step": 12727 + }, + { + "epoch": 1.3977597188666813, + "grad_norm": 2.2613415718078613, + "learning_rate": 1e-06, + "loss": 1.0333, + "mean_token_accuracy": 0.6861689686775208, + "num_tokens": 317709241.0, + "step": 12728 + }, + { + "epoch": 1.397869536569295, + "grad_norm": 2.0802035331726074, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6985182762145996, + "num_tokens": 317738288.0, + "step": 12729 + }, + { + "epoch": 1.3979793542719086, + "grad_norm": 2.0700910091400146, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.7062896490097046, + "num_tokens": 317769229.0, + "step": 12730 + }, + { + "epoch": 1.3980891719745223, + "grad_norm": 2.7487266063690186, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7291848063468933, + "num_tokens": 317787779.0, + "step": 12731 + }, + { + "epoch": 1.398198989677136, + "grad_norm": 2.4643378257751465, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7558331489562988, + "num_tokens": 317807984.0, + "step": 12732 + }, + { + "epoch": 1.3983088073797496, + "grad_norm": 2.2406883239746094, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7155382633209229, + "num_tokens": 317834034.0, + "step": 12733 + }, + { + "epoch": 1.3984186250823631, + "grad_norm": 2.1639060974121094, + "learning_rate": 1e-06, + "loss": 0.8388, + "mean_token_accuracy": 0.7401477694511414, + "num_tokens": 317859584.0, + "step": 12734 + }, + { + "epoch": 1.398528442784977, + "grad_norm": 2.5148885250091553, + "learning_rate": 1e-06, + "loss": 0.9582, + "mean_token_accuracy": 0.7056576013565063, + "num_tokens": 317879892.0, + "step": 12735 + }, + { + "epoch": 1.3986382604875907, + "grad_norm": 2.1557421684265137, + "learning_rate": 1e-06, + "loss": 1.0, + "mean_token_accuracy": 0.6903284788131714, + "num_tokens": 317907720.0, + "step": 12736 + }, + { + "epoch": 1.3987480781902042, + "grad_norm": 2.4043755531311035, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7168837785720825, + "num_tokens": 317930655.0, + "step": 12737 + }, + { + "epoch": 1.398857895892818, + "grad_norm": 2.274937868118286, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7182223796844482, + "num_tokens": 317955166.0, + "step": 12738 + }, + { + "epoch": 1.3989677135954315, + "grad_norm": 2.1296393871307373, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7085260152816772, + "num_tokens": 317982840.0, + "step": 12739 + }, + { + "epoch": 1.3990775312980452, + "grad_norm": 2.6062402725219727, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7187855243682861, + "num_tokens": 318003682.0, + "step": 12740 + }, + { + "epoch": 1.399187349000659, + "grad_norm": 2.35917592048645, + "learning_rate": 1e-06, + "loss": 0.9441, + "mean_token_accuracy": 0.7085745334625244, + "num_tokens": 318027715.0, + "step": 12741 + }, + { + "epoch": 1.3992971667032725, + "grad_norm": 2.2265682220458984, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.705414354801178, + "num_tokens": 318052863.0, + "step": 12742 + }, + { + "epoch": 1.3994069844058863, + "grad_norm": 2.363396644592285, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7211389541625977, + "num_tokens": 318076919.0, + "step": 12743 + }, + { + "epoch": 1.3995168021084998, + "grad_norm": 2.3215579986572266, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7118028402328491, + "num_tokens": 318101944.0, + "step": 12744 + }, + { + "epoch": 1.3996266198111136, + "grad_norm": 2.3611459732055664, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7219451069831848, + "num_tokens": 318127235.0, + "step": 12745 + }, + { + "epoch": 1.3997364375137273, + "grad_norm": 2.4115891456604004, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7077159285545349, + "num_tokens": 318149937.0, + "step": 12746 + }, + { + "epoch": 1.3998462552163409, + "grad_norm": 2.0823681354522705, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7148493528366089, + "num_tokens": 318178669.0, + "step": 12747 + }, + { + "epoch": 1.3999560729189544, + "grad_norm": 2.1446070671081543, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6856653094291687, + "num_tokens": 318206422.0, + "step": 12748 + }, + { + "epoch": 1.4000658906215682, + "grad_norm": 2.3183212280273438, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7239605188369751, + "num_tokens": 318230770.0, + "step": 12749 + }, + { + "epoch": 1.400175708324182, + "grad_norm": 2.3442227840423584, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7223516702651978, + "num_tokens": 318253317.0, + "step": 12750 + }, + { + "epoch": 1.4002855260267955, + "grad_norm": 2.3002350330352783, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7153266668319702, + "num_tokens": 318276906.0, + "step": 12751 + }, + { + "epoch": 1.4003953437294092, + "grad_norm": 2.2713029384613037, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7389006614685059, + "num_tokens": 318300406.0, + "step": 12752 + }, + { + "epoch": 1.4005051614320227, + "grad_norm": 2.1429996490478516, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7348645925521851, + "num_tokens": 318326862.0, + "step": 12753 + }, + { + "epoch": 1.4006149791346365, + "grad_norm": 2.200294256210327, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7110964059829712, + "num_tokens": 318351291.0, + "step": 12754 + }, + { + "epoch": 1.4007247968372503, + "grad_norm": 2.299445867538452, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7445232272148132, + "num_tokens": 318374260.0, + "step": 12755 + }, + { + "epoch": 1.4008346145398638, + "grad_norm": 2.1962647438049316, + "learning_rate": 1e-06, + "loss": 0.993, + "mean_token_accuracy": 0.6977603435516357, + "num_tokens": 318402395.0, + "step": 12756 + }, + { + "epoch": 1.4009444322424776, + "grad_norm": 2.695357322692871, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.714810848236084, + "num_tokens": 318421689.0, + "step": 12757 + }, + { + "epoch": 1.401054249945091, + "grad_norm": 2.2308619022369385, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.6973521709442139, + "num_tokens": 318448676.0, + "step": 12758 + }, + { + "epoch": 1.4011640676477048, + "grad_norm": 2.317949056625366, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.722740888595581, + "num_tokens": 318473411.0, + "step": 12759 + }, + { + "epoch": 1.4012738853503186, + "grad_norm": 2.1844680309295654, + "learning_rate": 1e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.747637927532196, + "num_tokens": 318498597.0, + "step": 12760 + }, + { + "epoch": 1.4013837030529321, + "grad_norm": 2.1736698150634766, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7056430578231812, + "num_tokens": 318527060.0, + "step": 12761 + }, + { + "epoch": 1.4014935207555457, + "grad_norm": 2.3628485202789307, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7171151041984558, + "num_tokens": 318548838.0, + "step": 12762 + }, + { + "epoch": 1.4016033384581594, + "grad_norm": 2.254559278488159, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6996631026268005, + "num_tokens": 318574636.0, + "step": 12763 + }, + { + "epoch": 1.4017131561607732, + "grad_norm": 2.4952962398529053, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7135934829711914, + "num_tokens": 318596082.0, + "step": 12764 + }, + { + "epoch": 1.4018229738633867, + "grad_norm": 2.148289918899536, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.721695065498352, + "num_tokens": 318624797.0, + "step": 12765 + }, + { + "epoch": 1.4019327915660005, + "grad_norm": 2.4999048709869385, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7265867590904236, + "num_tokens": 318645301.0, + "step": 12766 + }, + { + "epoch": 1.402042609268614, + "grad_norm": 2.4112446308135986, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7523584961891174, + "num_tokens": 318665393.0, + "step": 12767 + }, + { + "epoch": 1.4021524269712278, + "grad_norm": 2.3244361877441406, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7232933044433594, + "num_tokens": 318688714.0, + "step": 12768 + }, + { + "epoch": 1.4022622446738415, + "grad_norm": 2.3408539295196533, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7316490411758423, + "num_tokens": 318710064.0, + "step": 12769 + }, + { + "epoch": 1.402372062376455, + "grad_norm": 2.2490954399108887, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.723504364490509, + "num_tokens": 318734312.0, + "step": 12770 + }, + { + "epoch": 1.4024818800790688, + "grad_norm": 2.2128915786743164, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.702583909034729, + "num_tokens": 318760420.0, + "step": 12771 + }, + { + "epoch": 1.4025916977816824, + "grad_norm": 2.45758318901062, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7322601079940796, + "num_tokens": 318782680.0, + "step": 12772 + }, + { + "epoch": 1.4027015154842961, + "grad_norm": 2.477795124053955, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.726943850517273, + "num_tokens": 318805332.0, + "step": 12773 + }, + { + "epoch": 1.4028113331869096, + "grad_norm": 2.0017082691192627, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7340373992919922, + "num_tokens": 318834372.0, + "step": 12774 + }, + { + "epoch": 1.4029211508895234, + "grad_norm": 2.224452257156372, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7305277585983276, + "num_tokens": 318859080.0, + "step": 12775 + }, + { + "epoch": 1.403030968592137, + "grad_norm": 2.298888683319092, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7104744911193848, + "num_tokens": 318884098.0, + "step": 12776 + }, + { + "epoch": 1.4031407862947507, + "grad_norm": 2.5155186653137207, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7451985478401184, + "num_tokens": 318903780.0, + "step": 12777 + }, + { + "epoch": 1.4032506039973645, + "grad_norm": 2.178478479385376, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7172607779502869, + "num_tokens": 318930372.0, + "step": 12778 + }, + { + "epoch": 1.403360421699978, + "grad_norm": 2.065453052520752, + "learning_rate": 1e-06, + "loss": 0.9905, + "mean_token_accuracy": 0.7104202508926392, + "num_tokens": 318959271.0, + "step": 12779 + }, + { + "epoch": 1.4034702394025917, + "grad_norm": 1.92951500415802, + "learning_rate": 1e-06, + "loss": 1.0085, + "mean_token_accuracy": 0.6879561543464661, + "num_tokens": 318992604.0, + "step": 12780 + }, + { + "epoch": 1.4035800571052053, + "grad_norm": 1.9756532907485962, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7204506397247314, + "num_tokens": 319023191.0, + "step": 12781 + }, + { + "epoch": 1.403689874807819, + "grad_norm": 1.9976614713668823, + "learning_rate": 1e-06, + "loss": 0.9829, + "mean_token_accuracy": 0.6962286829948425, + "num_tokens": 319053068.0, + "step": 12782 + }, + { + "epoch": 1.4037996925104328, + "grad_norm": 2.4412245750427246, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7156099081039429, + "num_tokens": 319075493.0, + "step": 12783 + }, + { + "epoch": 1.4039095102130463, + "grad_norm": 2.186256170272827, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.6992442607879639, + "num_tokens": 319103779.0, + "step": 12784 + }, + { + "epoch": 1.40401932791566, + "grad_norm": 2.1004080772399902, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.69923996925354, + "num_tokens": 319131337.0, + "step": 12785 + }, + { + "epoch": 1.4041291456182736, + "grad_norm": 2.0115647315979004, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.6986122131347656, + "num_tokens": 319162569.0, + "step": 12786 + }, + { + "epoch": 1.4042389633208874, + "grad_norm": 2.5265161991119385, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7322437763214111, + "num_tokens": 319182944.0, + "step": 12787 + }, + { + "epoch": 1.404348781023501, + "grad_norm": 2.413785457611084, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7384895086288452, + "num_tokens": 319204956.0, + "step": 12788 + }, + { + "epoch": 1.4044585987261147, + "grad_norm": 2.079698085784912, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7196300029754639, + "num_tokens": 319232251.0, + "step": 12789 + }, + { + "epoch": 1.4045684164287282, + "grad_norm": 2.3216893672943115, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7256218194961548, + "num_tokens": 319258222.0, + "step": 12790 + }, + { + "epoch": 1.404678234131342, + "grad_norm": 2.4978535175323486, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7182992696762085, + "num_tokens": 319278409.0, + "step": 12791 + }, + { + "epoch": 1.4047880518339557, + "grad_norm": 2.1135451793670654, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7139475345611572, + "num_tokens": 319306814.0, + "step": 12792 + }, + { + "epoch": 1.4048978695365693, + "grad_norm": 2.499624729156494, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7302436232566833, + "num_tokens": 319328345.0, + "step": 12793 + }, + { + "epoch": 1.405007687239183, + "grad_norm": 2.5816211700439453, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7252964377403259, + "num_tokens": 319348466.0, + "step": 12794 + }, + { + "epoch": 1.4051175049417965, + "grad_norm": 2.2639472484588623, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7125928997993469, + "num_tokens": 319374236.0, + "step": 12795 + }, + { + "epoch": 1.4052273226444103, + "grad_norm": 2.2676589488983154, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7229201197624207, + "num_tokens": 319399756.0, + "step": 12796 + }, + { + "epoch": 1.405337140347024, + "grad_norm": 2.544311046600342, + "learning_rate": 1e-06, + "loss": 0.8448, + "mean_token_accuracy": 0.7284489870071411, + "num_tokens": 319419886.0, + "step": 12797 + }, + { + "epoch": 1.4054469580496376, + "grad_norm": 2.6456992626190186, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7237814664840698, + "num_tokens": 319439472.0, + "step": 12798 + }, + { + "epoch": 1.4055567757522511, + "grad_norm": 2.2585673332214355, + "learning_rate": 1e-06, + "loss": 0.9859, + "mean_token_accuracy": 0.698853075504303, + "num_tokens": 319468123.0, + "step": 12799 + }, + { + "epoch": 1.4056665934548649, + "grad_norm": 2.576925754547119, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7314846515655518, + "num_tokens": 319487625.0, + "step": 12800 + }, + { + "epoch": 1.4057764111574786, + "grad_norm": 2.401880979537964, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7117438316345215, + "num_tokens": 319512291.0, + "step": 12801 + }, + { + "epoch": 1.4058862288600922, + "grad_norm": 2.186446189880371, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7054263353347778, + "num_tokens": 319539939.0, + "step": 12802 + }, + { + "epoch": 1.405996046562706, + "grad_norm": 2.4720213413238525, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7126885652542114, + "num_tokens": 319562182.0, + "step": 12803 + }, + { + "epoch": 1.4061058642653195, + "grad_norm": 2.1613268852233887, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7152372598648071, + "num_tokens": 319590108.0, + "step": 12804 + }, + { + "epoch": 1.4062156819679332, + "grad_norm": 1.988542079925537, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7236481308937073, + "num_tokens": 319618371.0, + "step": 12805 + }, + { + "epoch": 1.406325499670547, + "grad_norm": 2.49139142036438, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7304367423057556, + "num_tokens": 319639446.0, + "step": 12806 + }, + { + "epoch": 1.4064353173731605, + "grad_norm": 2.7452054023742676, + "learning_rate": 1e-06, + "loss": 0.8639, + "mean_token_accuracy": 0.7332515120506287, + "num_tokens": 319658540.0, + "step": 12807 + }, + { + "epoch": 1.4065451350757743, + "grad_norm": 2.171722888946533, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7347638010978699, + "num_tokens": 319682648.0, + "step": 12808 + }, + { + "epoch": 1.4066549527783878, + "grad_norm": 2.264878749847412, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7214394211769104, + "num_tokens": 319707057.0, + "step": 12809 + }, + { + "epoch": 1.4067647704810016, + "grad_norm": 2.6416115760803223, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7362955212593079, + "num_tokens": 319726983.0, + "step": 12810 + }, + { + "epoch": 1.4068745881836153, + "grad_norm": 2.2329211235046387, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.6936537027359009, + "num_tokens": 319754433.0, + "step": 12811 + }, + { + "epoch": 1.4069844058862289, + "grad_norm": 2.5074479579925537, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7239810228347778, + "num_tokens": 319777169.0, + "step": 12812 + }, + { + "epoch": 1.4070942235888424, + "grad_norm": 2.4416282176971436, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7274875640869141, + "num_tokens": 319798644.0, + "step": 12813 + }, + { + "epoch": 1.4072040412914562, + "grad_norm": 2.354403257369995, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.7091860771179199, + "num_tokens": 319825747.0, + "step": 12814 + }, + { + "epoch": 1.40731385899407, + "grad_norm": 2.4144535064697266, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.723658561706543, + "num_tokens": 319849470.0, + "step": 12815 + }, + { + "epoch": 1.4074236766966834, + "grad_norm": 2.1592020988464355, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7168242931365967, + "num_tokens": 319877580.0, + "step": 12816 + }, + { + "epoch": 1.4075334943992972, + "grad_norm": 2.4001271724700928, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7172970771789551, + "num_tokens": 319901337.0, + "step": 12817 + }, + { + "epoch": 1.4076433121019107, + "grad_norm": 2.1147429943084717, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7226095795631409, + "num_tokens": 319928975.0, + "step": 12818 + }, + { + "epoch": 1.4077531298045245, + "grad_norm": 2.0762624740600586, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7044411897659302, + "num_tokens": 319960023.0, + "step": 12819 + }, + { + "epoch": 1.4078629475071383, + "grad_norm": 2.4635579586029053, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7217444181442261, + "num_tokens": 319980495.0, + "step": 12820 + }, + { + "epoch": 1.4079727652097518, + "grad_norm": 2.550529956817627, + "learning_rate": 1e-06, + "loss": 0.7906, + "mean_token_accuracy": 0.7474413514137268, + "num_tokens": 319998822.0, + "step": 12821 + }, + { + "epoch": 1.4080825829123655, + "grad_norm": 2.194751739501953, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7307363748550415, + "num_tokens": 320023246.0, + "step": 12822 + }, + { + "epoch": 1.408192400614979, + "grad_norm": 2.238978147506714, + "learning_rate": 1e-06, + "loss": 0.8021, + "mean_token_accuracy": 0.7444081902503967, + "num_tokens": 320046310.0, + "step": 12823 + }, + { + "epoch": 1.4083022183175928, + "grad_norm": 2.544133424758911, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7420445680618286, + "num_tokens": 320067774.0, + "step": 12824 + }, + { + "epoch": 1.4084120360202066, + "grad_norm": 2.737179756164551, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7327655553817749, + "num_tokens": 320085696.0, + "step": 12825 + }, + { + "epoch": 1.4085218537228201, + "grad_norm": 2.2309391498565674, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7170623540878296, + "num_tokens": 320110676.0, + "step": 12826 + }, + { + "epoch": 1.4086316714254337, + "grad_norm": 2.461580753326416, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7109208106994629, + "num_tokens": 320133326.0, + "step": 12827 + }, + { + "epoch": 1.4087414891280474, + "grad_norm": 2.214952230453491, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.718106210231781, + "num_tokens": 320159686.0, + "step": 12828 + }, + { + "epoch": 1.4088513068306612, + "grad_norm": 1.9410001039505005, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.708519458770752, + "num_tokens": 320193692.0, + "step": 12829 + }, + { + "epoch": 1.4089611245332747, + "grad_norm": 2.3497226238250732, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7385544776916504, + "num_tokens": 320216997.0, + "step": 12830 + }, + { + "epoch": 1.4090709422358885, + "grad_norm": 2.2357559204101562, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.706411600112915, + "num_tokens": 320243096.0, + "step": 12831 + }, + { + "epoch": 1.409180759938502, + "grad_norm": 2.3341948986053467, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7170385718345642, + "num_tokens": 320266210.0, + "step": 12832 + }, + { + "epoch": 1.4092905776411158, + "grad_norm": 2.0270979404449463, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7127453088760376, + "num_tokens": 320295607.0, + "step": 12833 + }, + { + "epoch": 1.4094003953437295, + "grad_norm": 2.559143543243408, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7233720421791077, + "num_tokens": 320314922.0, + "step": 12834 + }, + { + "epoch": 1.409510213046343, + "grad_norm": 2.311459541320801, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7329421639442444, + "num_tokens": 320338529.0, + "step": 12835 + }, + { + "epoch": 1.4096200307489568, + "grad_norm": 2.147228717803955, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7314128279685974, + "num_tokens": 320364482.0, + "step": 12836 + }, + { + "epoch": 1.4097298484515703, + "grad_norm": 2.4598286151885986, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7205277681350708, + "num_tokens": 320386390.0, + "step": 12837 + }, + { + "epoch": 1.409839666154184, + "grad_norm": 2.033536195755005, + "learning_rate": 1e-06, + "loss": 0.9182, + "mean_token_accuracy": 0.7196747660636902, + "num_tokens": 320416392.0, + "step": 12838 + }, + { + "epoch": 1.4099494838567976, + "grad_norm": 2.250464916229248, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7457914352416992, + "num_tokens": 320440616.0, + "step": 12839 + }, + { + "epoch": 1.4100593015594114, + "grad_norm": 2.064887523651123, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.7005219459533691, + "num_tokens": 320471057.0, + "step": 12840 + }, + { + "epoch": 1.410169119262025, + "grad_norm": 2.3137032985687256, + "learning_rate": 1e-06, + "loss": 0.864, + "mean_token_accuracy": 0.7307882308959961, + "num_tokens": 320494622.0, + "step": 12841 + }, + { + "epoch": 1.4102789369646387, + "grad_norm": 2.048158884048462, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.703040361404419, + "num_tokens": 320526517.0, + "step": 12842 + }, + { + "epoch": 1.4103887546672524, + "grad_norm": 2.1681880950927734, + "learning_rate": 1e-06, + "loss": 1.0165, + "mean_token_accuracy": 0.6955121755599976, + "num_tokens": 320553232.0, + "step": 12843 + }, + { + "epoch": 1.410498572369866, + "grad_norm": 2.2280945777893066, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7041796445846558, + "num_tokens": 320579837.0, + "step": 12844 + }, + { + "epoch": 1.4106083900724797, + "grad_norm": 2.0328264236450195, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7027590274810791, + "num_tokens": 320609302.0, + "step": 12845 + }, + { + "epoch": 1.4107182077750933, + "grad_norm": 2.2832016944885254, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7083926200866699, + "num_tokens": 320635856.0, + "step": 12846 + }, + { + "epoch": 1.410828025477707, + "grad_norm": 2.0210421085357666, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7109901309013367, + "num_tokens": 320665842.0, + "step": 12847 + }, + { + "epoch": 1.4109378431803208, + "grad_norm": 2.3003990650177, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7042593955993652, + "num_tokens": 320691501.0, + "step": 12848 + }, + { + "epoch": 1.4110476608829343, + "grad_norm": 2.1544413566589355, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7140508890151978, + "num_tokens": 320720724.0, + "step": 12849 + }, + { + "epoch": 1.4111574785855479, + "grad_norm": 2.1137115955352783, + "learning_rate": 1e-06, + "loss": 1.012, + "mean_token_accuracy": 0.6968076229095459, + "num_tokens": 320750208.0, + "step": 12850 + }, + { + "epoch": 1.4112672962881616, + "grad_norm": 2.320991277694702, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.722620964050293, + "num_tokens": 320774668.0, + "step": 12851 + }, + { + "epoch": 1.4113771139907754, + "grad_norm": 2.5257439613342285, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7328276634216309, + "num_tokens": 320798367.0, + "step": 12852 + }, + { + "epoch": 1.411486931693389, + "grad_norm": 2.243011474609375, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7127707004547119, + "num_tokens": 320826461.0, + "step": 12853 + }, + { + "epoch": 1.4115967493960027, + "grad_norm": 2.5572140216827393, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.7394532561302185, + "num_tokens": 320846292.0, + "step": 12854 + }, + { + "epoch": 1.4117065670986162, + "grad_norm": 2.122687578201294, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7319552898406982, + "num_tokens": 320874411.0, + "step": 12855 + }, + { + "epoch": 1.41181638480123, + "grad_norm": 1.9834504127502441, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7162505388259888, + "num_tokens": 320907438.0, + "step": 12856 + }, + { + "epoch": 1.4119262025038437, + "grad_norm": 1.9022260904312134, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.731626570224762, + "num_tokens": 320941072.0, + "step": 12857 + }, + { + "epoch": 1.4120360202064572, + "grad_norm": 2.461225748062134, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7333105802536011, + "num_tokens": 320961857.0, + "step": 12858 + }, + { + "epoch": 1.412145837909071, + "grad_norm": 2.3226914405822754, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7209374904632568, + "num_tokens": 320986838.0, + "step": 12859 + }, + { + "epoch": 1.4122556556116845, + "grad_norm": 2.0790014266967773, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6884591579437256, + "num_tokens": 321018749.0, + "step": 12860 + }, + { + "epoch": 1.4123654733142983, + "grad_norm": 2.3007748126983643, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7113872766494751, + "num_tokens": 321044151.0, + "step": 12861 + }, + { + "epoch": 1.412475291016912, + "grad_norm": 2.2883386611938477, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.709692656993866, + "num_tokens": 321068960.0, + "step": 12862 + }, + { + "epoch": 1.4125851087195256, + "grad_norm": 2.341834306716919, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7143359780311584, + "num_tokens": 321092203.0, + "step": 12863 + }, + { + "epoch": 1.4126949264221391, + "grad_norm": 2.184882879257202, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7153947353363037, + "num_tokens": 321119190.0, + "step": 12864 + }, + { + "epoch": 1.4128047441247529, + "grad_norm": 2.5608346462249756, + "learning_rate": 1e-06, + "loss": 0.9097, + "mean_token_accuracy": 0.7193021774291992, + "num_tokens": 321139766.0, + "step": 12865 + }, + { + "epoch": 1.4129145618273666, + "grad_norm": 2.256380081176758, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7215999960899353, + "num_tokens": 321163978.0, + "step": 12866 + }, + { + "epoch": 1.4130243795299802, + "grad_norm": 2.3566489219665527, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7316116094589233, + "num_tokens": 321188697.0, + "step": 12867 + }, + { + "epoch": 1.413134197232594, + "grad_norm": 2.198507308959961, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7311599254608154, + "num_tokens": 321216578.0, + "step": 12868 + }, + { + "epoch": 1.4132440149352075, + "grad_norm": 1.8136749267578125, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7161523103713989, + "num_tokens": 321255511.0, + "step": 12869 + }, + { + "epoch": 1.4133538326378212, + "grad_norm": 1.980795979499817, + "learning_rate": 1e-06, + "loss": 1.0189, + "mean_token_accuracy": 0.6877540349960327, + "num_tokens": 321290116.0, + "step": 12870 + }, + { + "epoch": 1.413463650340435, + "grad_norm": 2.2911667823791504, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7083699107170105, + "num_tokens": 321315331.0, + "step": 12871 + }, + { + "epoch": 1.4135734680430485, + "grad_norm": 2.551827907562256, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7168810367584229, + "num_tokens": 321335583.0, + "step": 12872 + }, + { + "epoch": 1.4136832857456623, + "grad_norm": 2.3028910160064697, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.725193440914154, + "num_tokens": 321359722.0, + "step": 12873 + }, + { + "epoch": 1.4137931034482758, + "grad_norm": 2.39029598236084, + "learning_rate": 1e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7506229877471924, + "num_tokens": 321382398.0, + "step": 12874 + }, + { + "epoch": 1.4139029211508896, + "grad_norm": 2.421400785446167, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7255300283432007, + "num_tokens": 321403795.0, + "step": 12875 + }, + { + "epoch": 1.4140127388535033, + "grad_norm": 2.3992369174957275, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7291321754455566, + "num_tokens": 321425173.0, + "step": 12876 + }, + { + "epoch": 1.4141225565561168, + "grad_norm": 1.968420386314392, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7307526469230652, + "num_tokens": 321454706.0, + "step": 12877 + }, + { + "epoch": 1.4142323742587304, + "grad_norm": 2.447972536087036, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7246884703636169, + "num_tokens": 321477913.0, + "step": 12878 + }, + { + "epoch": 1.4143421919613441, + "grad_norm": 2.304250717163086, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7110942602157593, + "num_tokens": 321503980.0, + "step": 12879 + }, + { + "epoch": 1.414452009663958, + "grad_norm": 2.1811363697052, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7308177351951599, + "num_tokens": 321529280.0, + "step": 12880 + }, + { + "epoch": 1.4145618273665714, + "grad_norm": 1.8824379444122314, + "learning_rate": 1e-06, + "loss": 0.9758, + "mean_token_accuracy": 0.7007924914360046, + "num_tokens": 321563077.0, + "step": 12881 + }, + { + "epoch": 1.4146716450691852, + "grad_norm": 2.1523830890655518, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7134429812431335, + "num_tokens": 321589819.0, + "step": 12882 + }, + { + "epoch": 1.4147814627717987, + "grad_norm": 2.584930181503296, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7168644666671753, + "num_tokens": 321610361.0, + "step": 12883 + }, + { + "epoch": 1.4148912804744125, + "grad_norm": 1.9775418043136597, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.7023894786834717, + "num_tokens": 321642664.0, + "step": 12884 + }, + { + "epoch": 1.4150010981770262, + "grad_norm": 2.369154691696167, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7348744869232178, + "num_tokens": 321666269.0, + "step": 12885 + }, + { + "epoch": 1.4151109158796398, + "grad_norm": 2.153531551361084, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.703040361404419, + "num_tokens": 321695302.0, + "step": 12886 + }, + { + "epoch": 1.4152207335822535, + "grad_norm": 2.3476834297180176, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7157936096191406, + "num_tokens": 321719931.0, + "step": 12887 + }, + { + "epoch": 1.415330551284867, + "grad_norm": 2.1236166954040527, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7159639000892639, + "num_tokens": 321747116.0, + "step": 12888 + }, + { + "epoch": 1.4154403689874808, + "grad_norm": 2.085517644882202, + "learning_rate": 1e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7566134929656982, + "num_tokens": 321774990.0, + "step": 12889 + }, + { + "epoch": 1.4155501866900946, + "grad_norm": 2.2446117401123047, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7321603298187256, + "num_tokens": 321799902.0, + "step": 12890 + }, + { + "epoch": 1.4156600043927081, + "grad_norm": 2.4496121406555176, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7343352437019348, + "num_tokens": 321822725.0, + "step": 12891 + }, + { + "epoch": 1.4157698220953217, + "grad_norm": 1.8610472679138184, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7261091470718384, + "num_tokens": 321856911.0, + "step": 12892 + }, + { + "epoch": 1.4158796397979354, + "grad_norm": 2.3036391735076904, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7144209146499634, + "num_tokens": 321880486.0, + "step": 12893 + }, + { + "epoch": 1.4159894575005492, + "grad_norm": 2.154723644256592, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7132503986358643, + "num_tokens": 321907959.0, + "step": 12894 + }, + { + "epoch": 1.4160992752031627, + "grad_norm": 2.5609028339385986, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7059428691864014, + "num_tokens": 321931039.0, + "step": 12895 + }, + { + "epoch": 1.4162090929057765, + "grad_norm": 2.4496164321899414, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7143768072128296, + "num_tokens": 321952775.0, + "step": 12896 + }, + { + "epoch": 1.41631891060839, + "grad_norm": 2.0959842205047607, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7087920308113098, + "num_tokens": 321981665.0, + "step": 12897 + }, + { + "epoch": 1.4164287283110037, + "grad_norm": 2.5673985481262207, + "learning_rate": 1e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7392631769180298, + "num_tokens": 322000505.0, + "step": 12898 + }, + { + "epoch": 1.4165385460136175, + "grad_norm": 2.011678695678711, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7072155475616455, + "num_tokens": 322033853.0, + "step": 12899 + }, + { + "epoch": 1.416648363716231, + "grad_norm": 2.2663862705230713, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.7024234533309937, + "num_tokens": 322058723.0, + "step": 12900 + }, + { + "epoch": 1.4167581814188448, + "grad_norm": 2.187056064605713, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7207039594650269, + "num_tokens": 322085512.0, + "step": 12901 + }, + { + "epoch": 1.4168679991214583, + "grad_norm": 2.1999897956848145, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7064388990402222, + "num_tokens": 322113064.0, + "step": 12902 + }, + { + "epoch": 1.416977816824072, + "grad_norm": 2.172513484954834, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7050235271453857, + "num_tokens": 322140885.0, + "step": 12903 + }, + { + "epoch": 1.4170876345266856, + "grad_norm": 2.4138526916503906, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7312633991241455, + "num_tokens": 322161881.0, + "step": 12904 + }, + { + "epoch": 1.4171974522292994, + "grad_norm": 2.0792744159698486, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7216488718986511, + "num_tokens": 322188722.0, + "step": 12905 + }, + { + "epoch": 1.417307269931913, + "grad_norm": 2.3288798332214355, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7281633615493774, + "num_tokens": 322211578.0, + "step": 12906 + }, + { + "epoch": 1.4174170876345267, + "grad_norm": 2.2271101474761963, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7507985830307007, + "num_tokens": 322237149.0, + "step": 12907 + }, + { + "epoch": 1.4175269053371404, + "grad_norm": 2.3769805431365967, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7383222579956055, + "num_tokens": 322257206.0, + "step": 12908 + }, + { + "epoch": 1.417636723039754, + "grad_norm": 2.3223114013671875, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7133809328079224, + "num_tokens": 322281079.0, + "step": 12909 + }, + { + "epoch": 1.4177465407423677, + "grad_norm": 1.966080904006958, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7171872854232788, + "num_tokens": 322315266.0, + "step": 12910 + }, + { + "epoch": 1.4178563584449813, + "grad_norm": 2.1382639408111572, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7182422876358032, + "num_tokens": 322346231.0, + "step": 12911 + }, + { + "epoch": 1.417966176147595, + "grad_norm": 2.2594923973083496, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7101206183433533, + "num_tokens": 322370871.0, + "step": 12912 + }, + { + "epoch": 1.4180759938502088, + "grad_norm": 2.618140935897827, + "learning_rate": 1e-06, + "loss": 0.9976, + "mean_token_accuracy": 0.7095445990562439, + "num_tokens": 322389818.0, + "step": 12913 + }, + { + "epoch": 1.4181858115528223, + "grad_norm": 2.19246506690979, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7123867273330688, + "num_tokens": 322419234.0, + "step": 12914 + }, + { + "epoch": 1.4182956292554358, + "grad_norm": 2.1189160346984863, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.709386944770813, + "num_tokens": 322446829.0, + "step": 12915 + }, + { + "epoch": 1.4184054469580496, + "grad_norm": 2.2817435264587402, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7232666611671448, + "num_tokens": 322470979.0, + "step": 12916 + }, + { + "epoch": 1.4185152646606634, + "grad_norm": 2.3020410537719727, + "learning_rate": 1e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7324717044830322, + "num_tokens": 322495551.0, + "step": 12917 + }, + { + "epoch": 1.418625082363277, + "grad_norm": 2.292773485183716, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7169127464294434, + "num_tokens": 322521783.0, + "step": 12918 + }, + { + "epoch": 1.4187349000658906, + "grad_norm": 2.3702492713928223, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7198483943939209, + "num_tokens": 322544717.0, + "step": 12919 + }, + { + "epoch": 1.4188447177685042, + "grad_norm": 2.394669532775879, + "learning_rate": 1e-06, + "loss": 0.802, + "mean_token_accuracy": 0.740943431854248, + "num_tokens": 322567826.0, + "step": 12920 + }, + { + "epoch": 1.418954535471118, + "grad_norm": 2.092341899871826, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7099672555923462, + "num_tokens": 322595833.0, + "step": 12921 + }, + { + "epoch": 1.4190643531737317, + "grad_norm": 2.0641965866088867, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7339212894439697, + "num_tokens": 322622834.0, + "step": 12922 + }, + { + "epoch": 1.4191741708763452, + "grad_norm": 2.346317768096924, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7244069576263428, + "num_tokens": 322645613.0, + "step": 12923 + }, + { + "epoch": 1.419283988578959, + "grad_norm": 2.339770793914795, + "learning_rate": 1e-06, + "loss": 1.0295, + "mean_token_accuracy": 0.6821354627609253, + "num_tokens": 322671335.0, + "step": 12924 + }, + { + "epoch": 1.4193938062815725, + "grad_norm": 2.1329944133758545, + "learning_rate": 1e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7368507385253906, + "num_tokens": 322697423.0, + "step": 12925 + }, + { + "epoch": 1.4195036239841863, + "grad_norm": 2.4204320907592773, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7223851084709167, + "num_tokens": 322720360.0, + "step": 12926 + }, + { + "epoch": 1.4196134416868, + "grad_norm": 2.1024281978607178, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7171889543533325, + "num_tokens": 322748836.0, + "step": 12927 + }, + { + "epoch": 1.4197232593894136, + "grad_norm": 2.1855034828186035, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7205306887626648, + "num_tokens": 322774795.0, + "step": 12928 + }, + { + "epoch": 1.419833077092027, + "grad_norm": 2.175123453140259, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7145614624023438, + "num_tokens": 322800317.0, + "step": 12929 + }, + { + "epoch": 1.4199428947946409, + "grad_norm": 2.0745487213134766, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7127802968025208, + "num_tokens": 322828429.0, + "step": 12930 + }, + { + "epoch": 1.4200527124972546, + "grad_norm": 2.1628847122192383, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7309346199035645, + "num_tokens": 322853344.0, + "step": 12931 + }, + { + "epoch": 1.4201625301998682, + "grad_norm": 2.511902332305908, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7223575711250305, + "num_tokens": 322873555.0, + "step": 12932 + }, + { + "epoch": 1.420272347902482, + "grad_norm": 2.059964895248413, + "learning_rate": 1e-06, + "loss": 1.0337, + "mean_token_accuracy": 0.6818890571594238, + "num_tokens": 322904488.0, + "step": 12933 + }, + { + "epoch": 1.4203821656050954, + "grad_norm": 2.234536647796631, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7153615951538086, + "num_tokens": 322931654.0, + "step": 12934 + }, + { + "epoch": 1.4204919833077092, + "grad_norm": 2.475635528564453, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7195465564727783, + "num_tokens": 322952790.0, + "step": 12935 + }, + { + "epoch": 1.420601801010323, + "grad_norm": 2.7387683391571045, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7331492900848389, + "num_tokens": 322970859.0, + "step": 12936 + }, + { + "epoch": 1.4207116187129365, + "grad_norm": 2.220264434814453, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7126863598823547, + "num_tokens": 323000635.0, + "step": 12937 + }, + { + "epoch": 1.4208214364155503, + "grad_norm": 3.1012253761291504, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.708135724067688, + "num_tokens": 323017071.0, + "step": 12938 + }, + { + "epoch": 1.4209312541181638, + "grad_norm": 2.5979256629943848, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7123613357543945, + "num_tokens": 323038702.0, + "step": 12939 + }, + { + "epoch": 1.4210410718207775, + "grad_norm": 2.151625394821167, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7049958109855652, + "num_tokens": 323065145.0, + "step": 12940 + }, + { + "epoch": 1.4211508895233913, + "grad_norm": 2.166417360305786, + "learning_rate": 1e-06, + "loss": 0.9588, + "mean_token_accuracy": 0.7003507018089294, + "num_tokens": 323094481.0, + "step": 12941 + }, + { + "epoch": 1.4212607072260048, + "grad_norm": 2.3461201190948486, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7156456708908081, + "num_tokens": 323118354.0, + "step": 12942 + }, + { + "epoch": 1.4213705249286184, + "grad_norm": 2.2193682193756104, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7272516489028931, + "num_tokens": 323145285.0, + "step": 12943 + }, + { + "epoch": 1.4214803426312321, + "grad_norm": 2.1038384437561035, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.739766001701355, + "num_tokens": 323172683.0, + "step": 12944 + }, + { + "epoch": 1.4215901603338459, + "grad_norm": 2.2920098304748535, + "learning_rate": 1e-06, + "loss": 0.7342, + "mean_token_accuracy": 0.757443368434906, + "num_tokens": 323194793.0, + "step": 12945 + }, + { + "epoch": 1.4216999780364594, + "grad_norm": 2.438441276550293, + "learning_rate": 1e-06, + "loss": 1.0095, + "mean_token_accuracy": 0.6933395862579346, + "num_tokens": 323218376.0, + "step": 12946 + }, + { + "epoch": 1.4218097957390732, + "grad_norm": 2.3893425464630127, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7128586173057556, + "num_tokens": 323242639.0, + "step": 12947 + }, + { + "epoch": 1.4219196134416867, + "grad_norm": 2.4309794902801514, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7359869480133057, + "num_tokens": 323264773.0, + "step": 12948 + }, + { + "epoch": 1.4220294311443005, + "grad_norm": 2.3044943809509277, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7051748037338257, + "num_tokens": 323291975.0, + "step": 12949 + }, + { + "epoch": 1.4221392488469142, + "grad_norm": 2.4920296669006348, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7040852308273315, + "num_tokens": 323314690.0, + "step": 12950 + }, + { + "epoch": 1.4222490665495278, + "grad_norm": 2.185659646987915, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7238165140151978, + "num_tokens": 323339340.0, + "step": 12951 + }, + { + "epoch": 1.4223588842521415, + "grad_norm": 2.475519895553589, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7255440950393677, + "num_tokens": 323359634.0, + "step": 12952 + }, + { + "epoch": 1.422468701954755, + "grad_norm": 2.4248712062835693, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7171223163604736, + "num_tokens": 323382864.0, + "step": 12953 + }, + { + "epoch": 1.4225785196573688, + "grad_norm": 2.645864248275757, + "learning_rate": 1e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7404009103775024, + "num_tokens": 323400744.0, + "step": 12954 + }, + { + "epoch": 1.4226883373599823, + "grad_norm": 2.484297513961792, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7065391540527344, + "num_tokens": 323423218.0, + "step": 12955 + }, + { + "epoch": 1.422798155062596, + "grad_norm": 2.2976346015930176, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7126969695091248, + "num_tokens": 323451107.0, + "step": 12956 + }, + { + "epoch": 1.4229079727652096, + "grad_norm": 2.3319690227508545, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7277458906173706, + "num_tokens": 323475583.0, + "step": 12957 + }, + { + "epoch": 1.4230177904678234, + "grad_norm": 2.3018856048583984, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.704753041267395, + "num_tokens": 323500500.0, + "step": 12958 + }, + { + "epoch": 1.4231276081704372, + "grad_norm": 2.4369401931762695, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7233311533927917, + "num_tokens": 323523268.0, + "step": 12959 + }, + { + "epoch": 1.4232374258730507, + "grad_norm": 2.472181558609009, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7109951972961426, + "num_tokens": 323544510.0, + "step": 12960 + }, + { + "epoch": 1.4233472435756644, + "grad_norm": 2.20414137840271, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7166250944137573, + "num_tokens": 323571214.0, + "step": 12961 + }, + { + "epoch": 1.423457061278278, + "grad_norm": 2.3185696601867676, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7279821634292603, + "num_tokens": 323595777.0, + "step": 12962 + }, + { + "epoch": 1.4235668789808917, + "grad_norm": 2.388047456741333, + "learning_rate": 1e-06, + "loss": 0.8605, + "mean_token_accuracy": 0.7325469255447388, + "num_tokens": 323618149.0, + "step": 12963 + }, + { + "epoch": 1.4236766966835055, + "grad_norm": 2.4320178031921387, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7255023121833801, + "num_tokens": 323639168.0, + "step": 12964 + }, + { + "epoch": 1.423786514386119, + "grad_norm": 2.4408257007598877, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.711539626121521, + "num_tokens": 323663171.0, + "step": 12965 + }, + { + "epoch": 1.4238963320887328, + "grad_norm": 2.506687879562378, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7425072193145752, + "num_tokens": 323684142.0, + "step": 12966 + }, + { + "epoch": 1.4240061497913463, + "grad_norm": 2.4302492141723633, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7251285910606384, + "num_tokens": 323707330.0, + "step": 12967 + }, + { + "epoch": 1.42411596749396, + "grad_norm": 2.181138753890991, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7123752236366272, + "num_tokens": 323734932.0, + "step": 12968 + }, + { + "epoch": 1.4242257851965736, + "grad_norm": 2.2989752292633057, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7078579664230347, + "num_tokens": 323759393.0, + "step": 12969 + }, + { + "epoch": 1.4243356028991874, + "grad_norm": 2.145038604736328, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.7016489505767822, + "num_tokens": 323785682.0, + "step": 12970 + }, + { + "epoch": 1.424445420601801, + "grad_norm": 2.4712741374969482, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7025750875473022, + "num_tokens": 323808488.0, + "step": 12971 + }, + { + "epoch": 1.4245552383044147, + "grad_norm": 2.2971529960632324, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7191203832626343, + "num_tokens": 323831729.0, + "step": 12972 + }, + { + "epoch": 1.4246650560070284, + "grad_norm": 2.615830183029175, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7248990535736084, + "num_tokens": 323851669.0, + "step": 12973 + }, + { + "epoch": 1.424774873709642, + "grad_norm": 1.9331388473510742, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7181687355041504, + "num_tokens": 323881916.0, + "step": 12974 + }, + { + "epoch": 1.4248846914122557, + "grad_norm": 2.4905855655670166, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7206870317459106, + "num_tokens": 323906223.0, + "step": 12975 + }, + { + "epoch": 1.4249945091148692, + "grad_norm": 2.5995118618011475, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.703682541847229, + "num_tokens": 323929872.0, + "step": 12976 + }, + { + "epoch": 1.425104326817483, + "grad_norm": 2.4301419258117676, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7077590823173523, + "num_tokens": 323952767.0, + "step": 12977 + }, + { + "epoch": 1.4252141445200968, + "grad_norm": 2.3437955379486084, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7118667960166931, + "num_tokens": 323976801.0, + "step": 12978 + }, + { + "epoch": 1.4253239622227103, + "grad_norm": 2.5831570625305176, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7163728475570679, + "num_tokens": 323996906.0, + "step": 12979 + }, + { + "epoch": 1.4254337799253238, + "grad_norm": 2.5036890506744385, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7294411659240723, + "num_tokens": 324018418.0, + "step": 12980 + }, + { + "epoch": 1.4255435976279376, + "grad_norm": 2.4936041831970215, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7141755819320679, + "num_tokens": 324039783.0, + "step": 12981 + }, + { + "epoch": 1.4256534153305513, + "grad_norm": 2.5945589542388916, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7328617572784424, + "num_tokens": 324060452.0, + "step": 12982 + }, + { + "epoch": 1.4257632330331649, + "grad_norm": 2.1894476413726807, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7262485027313232, + "num_tokens": 324087195.0, + "step": 12983 + }, + { + "epoch": 1.4258730507357786, + "grad_norm": 2.2342896461486816, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.723689079284668, + "num_tokens": 324114198.0, + "step": 12984 + }, + { + "epoch": 1.4259828684383922, + "grad_norm": 2.217738628387451, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6963573098182678, + "num_tokens": 324141320.0, + "step": 12985 + }, + { + "epoch": 1.426092686141006, + "grad_norm": 2.0697906017303467, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7095840573310852, + "num_tokens": 324172338.0, + "step": 12986 + }, + { + "epoch": 1.4262025038436197, + "grad_norm": 2.3127074241638184, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7365105152130127, + "num_tokens": 324196972.0, + "step": 12987 + }, + { + "epoch": 1.4263123215462332, + "grad_norm": 2.3067424297332764, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7287814617156982, + "num_tokens": 324221387.0, + "step": 12988 + }, + { + "epoch": 1.426422139248847, + "grad_norm": 2.5613625049591064, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7289990186691284, + "num_tokens": 324242010.0, + "step": 12989 + }, + { + "epoch": 1.4265319569514605, + "grad_norm": 2.0803449153900146, + "learning_rate": 1e-06, + "loss": 0.9946, + "mean_token_accuracy": 0.7030867338180542, + "num_tokens": 324271470.0, + "step": 12990 + }, + { + "epoch": 1.4266417746540743, + "grad_norm": 2.4725844860076904, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.6908597946166992, + "num_tokens": 324295637.0, + "step": 12991 + }, + { + "epoch": 1.426751592356688, + "grad_norm": 2.397230625152588, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7294244766235352, + "num_tokens": 324318771.0, + "step": 12992 + }, + { + "epoch": 1.4268614100593016, + "grad_norm": 2.3298308849334717, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7270222902297974, + "num_tokens": 324343191.0, + "step": 12993 + }, + { + "epoch": 1.426971227761915, + "grad_norm": 2.2193984985351562, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.712775468826294, + "num_tokens": 324368607.0, + "step": 12994 + }, + { + "epoch": 1.4270810454645289, + "grad_norm": 2.677802085876465, + "learning_rate": 1e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7405750155448914, + "num_tokens": 324386046.0, + "step": 12995 + }, + { + "epoch": 1.4271908631671426, + "grad_norm": 2.4580655097961426, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7248247861862183, + "num_tokens": 324408382.0, + "step": 12996 + }, + { + "epoch": 1.4273006808697561, + "grad_norm": 2.3097784519195557, + "learning_rate": 1e-06, + "loss": 1.0391, + "mean_token_accuracy": 0.6774613261222839, + "num_tokens": 324435965.0, + "step": 12997 + }, + { + "epoch": 1.42741049857237, + "grad_norm": 2.165562629699707, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7190364599227905, + "num_tokens": 324465607.0, + "step": 12998 + }, + { + "epoch": 1.4275203162749834, + "grad_norm": 2.4363343715667725, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7048158645629883, + "num_tokens": 324489091.0, + "step": 12999 + }, + { + "epoch": 1.4276301339775972, + "grad_norm": 2.267171859741211, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7297331094741821, + "num_tokens": 324513088.0, + "step": 13000 + }, + { + "epoch": 1.427739951680211, + "grad_norm": 2.122302293777466, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7156386375427246, + "num_tokens": 324540611.0, + "step": 13001 + }, + { + "epoch": 1.4278497693828245, + "grad_norm": 2.2879464626312256, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7321159243583679, + "num_tokens": 324563857.0, + "step": 13002 + }, + { + "epoch": 1.4279595870854382, + "grad_norm": 2.2673707008361816, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7298412919044495, + "num_tokens": 324587298.0, + "step": 13003 + }, + { + "epoch": 1.4280694047880518, + "grad_norm": 2.5371286869049072, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6983503699302673, + "num_tokens": 324609509.0, + "step": 13004 + }, + { + "epoch": 1.4281792224906655, + "grad_norm": 2.1372737884521484, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7087829113006592, + "num_tokens": 324636106.0, + "step": 13005 + }, + { + "epoch": 1.4282890401932793, + "grad_norm": 2.3749594688415527, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.6995854377746582, + "num_tokens": 324660308.0, + "step": 13006 + }, + { + "epoch": 1.4283988578958928, + "grad_norm": 2.0044143199920654, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7126146554946899, + "num_tokens": 324690611.0, + "step": 13007 + }, + { + "epoch": 1.4285086755985064, + "grad_norm": 2.414231061935425, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7209753394126892, + "num_tokens": 324712918.0, + "step": 13008 + }, + { + "epoch": 1.4286184933011201, + "grad_norm": 2.1469333171844482, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7135941982269287, + "num_tokens": 324740951.0, + "step": 13009 + }, + { + "epoch": 1.4287283110037339, + "grad_norm": 2.2428207397460938, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.6998236775398254, + "num_tokens": 324767069.0, + "step": 13010 + }, + { + "epoch": 1.4288381287063474, + "grad_norm": 2.2165427207946777, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7269812226295471, + "num_tokens": 324792990.0, + "step": 13011 + }, + { + "epoch": 1.4289479464089612, + "grad_norm": 2.217679977416992, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7118781805038452, + "num_tokens": 324818704.0, + "step": 13012 + }, + { + "epoch": 1.4290577641115747, + "grad_norm": 2.1321322917938232, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.696771502494812, + "num_tokens": 324848665.0, + "step": 13013 + }, + { + "epoch": 1.4291675818141885, + "grad_norm": 2.3751747608184814, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7006083726882935, + "num_tokens": 324873152.0, + "step": 13014 + }, + { + "epoch": 1.4292773995168022, + "grad_norm": 2.339773654937744, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7147662043571472, + "num_tokens": 324895937.0, + "step": 13015 + }, + { + "epoch": 1.4293872172194158, + "grad_norm": 2.340660572052002, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7172836065292358, + "num_tokens": 324921244.0, + "step": 13016 + }, + { + "epoch": 1.4294970349220295, + "grad_norm": 2.201608180999756, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7078733444213867, + "num_tokens": 324949687.0, + "step": 13017 + }, + { + "epoch": 1.429606852624643, + "grad_norm": 2.535700798034668, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7338865399360657, + "num_tokens": 324970280.0, + "step": 13018 + }, + { + "epoch": 1.4297166703272568, + "grad_norm": 2.452733039855957, + "learning_rate": 1e-06, + "loss": 0.7949, + "mean_token_accuracy": 0.7423326969146729, + "num_tokens": 324991048.0, + "step": 13019 + }, + { + "epoch": 1.4298264880298703, + "grad_norm": 2.0681354999542236, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.717536449432373, + "num_tokens": 325020039.0, + "step": 13020 + }, + { + "epoch": 1.429936305732484, + "grad_norm": 2.539628267288208, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7167949080467224, + "num_tokens": 325042256.0, + "step": 13021 + }, + { + "epoch": 1.4300461234350976, + "grad_norm": 2.113070249557495, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7191964387893677, + "num_tokens": 325071378.0, + "step": 13022 + }, + { + "epoch": 1.4301559411377114, + "grad_norm": 2.3435895442962646, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7039928436279297, + "num_tokens": 325097289.0, + "step": 13023 + }, + { + "epoch": 1.4302657588403251, + "grad_norm": 2.4345128536224365, + "learning_rate": 1e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7569422721862793, + "num_tokens": 325116769.0, + "step": 13024 + }, + { + "epoch": 1.4303755765429387, + "grad_norm": 2.4329659938812256, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7227842211723328, + "num_tokens": 325140262.0, + "step": 13025 + }, + { + "epoch": 1.4304853942455524, + "grad_norm": 2.1535582542419434, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7096680402755737, + "num_tokens": 325167986.0, + "step": 13026 + }, + { + "epoch": 1.430595211948166, + "grad_norm": 2.306800127029419, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7356995344161987, + "num_tokens": 325191907.0, + "step": 13027 + }, + { + "epoch": 1.4307050296507797, + "grad_norm": 2.169633388519287, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7260273694992065, + "num_tokens": 325219148.0, + "step": 13028 + }, + { + "epoch": 1.4308148473533935, + "grad_norm": 2.0276739597320557, + "learning_rate": 1e-06, + "loss": 0.9962, + "mean_token_accuracy": 0.6961795091629028, + "num_tokens": 325251746.0, + "step": 13029 + }, + { + "epoch": 1.430924665056007, + "grad_norm": 2.1695570945739746, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7150567770004272, + "num_tokens": 325278832.0, + "step": 13030 + }, + { + "epoch": 1.4310344827586206, + "grad_norm": 2.4058990478515625, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7185236215591431, + "num_tokens": 325301357.0, + "step": 13031 + }, + { + "epoch": 1.4311443004612343, + "grad_norm": 2.0881035327911377, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7337061166763306, + "num_tokens": 325328750.0, + "step": 13032 + }, + { + "epoch": 1.431254118163848, + "grad_norm": 2.152946949005127, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7063522338867188, + "num_tokens": 325355837.0, + "step": 13033 + }, + { + "epoch": 1.4313639358664616, + "grad_norm": 2.264646291732788, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7083625197410583, + "num_tokens": 325379701.0, + "step": 13034 + }, + { + "epoch": 1.4314737535690754, + "grad_norm": 2.4753763675689697, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7225548028945923, + "num_tokens": 325402861.0, + "step": 13035 + }, + { + "epoch": 1.431583571271689, + "grad_norm": 2.4116029739379883, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7292148470878601, + "num_tokens": 325426285.0, + "step": 13036 + }, + { + "epoch": 1.4316933889743026, + "grad_norm": 2.078946352005005, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7176109552383423, + "num_tokens": 325455042.0, + "step": 13037 + }, + { + "epoch": 1.4318032066769164, + "grad_norm": 2.1167023181915283, + "learning_rate": 1e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7378405332565308, + "num_tokens": 325483388.0, + "step": 13038 + }, + { + "epoch": 1.43191302437953, + "grad_norm": 2.2286922931671143, + "learning_rate": 1e-06, + "loss": 0.9991, + "mean_token_accuracy": 0.6981562376022339, + "num_tokens": 325509685.0, + "step": 13039 + }, + { + "epoch": 1.4320228420821437, + "grad_norm": 2.1112165451049805, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7301801443099976, + "num_tokens": 325537541.0, + "step": 13040 + }, + { + "epoch": 1.4321326597847572, + "grad_norm": 2.210097551345825, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.695106029510498, + "num_tokens": 325566211.0, + "step": 13041 + }, + { + "epoch": 1.432242477487371, + "grad_norm": 2.205685615539551, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.718747615814209, + "num_tokens": 325593245.0, + "step": 13042 + }, + { + "epoch": 1.4323522951899847, + "grad_norm": 2.115241050720215, + "learning_rate": 1e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7368341684341431, + "num_tokens": 325619009.0, + "step": 13043 + }, + { + "epoch": 1.4324621128925983, + "grad_norm": 2.3292276859283447, + "learning_rate": 1e-06, + "loss": 0.9196, + "mean_token_accuracy": 0.7157421112060547, + "num_tokens": 325642751.0, + "step": 13044 + }, + { + "epoch": 1.4325719305952118, + "grad_norm": 2.361177682876587, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7207837104797363, + "num_tokens": 325664580.0, + "step": 13045 + }, + { + "epoch": 1.4326817482978256, + "grad_norm": 1.7788853645324707, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7192180156707764, + "num_tokens": 325699609.0, + "step": 13046 + }, + { + "epoch": 1.4327915660004393, + "grad_norm": 2.1289641857147217, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7033408880233765, + "num_tokens": 325726049.0, + "step": 13047 + }, + { + "epoch": 1.4329013837030529, + "grad_norm": 2.3674352169036865, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7345319986343384, + "num_tokens": 325749751.0, + "step": 13048 + }, + { + "epoch": 1.4330112014056666, + "grad_norm": 2.5984349250793457, + "learning_rate": 1e-06, + "loss": 0.9085, + "mean_token_accuracy": 0.7164238095283508, + "num_tokens": 325770963.0, + "step": 13049 + }, + { + "epoch": 1.4331210191082802, + "grad_norm": 1.9253405332565308, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7247521877288818, + "num_tokens": 325805429.0, + "step": 13050 + }, + { + "epoch": 1.433230836810894, + "grad_norm": 2.4578967094421387, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7225963473320007, + "num_tokens": 325827747.0, + "step": 13051 + }, + { + "epoch": 1.4333406545135077, + "grad_norm": 2.295322895050049, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7075808048248291, + "num_tokens": 325852038.0, + "step": 13052 + }, + { + "epoch": 1.4334504722161212, + "grad_norm": 2.4040935039520264, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7218878865242004, + "num_tokens": 325875344.0, + "step": 13053 + }, + { + "epoch": 1.433560289918735, + "grad_norm": 2.6367075443267822, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7390034198760986, + "num_tokens": 325893476.0, + "step": 13054 + }, + { + "epoch": 1.4336701076213485, + "grad_norm": 2.416532278060913, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.728495717048645, + "num_tokens": 325915014.0, + "step": 13055 + }, + { + "epoch": 1.4337799253239623, + "grad_norm": 2.3585801124572754, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7098057270050049, + "num_tokens": 325939311.0, + "step": 13056 + }, + { + "epoch": 1.433889743026576, + "grad_norm": 2.3222591876983643, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.723757803440094, + "num_tokens": 325962243.0, + "step": 13057 + }, + { + "epoch": 1.4339995607291895, + "grad_norm": 2.40539813041687, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7110388278961182, + "num_tokens": 325985064.0, + "step": 13058 + }, + { + "epoch": 1.434109378431803, + "grad_norm": 2.28806471824646, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7169672846794128, + "num_tokens": 326010020.0, + "step": 13059 + }, + { + "epoch": 1.4342191961344168, + "grad_norm": 2.266700029373169, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7079123258590698, + "num_tokens": 326038505.0, + "step": 13060 + }, + { + "epoch": 1.4343290138370306, + "grad_norm": 2.4666788578033447, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7168807983398438, + "num_tokens": 326062903.0, + "step": 13061 + }, + { + "epoch": 1.4344388315396441, + "grad_norm": 2.4762418270111084, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7262341380119324, + "num_tokens": 326087027.0, + "step": 13062 + }, + { + "epoch": 1.434548649242258, + "grad_norm": 2.6070914268493652, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7017300128936768, + "num_tokens": 326108843.0, + "step": 13063 + }, + { + "epoch": 1.4346584669448714, + "grad_norm": 2.5958757400512695, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.73633873462677, + "num_tokens": 326126647.0, + "step": 13064 + }, + { + "epoch": 1.4347682846474852, + "grad_norm": 2.3309743404388428, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7179380655288696, + "num_tokens": 326151005.0, + "step": 13065 + }, + { + "epoch": 1.434878102350099, + "grad_norm": 2.331084966659546, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7225725650787354, + "num_tokens": 326175855.0, + "step": 13066 + }, + { + "epoch": 1.4349879200527125, + "grad_norm": 2.3943426609039307, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.7017045021057129, + "num_tokens": 326198228.0, + "step": 13067 + }, + { + "epoch": 1.4350977377553262, + "grad_norm": 2.325260639190674, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7186102867126465, + "num_tokens": 326221758.0, + "step": 13068 + }, + { + "epoch": 1.4352075554579398, + "grad_norm": 2.9018194675445557, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7460061311721802, + "num_tokens": 326240132.0, + "step": 13069 + }, + { + "epoch": 1.4353173731605535, + "grad_norm": 2.0532796382904053, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7312113046646118, + "num_tokens": 326269371.0, + "step": 13070 + }, + { + "epoch": 1.4354271908631673, + "grad_norm": 2.1192612648010254, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7117915153503418, + "num_tokens": 326299698.0, + "step": 13071 + }, + { + "epoch": 1.4355370085657808, + "grad_norm": 2.4441239833831787, + "learning_rate": 1e-06, + "loss": 0.9043, + "mean_token_accuracy": 0.7199019193649292, + "num_tokens": 326322990.0, + "step": 13072 + }, + { + "epoch": 1.4356468262683943, + "grad_norm": 2.7683560848236084, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7271937131881714, + "num_tokens": 326342159.0, + "step": 13073 + }, + { + "epoch": 1.435756643971008, + "grad_norm": 2.3992202281951904, + "learning_rate": 1e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7335215210914612, + "num_tokens": 326362623.0, + "step": 13074 + }, + { + "epoch": 1.4358664616736219, + "grad_norm": 2.267303228378296, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7109887599945068, + "num_tokens": 326388111.0, + "step": 13075 + }, + { + "epoch": 1.4359762793762354, + "grad_norm": 2.051332950592041, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7457288503646851, + "num_tokens": 326417354.0, + "step": 13076 + }, + { + "epoch": 1.4360860970788492, + "grad_norm": 2.048295497894287, + "learning_rate": 1e-06, + "loss": 1.0234, + "mean_token_accuracy": 0.6898934245109558, + "num_tokens": 326447954.0, + "step": 13077 + }, + { + "epoch": 1.4361959147814627, + "grad_norm": 2.590841293334961, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7309173345565796, + "num_tokens": 326469084.0, + "step": 13078 + }, + { + "epoch": 1.4363057324840764, + "grad_norm": 2.246910572052002, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7065111398696899, + "num_tokens": 326495421.0, + "step": 13079 + }, + { + "epoch": 1.4364155501866902, + "grad_norm": 2.678584098815918, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7321994304656982, + "num_tokens": 326514968.0, + "step": 13080 + }, + { + "epoch": 1.4365253678893037, + "grad_norm": 2.767038106918335, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7310645580291748, + "num_tokens": 326533815.0, + "step": 13081 + }, + { + "epoch": 1.4366351855919175, + "grad_norm": 2.656696319580078, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7218021750450134, + "num_tokens": 326551901.0, + "step": 13082 + }, + { + "epoch": 1.436745003294531, + "grad_norm": 2.3750600814819336, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7145729064941406, + "num_tokens": 326577043.0, + "step": 13083 + }, + { + "epoch": 1.4368548209971448, + "grad_norm": 2.2533421516418457, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7074293494224548, + "num_tokens": 326604816.0, + "step": 13084 + }, + { + "epoch": 1.4369646386997583, + "grad_norm": 2.2101802825927734, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7242532968521118, + "num_tokens": 326630882.0, + "step": 13085 + }, + { + "epoch": 1.437074456402372, + "grad_norm": 2.3810195922851562, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7185392379760742, + "num_tokens": 326654678.0, + "step": 13086 + }, + { + "epoch": 1.4371842741049856, + "grad_norm": 2.5257747173309326, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7111423015594482, + "num_tokens": 326675180.0, + "step": 13087 + }, + { + "epoch": 1.4372940918075994, + "grad_norm": 2.6103127002716064, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7041845321655273, + "num_tokens": 326695055.0, + "step": 13088 + }, + { + "epoch": 1.4374039095102131, + "grad_norm": 2.2065460681915283, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7306488156318665, + "num_tokens": 326720512.0, + "step": 13089 + }, + { + "epoch": 1.4375137272128267, + "grad_norm": 2.2621850967407227, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7265657782554626, + "num_tokens": 326746341.0, + "step": 13090 + }, + { + "epoch": 1.4376235449154404, + "grad_norm": 2.4303414821624756, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7389261722564697, + "num_tokens": 326768889.0, + "step": 13091 + }, + { + "epoch": 1.437733362618054, + "grad_norm": 2.62915301322937, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7129088640213013, + "num_tokens": 326789458.0, + "step": 13092 + }, + { + "epoch": 1.4378431803206677, + "grad_norm": 2.411524772644043, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7008154392242432, + "num_tokens": 326811444.0, + "step": 13093 + }, + { + "epoch": 1.4379529980232815, + "grad_norm": 2.223745346069336, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7184172868728638, + "num_tokens": 326836217.0, + "step": 13094 + }, + { + "epoch": 1.438062815725895, + "grad_norm": 2.1556155681610107, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7328490018844604, + "num_tokens": 326863358.0, + "step": 13095 + }, + { + "epoch": 1.4381726334285085, + "grad_norm": 2.299102544784546, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7246918678283691, + "num_tokens": 326887040.0, + "step": 13096 + }, + { + "epoch": 1.4382824511311223, + "grad_norm": 2.1369125843048096, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7046868801116943, + "num_tokens": 326916293.0, + "step": 13097 + }, + { + "epoch": 1.438392268833736, + "grad_norm": 2.4819910526275635, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7177546620368958, + "num_tokens": 326937689.0, + "step": 13098 + }, + { + "epoch": 1.4385020865363496, + "grad_norm": 2.764450788497925, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7262401580810547, + "num_tokens": 326957241.0, + "step": 13099 + }, + { + "epoch": 1.4386119042389633, + "grad_norm": 2.3817648887634277, + "learning_rate": 1e-06, + "loss": 0.8612, + "mean_token_accuracy": 0.7294588088989258, + "num_tokens": 326980128.0, + "step": 13100 + }, + { + "epoch": 1.4387217219415769, + "grad_norm": 2.349911689758301, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7232052087783813, + "num_tokens": 327002854.0, + "step": 13101 + }, + { + "epoch": 1.4388315396441906, + "grad_norm": 2.161400079727173, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7182830572128296, + "num_tokens": 327031521.0, + "step": 13102 + }, + { + "epoch": 1.4389413573468044, + "grad_norm": 2.3123698234558105, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7141433358192444, + "num_tokens": 327055761.0, + "step": 13103 + }, + { + "epoch": 1.439051175049418, + "grad_norm": 1.9278388023376465, + "learning_rate": 1e-06, + "loss": 0.808, + "mean_token_accuracy": 0.7414714097976685, + "num_tokens": 327086778.0, + "step": 13104 + }, + { + "epoch": 1.4391609927520317, + "grad_norm": 1.9122473001480103, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7402440309524536, + "num_tokens": 327118514.0, + "step": 13105 + }, + { + "epoch": 1.4392708104546452, + "grad_norm": 1.9414445161819458, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7120380401611328, + "num_tokens": 327151604.0, + "step": 13106 + }, + { + "epoch": 1.439380628157259, + "grad_norm": 2.138608932495117, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7235199213027954, + "num_tokens": 327180145.0, + "step": 13107 + }, + { + "epoch": 1.4394904458598727, + "grad_norm": 2.045389413833618, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7025511860847473, + "num_tokens": 327209398.0, + "step": 13108 + }, + { + "epoch": 1.4396002635624863, + "grad_norm": 2.1335537433624268, + "learning_rate": 1e-06, + "loss": 1.0809, + "mean_token_accuracy": 0.6797308325767517, + "num_tokens": 327237589.0, + "step": 13109 + }, + { + "epoch": 1.4397100812650998, + "grad_norm": 2.2518703937530518, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.708161473274231, + "num_tokens": 327263450.0, + "step": 13110 + }, + { + "epoch": 1.4398198989677136, + "grad_norm": 2.190840721130371, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7377263307571411, + "num_tokens": 327287132.0, + "step": 13111 + }, + { + "epoch": 1.4399297166703273, + "grad_norm": 2.1886680126190186, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7140620946884155, + "num_tokens": 327314683.0, + "step": 13112 + }, + { + "epoch": 1.4400395343729409, + "grad_norm": 2.319335699081421, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7095577716827393, + "num_tokens": 327338484.0, + "step": 13113 + }, + { + "epoch": 1.4401493520755546, + "grad_norm": 2.2990975379943848, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7113382816314697, + "num_tokens": 327364655.0, + "step": 13114 + }, + { + "epoch": 1.4402591697781681, + "grad_norm": 2.1164581775665283, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7101433277130127, + "num_tokens": 327391891.0, + "step": 13115 + }, + { + "epoch": 1.440368987480782, + "grad_norm": 2.3708572387695312, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7245696783065796, + "num_tokens": 327415796.0, + "step": 13116 + }, + { + "epoch": 1.4404788051833957, + "grad_norm": 2.4238381385803223, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7390373945236206, + "num_tokens": 327437605.0, + "step": 13117 + }, + { + "epoch": 1.4405886228860092, + "grad_norm": 2.2298147678375244, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7200696468353271, + "num_tokens": 327462401.0, + "step": 13118 + }, + { + "epoch": 1.440698440588623, + "grad_norm": 2.2252650260925293, + "learning_rate": 1e-06, + "loss": 1.0203, + "mean_token_accuracy": 0.6972665786743164, + "num_tokens": 327491177.0, + "step": 13119 + }, + { + "epoch": 1.4408082582912365, + "grad_norm": 2.5690231323242188, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7223137021064758, + "num_tokens": 327512254.0, + "step": 13120 + }, + { + "epoch": 1.4409180759938502, + "grad_norm": 2.3962090015411377, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7131279706954956, + "num_tokens": 327535961.0, + "step": 13121 + }, + { + "epoch": 1.441027893696464, + "grad_norm": 2.2651164531707764, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7128170132637024, + "num_tokens": 327560624.0, + "step": 13122 + }, + { + "epoch": 1.4411377113990775, + "grad_norm": 2.515871524810791, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7286853194236755, + "num_tokens": 327581082.0, + "step": 13123 + }, + { + "epoch": 1.441247529101691, + "grad_norm": 2.1666152477264404, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7118312120437622, + "num_tokens": 327607712.0, + "step": 13124 + }, + { + "epoch": 1.4413573468043048, + "grad_norm": 2.3162426948547363, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.7017591595649719, + "num_tokens": 327632099.0, + "step": 13125 + }, + { + "epoch": 1.4414671645069186, + "grad_norm": 2.0757076740264893, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7143164873123169, + "num_tokens": 327660228.0, + "step": 13126 + }, + { + "epoch": 1.4415769822095321, + "grad_norm": 2.615095615386963, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7210005521774292, + "num_tokens": 327681948.0, + "step": 13127 + }, + { + "epoch": 1.4416867999121459, + "grad_norm": 2.165239095687866, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7071062326431274, + "num_tokens": 327707858.0, + "step": 13128 + }, + { + "epoch": 1.4417966176147594, + "grad_norm": 2.14458966255188, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.7034529447555542, + "num_tokens": 327738412.0, + "step": 13129 + }, + { + "epoch": 1.4419064353173732, + "grad_norm": 2.525247097015381, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7367055416107178, + "num_tokens": 327760015.0, + "step": 13130 + }, + { + "epoch": 1.442016253019987, + "grad_norm": 2.248373508453369, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7249177098274231, + "num_tokens": 327785036.0, + "step": 13131 + }, + { + "epoch": 1.4421260707226005, + "grad_norm": 2.2723751068115234, + "learning_rate": 1e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7233320474624634, + "num_tokens": 327807971.0, + "step": 13132 + }, + { + "epoch": 1.4422358884252142, + "grad_norm": 2.356827735900879, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7268321514129639, + "num_tokens": 327831149.0, + "step": 13133 + }, + { + "epoch": 1.4423457061278278, + "grad_norm": 2.2254371643066406, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7078152894973755, + "num_tokens": 327858147.0, + "step": 13134 + }, + { + "epoch": 1.4424555238304415, + "grad_norm": 2.216137170791626, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7289069294929504, + "num_tokens": 327883187.0, + "step": 13135 + }, + { + "epoch": 1.442565341533055, + "grad_norm": 2.149941921234131, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7185804843902588, + "num_tokens": 327909936.0, + "step": 13136 + }, + { + "epoch": 1.4426751592356688, + "grad_norm": 2.236189842224121, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7018230557441711, + "num_tokens": 327935378.0, + "step": 13137 + }, + { + "epoch": 1.4427849769382823, + "grad_norm": 2.4869191646575928, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7228525876998901, + "num_tokens": 327956037.0, + "step": 13138 + }, + { + "epoch": 1.442894794640896, + "grad_norm": 2.2087082862854004, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.730505645275116, + "num_tokens": 327981890.0, + "step": 13139 + }, + { + "epoch": 1.4430046123435099, + "grad_norm": 2.419923782348633, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7203378677368164, + "num_tokens": 328005334.0, + "step": 13140 + }, + { + "epoch": 1.4431144300461234, + "grad_norm": 2.717654228210449, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7308486700057983, + "num_tokens": 328023842.0, + "step": 13141 + }, + { + "epoch": 1.4432242477487371, + "grad_norm": 2.026317834854126, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.6990303993225098, + "num_tokens": 328057671.0, + "step": 13142 + }, + { + "epoch": 1.4433340654513507, + "grad_norm": 2.5473408699035645, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7228959798812866, + "num_tokens": 328078964.0, + "step": 13143 + }, + { + "epoch": 1.4434438831539644, + "grad_norm": 2.404660940170288, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7178462147712708, + "num_tokens": 328103590.0, + "step": 13144 + }, + { + "epoch": 1.4435537008565782, + "grad_norm": 1.9495919942855835, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7061982154846191, + "num_tokens": 328140081.0, + "step": 13145 + }, + { + "epoch": 1.4436635185591917, + "grad_norm": 2.3866496086120605, + "learning_rate": 1e-06, + "loss": 0.9587, + "mean_token_accuracy": 0.7089747190475464, + "num_tokens": 328165647.0, + "step": 13146 + }, + { + "epoch": 1.4437733362618055, + "grad_norm": 2.669945478439331, + "learning_rate": 1e-06, + "loss": 0.7745, + "mean_token_accuracy": 0.7489630579948425, + "num_tokens": 328183240.0, + "step": 13147 + }, + { + "epoch": 1.443883153964419, + "grad_norm": 2.064612627029419, + "learning_rate": 1e-06, + "loss": 1.0011, + "mean_token_accuracy": 0.6912694573402405, + "num_tokens": 328211273.0, + "step": 13148 + }, + { + "epoch": 1.4439929716670328, + "grad_norm": 2.0811734199523926, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7321251630783081, + "num_tokens": 328239581.0, + "step": 13149 + }, + { + "epoch": 1.4441027893696463, + "grad_norm": 2.4706292152404785, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7046867609024048, + "num_tokens": 328263125.0, + "step": 13150 + }, + { + "epoch": 1.44421260707226, + "grad_norm": 2.0881259441375732, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7271785140037537, + "num_tokens": 328290848.0, + "step": 13151 + }, + { + "epoch": 1.4443224247748736, + "grad_norm": 2.0883469581604004, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7226805090904236, + "num_tokens": 328319702.0, + "step": 13152 + }, + { + "epoch": 1.4444322424774874, + "grad_norm": 2.280604124069214, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7290434837341309, + "num_tokens": 328344448.0, + "step": 13153 + }, + { + "epoch": 1.4445420601801011, + "grad_norm": 2.7704720497131348, + "learning_rate": 1e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7469582557678223, + "num_tokens": 328362581.0, + "step": 13154 + }, + { + "epoch": 1.4446518778827147, + "grad_norm": 2.5395262241363525, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7226280570030212, + "num_tokens": 328385227.0, + "step": 13155 + }, + { + "epoch": 1.4447616955853284, + "grad_norm": 2.156360149383545, + "learning_rate": 1e-06, + "loss": 1.011, + "mean_token_accuracy": 0.698490560054779, + "num_tokens": 328412199.0, + "step": 13156 + }, + { + "epoch": 1.444871513287942, + "grad_norm": 2.416121006011963, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7350116968154907, + "num_tokens": 328434255.0, + "step": 13157 + }, + { + "epoch": 1.4449813309905557, + "grad_norm": 2.300682783126831, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7436463236808777, + "num_tokens": 328458993.0, + "step": 13158 + }, + { + "epoch": 1.4450911486931695, + "grad_norm": 2.5667104721069336, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7378841638565063, + "num_tokens": 328481135.0, + "step": 13159 + }, + { + "epoch": 1.445200966395783, + "grad_norm": 2.113070249557495, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7322861552238464, + "num_tokens": 328507152.0, + "step": 13160 + }, + { + "epoch": 1.4453107840983965, + "grad_norm": 2.288658380508423, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7335219979286194, + "num_tokens": 328530287.0, + "step": 13161 + }, + { + "epoch": 1.4454206018010103, + "grad_norm": 2.544581890106201, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7272398471832275, + "num_tokens": 328552304.0, + "step": 13162 + }, + { + "epoch": 1.445530419503624, + "grad_norm": 2.2662065029144287, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.726880669593811, + "num_tokens": 328579796.0, + "step": 13163 + }, + { + "epoch": 1.4456402372062376, + "grad_norm": 2.1167709827423096, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7273658514022827, + "num_tokens": 328609875.0, + "step": 13164 + }, + { + "epoch": 1.4457500549088513, + "grad_norm": 2.366912603378296, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7314896583557129, + "num_tokens": 328633461.0, + "step": 13165 + }, + { + "epoch": 1.4458598726114649, + "grad_norm": 2.634094715118408, + "learning_rate": 1e-06, + "loss": 0.7808, + "mean_token_accuracy": 0.7429875731468201, + "num_tokens": 328652704.0, + "step": 13166 + }, + { + "epoch": 1.4459696903140786, + "grad_norm": 2.342118501663208, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7269555330276489, + "num_tokens": 328676192.0, + "step": 13167 + }, + { + "epoch": 1.4460795080166924, + "grad_norm": 2.3270626068115234, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7127264738082886, + "num_tokens": 328700028.0, + "step": 13168 + }, + { + "epoch": 1.446189325719306, + "grad_norm": 2.1964974403381348, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7267283797264099, + "num_tokens": 328725221.0, + "step": 13169 + }, + { + "epoch": 1.4462991434219197, + "grad_norm": 2.2943382263183594, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7034745216369629, + "num_tokens": 328750865.0, + "step": 13170 + }, + { + "epoch": 1.4464089611245332, + "grad_norm": 2.6058788299560547, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7157294750213623, + "num_tokens": 328771311.0, + "step": 13171 + }, + { + "epoch": 1.446518778827147, + "grad_norm": 2.4967572689056396, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7275510430335999, + "num_tokens": 328793289.0, + "step": 13172 + }, + { + "epoch": 1.4466285965297607, + "grad_norm": 2.170645236968994, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7333729863166809, + "num_tokens": 328819402.0, + "step": 13173 + }, + { + "epoch": 1.4467384142323743, + "grad_norm": 2.7740824222564697, + "learning_rate": 1e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7409629821777344, + "num_tokens": 328837048.0, + "step": 13174 + }, + { + "epoch": 1.4468482319349878, + "grad_norm": 2.185929298400879, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7106648087501526, + "num_tokens": 328867089.0, + "step": 13175 + }, + { + "epoch": 1.4469580496376016, + "grad_norm": 2.098329782485962, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.733690083026886, + "num_tokens": 328896899.0, + "step": 13176 + }, + { + "epoch": 1.4470678673402153, + "grad_norm": 2.6154608726501465, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7383753657341003, + "num_tokens": 328915756.0, + "step": 13177 + }, + { + "epoch": 1.4471776850428288, + "grad_norm": 2.2838234901428223, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.723564863204956, + "num_tokens": 328939827.0, + "step": 13178 + }, + { + "epoch": 1.4472875027454426, + "grad_norm": 2.6613833904266357, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7251930236816406, + "num_tokens": 328960354.0, + "step": 13179 + }, + { + "epoch": 1.4473973204480561, + "grad_norm": 2.208181858062744, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7158121466636658, + "num_tokens": 328988835.0, + "step": 13180 + }, + { + "epoch": 1.44750713815067, + "grad_norm": 2.8579375743865967, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7415789365768433, + "num_tokens": 329006880.0, + "step": 13181 + }, + { + "epoch": 1.4476169558532836, + "grad_norm": 2.4004623889923096, + "learning_rate": 1e-06, + "loss": 0.8204, + "mean_token_accuracy": 0.7386182546615601, + "num_tokens": 329027939.0, + "step": 13182 + }, + { + "epoch": 1.4477267735558972, + "grad_norm": 2.274589776992798, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7121903896331787, + "num_tokens": 329053561.0, + "step": 13183 + }, + { + "epoch": 1.447836591258511, + "grad_norm": 2.250706434249878, + "learning_rate": 1e-06, + "loss": 0.7901, + "mean_token_accuracy": 0.7463358044624329, + "num_tokens": 329077911.0, + "step": 13184 + }, + { + "epoch": 1.4479464089611245, + "grad_norm": 2.0698752403259277, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7412651777267456, + "num_tokens": 329105349.0, + "step": 13185 + }, + { + "epoch": 1.4480562266637382, + "grad_norm": 2.2724721431732178, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7229015827178955, + "num_tokens": 329131543.0, + "step": 13186 + }, + { + "epoch": 1.448166044366352, + "grad_norm": 2.3083884716033936, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7309991717338562, + "num_tokens": 329155353.0, + "step": 13187 + }, + { + "epoch": 1.4482758620689655, + "grad_norm": 2.2434563636779785, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7127361297607422, + "num_tokens": 329182440.0, + "step": 13188 + }, + { + "epoch": 1.448385679771579, + "grad_norm": 2.0246989727020264, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7096708416938782, + "num_tokens": 329210954.0, + "step": 13189 + }, + { + "epoch": 1.4484954974741928, + "grad_norm": 2.169039487838745, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7238564491271973, + "num_tokens": 329238046.0, + "step": 13190 + }, + { + "epoch": 1.4486053151768066, + "grad_norm": 2.1903181076049805, + "learning_rate": 1e-06, + "loss": 0.9697, + "mean_token_accuracy": 0.701187252998352, + "num_tokens": 329263093.0, + "step": 13191 + }, + { + "epoch": 1.44871513287942, + "grad_norm": 2.2474265098571777, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7278196811676025, + "num_tokens": 329287905.0, + "step": 13192 + }, + { + "epoch": 1.4488249505820339, + "grad_norm": 2.335860252380371, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7258740663528442, + "num_tokens": 329311399.0, + "step": 13193 + }, + { + "epoch": 1.4489347682846474, + "grad_norm": 2.156449556350708, + "learning_rate": 1e-06, + "loss": 1.0078, + "mean_token_accuracy": 0.6901077032089233, + "num_tokens": 329340392.0, + "step": 13194 + }, + { + "epoch": 1.4490445859872612, + "grad_norm": 2.0306236743927, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7033653259277344, + "num_tokens": 329371573.0, + "step": 13195 + }, + { + "epoch": 1.449154403689875, + "grad_norm": 2.7866134643554688, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.723512589931488, + "num_tokens": 329389743.0, + "step": 13196 + }, + { + "epoch": 1.4492642213924884, + "grad_norm": 2.1476190090179443, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7096261978149414, + "num_tokens": 329417059.0, + "step": 13197 + }, + { + "epoch": 1.4493740390951022, + "grad_norm": 2.168517589569092, + "learning_rate": 1e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7428784966468811, + "num_tokens": 329444828.0, + "step": 13198 + }, + { + "epoch": 1.4494838567977157, + "grad_norm": 2.2648744583129883, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.713092565536499, + "num_tokens": 329470966.0, + "step": 13199 + }, + { + "epoch": 1.4495936745003295, + "grad_norm": 1.9498714208602905, + "learning_rate": 1e-06, + "loss": 0.7854, + "mean_token_accuracy": 0.7592588067054749, + "num_tokens": 329498904.0, + "step": 13200 + }, + { + "epoch": 1.449703492202943, + "grad_norm": 2.383366346359253, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7269062995910645, + "num_tokens": 329522277.0, + "step": 13201 + }, + { + "epoch": 1.4498133099055568, + "grad_norm": 2.3137617111206055, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.6969383955001831, + "num_tokens": 329546980.0, + "step": 13202 + }, + { + "epoch": 1.4499231276081703, + "grad_norm": 2.1704301834106445, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.6987795233726501, + "num_tokens": 329574565.0, + "step": 13203 + }, + { + "epoch": 1.450032945310784, + "grad_norm": 2.2152771949768066, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7105956673622131, + "num_tokens": 329600553.0, + "step": 13204 + }, + { + "epoch": 1.4501427630133978, + "grad_norm": 2.330425262451172, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7033122181892395, + "num_tokens": 329624865.0, + "step": 13205 + }, + { + "epoch": 1.4502525807160114, + "grad_norm": 2.4637234210968018, + "learning_rate": 1e-06, + "loss": 0.788, + "mean_token_accuracy": 0.7586556673049927, + "num_tokens": 329646397.0, + "step": 13206 + }, + { + "epoch": 1.4503623984186251, + "grad_norm": 2.421781539916992, + "learning_rate": 1e-06, + "loss": 0.9381, + "mean_token_accuracy": 0.7163985967636108, + "num_tokens": 329668921.0, + "step": 13207 + }, + { + "epoch": 1.4504722161212387, + "grad_norm": 2.050767183303833, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7296804189682007, + "num_tokens": 329698358.0, + "step": 13208 + }, + { + "epoch": 1.4505820338238524, + "grad_norm": 2.058137893676758, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7292964458465576, + "num_tokens": 329726587.0, + "step": 13209 + }, + { + "epoch": 1.4506918515264662, + "grad_norm": 2.334174871444702, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7304060459136963, + "num_tokens": 329749356.0, + "step": 13210 + }, + { + "epoch": 1.4508016692290797, + "grad_norm": 2.255603075027466, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7150565981864929, + "num_tokens": 329774081.0, + "step": 13211 + }, + { + "epoch": 1.4509114869316933, + "grad_norm": 2.328972816467285, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7077502012252808, + "num_tokens": 329800407.0, + "step": 13212 + }, + { + "epoch": 1.451021304634307, + "grad_norm": 2.482661247253418, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7258429527282715, + "num_tokens": 329821854.0, + "step": 13213 + }, + { + "epoch": 1.4511311223369208, + "grad_norm": 2.1870086193084717, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7216633558273315, + "num_tokens": 329849213.0, + "step": 13214 + }, + { + "epoch": 1.4512409400395343, + "grad_norm": 2.194458484649658, + "learning_rate": 1e-06, + "loss": 1.0441, + "mean_token_accuracy": 0.6807448863983154, + "num_tokens": 329876954.0, + "step": 13215 + }, + { + "epoch": 1.451350757742148, + "grad_norm": 2.556094169616699, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7218658924102783, + "num_tokens": 329898221.0, + "step": 13216 + }, + { + "epoch": 1.4514605754447616, + "grad_norm": 2.050508499145508, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7206772565841675, + "num_tokens": 329927041.0, + "step": 13217 + }, + { + "epoch": 1.4515703931473753, + "grad_norm": 2.3309953212738037, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7231544256210327, + "num_tokens": 329950511.0, + "step": 13218 + }, + { + "epoch": 1.451680210849989, + "grad_norm": 2.4205009937286377, + "learning_rate": 1e-06, + "loss": 0.7409, + "mean_token_accuracy": 0.7574578523635864, + "num_tokens": 329972268.0, + "step": 13219 + }, + { + "epoch": 1.4517900285526026, + "grad_norm": 2.613523483276367, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.721808910369873, + "num_tokens": 329992958.0, + "step": 13220 + }, + { + "epoch": 1.4518998462552164, + "grad_norm": 2.167966604232788, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7169560790061951, + "num_tokens": 330019826.0, + "step": 13221 + }, + { + "epoch": 1.45200966395783, + "grad_norm": 2.5685181617736816, + "learning_rate": 1e-06, + "loss": 1.0279, + "mean_token_accuracy": 0.685957670211792, + "num_tokens": 330042108.0, + "step": 13222 + }, + { + "epoch": 1.4521194816604437, + "grad_norm": 2.3264029026031494, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7134734988212585, + "num_tokens": 330065615.0, + "step": 13223 + }, + { + "epoch": 1.4522292993630574, + "grad_norm": 2.139509916305542, + "learning_rate": 1e-06, + "loss": 1.0724, + "mean_token_accuracy": 0.6783506870269775, + "num_tokens": 330094950.0, + "step": 13224 + }, + { + "epoch": 1.452339117065671, + "grad_norm": 2.148568630218506, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6970336437225342, + "num_tokens": 330123738.0, + "step": 13225 + }, + { + "epoch": 1.4524489347682845, + "grad_norm": 2.6408638954162598, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7256467342376709, + "num_tokens": 330144627.0, + "step": 13226 + }, + { + "epoch": 1.4525587524708983, + "grad_norm": 2.3262031078338623, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7279942035675049, + "num_tokens": 330165692.0, + "step": 13227 + }, + { + "epoch": 1.452668570173512, + "grad_norm": 2.0269105434417725, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7253980040550232, + "num_tokens": 330195486.0, + "step": 13228 + }, + { + "epoch": 1.4527783878761256, + "grad_norm": 2.2230634689331055, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7184484004974365, + "num_tokens": 330222338.0, + "step": 13229 + }, + { + "epoch": 1.4528882055787393, + "grad_norm": 2.3517746925354004, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7158584594726562, + "num_tokens": 330247426.0, + "step": 13230 + }, + { + "epoch": 1.4529980232813529, + "grad_norm": 2.425420045852661, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7124601602554321, + "num_tokens": 330269963.0, + "step": 13231 + }, + { + "epoch": 1.4531078409839666, + "grad_norm": 2.1687729358673096, + "learning_rate": 1e-06, + "loss": 0.985, + "mean_token_accuracy": 0.7024394273757935, + "num_tokens": 330296661.0, + "step": 13232 + }, + { + "epoch": 1.4532176586865804, + "grad_norm": 2.325403928756714, + "learning_rate": 1e-06, + "loss": 0.8004, + "mean_token_accuracy": 0.7430179119110107, + "num_tokens": 330318435.0, + "step": 13233 + }, + { + "epoch": 1.453327476389194, + "grad_norm": 1.8873852491378784, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7279857397079468, + "num_tokens": 330350077.0, + "step": 13234 + }, + { + "epoch": 1.4534372940918077, + "grad_norm": 2.3748600482940674, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.711217999458313, + "num_tokens": 330375665.0, + "step": 13235 + }, + { + "epoch": 1.4535471117944212, + "grad_norm": 2.0476765632629395, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.731112003326416, + "num_tokens": 330404365.0, + "step": 13236 + }, + { + "epoch": 1.453656929497035, + "grad_norm": 2.0804355144500732, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7027876377105713, + "num_tokens": 330435738.0, + "step": 13237 + }, + { + "epoch": 1.4537667471996487, + "grad_norm": 1.923355221748352, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7307055592536926, + "num_tokens": 330469175.0, + "step": 13238 + }, + { + "epoch": 1.4538765649022622, + "grad_norm": 2.095677137374878, + "learning_rate": 1e-06, + "loss": 1.0198, + "mean_token_accuracy": 0.6906460523605347, + "num_tokens": 330498790.0, + "step": 13239 + }, + { + "epoch": 1.4539863826048758, + "grad_norm": 2.2524287700653076, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7313954830169678, + "num_tokens": 330523506.0, + "step": 13240 + }, + { + "epoch": 1.4540962003074895, + "grad_norm": 2.2085232734680176, + "learning_rate": 1e-06, + "loss": 0.8699, + "mean_token_accuracy": 0.7227591872215271, + "num_tokens": 330550118.0, + "step": 13241 + }, + { + "epoch": 1.4542060180101033, + "grad_norm": 2.3265280723571777, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.6971455812454224, + "num_tokens": 330575416.0, + "step": 13242 + }, + { + "epoch": 1.4543158357127168, + "grad_norm": 2.561429023742676, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7352417707443237, + "num_tokens": 330596191.0, + "step": 13243 + }, + { + "epoch": 1.4544256534153306, + "grad_norm": 2.2299633026123047, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7129043340682983, + "num_tokens": 330622470.0, + "step": 13244 + }, + { + "epoch": 1.4545354711179441, + "grad_norm": 2.3220512866973877, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7347160577774048, + "num_tokens": 330646465.0, + "step": 13245 + }, + { + "epoch": 1.4546452888205579, + "grad_norm": 2.0934033393859863, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7187991142272949, + "num_tokens": 330675606.0, + "step": 13246 + }, + { + "epoch": 1.4547551065231716, + "grad_norm": 2.3209776878356934, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7301278114318848, + "num_tokens": 330700364.0, + "step": 13247 + }, + { + "epoch": 1.4548649242257852, + "grad_norm": 2.323756694793701, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7116108536720276, + "num_tokens": 330724637.0, + "step": 13248 + }, + { + "epoch": 1.454974741928399, + "grad_norm": 2.034374475479126, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7108461260795593, + "num_tokens": 330755994.0, + "step": 13249 + }, + { + "epoch": 1.4550845596310125, + "grad_norm": 2.2979702949523926, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7144292593002319, + "num_tokens": 330781056.0, + "step": 13250 + }, + { + "epoch": 1.4551943773336262, + "grad_norm": 2.0581133365631104, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.72162926197052, + "num_tokens": 330810162.0, + "step": 13251 + }, + { + "epoch": 1.45530419503624, + "grad_norm": 2.3186471462249756, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7089300155639648, + "num_tokens": 330833869.0, + "step": 13252 + }, + { + "epoch": 1.4554140127388535, + "grad_norm": 2.3116466999053955, + "learning_rate": 1e-06, + "loss": 0.7729, + "mean_token_accuracy": 0.7511849999427795, + "num_tokens": 330856184.0, + "step": 13253 + }, + { + "epoch": 1.455523830441467, + "grad_norm": 2.018496036529541, + "learning_rate": 1e-06, + "loss": 1.0207, + "mean_token_accuracy": 0.6959550380706787, + "num_tokens": 330887721.0, + "step": 13254 + }, + { + "epoch": 1.4556336481440808, + "grad_norm": 2.050570249557495, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7046117186546326, + "num_tokens": 330918502.0, + "step": 13255 + }, + { + "epoch": 1.4557434658466946, + "grad_norm": 2.3260021209716797, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.724887490272522, + "num_tokens": 330941166.0, + "step": 13256 + }, + { + "epoch": 1.455853283549308, + "grad_norm": 2.1016390323638916, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7408590912818909, + "num_tokens": 330967403.0, + "step": 13257 + }, + { + "epoch": 1.4559631012519219, + "grad_norm": 2.2275612354278564, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7095481157302856, + "num_tokens": 330993552.0, + "step": 13258 + }, + { + "epoch": 1.4560729189545354, + "grad_norm": 2.3581950664520264, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7320461273193359, + "num_tokens": 331017044.0, + "step": 13259 + }, + { + "epoch": 1.4561827366571491, + "grad_norm": 2.462198257446289, + "learning_rate": 1e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.7545492053031921, + "num_tokens": 331038597.0, + "step": 13260 + }, + { + "epoch": 1.456292554359763, + "grad_norm": 2.423278570175171, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7238801121711731, + "num_tokens": 331063134.0, + "step": 13261 + }, + { + "epoch": 1.4564023720623764, + "grad_norm": 2.267258644104004, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7231193780899048, + "num_tokens": 331089822.0, + "step": 13262 + }, + { + "epoch": 1.4565121897649902, + "grad_norm": 2.7590231895446777, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7284956574440002, + "num_tokens": 331108995.0, + "step": 13263 + }, + { + "epoch": 1.4566220074676037, + "grad_norm": 2.228398561477661, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7365773916244507, + "num_tokens": 331133095.0, + "step": 13264 + }, + { + "epoch": 1.4567318251702175, + "grad_norm": 2.2897956371307373, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.6946312189102173, + "num_tokens": 331158425.0, + "step": 13265 + }, + { + "epoch": 1.456841642872831, + "grad_norm": 2.4680287837982178, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7187578678131104, + "num_tokens": 331180860.0, + "step": 13266 + }, + { + "epoch": 1.4569514605754448, + "grad_norm": 2.2090485095977783, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7348864078521729, + "num_tokens": 331205149.0, + "step": 13267 + }, + { + "epoch": 1.4570612782780583, + "grad_norm": 1.843768835067749, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.6935703158378601, + "num_tokens": 331243597.0, + "step": 13268 + }, + { + "epoch": 1.457171095980672, + "grad_norm": 2.2434940338134766, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7253599762916565, + "num_tokens": 331268748.0, + "step": 13269 + }, + { + "epoch": 1.4572809136832858, + "grad_norm": 2.142826557159424, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.719196081161499, + "num_tokens": 331296932.0, + "step": 13270 + }, + { + "epoch": 1.4573907313858994, + "grad_norm": 2.2617669105529785, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7212072014808655, + "num_tokens": 331321464.0, + "step": 13271 + }, + { + "epoch": 1.4575005490885131, + "grad_norm": 2.125749349594116, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7145614624023438, + "num_tokens": 331348907.0, + "step": 13272 + }, + { + "epoch": 1.4576103667911267, + "grad_norm": 2.4610137939453125, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7027343511581421, + "num_tokens": 331374333.0, + "step": 13273 + }, + { + "epoch": 1.4577201844937404, + "grad_norm": 2.404439687728882, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.6962465643882751, + "num_tokens": 331398489.0, + "step": 13274 + }, + { + "epoch": 1.4578300021963542, + "grad_norm": 2.1216325759887695, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7283204793930054, + "num_tokens": 331427747.0, + "step": 13275 + }, + { + "epoch": 1.4579398198989677, + "grad_norm": 2.296915054321289, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7291532754898071, + "num_tokens": 331449601.0, + "step": 13276 + }, + { + "epoch": 1.4580496376015812, + "grad_norm": 1.9539000988006592, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.71555095911026, + "num_tokens": 331482676.0, + "step": 13277 + }, + { + "epoch": 1.458159455304195, + "grad_norm": 2.0886874198913574, + "learning_rate": 1e-06, + "loss": 0.8257, + "mean_token_accuracy": 0.7406908273696899, + "num_tokens": 331511015.0, + "step": 13278 + }, + { + "epoch": 1.4582692730068088, + "grad_norm": 2.213388204574585, + "learning_rate": 1e-06, + "loss": 0.9872, + "mean_token_accuracy": 0.6935547590255737, + "num_tokens": 331538244.0, + "step": 13279 + }, + { + "epoch": 1.4583790907094223, + "grad_norm": 2.579983949661255, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7209317088127136, + "num_tokens": 331560346.0, + "step": 13280 + }, + { + "epoch": 1.458488908412036, + "grad_norm": 2.1100621223449707, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.704352855682373, + "num_tokens": 331589421.0, + "step": 13281 + }, + { + "epoch": 1.4585987261146496, + "grad_norm": 2.3675317764282227, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.7056273221969604, + "num_tokens": 331613508.0, + "step": 13282 + }, + { + "epoch": 1.4587085438172633, + "grad_norm": 2.4201533794403076, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7174766659736633, + "num_tokens": 331637237.0, + "step": 13283 + }, + { + "epoch": 1.458818361519877, + "grad_norm": 2.3789539337158203, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7278797626495361, + "num_tokens": 331661213.0, + "step": 13284 + }, + { + "epoch": 1.4589281792224906, + "grad_norm": 2.7325305938720703, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7293801307678223, + "num_tokens": 331681616.0, + "step": 13285 + }, + { + "epoch": 1.4590379969251044, + "grad_norm": 2.563570499420166, + "learning_rate": 1e-06, + "loss": 0.896, + "mean_token_accuracy": 0.7206175327301025, + "num_tokens": 331703148.0, + "step": 13286 + }, + { + "epoch": 1.459147814627718, + "grad_norm": 2.3941709995269775, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7014251947402954, + "num_tokens": 331728270.0, + "step": 13287 + }, + { + "epoch": 1.4592576323303317, + "grad_norm": 2.219719886779785, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7210564613342285, + "num_tokens": 331753344.0, + "step": 13288 + }, + { + "epoch": 1.4593674500329454, + "grad_norm": 2.222567319869995, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7225922346115112, + "num_tokens": 331780282.0, + "step": 13289 + }, + { + "epoch": 1.459477267735559, + "grad_norm": 2.280406951904297, + "learning_rate": 1e-06, + "loss": 1.0067, + "mean_token_accuracy": 0.6987702250480652, + "num_tokens": 331806511.0, + "step": 13290 + }, + { + "epoch": 1.4595870854381725, + "grad_norm": 2.0118141174316406, + "learning_rate": 1e-06, + "loss": 0.9941, + "mean_token_accuracy": 0.696392834186554, + "num_tokens": 331836359.0, + "step": 13291 + }, + { + "epoch": 1.4596969031407863, + "grad_norm": 2.198469638824463, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7310663461685181, + "num_tokens": 331862822.0, + "step": 13292 + }, + { + "epoch": 1.4598067208434, + "grad_norm": 2.388580083847046, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7180596590042114, + "num_tokens": 331884620.0, + "step": 13293 + }, + { + "epoch": 1.4599165385460136, + "grad_norm": 2.0058951377868652, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7362798452377319, + "num_tokens": 331912038.0, + "step": 13294 + }, + { + "epoch": 1.4600263562486273, + "grad_norm": 2.4939794540405273, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7510933876037598, + "num_tokens": 331930812.0, + "step": 13295 + }, + { + "epoch": 1.4601361739512408, + "grad_norm": 2.0575826168060303, + "learning_rate": 1e-06, + "loss": 1.0258, + "mean_token_accuracy": 0.6835973262786865, + "num_tokens": 331962720.0, + "step": 13296 + }, + { + "epoch": 1.4602459916538546, + "grad_norm": 2.145498752593994, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7091226577758789, + "num_tokens": 331990192.0, + "step": 13297 + }, + { + "epoch": 1.4603558093564684, + "grad_norm": 2.178649663925171, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7307904958724976, + "num_tokens": 332019611.0, + "step": 13298 + }, + { + "epoch": 1.460465627059082, + "grad_norm": 1.8641477823257446, + "learning_rate": 1e-06, + "loss": 0.8615, + "mean_token_accuracy": 0.7292505502700806, + "num_tokens": 332055824.0, + "step": 13299 + }, + { + "epoch": 1.4605754447616957, + "grad_norm": 2.5186233520507812, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7282100915908813, + "num_tokens": 332076457.0, + "step": 13300 + }, + { + "epoch": 1.4606852624643092, + "grad_norm": 2.0005693435668945, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7179794311523438, + "num_tokens": 332105475.0, + "step": 13301 + }, + { + "epoch": 1.460795080166923, + "grad_norm": 2.422532320022583, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7244272232055664, + "num_tokens": 332129837.0, + "step": 13302 + }, + { + "epoch": 1.4609048978695367, + "grad_norm": 2.1757352352142334, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7263308167457581, + "num_tokens": 332157539.0, + "step": 13303 + }, + { + "epoch": 1.4610147155721502, + "grad_norm": 2.4925220012664795, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7291626930236816, + "num_tokens": 332178631.0, + "step": 13304 + }, + { + "epoch": 1.4611245332747638, + "grad_norm": 2.329707622528076, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7186681628227234, + "num_tokens": 332203785.0, + "step": 13305 + }, + { + "epoch": 1.4612343509773775, + "grad_norm": 2.243382215499878, + "learning_rate": 1e-06, + "loss": 1.0017, + "mean_token_accuracy": 0.6930843591690063, + "num_tokens": 332233808.0, + "step": 13306 + }, + { + "epoch": 1.4613441686799913, + "grad_norm": 2.3592426776885986, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7379129528999329, + "num_tokens": 332257218.0, + "step": 13307 + }, + { + "epoch": 1.4614539863826048, + "grad_norm": 2.0946569442749023, + "learning_rate": 1e-06, + "loss": 0.8102, + "mean_token_accuracy": 0.7399864196777344, + "num_tokens": 332284860.0, + "step": 13308 + }, + { + "epoch": 1.4615638040852186, + "grad_norm": 2.576625108718872, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7453110814094543, + "num_tokens": 332303895.0, + "step": 13309 + }, + { + "epoch": 1.4616736217878321, + "grad_norm": 2.1744327545166016, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.713975191116333, + "num_tokens": 332328666.0, + "step": 13310 + }, + { + "epoch": 1.4617834394904459, + "grad_norm": 2.0921521186828613, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7034016847610474, + "num_tokens": 332359039.0, + "step": 13311 + }, + { + "epoch": 1.4618932571930596, + "grad_norm": 2.1701464653015137, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7171199321746826, + "num_tokens": 332386863.0, + "step": 13312 + }, + { + "epoch": 1.4620030748956732, + "grad_norm": 2.3712997436523438, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7246211767196655, + "num_tokens": 332411250.0, + "step": 13313 + }, + { + "epoch": 1.462112892598287, + "grad_norm": 2.419224500656128, + "learning_rate": 1e-06, + "loss": 1.0188, + "mean_token_accuracy": 0.6929036378860474, + "num_tokens": 332434950.0, + "step": 13314 + }, + { + "epoch": 1.4622227103009005, + "grad_norm": 2.292262315750122, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7161122560501099, + "num_tokens": 332459680.0, + "step": 13315 + }, + { + "epoch": 1.4623325280035142, + "grad_norm": 2.3603622913360596, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7183796167373657, + "num_tokens": 332483302.0, + "step": 13316 + }, + { + "epoch": 1.462442345706128, + "grad_norm": 2.679734230041504, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7108526825904846, + "num_tokens": 332503770.0, + "step": 13317 + }, + { + "epoch": 1.4625521634087415, + "grad_norm": 2.386807441711426, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7116420269012451, + "num_tokens": 332528199.0, + "step": 13318 + }, + { + "epoch": 1.462661981111355, + "grad_norm": 2.4284169673919678, + "learning_rate": 1e-06, + "loss": 0.9332, + "mean_token_accuracy": 0.7153360247612, + "num_tokens": 332553294.0, + "step": 13319 + }, + { + "epoch": 1.4627717988139688, + "grad_norm": 2.247307300567627, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7235469818115234, + "num_tokens": 332581138.0, + "step": 13320 + }, + { + "epoch": 1.4628816165165826, + "grad_norm": 2.4368817806243896, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7391254901885986, + "num_tokens": 332601867.0, + "step": 13321 + }, + { + "epoch": 1.462991434219196, + "grad_norm": 2.1750435829162598, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7129083871841431, + "num_tokens": 332631471.0, + "step": 13322 + }, + { + "epoch": 1.4631012519218098, + "grad_norm": 2.328385829925537, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7194051146507263, + "num_tokens": 332656092.0, + "step": 13323 + }, + { + "epoch": 1.4632110696244234, + "grad_norm": 2.426710367202759, + "learning_rate": 1e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7450074553489685, + "num_tokens": 332679248.0, + "step": 13324 + }, + { + "epoch": 1.4633208873270371, + "grad_norm": 2.45060396194458, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7294703722000122, + "num_tokens": 332702736.0, + "step": 13325 + }, + { + "epoch": 1.463430705029651, + "grad_norm": 2.4932496547698975, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7324069738388062, + "num_tokens": 332723813.0, + "step": 13326 + }, + { + "epoch": 1.4635405227322644, + "grad_norm": 2.2673516273498535, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7138233780860901, + "num_tokens": 332750481.0, + "step": 13327 + }, + { + "epoch": 1.4636503404348782, + "grad_norm": 2.161897897720337, + "learning_rate": 1e-06, + "loss": 0.9614, + "mean_token_accuracy": 0.7083120346069336, + "num_tokens": 332779409.0, + "step": 13328 + }, + { + "epoch": 1.4637601581374917, + "grad_norm": 2.263948917388916, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7151552438735962, + "num_tokens": 332803576.0, + "step": 13329 + }, + { + "epoch": 1.4638699758401055, + "grad_norm": 2.5034706592559814, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7457888126373291, + "num_tokens": 332824080.0, + "step": 13330 + }, + { + "epoch": 1.463979793542719, + "grad_norm": 2.3121118545532227, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7165200710296631, + "num_tokens": 332851021.0, + "step": 13331 + }, + { + "epoch": 1.4640896112453328, + "grad_norm": 2.207998752593994, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7113324403762817, + "num_tokens": 332877623.0, + "step": 13332 + }, + { + "epoch": 1.4641994289479463, + "grad_norm": 2.5917916297912598, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7187573909759521, + "num_tokens": 332898922.0, + "step": 13333 + }, + { + "epoch": 1.46430924665056, + "grad_norm": 2.378833293914795, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7335371971130371, + "num_tokens": 332922325.0, + "step": 13334 + }, + { + "epoch": 1.4644190643531738, + "grad_norm": 2.3717761039733887, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.6998584866523743, + "num_tokens": 332946627.0, + "step": 13335 + }, + { + "epoch": 1.4645288820557874, + "grad_norm": 2.5041332244873047, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.7374700307846069, + "num_tokens": 332967720.0, + "step": 13336 + }, + { + "epoch": 1.464638699758401, + "grad_norm": 2.2455742359161377, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7260363698005676, + "num_tokens": 332992754.0, + "step": 13337 + }, + { + "epoch": 1.4647485174610146, + "grad_norm": 2.338865041732788, + "learning_rate": 1e-06, + "loss": 0.8108, + "mean_token_accuracy": 0.7475439310073853, + "num_tokens": 333016225.0, + "step": 13338 + }, + { + "epoch": 1.4648583351636284, + "grad_norm": 2.1864476203918457, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.728118896484375, + "num_tokens": 333043910.0, + "step": 13339 + }, + { + "epoch": 1.4649681528662422, + "grad_norm": 2.5936779975891113, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7157822847366333, + "num_tokens": 333064724.0, + "step": 13340 + }, + { + "epoch": 1.4650779705688557, + "grad_norm": 2.6203742027282715, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7209811806678772, + "num_tokens": 333085464.0, + "step": 13341 + }, + { + "epoch": 1.4651877882714692, + "grad_norm": 2.5537798404693604, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.7009466290473938, + "num_tokens": 333107940.0, + "step": 13342 + }, + { + "epoch": 1.465297605974083, + "grad_norm": 2.2658333778381348, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7120456695556641, + "num_tokens": 333134612.0, + "step": 13343 + }, + { + "epoch": 1.4654074236766967, + "grad_norm": 2.221796751022339, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7217419147491455, + "num_tokens": 333161668.0, + "step": 13344 + }, + { + "epoch": 1.4655172413793103, + "grad_norm": 2.0448005199432373, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6947647333145142, + "num_tokens": 333192464.0, + "step": 13345 + }, + { + "epoch": 1.465627059081924, + "grad_norm": 2.1375138759613037, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.722827672958374, + "num_tokens": 333220368.0, + "step": 13346 + }, + { + "epoch": 1.4657368767845376, + "grad_norm": 2.572030544281006, + "learning_rate": 1e-06, + "loss": 0.7926, + "mean_token_accuracy": 0.7439058423042297, + "num_tokens": 333239273.0, + "step": 13347 + }, + { + "epoch": 1.4658466944871513, + "grad_norm": 2.0889198780059814, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.714468240737915, + "num_tokens": 333267863.0, + "step": 13348 + }, + { + "epoch": 1.465956512189765, + "grad_norm": 2.280266761779785, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7372359037399292, + "num_tokens": 333291863.0, + "step": 13349 + }, + { + "epoch": 1.4660663298923786, + "grad_norm": 2.2150356769561768, + "learning_rate": 1e-06, + "loss": 0.7759, + "mean_token_accuracy": 0.7450065016746521, + "num_tokens": 333316179.0, + "step": 13350 + }, + { + "epoch": 1.4661761475949924, + "grad_norm": 1.9913305044174194, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7109220027923584, + "num_tokens": 333346733.0, + "step": 13351 + }, + { + "epoch": 1.466285965297606, + "grad_norm": 2.3102011680603027, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7134734392166138, + "num_tokens": 333372115.0, + "step": 13352 + }, + { + "epoch": 1.4663957830002197, + "grad_norm": 2.4970405101776123, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7263492345809937, + "num_tokens": 333393244.0, + "step": 13353 + }, + { + "epoch": 1.4665056007028334, + "grad_norm": 2.2272579669952393, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.716638445854187, + "num_tokens": 333421030.0, + "step": 13354 + }, + { + "epoch": 1.466615418405447, + "grad_norm": 2.178823471069336, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7339236736297607, + "num_tokens": 333449793.0, + "step": 13355 + }, + { + "epoch": 1.4667252361080605, + "grad_norm": 2.3211584091186523, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7216511964797974, + "num_tokens": 333474973.0, + "step": 13356 + }, + { + "epoch": 1.4668350538106742, + "grad_norm": 2.176091432571411, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.6969877481460571, + "num_tokens": 333505347.0, + "step": 13357 + }, + { + "epoch": 1.466944871513288, + "grad_norm": 2.1826961040496826, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7114063501358032, + "num_tokens": 333531917.0, + "step": 13358 + }, + { + "epoch": 1.4670546892159015, + "grad_norm": 2.4392409324645996, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7393977046012878, + "num_tokens": 333553070.0, + "step": 13359 + }, + { + "epoch": 1.4671645069185153, + "grad_norm": 2.3082103729248047, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7084290981292725, + "num_tokens": 333577337.0, + "step": 13360 + }, + { + "epoch": 1.4672743246211288, + "grad_norm": 2.5431764125823975, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7031130790710449, + "num_tokens": 333597903.0, + "step": 13361 + }, + { + "epoch": 1.4673841423237426, + "grad_norm": 2.392117738723755, + "learning_rate": 1e-06, + "loss": 0.8441, + "mean_token_accuracy": 0.7382276058197021, + "num_tokens": 333619520.0, + "step": 13362 + }, + { + "epoch": 1.4674939600263563, + "grad_norm": 2.0529167652130127, + "learning_rate": 1e-06, + "loss": 0.9854, + "mean_token_accuracy": 0.6942139267921448, + "num_tokens": 333647120.0, + "step": 13363 + }, + { + "epoch": 1.4676037777289699, + "grad_norm": 2.3086767196655273, + "learning_rate": 1e-06, + "loss": 0.8743, + "mean_token_accuracy": 0.723612368106842, + "num_tokens": 333671680.0, + "step": 13364 + }, + { + "epoch": 1.4677135954315836, + "grad_norm": 1.9609123468399048, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7283554077148438, + "num_tokens": 333702653.0, + "step": 13365 + }, + { + "epoch": 1.4678234131341972, + "grad_norm": 2.4003472328186035, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7232937812805176, + "num_tokens": 333725361.0, + "step": 13366 + }, + { + "epoch": 1.467933230836811, + "grad_norm": 2.5321033000946045, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7373259663581848, + "num_tokens": 333746636.0, + "step": 13367 + }, + { + "epoch": 1.4680430485394247, + "grad_norm": 2.2001428604125977, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7087092399597168, + "num_tokens": 333772535.0, + "step": 13368 + }, + { + "epoch": 1.4681528662420382, + "grad_norm": 2.600672960281372, + "learning_rate": 1e-06, + "loss": 0.9771, + "mean_token_accuracy": 0.7034701108932495, + "num_tokens": 333793495.0, + "step": 13369 + }, + { + "epoch": 1.4682626839446518, + "grad_norm": 2.331942081451416, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7217885255813599, + "num_tokens": 333817846.0, + "step": 13370 + }, + { + "epoch": 1.4683725016472655, + "grad_norm": 2.280505418777466, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7316482067108154, + "num_tokens": 333843214.0, + "step": 13371 + }, + { + "epoch": 1.4684823193498793, + "grad_norm": 2.384678363800049, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7247244715690613, + "num_tokens": 333865273.0, + "step": 13372 + }, + { + "epoch": 1.4685921370524928, + "grad_norm": 2.132084369659424, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7051166296005249, + "num_tokens": 333894100.0, + "step": 13373 + }, + { + "epoch": 1.4687019547551066, + "grad_norm": 2.505009651184082, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7147159576416016, + "num_tokens": 333915318.0, + "step": 13374 + }, + { + "epoch": 1.46881177245772, + "grad_norm": 2.3890380859375, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7146734595298767, + "num_tokens": 333939023.0, + "step": 13375 + }, + { + "epoch": 1.4689215901603339, + "grad_norm": 2.329728364944458, + "learning_rate": 1e-06, + "loss": 0.9742, + "mean_token_accuracy": 0.7063456177711487, + "num_tokens": 333965420.0, + "step": 13376 + }, + { + "epoch": 1.4690314078629476, + "grad_norm": 2.0287320613861084, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7151691913604736, + "num_tokens": 333996287.0, + "step": 13377 + }, + { + "epoch": 1.4691412255655611, + "grad_norm": 2.357656478881836, + "learning_rate": 1e-06, + "loss": 0.7771, + "mean_token_accuracy": 0.7479719519615173, + "num_tokens": 334017977.0, + "step": 13378 + }, + { + "epoch": 1.469251043268175, + "grad_norm": 2.1860849857330322, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7189007997512817, + "num_tokens": 334043651.0, + "step": 13379 + }, + { + "epoch": 1.4693608609707884, + "grad_norm": 2.238849639892578, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7378807663917542, + "num_tokens": 334069463.0, + "step": 13380 + }, + { + "epoch": 1.4694706786734022, + "grad_norm": 2.2998135089874268, + "learning_rate": 1e-06, + "loss": 0.8923, + "mean_token_accuracy": 0.7185354232788086, + "num_tokens": 334093989.0, + "step": 13381 + }, + { + "epoch": 1.4695804963760157, + "grad_norm": 2.3760974407196045, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7277736067771912, + "num_tokens": 334118020.0, + "step": 13382 + }, + { + "epoch": 1.4696903140786295, + "grad_norm": 2.0787460803985596, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7164421081542969, + "num_tokens": 334149272.0, + "step": 13383 + }, + { + "epoch": 1.469800131781243, + "grad_norm": 1.936906099319458, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7230793237686157, + "num_tokens": 334181682.0, + "step": 13384 + }, + { + "epoch": 1.4699099494838568, + "grad_norm": 2.148902177810669, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7211762070655823, + "num_tokens": 334207523.0, + "step": 13385 + }, + { + "epoch": 1.4700197671864705, + "grad_norm": 2.025517225265503, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6967714428901672, + "num_tokens": 334239269.0, + "step": 13386 + }, + { + "epoch": 1.470129584889084, + "grad_norm": 2.2822959423065186, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7172816395759583, + "num_tokens": 334265692.0, + "step": 13387 + }, + { + "epoch": 1.4702394025916978, + "grad_norm": 2.205927610397339, + "learning_rate": 1e-06, + "loss": 0.8549, + "mean_token_accuracy": 0.7330410480499268, + "num_tokens": 334290059.0, + "step": 13388 + }, + { + "epoch": 1.4703492202943114, + "grad_norm": 2.0453617572784424, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7063450813293457, + "num_tokens": 334323120.0, + "step": 13389 + }, + { + "epoch": 1.4704590379969251, + "grad_norm": 2.3953168392181396, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7244006395339966, + "num_tokens": 334347909.0, + "step": 13390 + }, + { + "epoch": 1.4705688556995389, + "grad_norm": 2.5219666957855225, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7124244570732117, + "num_tokens": 334371644.0, + "step": 13391 + }, + { + "epoch": 1.4706786734021524, + "grad_norm": 2.1399433612823486, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7208482027053833, + "num_tokens": 334400092.0, + "step": 13392 + }, + { + "epoch": 1.4707884911047662, + "grad_norm": 2.3234314918518066, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7124168276786804, + "num_tokens": 334423795.0, + "step": 13393 + }, + { + "epoch": 1.4708983088073797, + "grad_norm": 2.033792495727539, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6960123777389526, + "num_tokens": 334454832.0, + "step": 13394 + }, + { + "epoch": 1.4710081265099935, + "grad_norm": 2.801039934158325, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7406375408172607, + "num_tokens": 334472806.0, + "step": 13395 + }, + { + "epoch": 1.471117944212607, + "grad_norm": 2.1926019191741943, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7128825187683105, + "num_tokens": 334501232.0, + "step": 13396 + }, + { + "epoch": 1.4712277619152208, + "grad_norm": 2.563159227371216, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7185112237930298, + "num_tokens": 334522300.0, + "step": 13397 + }, + { + "epoch": 1.4713375796178343, + "grad_norm": 2.1800193786621094, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7155336737632751, + "num_tokens": 334548634.0, + "step": 13398 + }, + { + "epoch": 1.471447397320448, + "grad_norm": 2.669689416885376, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7225990891456604, + "num_tokens": 334566810.0, + "step": 13399 + }, + { + "epoch": 1.4715572150230618, + "grad_norm": 1.9993433952331543, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7128798961639404, + "num_tokens": 334598312.0, + "step": 13400 + }, + { + "epoch": 1.4716670327256753, + "grad_norm": 2.5432093143463135, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7346705794334412, + "num_tokens": 334618515.0, + "step": 13401 + }, + { + "epoch": 1.471776850428289, + "grad_norm": 2.175175666809082, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7243098616600037, + "num_tokens": 334645319.0, + "step": 13402 + }, + { + "epoch": 1.4718866681309026, + "grad_norm": 2.394098997116089, + "learning_rate": 1e-06, + "loss": 0.7442, + "mean_token_accuracy": 0.7679732441902161, + "num_tokens": 334666472.0, + "step": 13403 + }, + { + "epoch": 1.4719964858335164, + "grad_norm": 2.4871344566345215, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7063106298446655, + "num_tokens": 334690617.0, + "step": 13404 + }, + { + "epoch": 1.4721063035361301, + "grad_norm": 2.3597984313964844, + "learning_rate": 1e-06, + "loss": 0.9536, + "mean_token_accuracy": 0.7094120979309082, + "num_tokens": 334715051.0, + "step": 13405 + }, + { + "epoch": 1.4722161212387437, + "grad_norm": 2.624500036239624, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7155778408050537, + "num_tokens": 334735858.0, + "step": 13406 + }, + { + "epoch": 1.4723259389413572, + "grad_norm": 2.3866453170776367, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.725687563419342, + "num_tokens": 334759606.0, + "step": 13407 + }, + { + "epoch": 1.472435756643971, + "grad_norm": 2.167483329772949, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.6972886919975281, + "num_tokens": 334790426.0, + "step": 13408 + }, + { + "epoch": 1.4725455743465847, + "grad_norm": 2.0754427909851074, + "learning_rate": 1e-06, + "loss": 0.8647, + "mean_token_accuracy": 0.7293751239776611, + "num_tokens": 334817776.0, + "step": 13409 + }, + { + "epoch": 1.4726553920491983, + "grad_norm": 2.027289628982544, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7147789597511292, + "num_tokens": 334848426.0, + "step": 13410 + }, + { + "epoch": 1.472765209751812, + "grad_norm": 2.3408336639404297, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7203160524368286, + "num_tokens": 334870657.0, + "step": 13411 + }, + { + "epoch": 1.4728750274544256, + "grad_norm": 1.8474650382995605, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7122697830200195, + "num_tokens": 334903623.0, + "step": 13412 + }, + { + "epoch": 1.4729848451570393, + "grad_norm": 2.1112935543060303, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7277586460113525, + "num_tokens": 334931566.0, + "step": 13413 + }, + { + "epoch": 1.473094662859653, + "grad_norm": 2.0466175079345703, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7196379899978638, + "num_tokens": 334959384.0, + "step": 13414 + }, + { + "epoch": 1.4732044805622666, + "grad_norm": 2.0677030086517334, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7135741710662842, + "num_tokens": 334989918.0, + "step": 13415 + }, + { + "epoch": 1.4733142982648804, + "grad_norm": 2.2584125995635986, + "learning_rate": 1e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7454361915588379, + "num_tokens": 335014945.0, + "step": 13416 + }, + { + "epoch": 1.473424115967494, + "grad_norm": 2.513296365737915, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7393944263458252, + "num_tokens": 335035237.0, + "step": 13417 + }, + { + "epoch": 1.4735339336701077, + "grad_norm": 2.095899820327759, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7188971042633057, + "num_tokens": 335062122.0, + "step": 13418 + }, + { + "epoch": 1.4736437513727214, + "grad_norm": 1.9490710496902466, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.712375283241272, + "num_tokens": 335093714.0, + "step": 13419 + }, + { + "epoch": 1.473753569075335, + "grad_norm": 2.216482162475586, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7185772657394409, + "num_tokens": 335117461.0, + "step": 13420 + }, + { + "epoch": 1.4738633867779485, + "grad_norm": 2.3530287742614746, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7160619497299194, + "num_tokens": 335141329.0, + "step": 13421 + }, + { + "epoch": 1.4739732044805622, + "grad_norm": 2.089210271835327, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7081582546234131, + "num_tokens": 335171486.0, + "step": 13422 + }, + { + "epoch": 1.474083022183176, + "grad_norm": 2.821016311645508, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7374125719070435, + "num_tokens": 335187182.0, + "step": 13423 + }, + { + "epoch": 1.4741928398857895, + "grad_norm": 2.7381536960601807, + "learning_rate": 1e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7338685989379883, + "num_tokens": 335206783.0, + "step": 13424 + }, + { + "epoch": 1.4743026575884033, + "grad_norm": 1.9834822416305542, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6825293302536011, + "num_tokens": 335239676.0, + "step": 13425 + }, + { + "epoch": 1.4744124752910168, + "grad_norm": 2.278412103652954, + "learning_rate": 1e-06, + "loss": 0.9664, + "mean_token_accuracy": 0.7018095254898071, + "num_tokens": 335266160.0, + "step": 13426 + }, + { + "epoch": 1.4745222929936306, + "grad_norm": 2.4989752769470215, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7126507759094238, + "num_tokens": 335288667.0, + "step": 13427 + }, + { + "epoch": 1.4746321106962443, + "grad_norm": 2.2012829780578613, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.720848798751831, + "num_tokens": 335315530.0, + "step": 13428 + }, + { + "epoch": 1.4747419283988579, + "grad_norm": 2.0556936264038086, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7229575514793396, + "num_tokens": 335343750.0, + "step": 13429 + }, + { + "epoch": 1.4748517461014716, + "grad_norm": 2.265293836593628, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7059221267700195, + "num_tokens": 335370339.0, + "step": 13430 + }, + { + "epoch": 1.4749615638040852, + "grad_norm": 2.3049795627593994, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7222785353660583, + "num_tokens": 335395031.0, + "step": 13431 + }, + { + "epoch": 1.475071381506699, + "grad_norm": 2.6593918800354004, + "learning_rate": 1e-06, + "loss": 0.9206, + "mean_token_accuracy": 0.7207075357437134, + "num_tokens": 335413305.0, + "step": 13432 + }, + { + "epoch": 1.4751811992093127, + "grad_norm": 2.270319938659668, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7127286195755005, + "num_tokens": 335438067.0, + "step": 13433 + }, + { + "epoch": 1.4752910169119262, + "grad_norm": 2.0146853923797607, + "learning_rate": 1e-06, + "loss": 0.9979, + "mean_token_accuracy": 0.6946028470993042, + "num_tokens": 335468046.0, + "step": 13434 + }, + { + "epoch": 1.4754008346145397, + "grad_norm": 2.515881061553955, + "learning_rate": 1e-06, + "loss": 0.8047, + "mean_token_accuracy": 0.7436107993125916, + "num_tokens": 335488566.0, + "step": 13435 + }, + { + "epoch": 1.4755106523171535, + "grad_norm": 2.4386112689971924, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.7328180074691772, + "num_tokens": 335509904.0, + "step": 13436 + }, + { + "epoch": 1.4756204700197673, + "grad_norm": 2.2062735557556152, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7177727818489075, + "num_tokens": 335536341.0, + "step": 13437 + }, + { + "epoch": 1.4757302877223808, + "grad_norm": 2.3892600536346436, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.744107723236084, + "num_tokens": 335557705.0, + "step": 13438 + }, + { + "epoch": 1.4758401054249946, + "grad_norm": 2.09213924407959, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7017707228660583, + "num_tokens": 335586169.0, + "step": 13439 + }, + { + "epoch": 1.475949923127608, + "grad_norm": 2.025219678878784, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7237056493759155, + "num_tokens": 335617667.0, + "step": 13440 + }, + { + "epoch": 1.4760597408302218, + "grad_norm": 2.536595582962036, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7193296551704407, + "num_tokens": 335639032.0, + "step": 13441 + }, + { + "epoch": 1.4761695585328356, + "grad_norm": 2.1140058040618896, + "learning_rate": 1e-06, + "loss": 1.0173, + "mean_token_accuracy": 0.6906803250312805, + "num_tokens": 335669761.0, + "step": 13442 + }, + { + "epoch": 1.4762793762354491, + "grad_norm": 2.4966368675231934, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7242993712425232, + "num_tokens": 335691721.0, + "step": 13443 + }, + { + "epoch": 1.476389193938063, + "grad_norm": 2.0667595863342285, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.729938268661499, + "num_tokens": 335720956.0, + "step": 13444 + }, + { + "epoch": 1.4764990116406764, + "grad_norm": 2.528193473815918, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7067453861236572, + "num_tokens": 335745209.0, + "step": 13445 + }, + { + "epoch": 1.4766088293432902, + "grad_norm": 2.326127529144287, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7173062562942505, + "num_tokens": 335771754.0, + "step": 13446 + }, + { + "epoch": 1.4767186470459037, + "grad_norm": 2.542525053024292, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.732733964920044, + "num_tokens": 335791866.0, + "step": 13447 + }, + { + "epoch": 1.4768284647485175, + "grad_norm": 2.2671568393707275, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7201893329620361, + "num_tokens": 335817578.0, + "step": 13448 + }, + { + "epoch": 1.476938282451131, + "grad_norm": 2.202878713607788, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.726760983467102, + "num_tokens": 335844198.0, + "step": 13449 + }, + { + "epoch": 1.4770481001537448, + "grad_norm": 2.820298910140991, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7428271174430847, + "num_tokens": 335861372.0, + "step": 13450 + }, + { + "epoch": 1.4771579178563585, + "grad_norm": 2.3910443782806396, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7178032398223877, + "num_tokens": 335885436.0, + "step": 13451 + }, + { + "epoch": 1.477267735558972, + "grad_norm": 2.4696154594421387, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7194887399673462, + "num_tokens": 335907004.0, + "step": 13452 + }, + { + "epoch": 1.4773775532615858, + "grad_norm": 2.580143690109253, + "learning_rate": 1e-06, + "loss": 0.8757, + "mean_token_accuracy": 0.7270159721374512, + "num_tokens": 335926783.0, + "step": 13453 + }, + { + "epoch": 1.4774873709641994, + "grad_norm": 2.082047700881958, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7026482224464417, + "num_tokens": 335955349.0, + "step": 13454 + }, + { + "epoch": 1.477597188666813, + "grad_norm": 2.647592067718506, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7255538702011108, + "num_tokens": 335974633.0, + "step": 13455 + }, + { + "epoch": 1.4777070063694269, + "grad_norm": 2.5946598052978516, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7137326002120972, + "num_tokens": 335995254.0, + "step": 13456 + }, + { + "epoch": 1.4778168240720404, + "grad_norm": 1.9649684429168701, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.711272120475769, + "num_tokens": 336025897.0, + "step": 13457 + }, + { + "epoch": 1.477926641774654, + "grad_norm": 2.19149112701416, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.729110836982727, + "num_tokens": 336051488.0, + "step": 13458 + }, + { + "epoch": 1.4780364594772677, + "grad_norm": 2.2250328063964844, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7210848331451416, + "num_tokens": 336075374.0, + "step": 13459 + }, + { + "epoch": 1.4781462771798815, + "grad_norm": 1.8246541023254395, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7054410576820374, + "num_tokens": 336111483.0, + "step": 13460 + }, + { + "epoch": 1.478256094882495, + "grad_norm": 2.1860294342041016, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7018947005271912, + "num_tokens": 336138249.0, + "step": 13461 + }, + { + "epoch": 1.4783659125851087, + "grad_norm": 2.220679759979248, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7044217586517334, + "num_tokens": 336164606.0, + "step": 13462 + }, + { + "epoch": 1.4784757302877223, + "grad_norm": 2.104125738143921, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7173179388046265, + "num_tokens": 336192191.0, + "step": 13463 + }, + { + "epoch": 1.478585547990336, + "grad_norm": 2.3408148288726807, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.70427006483078, + "num_tokens": 336216669.0, + "step": 13464 + }, + { + "epoch": 1.4786953656929498, + "grad_norm": 2.5549473762512207, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.72798752784729, + "num_tokens": 336237247.0, + "step": 13465 + }, + { + "epoch": 1.4788051833955633, + "grad_norm": 2.1432347297668457, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6930820941925049, + "num_tokens": 336265264.0, + "step": 13466 + }, + { + "epoch": 1.478915001098177, + "grad_norm": 2.2510080337524414, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7403972744941711, + "num_tokens": 336290100.0, + "step": 13467 + }, + { + "epoch": 1.4790248188007906, + "grad_norm": 2.3296594619750977, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7214047908782959, + "num_tokens": 336315089.0, + "step": 13468 + }, + { + "epoch": 1.4791346365034044, + "grad_norm": 2.242215871810913, + "learning_rate": 1e-06, + "loss": 0.9725, + "mean_token_accuracy": 0.701537549495697, + "num_tokens": 336341699.0, + "step": 13469 + }, + { + "epoch": 1.4792444542060181, + "grad_norm": 2.0919864177703857, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7066968679428101, + "num_tokens": 336373923.0, + "step": 13470 + }, + { + "epoch": 1.4793542719086317, + "grad_norm": 2.1913259029388428, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7202907800674438, + "num_tokens": 336399130.0, + "step": 13471 + }, + { + "epoch": 1.4794640896112452, + "grad_norm": 2.4789011478424072, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7071802020072937, + "num_tokens": 336423743.0, + "step": 13472 + }, + { + "epoch": 1.479573907313859, + "grad_norm": 2.4438223838806152, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7376390695571899, + "num_tokens": 336447146.0, + "step": 13473 + }, + { + "epoch": 1.4796837250164727, + "grad_norm": 2.495851993560791, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7388299107551575, + "num_tokens": 336467202.0, + "step": 13474 + }, + { + "epoch": 1.4797935427190863, + "grad_norm": 2.0895378589630127, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7220150232315063, + "num_tokens": 336499111.0, + "step": 13475 + }, + { + "epoch": 1.4799033604217, + "grad_norm": 2.1637978553771973, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7313467860221863, + "num_tokens": 336524269.0, + "step": 13476 + }, + { + "epoch": 1.4800131781243135, + "grad_norm": 2.8099374771118164, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7027567625045776, + "num_tokens": 336543657.0, + "step": 13477 + }, + { + "epoch": 1.4801229958269273, + "grad_norm": 2.2168469429016113, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7203953266143799, + "num_tokens": 336572507.0, + "step": 13478 + }, + { + "epoch": 1.480232813529541, + "grad_norm": 2.1481704711914062, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7243379950523376, + "num_tokens": 336597847.0, + "step": 13479 + }, + { + "epoch": 1.4803426312321546, + "grad_norm": 2.5907955169677734, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7375857830047607, + "num_tokens": 336617422.0, + "step": 13480 + }, + { + "epoch": 1.4804524489347684, + "grad_norm": 2.661881685256958, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7311100959777832, + "num_tokens": 336637373.0, + "step": 13481 + }, + { + "epoch": 1.4805622666373819, + "grad_norm": 2.2111284732818604, + "learning_rate": 1e-06, + "loss": 1.0121, + "mean_token_accuracy": 0.6905509829521179, + "num_tokens": 336665937.0, + "step": 13482 + }, + { + "epoch": 1.4806720843399956, + "grad_norm": 2.6034090518951416, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7211596965789795, + "num_tokens": 336685815.0, + "step": 13483 + }, + { + "epoch": 1.4807819020426094, + "grad_norm": 2.1305532455444336, + "learning_rate": 1e-06, + "loss": 0.9046, + "mean_token_accuracy": 0.7222439646720886, + "num_tokens": 336715666.0, + "step": 13484 + }, + { + "epoch": 1.480891719745223, + "grad_norm": 2.3342065811157227, + "learning_rate": 1e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7400708198547363, + "num_tokens": 336739969.0, + "step": 13485 + }, + { + "epoch": 1.4810015374478365, + "grad_norm": 2.399663209915161, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.730446457862854, + "num_tokens": 336762222.0, + "step": 13486 + }, + { + "epoch": 1.4811113551504502, + "grad_norm": 2.2538018226623535, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7107296586036682, + "num_tokens": 336788042.0, + "step": 13487 + }, + { + "epoch": 1.481221172853064, + "grad_norm": 2.363724708557129, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.731033980846405, + "num_tokens": 336810731.0, + "step": 13488 + }, + { + "epoch": 1.4813309905556775, + "grad_norm": 2.2280306816101074, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7224839329719543, + "num_tokens": 336833703.0, + "step": 13489 + }, + { + "epoch": 1.4814408082582913, + "grad_norm": 2.2450168132781982, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7208596467971802, + "num_tokens": 336856417.0, + "step": 13490 + }, + { + "epoch": 1.4815506259609048, + "grad_norm": 2.40433669090271, + "learning_rate": 1e-06, + "loss": 0.8696, + "mean_token_accuracy": 0.7256379127502441, + "num_tokens": 336879676.0, + "step": 13491 + }, + { + "epoch": 1.4816604436635186, + "grad_norm": 2.453146457672119, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7270343899726868, + "num_tokens": 336900968.0, + "step": 13492 + }, + { + "epoch": 1.4817702613661323, + "grad_norm": 2.216804027557373, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7158108949661255, + "num_tokens": 336926757.0, + "step": 13493 + }, + { + "epoch": 1.4818800790687459, + "grad_norm": 2.7885711193084717, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7306626439094543, + "num_tokens": 336945537.0, + "step": 13494 + }, + { + "epoch": 1.4819898967713596, + "grad_norm": 2.3809385299682617, + "learning_rate": 1e-06, + "loss": 0.7382, + "mean_token_accuracy": 0.7589689493179321, + "num_tokens": 336964944.0, + "step": 13495 + }, + { + "epoch": 1.4820997144739732, + "grad_norm": 2.144996404647827, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7165285348892212, + "num_tokens": 336991895.0, + "step": 13496 + }, + { + "epoch": 1.482209532176587, + "grad_norm": 2.1560728549957275, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7287920713424683, + "num_tokens": 337018815.0, + "step": 13497 + }, + { + "epoch": 1.4823193498792007, + "grad_norm": 2.3432793617248535, + "learning_rate": 1e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7390033602714539, + "num_tokens": 337041628.0, + "step": 13498 + }, + { + "epoch": 1.4824291675818142, + "grad_norm": 2.52662992477417, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7290171384811401, + "num_tokens": 337064035.0, + "step": 13499 + }, + { + "epoch": 1.4825389852844277, + "grad_norm": 2.1775386333465576, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7076026201248169, + "num_tokens": 337091669.0, + "step": 13500 + }, + { + "epoch": 1.4826488029870415, + "grad_norm": 2.3096580505371094, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.720413327217102, + "num_tokens": 337115640.0, + "step": 13501 + }, + { + "epoch": 1.4827586206896552, + "grad_norm": 2.1589345932006836, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7232997417449951, + "num_tokens": 337143365.0, + "step": 13502 + }, + { + "epoch": 1.4828684383922688, + "grad_norm": 2.440185070037842, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7194271683692932, + "num_tokens": 337166457.0, + "step": 13503 + }, + { + "epoch": 1.4829782560948825, + "grad_norm": 2.1417102813720703, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.708763062953949, + "num_tokens": 337195604.0, + "step": 13504 + }, + { + "epoch": 1.483088073797496, + "grad_norm": 2.0595455169677734, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.7157192826271057, + "num_tokens": 337225301.0, + "step": 13505 + }, + { + "epoch": 1.4831978915001098, + "grad_norm": 2.0519216060638428, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7107757329940796, + "num_tokens": 337254085.0, + "step": 13506 + }, + { + "epoch": 1.4833077092027236, + "grad_norm": 2.1576220989227295, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6937221884727478, + "num_tokens": 337283211.0, + "step": 13507 + }, + { + "epoch": 1.4834175269053371, + "grad_norm": 2.36283802986145, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7082059383392334, + "num_tokens": 337305660.0, + "step": 13508 + }, + { + "epoch": 1.4835273446079509, + "grad_norm": 2.0767738819122314, + "learning_rate": 1e-06, + "loss": 0.9777, + "mean_token_accuracy": 0.7107861042022705, + "num_tokens": 337336175.0, + "step": 13509 + }, + { + "epoch": 1.4836371623105644, + "grad_norm": 2.2349960803985596, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7350362539291382, + "num_tokens": 337362324.0, + "step": 13510 + }, + { + "epoch": 1.4837469800131782, + "grad_norm": 2.000398874282837, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7193032503128052, + "num_tokens": 337392363.0, + "step": 13511 + }, + { + "epoch": 1.4838567977157917, + "grad_norm": 2.863072633743286, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7312414646148682, + "num_tokens": 337410888.0, + "step": 13512 + }, + { + "epoch": 1.4839666154184055, + "grad_norm": 2.586521863937378, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7251765131950378, + "num_tokens": 337431769.0, + "step": 13513 + }, + { + "epoch": 1.484076433121019, + "grad_norm": 2.232008695602417, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.732826292514801, + "num_tokens": 337458183.0, + "step": 13514 + }, + { + "epoch": 1.4841862508236328, + "grad_norm": 2.521754741668701, + "learning_rate": 1e-06, + "loss": 0.8658, + "mean_token_accuracy": 0.7254139184951782, + "num_tokens": 337479531.0, + "step": 13515 + }, + { + "epoch": 1.4842960685262465, + "grad_norm": 2.7788825035095215, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7252615690231323, + "num_tokens": 337500417.0, + "step": 13516 + }, + { + "epoch": 1.48440588622886, + "grad_norm": 2.5468578338623047, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7396036386489868, + "num_tokens": 337521770.0, + "step": 13517 + }, + { + "epoch": 1.4845157039314738, + "grad_norm": 2.416365146636963, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7292178273200989, + "num_tokens": 337544018.0, + "step": 13518 + }, + { + "epoch": 1.4846255216340873, + "grad_norm": 2.3034000396728516, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7240986227989197, + "num_tokens": 337568246.0, + "step": 13519 + }, + { + "epoch": 1.484735339336701, + "grad_norm": 2.450883150100708, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7349261045455933, + "num_tokens": 337590993.0, + "step": 13520 + }, + { + "epoch": 1.4848451570393149, + "grad_norm": 2.4576704502105713, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7302567958831787, + "num_tokens": 337612856.0, + "step": 13521 + }, + { + "epoch": 1.4849549747419284, + "grad_norm": 2.1932737827301025, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.720207691192627, + "num_tokens": 337638856.0, + "step": 13522 + }, + { + "epoch": 1.485064792444542, + "grad_norm": 2.1207587718963623, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7147865295410156, + "num_tokens": 337666767.0, + "step": 13523 + }, + { + "epoch": 1.4851746101471557, + "grad_norm": 2.134615898132324, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7197161912918091, + "num_tokens": 337692358.0, + "step": 13524 + }, + { + "epoch": 1.4852844278497694, + "grad_norm": 2.2969658374786377, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7018939256668091, + "num_tokens": 337720025.0, + "step": 13525 + }, + { + "epoch": 1.485394245552383, + "grad_norm": 2.4275565147399902, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7060368061065674, + "num_tokens": 337742283.0, + "step": 13526 + }, + { + "epoch": 1.4855040632549967, + "grad_norm": 2.3463079929351807, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7272939682006836, + "num_tokens": 337765172.0, + "step": 13527 + }, + { + "epoch": 1.4856138809576103, + "grad_norm": 2.303485155105591, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7041903138160706, + "num_tokens": 337791110.0, + "step": 13528 + }, + { + "epoch": 1.485723698660224, + "grad_norm": 2.3405137062072754, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7057587504386902, + "num_tokens": 337816155.0, + "step": 13529 + }, + { + "epoch": 1.4858335163628378, + "grad_norm": 2.2862608432769775, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7059669494628906, + "num_tokens": 337841184.0, + "step": 13530 + }, + { + "epoch": 1.4859433340654513, + "grad_norm": 2.4446792602539062, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7160787582397461, + "num_tokens": 337864069.0, + "step": 13531 + }, + { + "epoch": 1.486053151768065, + "grad_norm": 2.5111382007598877, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7198267579078674, + "num_tokens": 337884977.0, + "step": 13532 + }, + { + "epoch": 1.4861629694706786, + "grad_norm": 2.4176392555236816, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7102171182632446, + "num_tokens": 337907410.0, + "step": 13533 + }, + { + "epoch": 1.4862727871732924, + "grad_norm": 2.178887367248535, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7175388336181641, + "num_tokens": 337936446.0, + "step": 13534 + }, + { + "epoch": 1.4863826048759061, + "grad_norm": 2.3430519104003906, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.7380361557006836, + "num_tokens": 337960925.0, + "step": 13535 + }, + { + "epoch": 1.4864924225785197, + "grad_norm": 2.218832492828369, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7053219079971313, + "num_tokens": 337989688.0, + "step": 13536 + }, + { + "epoch": 1.4866022402811332, + "grad_norm": 2.286250591278076, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7300499081611633, + "num_tokens": 338011870.0, + "step": 13537 + }, + { + "epoch": 1.486712057983747, + "grad_norm": 2.3057072162628174, + "learning_rate": 1e-06, + "loss": 0.7749, + "mean_token_accuracy": 0.7620551586151123, + "num_tokens": 338037574.0, + "step": 13538 + }, + { + "epoch": 1.4868218756863607, + "grad_norm": 2.1434977054595947, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7220430374145508, + "num_tokens": 338066866.0, + "step": 13539 + }, + { + "epoch": 1.4869316933889742, + "grad_norm": 2.126974582672119, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.7006633281707764, + "num_tokens": 338096847.0, + "step": 13540 + }, + { + "epoch": 1.487041511091588, + "grad_norm": 2.0784685611724854, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.6938245892524719, + "num_tokens": 338126193.0, + "step": 13541 + }, + { + "epoch": 1.4871513287942015, + "grad_norm": 2.251002550125122, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7163426876068115, + "num_tokens": 338150189.0, + "step": 13542 + }, + { + "epoch": 1.4872611464968153, + "grad_norm": 2.0147769451141357, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7160146832466125, + "num_tokens": 338182213.0, + "step": 13543 + }, + { + "epoch": 1.487370964199429, + "grad_norm": 2.4975626468658447, + "learning_rate": 1e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7512720823287964, + "num_tokens": 338202695.0, + "step": 13544 + }, + { + "epoch": 1.4874807819020426, + "grad_norm": 2.474672555923462, + "learning_rate": 1e-06, + "loss": 0.7636, + "mean_token_accuracy": 0.748705267906189, + "num_tokens": 338223932.0, + "step": 13545 + }, + { + "epoch": 1.4875905996046563, + "grad_norm": 2.0748229026794434, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7099034786224365, + "num_tokens": 338252581.0, + "step": 13546 + }, + { + "epoch": 1.4877004173072699, + "grad_norm": 2.292365550994873, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7086297273635864, + "num_tokens": 338279268.0, + "step": 13547 + }, + { + "epoch": 1.4878102350098836, + "grad_norm": 2.4961934089660645, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.716070294380188, + "num_tokens": 338300196.0, + "step": 13548 + }, + { + "epoch": 1.4879200527124974, + "grad_norm": 2.3383796215057373, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.709871232509613, + "num_tokens": 338325040.0, + "step": 13549 + }, + { + "epoch": 1.488029870415111, + "grad_norm": 2.555619716644287, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7191004157066345, + "num_tokens": 338346483.0, + "step": 13550 + }, + { + "epoch": 1.4881396881177245, + "grad_norm": 2.3258683681488037, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7211412787437439, + "num_tokens": 338369501.0, + "step": 13551 + }, + { + "epoch": 1.4882495058203382, + "grad_norm": 2.207023859024048, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7174067497253418, + "num_tokens": 338393583.0, + "step": 13552 + }, + { + "epoch": 1.488359323522952, + "grad_norm": 2.2493834495544434, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.734205961227417, + "num_tokens": 338418531.0, + "step": 13553 + }, + { + "epoch": 1.4884691412255655, + "grad_norm": 2.470140218734741, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7244824171066284, + "num_tokens": 338440128.0, + "step": 13554 + }, + { + "epoch": 1.4885789589281793, + "grad_norm": 2.449265956878662, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7195191383361816, + "num_tokens": 338464473.0, + "step": 13555 + }, + { + "epoch": 1.4886887766307928, + "grad_norm": 2.435701847076416, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.714069128036499, + "num_tokens": 338487860.0, + "step": 13556 + }, + { + "epoch": 1.4887985943334066, + "grad_norm": 2.0431525707244873, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7077062129974365, + "num_tokens": 338519483.0, + "step": 13557 + }, + { + "epoch": 1.4889084120360203, + "grad_norm": 2.296335220336914, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7152441740036011, + "num_tokens": 338543343.0, + "step": 13558 + }, + { + "epoch": 1.4890182297386338, + "grad_norm": 2.349132537841797, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7180173397064209, + "num_tokens": 338566423.0, + "step": 13559 + }, + { + "epoch": 1.4891280474412476, + "grad_norm": 2.331988573074341, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7185483574867249, + "num_tokens": 338589799.0, + "step": 13560 + }, + { + "epoch": 1.4892378651438611, + "grad_norm": 2.2555434703826904, + "learning_rate": 1e-06, + "loss": 1.0099, + "mean_token_accuracy": 0.6896086931228638, + "num_tokens": 338616740.0, + "step": 13561 + }, + { + "epoch": 1.489347682846475, + "grad_norm": 2.1620099544525146, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7080916166305542, + "num_tokens": 338643401.0, + "step": 13562 + }, + { + "epoch": 1.4894575005490884, + "grad_norm": 2.4485671520233154, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.7047035098075867, + "num_tokens": 338667626.0, + "step": 13563 + }, + { + "epoch": 1.4895673182517022, + "grad_norm": 2.568437337875366, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.718274712562561, + "num_tokens": 338690019.0, + "step": 13564 + }, + { + "epoch": 1.4896771359543157, + "grad_norm": 2.18033766746521, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.6951631307601929, + "num_tokens": 338718198.0, + "step": 13565 + }, + { + "epoch": 1.4897869536569295, + "grad_norm": 2.5048089027404785, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7260055541992188, + "num_tokens": 338739733.0, + "step": 13566 + }, + { + "epoch": 1.4898967713595432, + "grad_norm": 2.2755799293518066, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7177472114562988, + "num_tokens": 338764013.0, + "step": 13567 + }, + { + "epoch": 1.4900065890621568, + "grad_norm": 2.022676944732666, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7305577397346497, + "num_tokens": 338795427.0, + "step": 13568 + }, + { + "epoch": 1.4901164067647705, + "grad_norm": 2.1868886947631836, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7275338768959045, + "num_tokens": 338822814.0, + "step": 13569 + }, + { + "epoch": 1.490226224467384, + "grad_norm": 2.2780964374542236, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7207028865814209, + "num_tokens": 338848602.0, + "step": 13570 + }, + { + "epoch": 1.4903360421699978, + "grad_norm": 2.2741549015045166, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.7152154445648193, + "num_tokens": 338873847.0, + "step": 13571 + }, + { + "epoch": 1.4904458598726116, + "grad_norm": 2.639802932739258, + "learning_rate": 1e-06, + "loss": 0.7553, + "mean_token_accuracy": 0.7535236477851868, + "num_tokens": 338892739.0, + "step": 13572 + }, + { + "epoch": 1.4905556775752251, + "grad_norm": 1.93485426902771, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7216897010803223, + "num_tokens": 338924051.0, + "step": 13573 + }, + { + "epoch": 1.4906654952778389, + "grad_norm": 2.4163079261779785, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7140519022941589, + "num_tokens": 338947729.0, + "step": 13574 + }, + { + "epoch": 1.4907753129804524, + "grad_norm": 2.649155378341675, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7083296179771423, + "num_tokens": 338967029.0, + "step": 13575 + }, + { + "epoch": 1.4908851306830662, + "grad_norm": 2.2415387630462646, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7146967053413391, + "num_tokens": 338991729.0, + "step": 13576 + }, + { + "epoch": 1.4909949483856797, + "grad_norm": 2.324573516845703, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6973578929901123, + "num_tokens": 339016984.0, + "step": 13577 + }, + { + "epoch": 1.4911047660882935, + "grad_norm": 2.0529603958129883, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7092722654342651, + "num_tokens": 339049489.0, + "step": 13578 + }, + { + "epoch": 1.491214583790907, + "grad_norm": 2.1138997077941895, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7162108421325684, + "num_tokens": 339077775.0, + "step": 13579 + }, + { + "epoch": 1.4913244014935207, + "grad_norm": 2.3194384574890137, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.6978604793548584, + "num_tokens": 339101373.0, + "step": 13580 + }, + { + "epoch": 1.4914342191961345, + "grad_norm": 2.6590495109558105, + "learning_rate": 1e-06, + "loss": 0.7758, + "mean_token_accuracy": 0.7531816363334656, + "num_tokens": 339119844.0, + "step": 13581 + }, + { + "epoch": 1.491544036898748, + "grad_norm": 2.463538885116577, + "learning_rate": 1e-06, + "loss": 0.8362, + "mean_token_accuracy": 0.7350747585296631, + "num_tokens": 339141306.0, + "step": 13582 + }, + { + "epoch": 1.4916538546013618, + "grad_norm": 2.287785530090332, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7206276655197144, + "num_tokens": 339165661.0, + "step": 13583 + }, + { + "epoch": 1.4917636723039753, + "grad_norm": 2.4789557456970215, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7371233701705933, + "num_tokens": 339187630.0, + "step": 13584 + }, + { + "epoch": 1.491873490006589, + "grad_norm": 2.4314475059509277, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6934137344360352, + "num_tokens": 339210702.0, + "step": 13585 + }, + { + "epoch": 1.4919833077092028, + "grad_norm": 2.3872194290161133, + "learning_rate": 1e-06, + "loss": 0.8247, + "mean_token_accuracy": 0.7392780780792236, + "num_tokens": 339233016.0, + "step": 13586 + }, + { + "epoch": 1.4920931254118164, + "grad_norm": 2.437983989715576, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7209010124206543, + "num_tokens": 339256441.0, + "step": 13587 + }, + { + "epoch": 1.49220294311443, + "grad_norm": 2.3844785690307617, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7224839329719543, + "num_tokens": 339280521.0, + "step": 13588 + }, + { + "epoch": 1.4923127608170437, + "grad_norm": 2.2596962451934814, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7098364233970642, + "num_tokens": 339307308.0, + "step": 13589 + }, + { + "epoch": 1.4924225785196574, + "grad_norm": 2.332547426223755, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7253913283348083, + "num_tokens": 339331810.0, + "step": 13590 + }, + { + "epoch": 1.492532396222271, + "grad_norm": 2.0336086750030518, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7237445116043091, + "num_tokens": 339360447.0, + "step": 13591 + }, + { + "epoch": 1.4926422139248847, + "grad_norm": 2.581264019012451, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7411816120147705, + "num_tokens": 339380332.0, + "step": 13592 + }, + { + "epoch": 1.4927520316274983, + "grad_norm": 2.5497629642486572, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7285956144332886, + "num_tokens": 339400860.0, + "step": 13593 + }, + { + "epoch": 1.492861849330112, + "grad_norm": 2.120560646057129, + "learning_rate": 1e-06, + "loss": 0.7747, + "mean_token_accuracy": 0.7545915842056274, + "num_tokens": 339426472.0, + "step": 13594 + }, + { + "epoch": 1.4929716670327258, + "grad_norm": 2.4108619689941406, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7226953506469727, + "num_tokens": 339450868.0, + "step": 13595 + }, + { + "epoch": 1.4930814847353393, + "grad_norm": 2.2931816577911377, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7158664464950562, + "num_tokens": 339476086.0, + "step": 13596 + }, + { + "epoch": 1.493191302437953, + "grad_norm": 2.4206385612487793, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7371528148651123, + "num_tokens": 339497878.0, + "step": 13597 + }, + { + "epoch": 1.4933011201405666, + "grad_norm": 2.3091163635253906, + "learning_rate": 1e-06, + "loss": 0.8265, + "mean_token_accuracy": 0.7341817617416382, + "num_tokens": 339521101.0, + "step": 13598 + }, + { + "epoch": 1.4934109378431804, + "grad_norm": 2.2087619304656982, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.704901933670044, + "num_tokens": 339549786.0, + "step": 13599 + }, + { + "epoch": 1.493520755545794, + "grad_norm": 2.1729438304901123, + "learning_rate": 1e-06, + "loss": 1.0228, + "mean_token_accuracy": 0.6916120052337646, + "num_tokens": 339577559.0, + "step": 13600 + }, + { + "epoch": 1.4936305732484076, + "grad_norm": 2.6297760009765625, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7288752794265747, + "num_tokens": 339596769.0, + "step": 13601 + }, + { + "epoch": 1.4937403909510212, + "grad_norm": 2.0924227237701416, + "learning_rate": 1e-06, + "loss": 0.8473, + "mean_token_accuracy": 0.7371631264686584, + "num_tokens": 339623873.0, + "step": 13602 + }, + { + "epoch": 1.493850208653635, + "grad_norm": 2.089015483856201, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7099601626396179, + "num_tokens": 339654630.0, + "step": 13603 + }, + { + "epoch": 1.4939600263562487, + "grad_norm": 2.513373374938965, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7269803881645203, + "num_tokens": 339674232.0, + "step": 13604 + }, + { + "epoch": 1.4940698440588622, + "grad_norm": 2.2012314796447754, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7192400693893433, + "num_tokens": 339700008.0, + "step": 13605 + }, + { + "epoch": 1.494179661761476, + "grad_norm": 2.218775987625122, + "learning_rate": 1e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7573587894439697, + "num_tokens": 339724334.0, + "step": 13606 + }, + { + "epoch": 1.4942894794640895, + "grad_norm": 2.318779230117798, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7327468395233154, + "num_tokens": 339748608.0, + "step": 13607 + }, + { + "epoch": 1.4943992971667033, + "grad_norm": 2.2656469345092773, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7156938314437866, + "num_tokens": 339775639.0, + "step": 13608 + }, + { + "epoch": 1.494509114869317, + "grad_norm": 2.2501747608184814, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7268377542495728, + "num_tokens": 339803112.0, + "step": 13609 + }, + { + "epoch": 1.4946189325719306, + "grad_norm": 2.3126986026763916, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7099100351333618, + "num_tokens": 339827683.0, + "step": 13610 + }, + { + "epoch": 1.4947287502745443, + "grad_norm": 2.511425733566284, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7140769958496094, + "num_tokens": 339848179.0, + "step": 13611 + }, + { + "epoch": 1.4948385679771579, + "grad_norm": 2.0861411094665527, + "learning_rate": 1e-06, + "loss": 1.0218, + "mean_token_accuracy": 0.6916365623474121, + "num_tokens": 339877705.0, + "step": 13612 + }, + { + "epoch": 1.4949483856797716, + "grad_norm": 2.134188175201416, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.7047380805015564, + "num_tokens": 339907622.0, + "step": 13613 + }, + { + "epoch": 1.4950582033823854, + "grad_norm": 2.256826639175415, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7184632420539856, + "num_tokens": 339932512.0, + "step": 13614 + }, + { + "epoch": 1.495168021084999, + "grad_norm": 2.167850971221924, + "learning_rate": 1e-06, + "loss": 1.0111, + "mean_token_accuracy": 0.6980078220367432, + "num_tokens": 339960631.0, + "step": 13615 + }, + { + "epoch": 1.4952778387876124, + "grad_norm": 2.5176291465759277, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7245842814445496, + "num_tokens": 339982498.0, + "step": 13616 + }, + { + "epoch": 1.4953876564902262, + "grad_norm": 2.314483404159546, + "learning_rate": 1e-06, + "loss": 0.9882, + "mean_token_accuracy": 0.697876513004303, + "num_tokens": 340007210.0, + "step": 13617 + }, + { + "epoch": 1.49549747419284, + "grad_norm": 2.331780195236206, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.739662766456604, + "num_tokens": 340031451.0, + "step": 13618 + }, + { + "epoch": 1.4956072918954535, + "grad_norm": 2.085517406463623, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7035566568374634, + "num_tokens": 340059992.0, + "step": 13619 + }, + { + "epoch": 1.4957171095980673, + "grad_norm": 2.2998061180114746, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.6979159712791443, + "num_tokens": 340086312.0, + "step": 13620 + }, + { + "epoch": 1.4958269273006808, + "grad_norm": 2.4820477962493896, + "learning_rate": 1e-06, + "loss": 0.9075, + "mean_token_accuracy": 0.7167934775352478, + "num_tokens": 340110606.0, + "step": 13621 + }, + { + "epoch": 1.4959367450032945, + "grad_norm": 2.3736400604248047, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7340346574783325, + "num_tokens": 340134099.0, + "step": 13622 + }, + { + "epoch": 1.4960465627059083, + "grad_norm": 2.1890289783477783, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7272418737411499, + "num_tokens": 340160552.0, + "step": 13623 + }, + { + "epoch": 1.4961563804085218, + "grad_norm": 2.3327348232269287, + "learning_rate": 1e-06, + "loss": 0.817, + "mean_token_accuracy": 0.7432184219360352, + "num_tokens": 340184395.0, + "step": 13624 + }, + { + "epoch": 1.4962661981111356, + "grad_norm": 2.017498731613159, + "learning_rate": 1e-06, + "loss": 1.0254, + "mean_token_accuracy": 0.6859971284866333, + "num_tokens": 340216898.0, + "step": 13625 + }, + { + "epoch": 1.4963760158137491, + "grad_norm": 2.0642635822296143, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.7139140367507935, + "num_tokens": 340246532.0, + "step": 13626 + }, + { + "epoch": 1.4964858335163629, + "grad_norm": 2.135326385498047, + "learning_rate": 1e-06, + "loss": 0.941, + "mean_token_accuracy": 0.7081688642501831, + "num_tokens": 340276410.0, + "step": 13627 + }, + { + "epoch": 1.4965956512189764, + "grad_norm": 2.4753808975219727, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7258244156837463, + "num_tokens": 340298543.0, + "step": 13628 + }, + { + "epoch": 1.4967054689215902, + "grad_norm": 2.5705251693725586, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7208067774772644, + "num_tokens": 340319647.0, + "step": 13629 + }, + { + "epoch": 1.4968152866242037, + "grad_norm": 2.8426523208618164, + "learning_rate": 1e-06, + "loss": 0.8469, + "mean_token_accuracy": 0.7277599573135376, + "num_tokens": 340337502.0, + "step": 13630 + }, + { + "epoch": 1.4969251043268175, + "grad_norm": 2.3246333599090576, + "learning_rate": 1e-06, + "loss": 0.997, + "mean_token_accuracy": 0.6888030171394348, + "num_tokens": 340361184.0, + "step": 13631 + }, + { + "epoch": 1.4970349220294312, + "grad_norm": 2.3546688556671143, + "learning_rate": 1e-06, + "loss": 0.7545, + "mean_token_accuracy": 0.7545907497406006, + "num_tokens": 340382353.0, + "step": 13632 + }, + { + "epoch": 1.4971447397320448, + "grad_norm": 2.6151297092437744, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7206665277481079, + "num_tokens": 340403059.0, + "step": 13633 + }, + { + "epoch": 1.4972545574346585, + "grad_norm": 2.9102232456207275, + "learning_rate": 1e-06, + "loss": 0.7991, + "mean_token_accuracy": 0.7400170564651489, + "num_tokens": 340420053.0, + "step": 13634 + }, + { + "epoch": 1.497364375137272, + "grad_norm": 2.186079263687134, + "learning_rate": 1e-06, + "loss": 0.8876, + "mean_token_accuracy": 0.7271130681037903, + "num_tokens": 340448190.0, + "step": 13635 + }, + { + "epoch": 1.4974741928398858, + "grad_norm": 2.377976417541504, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7350815534591675, + "num_tokens": 340471001.0, + "step": 13636 + }, + { + "epoch": 1.4975840105424996, + "grad_norm": 2.4461445808410645, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7267540693283081, + "num_tokens": 340493666.0, + "step": 13637 + }, + { + "epoch": 1.497693828245113, + "grad_norm": 2.124969482421875, + "learning_rate": 1e-06, + "loss": 0.9055, + "mean_token_accuracy": 0.7185876369476318, + "num_tokens": 340523583.0, + "step": 13638 + }, + { + "epoch": 1.4978036459477266, + "grad_norm": 2.102175235748291, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.6978482007980347, + "num_tokens": 340551816.0, + "step": 13639 + }, + { + "epoch": 1.4979134636503404, + "grad_norm": 2.2643699645996094, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7347972989082336, + "num_tokens": 340578582.0, + "step": 13640 + }, + { + "epoch": 1.4980232813529542, + "grad_norm": 2.036590337753296, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.7137876749038696, + "num_tokens": 340610018.0, + "step": 13641 + }, + { + "epoch": 1.4981330990555677, + "grad_norm": 2.778721332550049, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.7073523998260498, + "num_tokens": 340628970.0, + "step": 13642 + }, + { + "epoch": 1.4982429167581814, + "grad_norm": 2.7244460582733154, + "learning_rate": 1e-06, + "loss": 0.8427, + "mean_token_accuracy": 0.735181450843811, + "num_tokens": 340647839.0, + "step": 13643 + }, + { + "epoch": 1.498352734460795, + "grad_norm": 2.362253189086914, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7171187400817871, + "num_tokens": 340669444.0, + "step": 13644 + }, + { + "epoch": 1.4984625521634087, + "grad_norm": 2.1627516746520996, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7317588925361633, + "num_tokens": 340695060.0, + "step": 13645 + }, + { + "epoch": 1.4985723698660225, + "grad_norm": 2.4748635292053223, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.738052248954773, + "num_tokens": 340714401.0, + "step": 13646 + }, + { + "epoch": 1.498682187568636, + "grad_norm": 2.195038318634033, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7348238229751587, + "num_tokens": 340739090.0, + "step": 13647 + }, + { + "epoch": 1.4987920052712498, + "grad_norm": 2.269019365310669, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7068727016448975, + "num_tokens": 340765745.0, + "step": 13648 + }, + { + "epoch": 1.4989018229738633, + "grad_norm": 2.768657684326172, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7181729078292847, + "num_tokens": 340783568.0, + "step": 13649 + }, + { + "epoch": 1.499011640676477, + "grad_norm": 2.513218402862549, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.723158061504364, + "num_tokens": 340804150.0, + "step": 13650 + }, + { + "epoch": 1.4991214583790908, + "grad_norm": 2.1567511558532715, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7240398526191711, + "num_tokens": 340831169.0, + "step": 13651 + }, + { + "epoch": 1.4992312760817044, + "grad_norm": 2.3935935497283936, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7085642218589783, + "num_tokens": 340856433.0, + "step": 13652 + }, + { + "epoch": 1.499341093784318, + "grad_norm": 2.6606075763702393, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7157869338989258, + "num_tokens": 340878822.0, + "step": 13653 + }, + { + "epoch": 1.4994509114869317, + "grad_norm": 2.028641700744629, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7123663425445557, + "num_tokens": 340906620.0, + "step": 13654 + }, + { + "epoch": 1.4995607291895454, + "grad_norm": 2.4572219848632812, + "learning_rate": 1e-06, + "loss": 0.8898, + "mean_token_accuracy": 0.7290263175964355, + "num_tokens": 340928342.0, + "step": 13655 + }, + { + "epoch": 1.499670546892159, + "grad_norm": 2.3484344482421875, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7262378931045532, + "num_tokens": 340951661.0, + "step": 13656 + }, + { + "epoch": 1.4997803645947727, + "grad_norm": 2.2202887535095215, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.718224048614502, + "num_tokens": 340977027.0, + "step": 13657 + }, + { + "epoch": 1.4998901822973862, + "grad_norm": 2.311084032058716, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7193560600280762, + "num_tokens": 341001230.0, + "step": 13658 + }, + { + "epoch": 1.5, + "grad_norm": 2.3145344257354736, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7176504731178284, + "num_tokens": 341026293.0, + "step": 13659 + }, + { + "epoch": 1.5001098177026138, + "grad_norm": 2.4314515590667725, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7074152231216431, + "num_tokens": 341049868.0, + "step": 13660 + }, + { + "epoch": 1.5002196354052273, + "grad_norm": 2.7109522819519043, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.727745771408081, + "num_tokens": 341068915.0, + "step": 13661 + }, + { + "epoch": 1.5003294531078408, + "grad_norm": 1.892099380493164, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.731863260269165, + "num_tokens": 341101754.0, + "step": 13662 + }, + { + "epoch": 1.5004392708104546, + "grad_norm": 2.2459821701049805, + "learning_rate": 1e-06, + "loss": 0.8821, + "mean_token_accuracy": 0.7260442972183228, + "num_tokens": 341126253.0, + "step": 13663 + }, + { + "epoch": 1.5005490885130683, + "grad_norm": 2.1197688579559326, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7053942680358887, + "num_tokens": 341153624.0, + "step": 13664 + }, + { + "epoch": 1.500658906215682, + "grad_norm": 2.2728967666625977, + "learning_rate": 1e-06, + "loss": 0.9518, + "mean_token_accuracy": 0.7085571885108948, + "num_tokens": 341179102.0, + "step": 13665 + }, + { + "epoch": 1.5007687239182956, + "grad_norm": 2.109952449798584, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7099722623825073, + "num_tokens": 341208659.0, + "step": 13666 + }, + { + "epoch": 1.5008785416209092, + "grad_norm": 2.2730185985565186, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.6964294910430908, + "num_tokens": 341236044.0, + "step": 13667 + }, + { + "epoch": 1.500988359323523, + "grad_norm": 1.9579706192016602, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.711940348148346, + "num_tokens": 341268798.0, + "step": 13668 + }, + { + "epoch": 1.5010981770261367, + "grad_norm": 2.4950857162475586, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.7264834642410278, + "num_tokens": 341289130.0, + "step": 13669 + }, + { + "epoch": 1.5012079947287504, + "grad_norm": 2.299077272415161, + "learning_rate": 1e-06, + "loss": 0.7884, + "mean_token_accuracy": 0.7488389611244202, + "num_tokens": 341313654.0, + "step": 13670 + }, + { + "epoch": 1.501317812431364, + "grad_norm": 2.4814555644989014, + "learning_rate": 1e-06, + "loss": 0.8517, + "mean_token_accuracy": 0.7328696846961975, + "num_tokens": 341334175.0, + "step": 13671 + }, + { + "epoch": 1.5014276301339775, + "grad_norm": 2.3178343772888184, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7276408672332764, + "num_tokens": 341360490.0, + "step": 13672 + }, + { + "epoch": 1.5015374478365913, + "grad_norm": 2.47357177734375, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7062181234359741, + "num_tokens": 341384180.0, + "step": 13673 + }, + { + "epoch": 1.501647265539205, + "grad_norm": 2.133253335952759, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7212494611740112, + "num_tokens": 341412073.0, + "step": 13674 + }, + { + "epoch": 1.5017570832418186, + "grad_norm": 2.844982624053955, + "learning_rate": 1e-06, + "loss": 0.8275, + "mean_token_accuracy": 0.735919713973999, + "num_tokens": 341428919.0, + "step": 13675 + }, + { + "epoch": 1.501866900944432, + "grad_norm": 2.2424325942993164, + "learning_rate": 1e-06, + "loss": 0.9041, + "mean_token_accuracy": 0.7221522927284241, + "num_tokens": 341453294.0, + "step": 13676 + }, + { + "epoch": 1.5019767186470458, + "grad_norm": 2.5054216384887695, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7181556224822998, + "num_tokens": 341475045.0, + "step": 13677 + }, + { + "epoch": 1.5020865363496596, + "grad_norm": 2.6141152381896973, + "learning_rate": 1e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7351537346839905, + "num_tokens": 341493847.0, + "step": 13678 + }, + { + "epoch": 1.5021963540522734, + "grad_norm": 2.4098734855651855, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7195266485214233, + "num_tokens": 341518363.0, + "step": 13679 + }, + { + "epoch": 1.502306171754887, + "grad_norm": 2.177520513534546, + "learning_rate": 1e-06, + "loss": 1.0126, + "mean_token_accuracy": 0.6927937269210815, + "num_tokens": 341547685.0, + "step": 13680 + }, + { + "epoch": 1.5024159894575004, + "grad_norm": 2.8105902671813965, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7218562364578247, + "num_tokens": 341564987.0, + "step": 13681 + }, + { + "epoch": 1.5025258071601142, + "grad_norm": 1.9973437786102295, + "learning_rate": 1e-06, + "loss": 0.8329, + "mean_token_accuracy": 0.7366398572921753, + "num_tokens": 341595426.0, + "step": 13682 + }, + { + "epoch": 1.502635624862728, + "grad_norm": 2.860415458679199, + "learning_rate": 1e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7628793716430664, + "num_tokens": 341611977.0, + "step": 13683 + }, + { + "epoch": 1.5027454425653415, + "grad_norm": 2.3180770874023438, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7199815511703491, + "num_tokens": 341635426.0, + "step": 13684 + }, + { + "epoch": 1.5028552602679552, + "grad_norm": 2.6275789737701416, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7125549912452698, + "num_tokens": 341656033.0, + "step": 13685 + }, + { + "epoch": 1.5029650779705688, + "grad_norm": 2.140024423599243, + "learning_rate": 1e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7418926358222961, + "num_tokens": 341683401.0, + "step": 13686 + }, + { + "epoch": 1.5030748956731825, + "grad_norm": 2.3785905838012695, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7172754406929016, + "num_tokens": 341707178.0, + "step": 13687 + }, + { + "epoch": 1.5031847133757963, + "grad_norm": 2.7097156047821045, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7183536291122437, + "num_tokens": 341727088.0, + "step": 13688 + }, + { + "epoch": 1.5032945310784098, + "grad_norm": 2.6347765922546387, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7347805500030518, + "num_tokens": 341747475.0, + "step": 13689 + }, + { + "epoch": 1.5034043487810234, + "grad_norm": 2.439523220062256, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7311797142028809, + "num_tokens": 341768556.0, + "step": 13690 + }, + { + "epoch": 1.5035141664836371, + "grad_norm": 2.3206138610839844, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.726448655128479, + "num_tokens": 341791365.0, + "step": 13691 + }, + { + "epoch": 1.5036239841862509, + "grad_norm": 1.9946362972259521, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.721440315246582, + "num_tokens": 341822252.0, + "step": 13692 + }, + { + "epoch": 1.5037338018888646, + "grad_norm": 2.0651400089263916, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7085897922515869, + "num_tokens": 341853245.0, + "step": 13693 + }, + { + "epoch": 1.5038436195914782, + "grad_norm": 2.36822772026062, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7120014429092407, + "num_tokens": 341876764.0, + "step": 13694 + }, + { + "epoch": 1.5039534372940917, + "grad_norm": 2.2843363285064697, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7297562956809998, + "num_tokens": 341901182.0, + "step": 13695 + }, + { + "epoch": 1.5040632549967055, + "grad_norm": 2.0910804271698, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7094635963439941, + "num_tokens": 341929987.0, + "step": 13696 + }, + { + "epoch": 1.5041730726993192, + "grad_norm": 2.3913497924804688, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7280644178390503, + "num_tokens": 341954365.0, + "step": 13697 + }, + { + "epoch": 1.5042828904019327, + "grad_norm": 2.0950024127960205, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.709116518497467, + "num_tokens": 341984452.0, + "step": 13698 + }, + { + "epoch": 1.5043927081045465, + "grad_norm": 2.2396321296691895, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7350368499755859, + "num_tokens": 342009815.0, + "step": 13699 + }, + { + "epoch": 1.50450252580716, + "grad_norm": 2.1785755157470703, + "learning_rate": 1e-06, + "loss": 0.9641, + "mean_token_accuracy": 0.712410569190979, + "num_tokens": 342036428.0, + "step": 13700 + }, + { + "epoch": 1.5046123435097738, + "grad_norm": 1.994194507598877, + "learning_rate": 1e-06, + "loss": 0.9891, + "mean_token_accuracy": 0.698625385761261, + "num_tokens": 342067258.0, + "step": 13701 + }, + { + "epoch": 1.5047221612123876, + "grad_norm": 2.358762741088867, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7199050784111023, + "num_tokens": 342091698.0, + "step": 13702 + }, + { + "epoch": 1.504831978915001, + "grad_norm": 2.1243293285369873, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7210206389427185, + "num_tokens": 342118362.0, + "step": 13703 + }, + { + "epoch": 1.5049417966176146, + "grad_norm": 2.4413115978240967, + "learning_rate": 1e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.7417467832565308, + "num_tokens": 342138509.0, + "step": 13704 + }, + { + "epoch": 1.5050516143202284, + "grad_norm": 2.608642816543579, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7282126545906067, + "num_tokens": 342158025.0, + "step": 13705 + }, + { + "epoch": 1.5051614320228421, + "grad_norm": 2.4972944259643555, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7431430816650391, + "num_tokens": 342178854.0, + "step": 13706 + }, + { + "epoch": 1.505271249725456, + "grad_norm": 2.4604270458221436, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7227649092674255, + "num_tokens": 342201217.0, + "step": 13707 + }, + { + "epoch": 1.5053810674280694, + "grad_norm": 2.414680242538452, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7447271347045898, + "num_tokens": 342222814.0, + "step": 13708 + }, + { + "epoch": 1.505490885130683, + "grad_norm": 2.437783718109131, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7208565473556519, + "num_tokens": 342245680.0, + "step": 13709 + }, + { + "epoch": 1.5056007028332967, + "grad_norm": 2.5801093578338623, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7219488620758057, + "num_tokens": 342265757.0, + "step": 13710 + }, + { + "epoch": 1.5057105205359105, + "grad_norm": 2.4339962005615234, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7344921231269836, + "num_tokens": 342286387.0, + "step": 13711 + }, + { + "epoch": 1.505820338238524, + "grad_norm": 2.075162172317505, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7340993285179138, + "num_tokens": 342313171.0, + "step": 13712 + }, + { + "epoch": 1.5059301559411375, + "grad_norm": 2.359130382537842, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7314108610153198, + "num_tokens": 342337133.0, + "step": 13713 + }, + { + "epoch": 1.5060399736437513, + "grad_norm": 2.2122714519500732, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7070876359939575, + "num_tokens": 342363530.0, + "step": 13714 + }, + { + "epoch": 1.506149791346365, + "grad_norm": 2.17716121673584, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7265546321868896, + "num_tokens": 342388517.0, + "step": 13715 + }, + { + "epoch": 1.5062596090489788, + "grad_norm": 2.4153358936309814, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.736387312412262, + "num_tokens": 342411150.0, + "step": 13716 + }, + { + "epoch": 1.5063694267515924, + "grad_norm": 2.557723045349121, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7197275161743164, + "num_tokens": 342432572.0, + "step": 13717 + }, + { + "epoch": 1.506479244454206, + "grad_norm": 1.9769608974456787, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7382588386535645, + "num_tokens": 342461485.0, + "step": 13718 + }, + { + "epoch": 1.5065890621568196, + "grad_norm": 2.1331710815429688, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.713920533657074, + "num_tokens": 342487687.0, + "step": 13719 + }, + { + "epoch": 1.5066988798594334, + "grad_norm": 2.229084014892578, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.714789867401123, + "num_tokens": 342512832.0, + "step": 13720 + }, + { + "epoch": 1.5068086975620472, + "grad_norm": 2.321383237838745, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7231237888336182, + "num_tokens": 342536381.0, + "step": 13721 + }, + { + "epoch": 1.5069185152646607, + "grad_norm": 1.9283345937728882, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7066348195075989, + "num_tokens": 342568349.0, + "step": 13722 + }, + { + "epoch": 1.5070283329672742, + "grad_norm": 2.8423187732696533, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7223080396652222, + "num_tokens": 342587125.0, + "step": 13723 + }, + { + "epoch": 1.507138150669888, + "grad_norm": 2.2693488597869873, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7132337093353271, + "num_tokens": 342613487.0, + "step": 13724 + }, + { + "epoch": 1.5072479683725017, + "grad_norm": 2.090430736541748, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7143202424049377, + "num_tokens": 342640843.0, + "step": 13725 + }, + { + "epoch": 1.5073577860751153, + "grad_norm": 2.210493803024292, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7239398956298828, + "num_tokens": 342668328.0, + "step": 13726 + }, + { + "epoch": 1.5074676037777288, + "grad_norm": 2.346870183944702, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7122797966003418, + "num_tokens": 342694006.0, + "step": 13727 + }, + { + "epoch": 1.5075774214803426, + "grad_norm": 2.1451714038848877, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7127916216850281, + "num_tokens": 342721616.0, + "step": 13728 + }, + { + "epoch": 1.5076872391829563, + "grad_norm": 2.2162528038024902, + "learning_rate": 1e-06, + "loss": 0.8713, + "mean_token_accuracy": 0.7246885895729065, + "num_tokens": 342745501.0, + "step": 13729 + }, + { + "epoch": 1.50779705688557, + "grad_norm": 2.1913599967956543, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7183760404586792, + "num_tokens": 342773112.0, + "step": 13730 + }, + { + "epoch": 1.5079068745881836, + "grad_norm": 2.068209648132324, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7447633743286133, + "num_tokens": 342802713.0, + "step": 13731 + }, + { + "epoch": 1.5080166922907972, + "grad_norm": 2.0523736476898193, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7116934657096863, + "num_tokens": 342832464.0, + "step": 13732 + }, + { + "epoch": 1.508126509993411, + "grad_norm": 2.5510900020599365, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7284971475601196, + "num_tokens": 342853156.0, + "step": 13733 + }, + { + "epoch": 1.5082363276960247, + "grad_norm": 2.1716578006744385, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.708614706993103, + "num_tokens": 342880808.0, + "step": 13734 + }, + { + "epoch": 1.5083461453986382, + "grad_norm": 2.260976552963257, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.715631365776062, + "num_tokens": 342905015.0, + "step": 13735 + }, + { + "epoch": 1.508455963101252, + "grad_norm": 2.261247158050537, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7100579738616943, + "num_tokens": 342932005.0, + "step": 13736 + }, + { + "epoch": 1.5085657808038655, + "grad_norm": 2.2989389896392822, + "learning_rate": 1e-06, + "loss": 1.0052, + "mean_token_accuracy": 0.6963796615600586, + "num_tokens": 342959225.0, + "step": 13737 + }, + { + "epoch": 1.5086755985064793, + "grad_norm": 2.241835832595825, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7252739667892456, + "num_tokens": 342983856.0, + "step": 13738 + }, + { + "epoch": 1.508785416209093, + "grad_norm": 2.1186740398406982, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7169121503829956, + "num_tokens": 343013776.0, + "step": 13739 + }, + { + "epoch": 1.5088952339117065, + "grad_norm": 1.9034572839736938, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.6958420872688293, + "num_tokens": 343051074.0, + "step": 13740 + }, + { + "epoch": 1.50900505161432, + "grad_norm": 2.4699296951293945, + "learning_rate": 1e-06, + "loss": 0.8803, + "mean_token_accuracy": 0.7280140519142151, + "num_tokens": 343073307.0, + "step": 13741 + }, + { + "epoch": 1.5091148693169338, + "grad_norm": 2.3573131561279297, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7087817192077637, + "num_tokens": 343097923.0, + "step": 13742 + }, + { + "epoch": 1.5092246870195476, + "grad_norm": 2.2258172035217285, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.712192177772522, + "num_tokens": 343124547.0, + "step": 13743 + }, + { + "epoch": 1.5093345047221614, + "grad_norm": 2.3407602310180664, + "learning_rate": 1e-06, + "loss": 0.9676, + "mean_token_accuracy": 0.6990272998809814, + "num_tokens": 343149553.0, + "step": 13744 + }, + { + "epoch": 1.5094443224247749, + "grad_norm": 2.3161301612854004, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7181515693664551, + "num_tokens": 343174012.0, + "step": 13745 + }, + { + "epoch": 1.5095541401273884, + "grad_norm": 2.247274875640869, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7224248647689819, + "num_tokens": 343199204.0, + "step": 13746 + }, + { + "epoch": 1.5096639578300022, + "grad_norm": 2.1115591526031494, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7291077971458435, + "num_tokens": 343225751.0, + "step": 13747 + }, + { + "epoch": 1.509773775532616, + "grad_norm": 2.1052048206329346, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7212971448898315, + "num_tokens": 343255657.0, + "step": 13748 + }, + { + "epoch": 1.5098835932352295, + "grad_norm": 2.19929575920105, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7198063135147095, + "num_tokens": 343281673.0, + "step": 13749 + }, + { + "epoch": 1.5099934109378432, + "grad_norm": 2.350520610809326, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7395284175872803, + "num_tokens": 343305170.0, + "step": 13750 + }, + { + "epoch": 1.5101032286404568, + "grad_norm": 2.1952595710754395, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7181710004806519, + "num_tokens": 343332969.0, + "step": 13751 + }, + { + "epoch": 1.5102130463430705, + "grad_norm": 2.1259384155273438, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7236788272857666, + "num_tokens": 343362192.0, + "step": 13752 + }, + { + "epoch": 1.5103228640456843, + "grad_norm": 2.3160252571105957, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7454091310501099, + "num_tokens": 343385521.0, + "step": 13753 + }, + { + "epoch": 1.5104326817482978, + "grad_norm": 2.0909206867218018, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7037129402160645, + "num_tokens": 343416193.0, + "step": 13754 + }, + { + "epoch": 1.5105424994509113, + "grad_norm": 2.0719218254089355, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7212461233139038, + "num_tokens": 343443608.0, + "step": 13755 + }, + { + "epoch": 1.510652317153525, + "grad_norm": 2.558340549468994, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7391955852508545, + "num_tokens": 343464016.0, + "step": 13756 + }, + { + "epoch": 1.5107621348561389, + "grad_norm": 2.4457268714904785, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7031910419464111, + "num_tokens": 343485175.0, + "step": 13757 + }, + { + "epoch": 1.5108719525587526, + "grad_norm": 2.0785014629364014, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.7034075260162354, + "num_tokens": 343512696.0, + "step": 13758 + }, + { + "epoch": 1.5109817702613662, + "grad_norm": 2.551948070526123, + "learning_rate": 1e-06, + "loss": 0.7403, + "mean_token_accuracy": 0.7596404552459717, + "num_tokens": 343533102.0, + "step": 13759 + }, + { + "epoch": 1.5110915879639797, + "grad_norm": 2.2344391345977783, + "learning_rate": 1e-06, + "loss": 1.0679, + "mean_token_accuracy": 0.6779868602752686, + "num_tokens": 343560569.0, + "step": 13760 + }, + { + "epoch": 1.5112014056665934, + "grad_norm": 2.4333808422088623, + "learning_rate": 1e-06, + "loss": 0.7769, + "mean_token_accuracy": 0.7458860278129578, + "num_tokens": 343581763.0, + "step": 13761 + }, + { + "epoch": 1.5113112233692072, + "grad_norm": 2.4605836868286133, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7348343133926392, + "num_tokens": 343605245.0, + "step": 13762 + }, + { + "epoch": 1.5114210410718207, + "grad_norm": 2.1538476943969727, + "learning_rate": 1e-06, + "loss": 1.0008, + "mean_token_accuracy": 0.690401554107666, + "num_tokens": 343634587.0, + "step": 13763 + }, + { + "epoch": 1.5115308587744345, + "grad_norm": 2.2849559783935547, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7206991910934448, + "num_tokens": 343660242.0, + "step": 13764 + }, + { + "epoch": 1.511640676477048, + "grad_norm": 2.254359245300293, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7221079468727112, + "num_tokens": 343686621.0, + "step": 13765 + }, + { + "epoch": 1.5117504941796618, + "grad_norm": 2.1983091831207275, + "learning_rate": 1e-06, + "loss": 0.8334, + "mean_token_accuracy": 0.7375469207763672, + "num_tokens": 343711540.0, + "step": 13766 + }, + { + "epoch": 1.5118603118822755, + "grad_norm": 2.346724271774292, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.726173996925354, + "num_tokens": 343735584.0, + "step": 13767 + }, + { + "epoch": 1.511970129584889, + "grad_norm": 1.9721479415893555, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7248328328132629, + "num_tokens": 343765821.0, + "step": 13768 + }, + { + "epoch": 1.5120799472875026, + "grad_norm": 2.5888378620147705, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.714286744594574, + "num_tokens": 343786584.0, + "step": 13769 + }, + { + "epoch": 1.5121897649901164, + "grad_norm": 2.407227039337158, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7162326574325562, + "num_tokens": 343809474.0, + "step": 13770 + }, + { + "epoch": 1.5122995826927301, + "grad_norm": 1.9616533517837524, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7157894372940063, + "num_tokens": 343842675.0, + "step": 13771 + }, + { + "epoch": 1.5124094003953439, + "grad_norm": 2.375195264816284, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7204472422599792, + "num_tokens": 343864949.0, + "step": 13772 + }, + { + "epoch": 1.5125192180979574, + "grad_norm": 2.1120941638946533, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7033039927482605, + "num_tokens": 343892700.0, + "step": 13773 + }, + { + "epoch": 1.512629035800571, + "grad_norm": 2.1845903396606445, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7114918828010559, + "num_tokens": 343921269.0, + "step": 13774 + }, + { + "epoch": 1.5127388535031847, + "grad_norm": 2.5714547634124756, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7301636934280396, + "num_tokens": 343941116.0, + "step": 13775 + }, + { + "epoch": 1.5128486712057985, + "grad_norm": 1.93696928024292, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7206647396087646, + "num_tokens": 343971665.0, + "step": 13776 + }, + { + "epoch": 1.512958488908412, + "grad_norm": 2.1192238330841064, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7230022549629211, + "num_tokens": 343998677.0, + "step": 13777 + }, + { + "epoch": 1.5130683066110255, + "grad_norm": 2.4789621829986572, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7155234813690186, + "num_tokens": 344021289.0, + "step": 13778 + }, + { + "epoch": 1.5131781243136393, + "grad_norm": 2.5428950786590576, + "learning_rate": 1e-06, + "loss": 0.9784, + "mean_token_accuracy": 0.7076277732849121, + "num_tokens": 344043140.0, + "step": 13779 + }, + { + "epoch": 1.513287942016253, + "grad_norm": 2.4375648498535156, + "learning_rate": 1e-06, + "loss": 0.8513, + "mean_token_accuracy": 0.7333647012710571, + "num_tokens": 344065681.0, + "step": 13780 + }, + { + "epoch": 1.5133977597188668, + "grad_norm": 2.5034236907958984, + "learning_rate": 1e-06, + "loss": 0.7932, + "mean_token_accuracy": 0.746644914150238, + "num_tokens": 344087060.0, + "step": 13781 + }, + { + "epoch": 1.5135075774214803, + "grad_norm": 2.496298313140869, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7230475544929504, + "num_tokens": 344110880.0, + "step": 13782 + }, + { + "epoch": 1.5136173951240939, + "grad_norm": 2.1004412174224854, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7308047413825989, + "num_tokens": 344140137.0, + "step": 13783 + }, + { + "epoch": 1.5137272128267076, + "grad_norm": 2.5614864826202393, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7297896146774292, + "num_tokens": 344160742.0, + "step": 13784 + }, + { + "epoch": 1.5138370305293214, + "grad_norm": 1.967960000038147, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6941999793052673, + "num_tokens": 344191335.0, + "step": 13785 + }, + { + "epoch": 1.5139468482319351, + "grad_norm": 2.3215301036834717, + "learning_rate": 1e-06, + "loss": 0.8566, + "mean_token_accuracy": 0.740700364112854, + "num_tokens": 344216472.0, + "step": 13786 + }, + { + "epoch": 1.5140566659345487, + "grad_norm": 2.4817168712615967, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7305031418800354, + "num_tokens": 344237346.0, + "step": 13787 + }, + { + "epoch": 1.5141664836371622, + "grad_norm": 2.525937795639038, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7028356790542603, + "num_tokens": 344261116.0, + "step": 13788 + }, + { + "epoch": 1.514276301339776, + "grad_norm": 2.2504923343658447, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7242549061775208, + "num_tokens": 344287704.0, + "step": 13789 + }, + { + "epoch": 1.5143861190423897, + "grad_norm": 1.9676508903503418, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7284911870956421, + "num_tokens": 344317514.0, + "step": 13790 + }, + { + "epoch": 1.5144959367450033, + "grad_norm": 2.337282419204712, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.699108898639679, + "num_tokens": 344341159.0, + "step": 13791 + }, + { + "epoch": 1.5146057544476168, + "grad_norm": 2.3598310947418213, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.7087064981460571, + "num_tokens": 344363920.0, + "step": 13792 + }, + { + "epoch": 1.5147155721502306, + "grad_norm": 2.2771737575531006, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7118047475814819, + "num_tokens": 344389463.0, + "step": 13793 + }, + { + "epoch": 1.5148253898528443, + "grad_norm": 1.9685323238372803, + "learning_rate": 1e-06, + "loss": 0.9894, + "mean_token_accuracy": 0.7053015232086182, + "num_tokens": 344423584.0, + "step": 13794 + }, + { + "epoch": 1.514935207555458, + "grad_norm": 2.327449321746826, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7278072237968445, + "num_tokens": 344446956.0, + "step": 13795 + }, + { + "epoch": 1.5150450252580716, + "grad_norm": 2.365281820297241, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.741031289100647, + "num_tokens": 344470745.0, + "step": 13796 + }, + { + "epoch": 1.5151548429606851, + "grad_norm": 2.2400872707366943, + "learning_rate": 1e-06, + "loss": 0.9835, + "mean_token_accuracy": 0.7042266726493835, + "num_tokens": 344499736.0, + "step": 13797 + }, + { + "epoch": 1.515264660663299, + "grad_norm": 2.250089406967163, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7180718779563904, + "num_tokens": 344523131.0, + "step": 13798 + }, + { + "epoch": 1.5153744783659127, + "grad_norm": 2.6060945987701416, + "learning_rate": 1e-06, + "loss": 0.8132, + "mean_token_accuracy": 0.7410976886749268, + "num_tokens": 344542328.0, + "step": 13799 + }, + { + "epoch": 1.5154842960685262, + "grad_norm": 2.2023138999938965, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7221770882606506, + "num_tokens": 344567390.0, + "step": 13800 + }, + { + "epoch": 1.51559411377114, + "grad_norm": 2.2430808544158936, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7187198400497437, + "num_tokens": 344592114.0, + "step": 13801 + }, + { + "epoch": 1.5157039314737535, + "grad_norm": 2.2697877883911133, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7132292985916138, + "num_tokens": 344619249.0, + "step": 13802 + }, + { + "epoch": 1.5158137491763672, + "grad_norm": 2.2072858810424805, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7188621759414673, + "num_tokens": 344645081.0, + "step": 13803 + }, + { + "epoch": 1.515923566878981, + "grad_norm": 2.1054160594940186, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.709425687789917, + "num_tokens": 344673565.0, + "step": 13804 + }, + { + "epoch": 1.5160333845815945, + "grad_norm": 2.0824453830718994, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.711995542049408, + "num_tokens": 344702655.0, + "step": 13805 + }, + { + "epoch": 1.516143202284208, + "grad_norm": 2.5713441371917725, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7310341000556946, + "num_tokens": 344723226.0, + "step": 13806 + }, + { + "epoch": 1.5162530199868218, + "grad_norm": 2.2789647579193115, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7177107334136963, + "num_tokens": 344748849.0, + "step": 13807 + }, + { + "epoch": 1.5163628376894356, + "grad_norm": 2.338181257247925, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7241091728210449, + "num_tokens": 344771794.0, + "step": 13808 + }, + { + "epoch": 1.5164726553920493, + "grad_norm": 2.5352511405944824, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7154215574264526, + "num_tokens": 344793590.0, + "step": 13809 + }, + { + "epoch": 1.5165824730946629, + "grad_norm": 2.9814999103546143, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.742032527923584, + "num_tokens": 344810800.0, + "step": 13810 + }, + { + "epoch": 1.5166922907972764, + "grad_norm": 2.40006685256958, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7193598747253418, + "num_tokens": 344832806.0, + "step": 13811 + }, + { + "epoch": 1.5168021084998902, + "grad_norm": 2.3178796768188477, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7342946529388428, + "num_tokens": 344855519.0, + "step": 13812 + }, + { + "epoch": 1.516911926202504, + "grad_norm": 2.13442063331604, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.7065900564193726, + "num_tokens": 344882924.0, + "step": 13813 + }, + { + "epoch": 1.5170217439051175, + "grad_norm": 2.177861213684082, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7316495180130005, + "num_tokens": 344910244.0, + "step": 13814 + }, + { + "epoch": 1.5171315616077312, + "grad_norm": 2.246056079864502, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7085366249084473, + "num_tokens": 344935423.0, + "step": 13815 + }, + { + "epoch": 1.5172413793103448, + "grad_norm": 2.3376834392547607, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7262697219848633, + "num_tokens": 344959538.0, + "step": 13816 + }, + { + "epoch": 1.5173511970129585, + "grad_norm": 2.273425579071045, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7089649438858032, + "num_tokens": 344987278.0, + "step": 13817 + }, + { + "epoch": 1.5174610147155723, + "grad_norm": 2.2324469089508057, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7134007215499878, + "num_tokens": 345014267.0, + "step": 13818 + }, + { + "epoch": 1.5175708324181858, + "grad_norm": 2.299100637435913, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7220078110694885, + "num_tokens": 345038472.0, + "step": 13819 + }, + { + "epoch": 1.5176806501207993, + "grad_norm": 2.27478289604187, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7252248525619507, + "num_tokens": 345061839.0, + "step": 13820 + }, + { + "epoch": 1.517790467823413, + "grad_norm": 2.2429280281066895, + "learning_rate": 1e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7355719804763794, + "num_tokens": 345087479.0, + "step": 13821 + }, + { + "epoch": 1.5179002855260268, + "grad_norm": 2.1488139629364014, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7214667797088623, + "num_tokens": 345114208.0, + "step": 13822 + }, + { + "epoch": 1.5180101032286406, + "grad_norm": 2.093531847000122, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7243450880050659, + "num_tokens": 345143276.0, + "step": 13823 + }, + { + "epoch": 1.5181199209312541, + "grad_norm": 2.792067766189575, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7473729252815247, + "num_tokens": 345161749.0, + "step": 13824 + }, + { + "epoch": 1.5182297386338677, + "grad_norm": 2.711812973022461, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7169390320777893, + "num_tokens": 345181212.0, + "step": 13825 + }, + { + "epoch": 1.5183395563364814, + "grad_norm": 2.228389263153076, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7233842015266418, + "num_tokens": 345207770.0, + "step": 13826 + }, + { + "epoch": 1.5184493740390952, + "grad_norm": 2.3836679458618164, + "learning_rate": 1e-06, + "loss": 0.7998, + "mean_token_accuracy": 0.7479323148727417, + "num_tokens": 345230085.0, + "step": 13827 + }, + { + "epoch": 1.5185591917417087, + "grad_norm": 2.1383473873138428, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7217812538146973, + "num_tokens": 345256693.0, + "step": 13828 + }, + { + "epoch": 1.5186690094443223, + "grad_norm": 2.5099105834960938, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.7029121518135071, + "num_tokens": 345278840.0, + "step": 13829 + }, + { + "epoch": 1.518778827146936, + "grad_norm": 2.514688730239868, + "learning_rate": 1e-06, + "loss": 0.7831, + "mean_token_accuracy": 0.7468506097793579, + "num_tokens": 345298985.0, + "step": 13830 + }, + { + "epoch": 1.5188886448495498, + "grad_norm": 2.317849636077881, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.740904688835144, + "num_tokens": 345321156.0, + "step": 13831 + }, + { + "epoch": 1.5189984625521635, + "grad_norm": 2.167842149734497, + "learning_rate": 1e-06, + "loss": 0.8359, + "mean_token_accuracy": 0.740343451499939, + "num_tokens": 345347012.0, + "step": 13832 + }, + { + "epoch": 1.519108280254777, + "grad_norm": 2.6375412940979004, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7220057249069214, + "num_tokens": 345368353.0, + "step": 13833 + }, + { + "epoch": 1.5192180979573906, + "grad_norm": 2.377570390701294, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7132257223129272, + "num_tokens": 345393585.0, + "step": 13834 + }, + { + "epoch": 1.5193279156600044, + "grad_norm": 2.3599321842193604, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7144585251808167, + "num_tokens": 345417236.0, + "step": 13835 + }, + { + "epoch": 1.5194377333626181, + "grad_norm": 2.2571969032287598, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7323707938194275, + "num_tokens": 345439999.0, + "step": 13836 + }, + { + "epoch": 1.5195475510652319, + "grad_norm": 2.3029398918151855, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7194803357124329, + "num_tokens": 345463156.0, + "step": 13837 + }, + { + "epoch": 1.5196573687678454, + "grad_norm": 2.3936212062835693, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.740368127822876, + "num_tokens": 345483859.0, + "step": 13838 + }, + { + "epoch": 1.519767186470459, + "grad_norm": 2.3037102222442627, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.715429961681366, + "num_tokens": 345510203.0, + "step": 13839 + }, + { + "epoch": 1.5198770041730727, + "grad_norm": 2.3943140506744385, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7170889377593994, + "num_tokens": 345531153.0, + "step": 13840 + }, + { + "epoch": 1.5199868218756865, + "grad_norm": 2.039497137069702, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.7231231927871704, + "num_tokens": 345560914.0, + "step": 13841 + }, + { + "epoch": 1.5200966395783, + "grad_norm": 2.218261480331421, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.689107358455658, + "num_tokens": 345587676.0, + "step": 13842 + }, + { + "epoch": 1.5202064572809135, + "grad_norm": 2.2150073051452637, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.700311541557312, + "num_tokens": 345613826.0, + "step": 13843 + }, + { + "epoch": 1.5203162749835273, + "grad_norm": 2.1738884449005127, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7108440399169922, + "num_tokens": 345641872.0, + "step": 13844 + }, + { + "epoch": 1.520426092686141, + "grad_norm": 2.431626319885254, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7445666790008545, + "num_tokens": 345664036.0, + "step": 13845 + }, + { + "epoch": 1.5205359103887548, + "grad_norm": 2.3612430095672607, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7063844203948975, + "num_tokens": 345690337.0, + "step": 13846 + }, + { + "epoch": 1.5206457280913683, + "grad_norm": 2.2347006797790527, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7140911817550659, + "num_tokens": 345715537.0, + "step": 13847 + }, + { + "epoch": 1.5207555457939819, + "grad_norm": 2.08107590675354, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7151898145675659, + "num_tokens": 345745156.0, + "step": 13848 + }, + { + "epoch": 1.5208653634965956, + "grad_norm": 2.3665194511413574, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.6925875544548035, + "num_tokens": 345770952.0, + "step": 13849 + }, + { + "epoch": 1.5209751811992094, + "grad_norm": 2.345578193664551, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7257992029190063, + "num_tokens": 345793384.0, + "step": 13850 + }, + { + "epoch": 1.5210849989018231, + "grad_norm": 2.054987668991089, + "learning_rate": 1e-06, + "loss": 1.015, + "mean_token_accuracy": 0.6940200924873352, + "num_tokens": 345825401.0, + "step": 13851 + }, + { + "epoch": 1.5211948166044367, + "grad_norm": 2.3950393199920654, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7183220386505127, + "num_tokens": 345849534.0, + "step": 13852 + }, + { + "epoch": 1.5213046343070502, + "grad_norm": 2.4995200634002686, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7255042791366577, + "num_tokens": 345870484.0, + "step": 13853 + }, + { + "epoch": 1.521414452009664, + "grad_norm": 2.527022123336792, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6911312341690063, + "num_tokens": 345892408.0, + "step": 13854 + }, + { + "epoch": 1.5215242697122777, + "grad_norm": 2.0683488845825195, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7231093645095825, + "num_tokens": 345922966.0, + "step": 13855 + }, + { + "epoch": 1.5216340874148913, + "grad_norm": 2.841468572616577, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7293170690536499, + "num_tokens": 345940114.0, + "step": 13856 + }, + { + "epoch": 1.5217439051175048, + "grad_norm": 2.6913676261901855, + "learning_rate": 1e-06, + "loss": 0.85, + "mean_token_accuracy": 0.7337539196014404, + "num_tokens": 345959254.0, + "step": 13857 + }, + { + "epoch": 1.5218537228201185, + "grad_norm": 2.406196355819702, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7224587202072144, + "num_tokens": 345982253.0, + "step": 13858 + }, + { + "epoch": 1.5219635405227323, + "grad_norm": 2.247058391571045, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7155055999755859, + "num_tokens": 346007765.0, + "step": 13859 + }, + { + "epoch": 1.522073358225346, + "grad_norm": 2.2051165103912354, + "learning_rate": 1e-06, + "loss": 0.9507, + "mean_token_accuracy": 0.7063084840774536, + "num_tokens": 346034211.0, + "step": 13860 + }, + { + "epoch": 1.5221831759279596, + "grad_norm": 2.342158317565918, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7044839859008789, + "num_tokens": 346059180.0, + "step": 13861 + }, + { + "epoch": 1.5222929936305731, + "grad_norm": 2.311678171157837, + "learning_rate": 1e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7480039596557617, + "num_tokens": 346080450.0, + "step": 13862 + }, + { + "epoch": 1.522402811333187, + "grad_norm": 2.4521946907043457, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7117465734481812, + "num_tokens": 346102055.0, + "step": 13863 + }, + { + "epoch": 1.5225126290358006, + "grad_norm": 1.9768635034561157, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7124338150024414, + "num_tokens": 346132901.0, + "step": 13864 + }, + { + "epoch": 1.5226224467384142, + "grad_norm": 2.0602357387542725, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7021678686141968, + "num_tokens": 346161608.0, + "step": 13865 + }, + { + "epoch": 1.522732264441028, + "grad_norm": 2.2643556594848633, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7102195024490356, + "num_tokens": 346185936.0, + "step": 13866 + }, + { + "epoch": 1.5228420821436415, + "grad_norm": 2.2152562141418457, + "learning_rate": 1e-06, + "loss": 0.9973, + "mean_token_accuracy": 0.702759325504303, + "num_tokens": 346211820.0, + "step": 13867 + }, + { + "epoch": 1.5229518998462552, + "grad_norm": 2.1805715560913086, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.7097005844116211, + "num_tokens": 346237509.0, + "step": 13868 + }, + { + "epoch": 1.523061717548869, + "grad_norm": 2.3886799812316895, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7357735633850098, + "num_tokens": 346258670.0, + "step": 13869 + }, + { + "epoch": 1.5231715352514825, + "grad_norm": 2.569014072418213, + "learning_rate": 1e-06, + "loss": 0.8346, + "mean_token_accuracy": 0.735048770904541, + "num_tokens": 346279776.0, + "step": 13870 + }, + { + "epoch": 1.523281352954096, + "grad_norm": 2.556068181991577, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7295681238174438, + "num_tokens": 346300984.0, + "step": 13871 + }, + { + "epoch": 1.5233911706567098, + "grad_norm": 2.3915677070617676, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7297841310501099, + "num_tokens": 346322266.0, + "step": 13872 + }, + { + "epoch": 1.5235009883593236, + "grad_norm": 2.3020904064178467, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7086896300315857, + "num_tokens": 346348389.0, + "step": 13873 + }, + { + "epoch": 1.5236108060619373, + "grad_norm": 2.3452224731445312, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.736291766166687, + "num_tokens": 346370578.0, + "step": 13874 + }, + { + "epoch": 1.5237206237645509, + "grad_norm": 2.442859411239624, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.72515469789505, + "num_tokens": 346393395.0, + "step": 13875 + }, + { + "epoch": 1.5238304414671644, + "grad_norm": 2.3646292686462402, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7052208185195923, + "num_tokens": 346421876.0, + "step": 13876 + }, + { + "epoch": 1.5239402591697782, + "grad_norm": 2.592633008956909, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7263293266296387, + "num_tokens": 346441547.0, + "step": 13877 + }, + { + "epoch": 1.524050076872392, + "grad_norm": 2.3924317359924316, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7040266990661621, + "num_tokens": 346464607.0, + "step": 13878 + }, + { + "epoch": 1.5241598945750054, + "grad_norm": 2.095492124557495, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.6983828544616699, + "num_tokens": 346495458.0, + "step": 13879 + }, + { + "epoch": 1.5242697122776192, + "grad_norm": 2.3038489818573, + "learning_rate": 1e-06, + "loss": 0.9585, + "mean_token_accuracy": 0.7018619179725647, + "num_tokens": 346518436.0, + "step": 13880 + }, + { + "epoch": 1.5243795299802327, + "grad_norm": 2.3038671016693115, + "learning_rate": 1e-06, + "loss": 0.729, + "mean_token_accuracy": 0.7598918080329895, + "num_tokens": 346538947.0, + "step": 13881 + }, + { + "epoch": 1.5244893476828465, + "grad_norm": 2.3414430618286133, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7151694297790527, + "num_tokens": 346562630.0, + "step": 13882 + }, + { + "epoch": 1.5245991653854603, + "grad_norm": 2.414005994796753, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7048131227493286, + "num_tokens": 346584720.0, + "step": 13883 + }, + { + "epoch": 1.5247089830880738, + "grad_norm": 2.4047582149505615, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7239738702774048, + "num_tokens": 346605585.0, + "step": 13884 + }, + { + "epoch": 1.5248188007906873, + "grad_norm": 2.2930524349212646, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7282559871673584, + "num_tokens": 346630238.0, + "step": 13885 + }, + { + "epoch": 1.524928618493301, + "grad_norm": 2.4108707904815674, + "learning_rate": 1e-06, + "loss": 0.7806, + "mean_token_accuracy": 0.7492935657501221, + "num_tokens": 346651031.0, + "step": 13886 + }, + { + "epoch": 1.5250384361959148, + "grad_norm": 2.4436731338500977, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7244553565979004, + "num_tokens": 346672064.0, + "step": 13887 + }, + { + "epoch": 1.5251482538985286, + "grad_norm": 2.313433885574341, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7125661373138428, + "num_tokens": 346696014.0, + "step": 13888 + }, + { + "epoch": 1.5252580716011421, + "grad_norm": 2.307478904724121, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7284569144248962, + "num_tokens": 346720069.0, + "step": 13889 + }, + { + "epoch": 1.5253678893037557, + "grad_norm": 2.247662305831909, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7152611017227173, + "num_tokens": 346746431.0, + "step": 13890 + }, + { + "epoch": 1.5254777070063694, + "grad_norm": 2.357172966003418, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6966413855552673, + "num_tokens": 346770456.0, + "step": 13891 + }, + { + "epoch": 1.5255875247089832, + "grad_norm": 2.4315710067749023, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7148244380950928, + "num_tokens": 346793334.0, + "step": 13892 + }, + { + "epoch": 1.5256973424115967, + "grad_norm": 2.5396008491516113, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7203406095504761, + "num_tokens": 346814498.0, + "step": 13893 + }, + { + "epoch": 1.5258071601142102, + "grad_norm": 2.264679193496704, + "learning_rate": 1e-06, + "loss": 0.9637, + "mean_token_accuracy": 0.7000205516815186, + "num_tokens": 346839634.0, + "step": 13894 + }, + { + "epoch": 1.525916977816824, + "grad_norm": 2.148902177810669, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7111377120018005, + "num_tokens": 346866917.0, + "step": 13895 + }, + { + "epoch": 1.5260267955194378, + "grad_norm": 2.0670435428619385, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7071223855018616, + "num_tokens": 346895301.0, + "step": 13896 + }, + { + "epoch": 1.5261366132220515, + "grad_norm": 2.4368839263916016, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7316713333129883, + "num_tokens": 346916648.0, + "step": 13897 + }, + { + "epoch": 1.526246430924665, + "grad_norm": 2.5713412761688232, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7232699990272522, + "num_tokens": 346936927.0, + "step": 13898 + }, + { + "epoch": 1.5263562486272786, + "grad_norm": 2.3665080070495605, + "learning_rate": 1e-06, + "loss": 0.9642, + "mean_token_accuracy": 0.722183108329773, + "num_tokens": 346959442.0, + "step": 13899 + }, + { + "epoch": 1.5264660663298923, + "grad_norm": 2.4874770641326904, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7327198386192322, + "num_tokens": 346980044.0, + "step": 13900 + }, + { + "epoch": 1.526575884032506, + "grad_norm": 2.3383402824401855, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7108057141304016, + "num_tokens": 347004063.0, + "step": 13901 + }, + { + "epoch": 1.5266857017351199, + "grad_norm": 2.364957571029663, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7441167235374451, + "num_tokens": 347026561.0, + "step": 13902 + }, + { + "epoch": 1.5267955194377334, + "grad_norm": 2.306426763534546, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7139789462089539, + "num_tokens": 347051234.0, + "step": 13903 + }, + { + "epoch": 1.526905337140347, + "grad_norm": 2.230691432952881, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7109977602958679, + "num_tokens": 347076249.0, + "step": 13904 + }, + { + "epoch": 1.5270151548429607, + "grad_norm": 2.5579886436462402, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7165396213531494, + "num_tokens": 347097092.0, + "step": 13905 + }, + { + "epoch": 1.5271249725455744, + "grad_norm": 2.407007932662964, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7179137468338013, + "num_tokens": 347119699.0, + "step": 13906 + }, + { + "epoch": 1.527234790248188, + "grad_norm": 2.087451219558716, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7288943529129028, + "num_tokens": 347148839.0, + "step": 13907 + }, + { + "epoch": 1.5273446079508015, + "grad_norm": 2.4832286834716797, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7275196313858032, + "num_tokens": 347171146.0, + "step": 13908 + }, + { + "epoch": 1.5274544256534153, + "grad_norm": 2.3165228366851807, + "learning_rate": 1e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7432903051376343, + "num_tokens": 347194804.0, + "step": 13909 + }, + { + "epoch": 1.527564243356029, + "grad_norm": 2.8018338680267334, + "learning_rate": 1e-06, + "loss": 0.8182, + "mean_token_accuracy": 0.738074541091919, + "num_tokens": 347211861.0, + "step": 13910 + }, + { + "epoch": 1.5276740610586428, + "grad_norm": 2.414280652999878, + "learning_rate": 1e-06, + "loss": 0.8306, + "mean_token_accuracy": 0.735889196395874, + "num_tokens": 347234039.0, + "step": 13911 + }, + { + "epoch": 1.5277838787612563, + "grad_norm": 2.225536346435547, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7160590887069702, + "num_tokens": 347259444.0, + "step": 13912 + }, + { + "epoch": 1.5278936964638699, + "grad_norm": 2.198197603225708, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7149710655212402, + "num_tokens": 347285998.0, + "step": 13913 + }, + { + "epoch": 1.5280035141664836, + "grad_norm": 2.6450002193450928, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7277356386184692, + "num_tokens": 347305542.0, + "step": 13914 + }, + { + "epoch": 1.5281133318690974, + "grad_norm": 2.322476625442505, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7180430889129639, + "num_tokens": 347329478.0, + "step": 13915 + }, + { + "epoch": 1.5282231495717111, + "grad_norm": 2.340348482131958, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7082124948501587, + "num_tokens": 347355352.0, + "step": 13916 + }, + { + "epoch": 1.5283329672743247, + "grad_norm": 2.1633718013763428, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.7076997756958008, + "num_tokens": 347383702.0, + "step": 13917 + }, + { + "epoch": 1.5284427849769382, + "grad_norm": 2.4405720233917236, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.727586030960083, + "num_tokens": 347406928.0, + "step": 13918 + }, + { + "epoch": 1.528552602679552, + "grad_norm": 2.5588607788085938, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7435822486877441, + "num_tokens": 347428210.0, + "step": 13919 + }, + { + "epoch": 1.5286624203821657, + "grad_norm": 2.476905107498169, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7344374656677246, + "num_tokens": 347449578.0, + "step": 13920 + }, + { + "epoch": 1.5287722380847792, + "grad_norm": 1.8321001529693604, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.704395592212677, + "num_tokens": 347487376.0, + "step": 13921 + }, + { + "epoch": 1.5288820557873928, + "grad_norm": 2.4527649879455566, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7294601202011108, + "num_tokens": 347508731.0, + "step": 13922 + }, + { + "epoch": 1.5289918734900065, + "grad_norm": 2.3971903324127197, + "learning_rate": 1e-06, + "loss": 0.7662, + "mean_token_accuracy": 0.754287600517273, + "num_tokens": 347532102.0, + "step": 13923 + }, + { + "epoch": 1.5291016911926203, + "grad_norm": 2.207474946975708, + "learning_rate": 1e-06, + "loss": 0.9812, + "mean_token_accuracy": 0.7006115913391113, + "num_tokens": 347560588.0, + "step": 13924 + }, + { + "epoch": 1.529211508895234, + "grad_norm": 2.6744816303253174, + "learning_rate": 1e-06, + "loss": 0.8474, + "mean_token_accuracy": 0.7494850158691406, + "num_tokens": 347577766.0, + "step": 13925 + }, + { + "epoch": 1.5293213265978476, + "grad_norm": 2.172013998031616, + "learning_rate": 1e-06, + "loss": 0.8003, + "mean_token_accuracy": 0.7500735521316528, + "num_tokens": 347602102.0, + "step": 13926 + }, + { + "epoch": 1.5294311443004611, + "grad_norm": 2.28741192817688, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7210269570350647, + "num_tokens": 347626241.0, + "step": 13927 + }, + { + "epoch": 1.5295409620030749, + "grad_norm": 2.0857644081115723, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7152212858200073, + "num_tokens": 347654812.0, + "step": 13928 + }, + { + "epoch": 1.5296507797056886, + "grad_norm": 2.4493699073791504, + "learning_rate": 1e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7331610918045044, + "num_tokens": 347678356.0, + "step": 13929 + }, + { + "epoch": 1.5297605974083022, + "grad_norm": 2.1231324672698975, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7110483646392822, + "num_tokens": 347706824.0, + "step": 13930 + }, + { + "epoch": 1.529870415110916, + "grad_norm": 2.1640167236328125, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7340627312660217, + "num_tokens": 347733793.0, + "step": 13931 + }, + { + "epoch": 1.5299802328135295, + "grad_norm": 2.3585448265075684, + "learning_rate": 1e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7416559457778931, + "num_tokens": 347755426.0, + "step": 13932 + }, + { + "epoch": 1.5300900505161432, + "grad_norm": 2.464693784713745, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7336753606796265, + "num_tokens": 347775798.0, + "step": 13933 + }, + { + "epoch": 1.530199868218757, + "grad_norm": 2.5094456672668457, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.736775279045105, + "num_tokens": 347798345.0, + "step": 13934 + }, + { + "epoch": 1.5303096859213705, + "grad_norm": 2.658066749572754, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7295225858688354, + "num_tokens": 347820069.0, + "step": 13935 + }, + { + "epoch": 1.530419503623984, + "grad_norm": 2.2187654972076416, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7099119424819946, + "num_tokens": 347848913.0, + "step": 13936 + }, + { + "epoch": 1.5305293213265978, + "grad_norm": 2.281787633895874, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.708463191986084, + "num_tokens": 347873398.0, + "step": 13937 + }, + { + "epoch": 1.5306391390292116, + "grad_norm": 2.2365832328796387, + "learning_rate": 1e-06, + "loss": 0.8345, + "mean_token_accuracy": 0.7374837398529053, + "num_tokens": 347896945.0, + "step": 13938 + }, + { + "epoch": 1.5307489567318253, + "grad_norm": 2.2867677211761475, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7184396386146545, + "num_tokens": 347921214.0, + "step": 13939 + }, + { + "epoch": 1.5308587744344389, + "grad_norm": 1.9937798976898193, + "learning_rate": 1e-06, + "loss": 0.9228, + "mean_token_accuracy": 0.7156869173049927, + "num_tokens": 347952388.0, + "step": 13940 + }, + { + "epoch": 1.5309685921370524, + "grad_norm": 2.206416606903076, + "learning_rate": 1e-06, + "loss": 1.0638, + "mean_token_accuracy": 0.6762334108352661, + "num_tokens": 347983079.0, + "step": 13941 + }, + { + "epoch": 1.5310784098396661, + "grad_norm": 2.5299460887908936, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7084335088729858, + "num_tokens": 348004136.0, + "step": 13942 + }, + { + "epoch": 1.53118822754228, + "grad_norm": 2.195488214492798, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7304897308349609, + "num_tokens": 348030458.0, + "step": 13943 + }, + { + "epoch": 1.5312980452448934, + "grad_norm": 2.0923893451690674, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7057352066040039, + "num_tokens": 348059962.0, + "step": 13944 + }, + { + "epoch": 1.5314078629475072, + "grad_norm": 2.198777914047241, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7148093581199646, + "num_tokens": 348088664.0, + "step": 13945 + }, + { + "epoch": 1.5315176806501207, + "grad_norm": 2.403715133666992, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7148658037185669, + "num_tokens": 348110484.0, + "step": 13946 + }, + { + "epoch": 1.5316274983527345, + "grad_norm": 2.037989854812622, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7283947467803955, + "num_tokens": 348140490.0, + "step": 13947 + }, + { + "epoch": 1.5317373160553482, + "grad_norm": 2.0598700046539307, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7253903150558472, + "num_tokens": 348171922.0, + "step": 13948 + }, + { + "epoch": 1.5318471337579618, + "grad_norm": 2.25443172454834, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7083817720413208, + "num_tokens": 348196932.0, + "step": 13949 + }, + { + "epoch": 1.5319569514605753, + "grad_norm": 2.30757737159729, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7157107591629028, + "num_tokens": 348221942.0, + "step": 13950 + }, + { + "epoch": 1.532066769163189, + "grad_norm": 2.359964609146118, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7190857529640198, + "num_tokens": 348245521.0, + "step": 13951 + }, + { + "epoch": 1.5321765868658028, + "grad_norm": 1.9717392921447754, + "learning_rate": 1e-06, + "loss": 1.025, + "mean_token_accuracy": 0.6827254295349121, + "num_tokens": 348280655.0, + "step": 13952 + }, + { + "epoch": 1.5322864045684166, + "grad_norm": 2.3257973194122314, + "learning_rate": 1e-06, + "loss": 0.9438, + "mean_token_accuracy": 0.7108712196350098, + "num_tokens": 348307672.0, + "step": 13953 + }, + { + "epoch": 1.5323962222710301, + "grad_norm": 2.390626907348633, + "learning_rate": 1e-06, + "loss": 0.8212, + "mean_token_accuracy": 0.7438395023345947, + "num_tokens": 348331288.0, + "step": 13954 + }, + { + "epoch": 1.5325060399736437, + "grad_norm": 2.520934581756592, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.709507942199707, + "num_tokens": 348354889.0, + "step": 13955 + }, + { + "epoch": 1.5326158576762574, + "grad_norm": 2.404873847961426, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7152113318443298, + "num_tokens": 348380610.0, + "step": 13956 + }, + { + "epoch": 1.5327256753788712, + "grad_norm": 2.176234722137451, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7130955457687378, + "num_tokens": 348408515.0, + "step": 13957 + }, + { + "epoch": 1.5328354930814847, + "grad_norm": 2.0525245666503906, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7099767923355103, + "num_tokens": 348437805.0, + "step": 13958 + }, + { + "epoch": 1.5329453107840982, + "grad_norm": 2.3384952545166016, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7248882055282593, + "num_tokens": 348461351.0, + "step": 13959 + }, + { + "epoch": 1.533055128486712, + "grad_norm": 2.1463825702667236, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.726303219795227, + "num_tokens": 348489220.0, + "step": 13960 + }, + { + "epoch": 1.5331649461893258, + "grad_norm": 2.1632065773010254, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7238011360168457, + "num_tokens": 348516840.0, + "step": 13961 + }, + { + "epoch": 1.5332747638919395, + "grad_norm": 2.105001449584961, + "learning_rate": 1e-06, + "loss": 0.9945, + "mean_token_accuracy": 0.6967569589614868, + "num_tokens": 348544649.0, + "step": 13962 + }, + { + "epoch": 1.533384581594553, + "grad_norm": 2.603145122528076, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7313805818557739, + "num_tokens": 348565289.0, + "step": 13963 + }, + { + "epoch": 1.5334943992971666, + "grad_norm": 2.133572816848755, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.6975809931755066, + "num_tokens": 348596854.0, + "step": 13964 + }, + { + "epoch": 1.5336042169997803, + "grad_norm": 2.572058916091919, + "learning_rate": 1e-06, + "loss": 0.9168, + "mean_token_accuracy": 0.7111718654632568, + "num_tokens": 348616617.0, + "step": 13965 + }, + { + "epoch": 1.533714034702394, + "grad_norm": 2.451648473739624, + "learning_rate": 1e-06, + "loss": 0.9261, + "mean_token_accuracy": 0.7173892855644226, + "num_tokens": 348640313.0, + "step": 13966 + }, + { + "epoch": 1.5338238524050078, + "grad_norm": 2.4111149311065674, + "learning_rate": 1e-06, + "loss": 1.033, + "mean_token_accuracy": 0.6924452781677246, + "num_tokens": 348665555.0, + "step": 13967 + }, + { + "epoch": 1.5339336701076214, + "grad_norm": 2.5272741317749023, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7240586876869202, + "num_tokens": 348685701.0, + "step": 13968 + }, + { + "epoch": 1.534043487810235, + "grad_norm": 2.4044549465179443, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.724968433380127, + "num_tokens": 348706438.0, + "step": 13969 + }, + { + "epoch": 1.5341533055128487, + "grad_norm": 2.0741114616394043, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7030535340309143, + "num_tokens": 348736328.0, + "step": 13970 + }, + { + "epoch": 1.5342631232154624, + "grad_norm": 2.8000741004943848, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7252436280250549, + "num_tokens": 348755052.0, + "step": 13971 + }, + { + "epoch": 1.534372940918076, + "grad_norm": 2.0385353565216064, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6959668397903442, + "num_tokens": 348787149.0, + "step": 13972 + }, + { + "epoch": 1.5344827586206895, + "grad_norm": 2.8303420543670654, + "learning_rate": 1e-06, + "loss": 0.7783, + "mean_token_accuracy": 0.7495616674423218, + "num_tokens": 348804276.0, + "step": 13973 + }, + { + "epoch": 1.5345925763233033, + "grad_norm": 2.0283215045928955, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7028105854988098, + "num_tokens": 348834285.0, + "step": 13974 + }, + { + "epoch": 1.534702394025917, + "grad_norm": 2.5875487327575684, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7327149510383606, + "num_tokens": 348854399.0, + "step": 13975 + }, + { + "epoch": 1.5348122117285308, + "grad_norm": 2.4221675395965576, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7186356782913208, + "num_tokens": 348875772.0, + "step": 13976 + }, + { + "epoch": 1.5349220294311443, + "grad_norm": 2.0911331176757812, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6927237510681152, + "num_tokens": 348904463.0, + "step": 13977 + }, + { + "epoch": 1.5350318471337578, + "grad_norm": 2.3154468536376953, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.728056788444519, + "num_tokens": 348928008.0, + "step": 13978 + }, + { + "epoch": 1.5351416648363716, + "grad_norm": 2.3938815593719482, + "learning_rate": 1e-06, + "loss": 0.9491, + "mean_token_accuracy": 0.7097899317741394, + "num_tokens": 348953057.0, + "step": 13979 + }, + { + "epoch": 1.5352514825389854, + "grad_norm": 2.4698128700256348, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7212902307510376, + "num_tokens": 348975741.0, + "step": 13980 + }, + { + "epoch": 1.535361300241599, + "grad_norm": 2.7324204444885254, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7282200455665588, + "num_tokens": 348995665.0, + "step": 13981 + }, + { + "epoch": 1.5354711179442126, + "grad_norm": 2.261662483215332, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7203046083450317, + "num_tokens": 349020902.0, + "step": 13982 + }, + { + "epoch": 1.5355809356468262, + "grad_norm": 2.4181392192840576, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7081000804901123, + "num_tokens": 349045908.0, + "step": 13983 + }, + { + "epoch": 1.53569075334944, + "grad_norm": 2.1661536693573, + "learning_rate": 1e-06, + "loss": 0.9316, + "mean_token_accuracy": 0.708301305770874, + "num_tokens": 349074515.0, + "step": 13984 + }, + { + "epoch": 1.5358005710520537, + "grad_norm": 2.482440710067749, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7220278978347778, + "num_tokens": 349095468.0, + "step": 13985 + }, + { + "epoch": 1.5359103887546672, + "grad_norm": 2.348071336746216, + "learning_rate": 1e-06, + "loss": 0.9797, + "mean_token_accuracy": 0.7013201713562012, + "num_tokens": 349120502.0, + "step": 13986 + }, + { + "epoch": 1.5360202064572808, + "grad_norm": 2.2025747299194336, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7106563448905945, + "num_tokens": 349145454.0, + "step": 13987 + }, + { + "epoch": 1.5361300241598945, + "grad_norm": 2.433873176574707, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7345070838928223, + "num_tokens": 349167179.0, + "step": 13988 + }, + { + "epoch": 1.5362398418625083, + "grad_norm": 2.047698736190796, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7287118434906006, + "num_tokens": 349195617.0, + "step": 13989 + }, + { + "epoch": 1.536349659565122, + "grad_norm": 2.2673323154449463, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.708530843257904, + "num_tokens": 349220958.0, + "step": 13990 + }, + { + "epoch": 1.5364594772677356, + "grad_norm": 2.186976194381714, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7147417664527893, + "num_tokens": 349247741.0, + "step": 13991 + }, + { + "epoch": 1.536569294970349, + "grad_norm": 2.242260217666626, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.6973084211349487, + "num_tokens": 349273400.0, + "step": 13992 + }, + { + "epoch": 1.5366791126729629, + "grad_norm": 2.4416568279266357, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7100401520729065, + "num_tokens": 349296377.0, + "step": 13993 + }, + { + "epoch": 1.5367889303755766, + "grad_norm": 2.4581546783447266, + "learning_rate": 1e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7295513153076172, + "num_tokens": 349318252.0, + "step": 13994 + }, + { + "epoch": 1.5368987480781902, + "grad_norm": 2.4513275623321533, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7055329084396362, + "num_tokens": 349339914.0, + "step": 13995 + }, + { + "epoch": 1.537008565780804, + "grad_norm": 2.3558132648468018, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7386265993118286, + "num_tokens": 349361178.0, + "step": 13996 + }, + { + "epoch": 1.5371183834834174, + "grad_norm": 2.352825164794922, + "learning_rate": 1e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7529346942901611, + "num_tokens": 349382983.0, + "step": 13997 + }, + { + "epoch": 1.5372282011860312, + "grad_norm": 1.9863325357437134, + "learning_rate": 1e-06, + "loss": 0.8153, + "mean_token_accuracy": 0.74028480052948, + "num_tokens": 349414259.0, + "step": 13998 + }, + { + "epoch": 1.537338018888645, + "grad_norm": 2.2550759315490723, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7254284024238586, + "num_tokens": 349439390.0, + "step": 13999 + }, + { + "epoch": 1.5374478365912585, + "grad_norm": 2.266392230987549, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.704373836517334, + "num_tokens": 349462779.0, + "step": 14000 + }, + { + "epoch": 1.537557654293872, + "grad_norm": 2.6302011013031006, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7211318016052246, + "num_tokens": 349482449.0, + "step": 14001 + }, + { + "epoch": 1.5376674719964858, + "grad_norm": 1.9706467390060425, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7142472267150879, + "num_tokens": 349512955.0, + "step": 14002 + }, + { + "epoch": 1.5377772896990995, + "grad_norm": 2.2495408058166504, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7213512659072876, + "num_tokens": 349538888.0, + "step": 14003 + }, + { + "epoch": 1.5378871074017133, + "grad_norm": 2.5691628456115723, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7079052925109863, + "num_tokens": 349560199.0, + "step": 14004 + }, + { + "epoch": 1.5379969251043268, + "grad_norm": 2.344067335128784, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7247304916381836, + "num_tokens": 349584565.0, + "step": 14005 + }, + { + "epoch": 1.5381067428069404, + "grad_norm": 2.279008388519287, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.7028816342353821, + "num_tokens": 349612550.0, + "step": 14006 + }, + { + "epoch": 1.5382165605095541, + "grad_norm": 2.1093618869781494, + "learning_rate": 1e-06, + "loss": 0.796, + "mean_token_accuracy": 0.7477012872695923, + "num_tokens": 349638064.0, + "step": 14007 + }, + { + "epoch": 1.5383263782121679, + "grad_norm": 2.4026362895965576, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7259180545806885, + "num_tokens": 349660264.0, + "step": 14008 + }, + { + "epoch": 1.5384361959147814, + "grad_norm": 2.1238656044006348, + "learning_rate": 1e-06, + "loss": 0.8644, + "mean_token_accuracy": 0.7392784357070923, + "num_tokens": 349688522.0, + "step": 14009 + }, + { + "epoch": 1.538546013617395, + "grad_norm": 1.9581815004348755, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6875097751617432, + "num_tokens": 349721474.0, + "step": 14010 + }, + { + "epoch": 1.5386558313200087, + "grad_norm": 2.242621898651123, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7301275134086609, + "num_tokens": 349746061.0, + "step": 14011 + }, + { + "epoch": 1.5387656490226225, + "grad_norm": 2.486978769302368, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7438653707504272, + "num_tokens": 349766372.0, + "step": 14012 + }, + { + "epoch": 1.5388754667252362, + "grad_norm": 2.323845148086548, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7059499025344849, + "num_tokens": 349790542.0, + "step": 14013 + }, + { + "epoch": 1.5389852844278498, + "grad_norm": 2.0762834548950195, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.7124467492103577, + "num_tokens": 349820221.0, + "step": 14014 + }, + { + "epoch": 1.5390951021304633, + "grad_norm": 2.5335004329681396, + "learning_rate": 1e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7518242597579956, + "num_tokens": 349842009.0, + "step": 14015 + }, + { + "epoch": 1.539204919833077, + "grad_norm": 2.5315065383911133, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7467417120933533, + "num_tokens": 349863883.0, + "step": 14016 + }, + { + "epoch": 1.5393147375356908, + "grad_norm": 2.338716983795166, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7263786792755127, + "num_tokens": 349887181.0, + "step": 14017 + }, + { + "epoch": 1.5394245552383046, + "grad_norm": 2.2229814529418945, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7213542461395264, + "num_tokens": 349912459.0, + "step": 14018 + }, + { + "epoch": 1.539534372940918, + "grad_norm": 2.6086676120758057, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7420606017112732, + "num_tokens": 349933421.0, + "step": 14019 + }, + { + "epoch": 1.5396441906435316, + "grad_norm": 2.21197509765625, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7191698551177979, + "num_tokens": 349960476.0, + "step": 14020 + }, + { + "epoch": 1.5397540083461454, + "grad_norm": 2.4720160961151123, + "learning_rate": 1e-06, + "loss": 0.9885, + "mean_token_accuracy": 0.6914076805114746, + "num_tokens": 349983269.0, + "step": 14021 + }, + { + "epoch": 1.5398638260487592, + "grad_norm": 2.1264758110046387, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7128593921661377, + "num_tokens": 350009492.0, + "step": 14022 + }, + { + "epoch": 1.5399736437513727, + "grad_norm": 2.157696485519409, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7193604707717896, + "num_tokens": 350037172.0, + "step": 14023 + }, + { + "epoch": 1.5400834614539862, + "grad_norm": 2.7064762115478516, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7433676719665527, + "num_tokens": 350055699.0, + "step": 14024 + }, + { + "epoch": 1.5401932791566, + "grad_norm": 2.3730528354644775, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7306574583053589, + "num_tokens": 350077768.0, + "step": 14025 + }, + { + "epoch": 1.5403030968592137, + "grad_norm": 1.8042106628417969, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6853897571563721, + "num_tokens": 350113309.0, + "step": 14026 + }, + { + "epoch": 1.5404129145618275, + "grad_norm": 2.075382947921753, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7114583253860474, + "num_tokens": 350143676.0, + "step": 14027 + }, + { + "epoch": 1.540522732264441, + "grad_norm": 2.4454848766326904, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.723345160484314, + "num_tokens": 350165078.0, + "step": 14028 + }, + { + "epoch": 1.5406325499670546, + "grad_norm": 1.9020752906799316, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.7161054015159607, + "num_tokens": 350199064.0, + "step": 14029 + }, + { + "epoch": 1.5407423676696683, + "grad_norm": 2.3483898639678955, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.6994040012359619, + "num_tokens": 350222082.0, + "step": 14030 + }, + { + "epoch": 1.540852185372282, + "grad_norm": 2.034226894378662, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7081242203712463, + "num_tokens": 350252142.0, + "step": 14031 + }, + { + "epoch": 1.5409620030748958, + "grad_norm": 2.285294771194458, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7286087274551392, + "num_tokens": 350275662.0, + "step": 14032 + }, + { + "epoch": 1.5410718207775094, + "grad_norm": 2.403045415878296, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7301932573318481, + "num_tokens": 350298211.0, + "step": 14033 + }, + { + "epoch": 1.541181638480123, + "grad_norm": 2.4970452785491943, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7114946842193604, + "num_tokens": 350320329.0, + "step": 14034 + }, + { + "epoch": 1.5412914561827367, + "grad_norm": 2.3451385498046875, + "learning_rate": 1e-06, + "loss": 0.9396, + "mean_token_accuracy": 0.7054412364959717, + "num_tokens": 350345583.0, + "step": 14035 + }, + { + "epoch": 1.5414012738853504, + "grad_norm": 2.380922555923462, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.7260805368423462, + "num_tokens": 350368373.0, + "step": 14036 + }, + { + "epoch": 1.541511091587964, + "grad_norm": 1.9587981700897217, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7049544453620911, + "num_tokens": 350401049.0, + "step": 14037 + }, + { + "epoch": 1.5416209092905775, + "grad_norm": 2.719658374786377, + "learning_rate": 1e-06, + "loss": 0.7978, + "mean_token_accuracy": 0.7439121007919312, + "num_tokens": 350418768.0, + "step": 14038 + }, + { + "epoch": 1.5417307269931912, + "grad_norm": 2.5023982524871826, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.70698082447052, + "num_tokens": 350440263.0, + "step": 14039 + }, + { + "epoch": 1.541840544695805, + "grad_norm": 2.785907506942749, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7185746431350708, + "num_tokens": 350458794.0, + "step": 14040 + }, + { + "epoch": 1.5419503623984188, + "grad_norm": 2.1276071071624756, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7270099520683289, + "num_tokens": 350488386.0, + "step": 14041 + }, + { + "epoch": 1.5420601801010323, + "grad_norm": 2.5995402336120605, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7145447731018066, + "num_tokens": 350510506.0, + "step": 14042 + }, + { + "epoch": 1.5421699978036458, + "grad_norm": 2.3367671966552734, + "learning_rate": 1e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7321898937225342, + "num_tokens": 350533511.0, + "step": 14043 + }, + { + "epoch": 1.5422798155062596, + "grad_norm": 2.0596446990966797, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7053172588348389, + "num_tokens": 350563185.0, + "step": 14044 + }, + { + "epoch": 1.5423896332088733, + "grad_norm": 2.5179505348205566, + "learning_rate": 1e-06, + "loss": 0.9655, + "mean_token_accuracy": 0.7078825831413269, + "num_tokens": 350584800.0, + "step": 14045 + }, + { + "epoch": 1.5424994509114869, + "grad_norm": 2.329681634902954, + "learning_rate": 1e-06, + "loss": 0.9947, + "mean_token_accuracy": 0.700846791267395, + "num_tokens": 350610241.0, + "step": 14046 + }, + { + "epoch": 1.5426092686141006, + "grad_norm": 2.120069742202759, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7143909931182861, + "num_tokens": 350637948.0, + "step": 14047 + }, + { + "epoch": 1.5427190863167142, + "grad_norm": 2.295349359512329, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7163926362991333, + "num_tokens": 350663901.0, + "step": 14048 + }, + { + "epoch": 1.542828904019328, + "grad_norm": 2.271996259689331, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7049094438552856, + "num_tokens": 350690485.0, + "step": 14049 + }, + { + "epoch": 1.5429387217219417, + "grad_norm": 2.45076322555542, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7294338941574097, + "num_tokens": 350713884.0, + "step": 14050 + }, + { + "epoch": 1.5430485394245552, + "grad_norm": 2.0367093086242676, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.6844072341918945, + "num_tokens": 350747105.0, + "step": 14051 + }, + { + "epoch": 1.5431583571271688, + "grad_norm": 2.312335729598999, + "learning_rate": 1e-06, + "loss": 0.9608, + "mean_token_accuracy": 0.7035397291183472, + "num_tokens": 350773303.0, + "step": 14052 + }, + { + "epoch": 1.5432681748297825, + "grad_norm": 2.2343220710754395, + "learning_rate": 1e-06, + "loss": 0.9921, + "mean_token_accuracy": 0.692166805267334, + "num_tokens": 350800937.0, + "step": 14053 + }, + { + "epoch": 1.5433779925323963, + "grad_norm": 2.233374834060669, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.7049686908721924, + "num_tokens": 350826541.0, + "step": 14054 + }, + { + "epoch": 1.54348781023501, + "grad_norm": 2.461158275604248, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.741643488407135, + "num_tokens": 350847203.0, + "step": 14055 + }, + { + "epoch": 1.5435976279376236, + "grad_norm": 2.197218179702759, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.705367386341095, + "num_tokens": 350873128.0, + "step": 14056 + }, + { + "epoch": 1.543707445640237, + "grad_norm": 2.2544007301330566, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7138070464134216, + "num_tokens": 350901034.0, + "step": 14057 + }, + { + "epoch": 1.5438172633428509, + "grad_norm": 1.8923321962356567, + "learning_rate": 1e-06, + "loss": 0.9851, + "mean_token_accuracy": 0.7023320198059082, + "num_tokens": 350934948.0, + "step": 14058 + }, + { + "epoch": 1.5439270810454646, + "grad_norm": 2.913586139678955, + "learning_rate": 1e-06, + "loss": 0.9176, + "mean_token_accuracy": 0.7151347398757935, + "num_tokens": 350952937.0, + "step": 14059 + }, + { + "epoch": 1.5440368987480781, + "grad_norm": 2.2941861152648926, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7104755640029907, + "num_tokens": 350977318.0, + "step": 14060 + }, + { + "epoch": 1.544146716450692, + "grad_norm": 2.180324077606201, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7065560817718506, + "num_tokens": 351003755.0, + "step": 14061 + }, + { + "epoch": 1.5442565341533054, + "grad_norm": 2.4430480003356934, + "learning_rate": 1e-06, + "loss": 0.7302, + "mean_token_accuracy": 0.7631980180740356, + "num_tokens": 351023857.0, + "step": 14062 + }, + { + "epoch": 1.5443663518559192, + "grad_norm": 2.3160691261291504, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.74165940284729, + "num_tokens": 351047540.0, + "step": 14063 + }, + { + "epoch": 1.544476169558533, + "grad_norm": 2.223167896270752, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7193589210510254, + "num_tokens": 351074432.0, + "step": 14064 + }, + { + "epoch": 1.5445859872611465, + "grad_norm": 2.299450635910034, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7425026893615723, + "num_tokens": 351097497.0, + "step": 14065 + }, + { + "epoch": 1.54469580496376, + "grad_norm": 2.4902219772338867, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.7245523929595947, + "num_tokens": 351117759.0, + "step": 14066 + }, + { + "epoch": 1.5448056226663738, + "grad_norm": 2.1988492012023926, + "learning_rate": 1e-06, + "loss": 0.7587, + "mean_token_accuracy": 0.7528589367866516, + "num_tokens": 351140967.0, + "step": 14067 + }, + { + "epoch": 1.5449154403689875, + "grad_norm": 2.329204559326172, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7276804447174072, + "num_tokens": 351165356.0, + "step": 14068 + }, + { + "epoch": 1.5450252580716013, + "grad_norm": 2.368903398513794, + "learning_rate": 1e-06, + "loss": 0.9525, + "mean_token_accuracy": 0.701714038848877, + "num_tokens": 351188973.0, + "step": 14069 + }, + { + "epoch": 1.5451350757742148, + "grad_norm": 2.044858932495117, + "learning_rate": 1e-06, + "loss": 0.996, + "mean_token_accuracy": 0.6928286552429199, + "num_tokens": 351219710.0, + "step": 14070 + }, + { + "epoch": 1.5452448934768284, + "grad_norm": 2.4436910152435303, + "learning_rate": 1e-06, + "loss": 0.8493, + "mean_token_accuracy": 0.7283044457435608, + "num_tokens": 351240927.0, + "step": 14071 + }, + { + "epoch": 1.5453547111794421, + "grad_norm": 1.964350700378418, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7115710973739624, + "num_tokens": 351274120.0, + "step": 14072 + }, + { + "epoch": 1.5454645288820559, + "grad_norm": 2.2250328063964844, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7003722190856934, + "num_tokens": 351302705.0, + "step": 14073 + }, + { + "epoch": 1.5455743465846694, + "grad_norm": 2.4038796424865723, + "learning_rate": 1e-06, + "loss": 0.8541, + "mean_token_accuracy": 0.7303410768508911, + "num_tokens": 351323623.0, + "step": 14074 + }, + { + "epoch": 1.545684164287283, + "grad_norm": 2.4632718563079834, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7337546944618225, + "num_tokens": 351344618.0, + "step": 14075 + }, + { + "epoch": 1.5457939819898967, + "grad_norm": 2.1427056789398193, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7070775032043457, + "num_tokens": 351371024.0, + "step": 14076 + }, + { + "epoch": 1.5459037996925105, + "grad_norm": 2.4089105129241943, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.7057440876960754, + "num_tokens": 351395236.0, + "step": 14077 + }, + { + "epoch": 1.5460136173951242, + "grad_norm": 2.3942959308624268, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.710616946220398, + "num_tokens": 351418261.0, + "step": 14078 + }, + { + "epoch": 1.5461234350977378, + "grad_norm": 2.1498587131500244, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.6962025165557861, + "num_tokens": 351449213.0, + "step": 14079 + }, + { + "epoch": 1.5462332528003513, + "grad_norm": 2.4527413845062256, + "learning_rate": 1e-06, + "loss": 0.869, + "mean_token_accuracy": 0.733383059501648, + "num_tokens": 351471767.0, + "step": 14080 + }, + { + "epoch": 1.546343070502965, + "grad_norm": 2.364917278289795, + "learning_rate": 1e-06, + "loss": 0.8916, + "mean_token_accuracy": 0.7209596633911133, + "num_tokens": 351497414.0, + "step": 14081 + }, + { + "epoch": 1.5464528882055788, + "grad_norm": 2.520580768585205, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7159171104431152, + "num_tokens": 351518568.0, + "step": 14082 + }, + { + "epoch": 1.5465627059081926, + "grad_norm": 2.2413880825042725, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7003657221794128, + "num_tokens": 351545956.0, + "step": 14083 + }, + { + "epoch": 1.546672523610806, + "grad_norm": 2.57088041305542, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7162013649940491, + "num_tokens": 351567675.0, + "step": 14084 + }, + { + "epoch": 1.5467823413134196, + "grad_norm": 2.0800795555114746, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7198310494422913, + "num_tokens": 351594969.0, + "step": 14085 + }, + { + "epoch": 1.5468921590160334, + "grad_norm": 2.234015464782715, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7157024145126343, + "num_tokens": 351619552.0, + "step": 14086 + }, + { + "epoch": 1.5470019767186471, + "grad_norm": 2.2286391258239746, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7256731986999512, + "num_tokens": 351645806.0, + "step": 14087 + }, + { + "epoch": 1.5471117944212607, + "grad_norm": 2.4740381240844727, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7206063866615295, + "num_tokens": 351668379.0, + "step": 14088 + }, + { + "epoch": 1.5472216121238742, + "grad_norm": 2.7683603763580322, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.729479193687439, + "num_tokens": 351687676.0, + "step": 14089 + }, + { + "epoch": 1.547331429826488, + "grad_norm": 2.470057249069214, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7346014380455017, + "num_tokens": 351708719.0, + "step": 14090 + }, + { + "epoch": 1.5474412475291017, + "grad_norm": 2.2483394145965576, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.709530234336853, + "num_tokens": 351734691.0, + "step": 14091 + }, + { + "epoch": 1.5475510652317155, + "grad_norm": 2.727907180786133, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7366117238998413, + "num_tokens": 351753168.0, + "step": 14092 + }, + { + "epoch": 1.547660882934329, + "grad_norm": 2.4496355056762695, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.734215497970581, + "num_tokens": 351775484.0, + "step": 14093 + }, + { + "epoch": 1.5477707006369426, + "grad_norm": 2.388263463973999, + "learning_rate": 1e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7243311405181885, + "num_tokens": 351797745.0, + "step": 14094 + }, + { + "epoch": 1.5478805183395563, + "grad_norm": 2.575815439224243, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7232829332351685, + "num_tokens": 351820340.0, + "step": 14095 + }, + { + "epoch": 1.54799033604217, + "grad_norm": 2.373324394226074, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7246854901313782, + "num_tokens": 351846134.0, + "step": 14096 + }, + { + "epoch": 1.5481001537447838, + "grad_norm": 2.0853991508483887, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7311161160469055, + "num_tokens": 351873871.0, + "step": 14097 + }, + { + "epoch": 1.5482099714473974, + "grad_norm": 2.473544120788574, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7154508233070374, + "num_tokens": 351896331.0, + "step": 14098 + }, + { + "epoch": 1.548319789150011, + "grad_norm": 2.136403799057007, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7248239517211914, + "num_tokens": 351924765.0, + "step": 14099 + }, + { + "epoch": 1.5484296068526247, + "grad_norm": 1.9695580005645752, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7053859233856201, + "num_tokens": 351957037.0, + "step": 14100 + }, + { + "epoch": 1.5485394245552384, + "grad_norm": 2.3631138801574707, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7254616618156433, + "num_tokens": 351979828.0, + "step": 14101 + }, + { + "epoch": 1.548649242257852, + "grad_norm": 2.166595458984375, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.6970950365066528, + "num_tokens": 352008517.0, + "step": 14102 + }, + { + "epoch": 1.5487590599604655, + "grad_norm": 2.2512855529785156, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7313084006309509, + "num_tokens": 352032860.0, + "step": 14103 + }, + { + "epoch": 1.5488688776630792, + "grad_norm": 2.1543161869049072, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7180283665657043, + "num_tokens": 352060044.0, + "step": 14104 + }, + { + "epoch": 1.548978695365693, + "grad_norm": 2.0485196113586426, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.6983695030212402, + "num_tokens": 352090507.0, + "step": 14105 + }, + { + "epoch": 1.5490885130683067, + "grad_norm": 2.174281120300293, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.6973402500152588, + "num_tokens": 352118276.0, + "step": 14106 + }, + { + "epoch": 1.5491983307709203, + "grad_norm": 2.3313095569610596, + "learning_rate": 1e-06, + "loss": 0.9484, + "mean_token_accuracy": 0.720312774181366, + "num_tokens": 352140659.0, + "step": 14107 + }, + { + "epoch": 1.5493081484735338, + "grad_norm": 2.5358293056488037, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7208278179168701, + "num_tokens": 352162794.0, + "step": 14108 + }, + { + "epoch": 1.5494179661761476, + "grad_norm": 2.4332213401794434, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.717094361782074, + "num_tokens": 352183040.0, + "step": 14109 + }, + { + "epoch": 1.5495277838787613, + "grad_norm": 2.312842607498169, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7147424817085266, + "num_tokens": 352209000.0, + "step": 14110 + }, + { + "epoch": 1.5496376015813749, + "grad_norm": 2.6730971336364746, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7271824479103088, + "num_tokens": 352229428.0, + "step": 14111 + }, + { + "epoch": 1.5497474192839886, + "grad_norm": 2.23470139503479, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6934626698493958, + "num_tokens": 352257705.0, + "step": 14112 + }, + { + "epoch": 1.5498572369866022, + "grad_norm": 1.9739396572113037, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7251707911491394, + "num_tokens": 352290445.0, + "step": 14113 + }, + { + "epoch": 1.549967054689216, + "grad_norm": 2.544834613800049, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7098177671432495, + "num_tokens": 352311871.0, + "step": 14114 + }, + { + "epoch": 1.5500768723918297, + "grad_norm": 2.418290853500366, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7291661500930786, + "num_tokens": 352333596.0, + "step": 14115 + }, + { + "epoch": 1.5501866900944432, + "grad_norm": 2.2329983711242676, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7211565375328064, + "num_tokens": 352359003.0, + "step": 14116 + }, + { + "epoch": 1.5502965077970567, + "grad_norm": 2.7176246643066406, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7235704064369202, + "num_tokens": 352378389.0, + "step": 14117 + }, + { + "epoch": 1.5504063254996705, + "grad_norm": 2.350663185119629, + "learning_rate": 1e-06, + "loss": 0.7754, + "mean_token_accuracy": 0.7547542452812195, + "num_tokens": 352400081.0, + "step": 14118 + }, + { + "epoch": 1.5505161432022843, + "grad_norm": 2.4787118434906006, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7351369857788086, + "num_tokens": 352420061.0, + "step": 14119 + }, + { + "epoch": 1.550625960904898, + "grad_norm": 2.2851712703704834, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7353025674819946, + "num_tokens": 352444144.0, + "step": 14120 + }, + { + "epoch": 1.5507357786075116, + "grad_norm": 2.5086560249328613, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7205044031143188, + "num_tokens": 352465675.0, + "step": 14121 + }, + { + "epoch": 1.550845596310125, + "grad_norm": 2.0280964374542236, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7229591608047485, + "num_tokens": 352496946.0, + "step": 14122 + }, + { + "epoch": 1.5509554140127388, + "grad_norm": 2.1785945892333984, + "learning_rate": 1e-06, + "loss": 0.9552, + "mean_token_accuracy": 0.7070558071136475, + "num_tokens": 352524343.0, + "step": 14123 + }, + { + "epoch": 1.5510652317153526, + "grad_norm": 2.3544726371765137, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7210661768913269, + "num_tokens": 352549030.0, + "step": 14124 + }, + { + "epoch": 1.5511750494179661, + "grad_norm": 2.133543014526367, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7265505790710449, + "num_tokens": 352576223.0, + "step": 14125 + }, + { + "epoch": 1.55128486712058, + "grad_norm": 2.1993048191070557, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7068199515342712, + "num_tokens": 352603416.0, + "step": 14126 + }, + { + "epoch": 1.5513946848231934, + "grad_norm": 2.4018681049346924, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7478606700897217, + "num_tokens": 352624610.0, + "step": 14127 + }, + { + "epoch": 1.5515045025258072, + "grad_norm": 2.2326459884643555, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7391778826713562, + "num_tokens": 352648489.0, + "step": 14128 + }, + { + "epoch": 1.551614320228421, + "grad_norm": 2.3149585723876953, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7138875722885132, + "num_tokens": 352673393.0, + "step": 14129 + }, + { + "epoch": 1.5517241379310345, + "grad_norm": 2.4055631160736084, + "learning_rate": 1e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7395378947257996, + "num_tokens": 352696660.0, + "step": 14130 + }, + { + "epoch": 1.551833955633648, + "grad_norm": 2.1234965324401855, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7236694693565369, + "num_tokens": 352722841.0, + "step": 14131 + }, + { + "epoch": 1.5519437733362618, + "grad_norm": 2.306055784225464, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7221699953079224, + "num_tokens": 352750047.0, + "step": 14132 + }, + { + "epoch": 1.5520535910388755, + "grad_norm": 2.3068792819976807, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7299331426620483, + "num_tokens": 352774649.0, + "step": 14133 + }, + { + "epoch": 1.5521634087414893, + "grad_norm": 2.7193267345428467, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7436782121658325, + "num_tokens": 352793991.0, + "step": 14134 + }, + { + "epoch": 1.5522732264441028, + "grad_norm": 2.167351484298706, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7412484884262085, + "num_tokens": 352822570.0, + "step": 14135 + }, + { + "epoch": 1.5523830441467164, + "grad_norm": 2.200216770172119, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7183369398117065, + "num_tokens": 352848688.0, + "step": 14136 + }, + { + "epoch": 1.55249286184933, + "grad_norm": 1.9969247579574585, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7376357316970825, + "num_tokens": 352878376.0, + "step": 14137 + }, + { + "epoch": 1.5526026795519439, + "grad_norm": 2.3906235694885254, + "learning_rate": 1e-06, + "loss": 0.9803, + "mean_token_accuracy": 0.6968945264816284, + "num_tokens": 352902527.0, + "step": 14138 + }, + { + "epoch": 1.5527124972545574, + "grad_norm": 2.306941032409668, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7100112438201904, + "num_tokens": 352927048.0, + "step": 14139 + }, + { + "epoch": 1.552822314957171, + "grad_norm": 2.4631834030151367, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7197646498680115, + "num_tokens": 352950742.0, + "step": 14140 + }, + { + "epoch": 1.5529321326597847, + "grad_norm": 2.309375286102295, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7151798009872437, + "num_tokens": 352974022.0, + "step": 14141 + }, + { + "epoch": 1.5530419503623984, + "grad_norm": 2.0412979125976562, + "learning_rate": 1e-06, + "loss": 0.9681, + "mean_token_accuracy": 0.7028439044952393, + "num_tokens": 353006112.0, + "step": 14142 + }, + { + "epoch": 1.5531517680650122, + "grad_norm": 2.383660078048706, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7224282026290894, + "num_tokens": 353028492.0, + "step": 14143 + }, + { + "epoch": 1.5532615857676257, + "grad_norm": 2.548884153366089, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7198516726493835, + "num_tokens": 353048456.0, + "step": 14144 + }, + { + "epoch": 1.5533714034702393, + "grad_norm": 2.1363306045532227, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7130864858627319, + "num_tokens": 353074172.0, + "step": 14145 + }, + { + "epoch": 1.553481221172853, + "grad_norm": 2.156703472137451, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7306511402130127, + "num_tokens": 353100243.0, + "step": 14146 + }, + { + "epoch": 1.5535910388754668, + "grad_norm": 2.3240795135498047, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7109065055847168, + "num_tokens": 353125339.0, + "step": 14147 + }, + { + "epoch": 1.5537008565780805, + "grad_norm": 2.527953863143921, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7330707311630249, + "num_tokens": 353147204.0, + "step": 14148 + }, + { + "epoch": 1.553810674280694, + "grad_norm": 2.304851770401001, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7259248495101929, + "num_tokens": 353170176.0, + "step": 14149 + }, + { + "epoch": 1.5539204919833076, + "grad_norm": 2.287061929702759, + "learning_rate": 1e-06, + "loss": 0.9873, + "mean_token_accuracy": 0.7030807733535767, + "num_tokens": 353193299.0, + "step": 14150 + }, + { + "epoch": 1.5540303096859214, + "grad_norm": 2.4178221225738525, + "learning_rate": 1e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.7477177381515503, + "num_tokens": 353214700.0, + "step": 14151 + }, + { + "epoch": 1.5541401273885351, + "grad_norm": 2.3689522743225098, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7230212688446045, + "num_tokens": 353238618.0, + "step": 14152 + }, + { + "epoch": 1.5542499450911487, + "grad_norm": 2.0749192237854004, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7206965088844299, + "num_tokens": 353267221.0, + "step": 14153 + }, + { + "epoch": 1.5543597627937622, + "grad_norm": 2.6469414234161377, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7274364233016968, + "num_tokens": 353286887.0, + "step": 14154 + }, + { + "epoch": 1.554469580496376, + "grad_norm": 2.172236204147339, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7328487038612366, + "num_tokens": 353314172.0, + "step": 14155 + }, + { + "epoch": 1.5545793981989897, + "grad_norm": 2.1605334281921387, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6976350545883179, + "num_tokens": 353345324.0, + "step": 14156 + }, + { + "epoch": 1.5546892159016035, + "grad_norm": 2.274566173553467, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.7294743657112122, + "num_tokens": 353373638.0, + "step": 14157 + }, + { + "epoch": 1.554799033604217, + "grad_norm": 2.0659472942352295, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7090855836868286, + "num_tokens": 353404074.0, + "step": 14158 + }, + { + "epoch": 1.5549088513068305, + "grad_norm": 2.010200262069702, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7068494558334351, + "num_tokens": 353433976.0, + "step": 14159 + }, + { + "epoch": 1.5550186690094443, + "grad_norm": 2.186384916305542, + "learning_rate": 1e-06, + "loss": 0.9679, + "mean_token_accuracy": 0.7001888751983643, + "num_tokens": 353464008.0, + "step": 14160 + }, + { + "epoch": 1.555128486712058, + "grad_norm": 2.418256998062134, + "learning_rate": 1e-06, + "loss": 0.9027, + "mean_token_accuracy": 0.7150604128837585, + "num_tokens": 353488270.0, + "step": 14161 + }, + { + "epoch": 1.5552383044146716, + "grad_norm": 2.526785135269165, + "learning_rate": 1e-06, + "loss": 1.0014, + "mean_token_accuracy": 0.6960644721984863, + "num_tokens": 353511090.0, + "step": 14162 + }, + { + "epoch": 1.5553481221172853, + "grad_norm": 2.2770822048187256, + "learning_rate": 1e-06, + "loss": 0.9837, + "mean_token_accuracy": 0.699882984161377, + "num_tokens": 353534235.0, + "step": 14163 + }, + { + "epoch": 1.5554579398198989, + "grad_norm": 2.5290040969848633, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7326250672340393, + "num_tokens": 353554600.0, + "step": 14164 + }, + { + "epoch": 1.5555677575225126, + "grad_norm": 2.3467752933502197, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7179343700408936, + "num_tokens": 353577699.0, + "step": 14165 + }, + { + "epoch": 1.5556775752251264, + "grad_norm": 2.4754080772399902, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7027361392974854, + "num_tokens": 353599147.0, + "step": 14166 + }, + { + "epoch": 1.55578739292774, + "grad_norm": 2.247316360473633, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.712332546710968, + "num_tokens": 353624551.0, + "step": 14167 + }, + { + "epoch": 1.5558972106303535, + "grad_norm": 2.2168362140655518, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7061773538589478, + "num_tokens": 353651068.0, + "step": 14168 + }, + { + "epoch": 1.5560070283329672, + "grad_norm": 2.2793703079223633, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.721930980682373, + "num_tokens": 353675565.0, + "step": 14169 + }, + { + "epoch": 1.556116846035581, + "grad_norm": 2.167492389678955, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7184875011444092, + "num_tokens": 353703592.0, + "step": 14170 + }, + { + "epoch": 1.5562266637381947, + "grad_norm": 2.236621618270874, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.717143714427948, + "num_tokens": 353729081.0, + "step": 14171 + }, + { + "epoch": 1.5563364814408083, + "grad_norm": 2.624112367630005, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.7383326292037964, + "num_tokens": 353748112.0, + "step": 14172 + }, + { + "epoch": 1.5564462991434218, + "grad_norm": 2.368940830230713, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7131372690200806, + "num_tokens": 353772511.0, + "step": 14173 + }, + { + "epoch": 1.5565561168460356, + "grad_norm": 2.4955382347106934, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7251554727554321, + "num_tokens": 353794775.0, + "step": 14174 + }, + { + "epoch": 1.5566659345486493, + "grad_norm": 2.2987160682678223, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.70291668176651, + "num_tokens": 353821066.0, + "step": 14175 + }, + { + "epoch": 1.5567757522512629, + "grad_norm": 2.7282469272613525, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7144242525100708, + "num_tokens": 353840576.0, + "step": 14176 + }, + { + "epoch": 1.5568855699538766, + "grad_norm": 2.5853447914123535, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7342818975448608, + "num_tokens": 353861880.0, + "step": 14177 + }, + { + "epoch": 1.5569953876564901, + "grad_norm": 2.5276148319244385, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7218583822250366, + "num_tokens": 353881787.0, + "step": 14178 + }, + { + "epoch": 1.557105205359104, + "grad_norm": 2.302290201187134, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7203103303909302, + "num_tokens": 353906102.0, + "step": 14179 + }, + { + "epoch": 1.5572150230617177, + "grad_norm": 2.132545232772827, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7043485045433044, + "num_tokens": 353931800.0, + "step": 14180 + }, + { + "epoch": 1.5573248407643312, + "grad_norm": 2.2566237449645996, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7335793375968933, + "num_tokens": 353955190.0, + "step": 14181 + }, + { + "epoch": 1.5574346584669447, + "grad_norm": 2.285200357437134, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7247005701065063, + "num_tokens": 353978891.0, + "step": 14182 + }, + { + "epoch": 1.5575444761695585, + "grad_norm": 2.3681960105895996, + "learning_rate": 1e-06, + "loss": 0.7766, + "mean_token_accuracy": 0.7504879236221313, + "num_tokens": 353999437.0, + "step": 14183 + }, + { + "epoch": 1.5576542938721722, + "grad_norm": 2.462113857269287, + "learning_rate": 1e-06, + "loss": 0.8919, + "mean_token_accuracy": 0.7217459678649902, + "num_tokens": 354021691.0, + "step": 14184 + }, + { + "epoch": 1.557764111574786, + "grad_norm": 2.4049689769744873, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7287135124206543, + "num_tokens": 354044674.0, + "step": 14185 + }, + { + "epoch": 1.5578739292773995, + "grad_norm": 2.2960493564605713, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7216833829879761, + "num_tokens": 354068511.0, + "step": 14186 + }, + { + "epoch": 1.557983746980013, + "grad_norm": 2.384995698928833, + "learning_rate": 1e-06, + "loss": 0.9824, + "mean_token_accuracy": 0.7060133218765259, + "num_tokens": 354093128.0, + "step": 14187 + }, + { + "epoch": 1.5580935646826268, + "grad_norm": 2.2144393920898438, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7269496917724609, + "num_tokens": 354122089.0, + "step": 14188 + }, + { + "epoch": 1.5582033823852406, + "grad_norm": 1.9332083463668823, + "learning_rate": 1e-06, + "loss": 0.9609, + "mean_token_accuracy": 0.7038815021514893, + "num_tokens": 354154185.0, + "step": 14189 + }, + { + "epoch": 1.5583132000878541, + "grad_norm": 2.343553304672241, + "learning_rate": 1e-06, + "loss": 0.9892, + "mean_token_accuracy": 0.6993979215621948, + "num_tokens": 354180576.0, + "step": 14190 + }, + { + "epoch": 1.5584230177904677, + "grad_norm": 2.3147292137145996, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7199087142944336, + "num_tokens": 354207124.0, + "step": 14191 + }, + { + "epoch": 1.5585328354930814, + "grad_norm": 2.171255350112915, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7142646312713623, + "num_tokens": 354236239.0, + "step": 14192 + }, + { + "epoch": 1.5586426531956952, + "grad_norm": 2.3276071548461914, + "learning_rate": 1e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7376179695129395, + "num_tokens": 354258587.0, + "step": 14193 + }, + { + "epoch": 1.558752470898309, + "grad_norm": 2.1572961807250977, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7331110239028931, + "num_tokens": 354283462.0, + "step": 14194 + }, + { + "epoch": 1.5588622886009225, + "grad_norm": 2.3988256454467773, + "learning_rate": 1e-06, + "loss": 0.7609, + "mean_token_accuracy": 0.7522399425506592, + "num_tokens": 354306749.0, + "step": 14195 + }, + { + "epoch": 1.558972106303536, + "grad_norm": 2.8270461559295654, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7323473691940308, + "num_tokens": 354326027.0, + "step": 14196 + }, + { + "epoch": 1.5590819240061498, + "grad_norm": 2.336657762527466, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.7106592059135437, + "num_tokens": 354351560.0, + "step": 14197 + }, + { + "epoch": 1.5591917417087635, + "grad_norm": 2.210780382156372, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7346778512001038, + "num_tokens": 354378327.0, + "step": 14198 + }, + { + "epoch": 1.5593015594113773, + "grad_norm": 2.249774217605591, + "learning_rate": 1e-06, + "loss": 0.8855, + "mean_token_accuracy": 0.7325686812400818, + "num_tokens": 354403437.0, + "step": 14199 + }, + { + "epoch": 1.5594113771139908, + "grad_norm": 2.0436723232269287, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7188109159469604, + "num_tokens": 354432244.0, + "step": 14200 + }, + { + "epoch": 1.5595211948166043, + "grad_norm": 2.4543728828430176, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7047115564346313, + "num_tokens": 354456855.0, + "step": 14201 + }, + { + "epoch": 1.559631012519218, + "grad_norm": 2.117887020111084, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7217058539390564, + "num_tokens": 354483740.0, + "step": 14202 + }, + { + "epoch": 1.5597408302218319, + "grad_norm": 2.5294501781463623, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7498735785484314, + "num_tokens": 354505246.0, + "step": 14203 + }, + { + "epoch": 1.5598506479244454, + "grad_norm": 2.354611873626709, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7123343348503113, + "num_tokens": 354530210.0, + "step": 14204 + }, + { + "epoch": 1.559960465627059, + "grad_norm": 2.164116859436035, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7244640588760376, + "num_tokens": 354556972.0, + "step": 14205 + }, + { + "epoch": 1.5600702833296727, + "grad_norm": 2.348288059234619, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6988827586174011, + "num_tokens": 354582042.0, + "step": 14206 + }, + { + "epoch": 1.5601801010322864, + "grad_norm": 2.313701868057251, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7100642919540405, + "num_tokens": 354605827.0, + "step": 14207 + }, + { + "epoch": 1.5602899187349002, + "grad_norm": 2.326385736465454, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7364829778671265, + "num_tokens": 354630269.0, + "step": 14208 + }, + { + "epoch": 1.5603997364375137, + "grad_norm": 2.020637035369873, + "learning_rate": 1e-06, + "loss": 0.9613, + "mean_token_accuracy": 0.7052067518234253, + "num_tokens": 354661480.0, + "step": 14209 + }, + { + "epoch": 1.5605095541401273, + "grad_norm": 2.2299964427948, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7205551862716675, + "num_tokens": 354685599.0, + "step": 14210 + }, + { + "epoch": 1.560619371842741, + "grad_norm": 2.59274959564209, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7314929962158203, + "num_tokens": 354704008.0, + "step": 14211 + }, + { + "epoch": 1.5607291895453548, + "grad_norm": 2.1496193408966064, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7258701920509338, + "num_tokens": 354730419.0, + "step": 14212 + }, + { + "epoch": 1.5608390072479685, + "grad_norm": 2.184084892272949, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7268040180206299, + "num_tokens": 354757278.0, + "step": 14213 + }, + { + "epoch": 1.560948824950582, + "grad_norm": 2.151412010192871, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7204682230949402, + "num_tokens": 354783955.0, + "step": 14214 + }, + { + "epoch": 1.5610586426531956, + "grad_norm": 2.212332010269165, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7156447768211365, + "num_tokens": 354811276.0, + "step": 14215 + }, + { + "epoch": 1.5611684603558094, + "grad_norm": 2.1277120113372803, + "learning_rate": 1e-06, + "loss": 0.9883, + "mean_token_accuracy": 0.7043558359146118, + "num_tokens": 354841244.0, + "step": 14216 + }, + { + "epoch": 1.5612782780584231, + "grad_norm": 2.3851234912872314, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7182058095932007, + "num_tokens": 354863773.0, + "step": 14217 + }, + { + "epoch": 1.5613880957610367, + "grad_norm": 1.9848366975784302, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7272509336471558, + "num_tokens": 354895739.0, + "step": 14218 + }, + { + "epoch": 1.5614979134636502, + "grad_norm": 2.499990940093994, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7355976104736328, + "num_tokens": 354917641.0, + "step": 14219 + }, + { + "epoch": 1.561607731166264, + "grad_norm": 2.6398277282714844, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7255080938339233, + "num_tokens": 354937408.0, + "step": 14220 + }, + { + "epoch": 1.5617175488688777, + "grad_norm": 2.5667810440063477, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.730625569820404, + "num_tokens": 354959042.0, + "step": 14221 + }, + { + "epoch": 1.5618273665714915, + "grad_norm": 2.3697452545166016, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7217305302619934, + "num_tokens": 354984252.0, + "step": 14222 + }, + { + "epoch": 1.561937184274105, + "grad_norm": 2.3969955444335938, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7111393809318542, + "num_tokens": 355009036.0, + "step": 14223 + }, + { + "epoch": 1.5620470019767185, + "grad_norm": 2.2080674171447754, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7314451336860657, + "num_tokens": 355035744.0, + "step": 14224 + }, + { + "epoch": 1.5621568196793323, + "grad_norm": 2.5043911933898926, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7128624320030212, + "num_tokens": 355055943.0, + "step": 14225 + }, + { + "epoch": 1.562266637381946, + "grad_norm": 2.3941667079925537, + "learning_rate": 1e-06, + "loss": 0.967, + "mean_token_accuracy": 0.6980644464492798, + "num_tokens": 355079086.0, + "step": 14226 + }, + { + "epoch": 1.5623764550845596, + "grad_norm": 2.2814371585845947, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7236050367355347, + "num_tokens": 355105869.0, + "step": 14227 + }, + { + "epoch": 1.5624862727871733, + "grad_norm": 2.5554757118225098, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7236347198486328, + "num_tokens": 355125692.0, + "step": 14228 + }, + { + "epoch": 1.5625960904897869, + "grad_norm": 2.304375648498535, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7194550633430481, + "num_tokens": 355151502.0, + "step": 14229 + }, + { + "epoch": 1.5627059081924006, + "grad_norm": 2.2165067195892334, + "learning_rate": 1e-06, + "loss": 1.0076, + "mean_token_accuracy": 0.6957117915153503, + "num_tokens": 355179192.0, + "step": 14230 + }, + { + "epoch": 1.5628157258950144, + "grad_norm": 2.313948631286621, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7217922806739807, + "num_tokens": 355204615.0, + "step": 14231 + }, + { + "epoch": 1.562925543597628, + "grad_norm": 2.3275344371795654, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7285192012786865, + "num_tokens": 355227647.0, + "step": 14232 + }, + { + "epoch": 1.5630353613002415, + "grad_norm": 2.32657790184021, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7204124927520752, + "num_tokens": 355251014.0, + "step": 14233 + }, + { + "epoch": 1.5631451790028552, + "grad_norm": 2.287409543991089, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7034944295883179, + "num_tokens": 355275683.0, + "step": 14234 + }, + { + "epoch": 1.563254996705469, + "grad_norm": 2.614830493927002, + "learning_rate": 1e-06, + "loss": 0.8302, + "mean_token_accuracy": 0.7486637830734253, + "num_tokens": 355294181.0, + "step": 14235 + }, + { + "epoch": 1.5633648144080827, + "grad_norm": 2.0929722785949707, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7129002809524536, + "num_tokens": 355323999.0, + "step": 14236 + }, + { + "epoch": 1.5634746321106963, + "grad_norm": 2.43416428565979, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7267663478851318, + "num_tokens": 355344728.0, + "step": 14237 + }, + { + "epoch": 1.5635844498133098, + "grad_norm": 2.1919970512390137, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7237380743026733, + "num_tokens": 355371678.0, + "step": 14238 + }, + { + "epoch": 1.5636942675159236, + "grad_norm": 2.3238582611083984, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.7026082277297974, + "num_tokens": 355398768.0, + "step": 14239 + }, + { + "epoch": 1.5638040852185373, + "grad_norm": 2.445173740386963, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7345269322395325, + "num_tokens": 355419498.0, + "step": 14240 + }, + { + "epoch": 1.5639139029211508, + "grad_norm": 2.1585750579833984, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7251055836677551, + "num_tokens": 355447231.0, + "step": 14241 + }, + { + "epoch": 1.5640237206237646, + "grad_norm": 2.074810266494751, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7009792327880859, + "num_tokens": 355477493.0, + "step": 14242 + }, + { + "epoch": 1.5641335383263781, + "grad_norm": 2.5328307151794434, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7262436151504517, + "num_tokens": 355499170.0, + "step": 14243 + }, + { + "epoch": 1.564243356028992, + "grad_norm": 2.0477447509765625, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7004847526550293, + "num_tokens": 355529484.0, + "step": 14244 + }, + { + "epoch": 1.5643531737316057, + "grad_norm": 2.0370843410491943, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.711118221282959, + "num_tokens": 355559842.0, + "step": 14245 + }, + { + "epoch": 1.5644629914342192, + "grad_norm": 2.1016428470611572, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7177379131317139, + "num_tokens": 355588343.0, + "step": 14246 + }, + { + "epoch": 1.5645728091368327, + "grad_norm": 2.189263343811035, + "learning_rate": 1e-06, + "loss": 0.798, + "mean_token_accuracy": 0.742974042892456, + "num_tokens": 355613510.0, + "step": 14247 + }, + { + "epoch": 1.5646826268394465, + "grad_norm": 2.409722089767456, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7114635705947876, + "num_tokens": 355638082.0, + "step": 14248 + }, + { + "epoch": 1.5647924445420602, + "grad_norm": 2.2495229244232178, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7288516759872437, + "num_tokens": 355663585.0, + "step": 14249 + }, + { + "epoch": 1.564902262244674, + "grad_norm": 2.11814284324646, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7162113785743713, + "num_tokens": 355689836.0, + "step": 14250 + }, + { + "epoch": 1.5650120799472875, + "grad_norm": 2.571450710296631, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7242521047592163, + "num_tokens": 355709520.0, + "step": 14251 + }, + { + "epoch": 1.565121897649901, + "grad_norm": 2.3916895389556885, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.708919882774353, + "num_tokens": 355732842.0, + "step": 14252 + }, + { + "epoch": 1.5652317153525148, + "grad_norm": 2.132120132446289, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7221295833587646, + "num_tokens": 355760657.0, + "step": 14253 + }, + { + "epoch": 1.5653415330551286, + "grad_norm": 2.1571826934814453, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7109878063201904, + "num_tokens": 355788501.0, + "step": 14254 + }, + { + "epoch": 1.565451350757742, + "grad_norm": 2.7990543842315674, + "learning_rate": 1e-06, + "loss": 0.7957, + "mean_token_accuracy": 0.7467960715293884, + "num_tokens": 355806013.0, + "step": 14255 + }, + { + "epoch": 1.5655611684603556, + "grad_norm": 2.608760356903076, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.6996319890022278, + "num_tokens": 355827602.0, + "step": 14256 + }, + { + "epoch": 1.5656709861629694, + "grad_norm": 2.00002121925354, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7176935076713562, + "num_tokens": 355861991.0, + "step": 14257 + }, + { + "epoch": 1.5657808038655832, + "grad_norm": 2.5218183994293213, + "learning_rate": 1e-06, + "loss": 0.8109, + "mean_token_accuracy": 0.7440797090530396, + "num_tokens": 355884236.0, + "step": 14258 + }, + { + "epoch": 1.565890621568197, + "grad_norm": 2.097344398498535, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7148803472518921, + "num_tokens": 355915793.0, + "step": 14259 + }, + { + "epoch": 1.5660004392708105, + "grad_norm": 2.2308712005615234, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7362228631973267, + "num_tokens": 355938110.0, + "step": 14260 + }, + { + "epoch": 1.566110256973424, + "grad_norm": 2.1529242992401123, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.7079258561134338, + "num_tokens": 355967059.0, + "step": 14261 + }, + { + "epoch": 1.5662200746760377, + "grad_norm": 2.0785984992980957, + "learning_rate": 1e-06, + "loss": 0.9683, + "mean_token_accuracy": 0.7045360207557678, + "num_tokens": 355996549.0, + "step": 14262 + }, + { + "epoch": 1.5663298923786515, + "grad_norm": 2.320920944213867, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7099783420562744, + "num_tokens": 356020823.0, + "step": 14263 + }, + { + "epoch": 1.5664397100812653, + "grad_norm": 2.3351094722747803, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7169477343559265, + "num_tokens": 356043619.0, + "step": 14264 + }, + { + "epoch": 1.5665495277838788, + "grad_norm": 2.155222177505493, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7130855917930603, + "num_tokens": 356071800.0, + "step": 14265 + }, + { + "epoch": 1.5666593454864923, + "grad_norm": 2.1463587284088135, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7104403972625732, + "num_tokens": 356100003.0, + "step": 14266 + }, + { + "epoch": 1.566769163189106, + "grad_norm": 1.9094038009643555, + "learning_rate": 1e-06, + "loss": 0.9971, + "mean_token_accuracy": 0.6990903615951538, + "num_tokens": 356134202.0, + "step": 14267 + }, + { + "epoch": 1.5668789808917198, + "grad_norm": 2.2425520420074463, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7178515791893005, + "num_tokens": 356159587.0, + "step": 14268 + }, + { + "epoch": 1.5669887985943334, + "grad_norm": 2.2454216480255127, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7079693675041199, + "num_tokens": 356184321.0, + "step": 14269 + }, + { + "epoch": 1.567098616296947, + "grad_norm": 2.249680995941162, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.7288135290145874, + "num_tokens": 356209432.0, + "step": 14270 + }, + { + "epoch": 1.5672084339995607, + "grad_norm": 2.0442676544189453, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.724685549736023, + "num_tokens": 356240887.0, + "step": 14271 + }, + { + "epoch": 1.5673182517021744, + "grad_norm": 2.269274950027466, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7201565504074097, + "num_tokens": 356264114.0, + "step": 14272 + }, + { + "epoch": 1.5674280694047882, + "grad_norm": 2.2949297428131104, + "learning_rate": 1e-06, + "loss": 0.872, + "mean_token_accuracy": 0.7277390360832214, + "num_tokens": 356289412.0, + "step": 14273 + }, + { + "epoch": 1.5675378871074017, + "grad_norm": 2.273922920227051, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.715694785118103, + "num_tokens": 356314293.0, + "step": 14274 + }, + { + "epoch": 1.5676477048100153, + "grad_norm": 2.3566055297851562, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7149339318275452, + "num_tokens": 356339515.0, + "step": 14275 + }, + { + "epoch": 1.567757522512629, + "grad_norm": 2.4359543323516846, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7341710329055786, + "num_tokens": 356360713.0, + "step": 14276 + }, + { + "epoch": 1.5678673402152428, + "grad_norm": 2.259002447128296, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7348189949989319, + "num_tokens": 356385986.0, + "step": 14277 + }, + { + "epoch": 1.5679771579178565, + "grad_norm": 2.0186917781829834, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7127771377563477, + "num_tokens": 356416516.0, + "step": 14278 + }, + { + "epoch": 1.56808697562047, + "grad_norm": 2.4240317344665527, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7215521335601807, + "num_tokens": 356440045.0, + "step": 14279 + }, + { + "epoch": 1.5681967933230836, + "grad_norm": 2.1572353839874268, + "learning_rate": 1e-06, + "loss": 1.0484, + "mean_token_accuracy": 0.684798002243042, + "num_tokens": 356470400.0, + "step": 14280 + }, + { + "epoch": 1.5683066110256974, + "grad_norm": 2.133819341659546, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7233166694641113, + "num_tokens": 356497818.0, + "step": 14281 + }, + { + "epoch": 1.568416428728311, + "grad_norm": 2.3646440505981445, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7097226977348328, + "num_tokens": 356522795.0, + "step": 14282 + }, + { + "epoch": 1.5685262464309246, + "grad_norm": 1.9380543231964111, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.721638560295105, + "num_tokens": 356553468.0, + "step": 14283 + }, + { + "epoch": 1.5686360641335382, + "grad_norm": 2.109800338745117, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7104321718215942, + "num_tokens": 356580040.0, + "step": 14284 + }, + { + "epoch": 1.568745881836152, + "grad_norm": 2.445136308670044, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7323940992355347, + "num_tokens": 356601422.0, + "step": 14285 + }, + { + "epoch": 1.5688556995387657, + "grad_norm": 2.248854875564575, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7251744866371155, + "num_tokens": 356624313.0, + "step": 14286 + }, + { + "epoch": 1.5689655172413794, + "grad_norm": 2.2740590572357178, + "learning_rate": 1e-06, + "loss": 0.778, + "mean_token_accuracy": 0.7539624571800232, + "num_tokens": 356646944.0, + "step": 14287 + }, + { + "epoch": 1.569075334943993, + "grad_norm": 2.224961519241333, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7273603081703186, + "num_tokens": 356671341.0, + "step": 14288 + }, + { + "epoch": 1.5691851526466065, + "grad_norm": 2.1037657260894775, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7122542858123779, + "num_tokens": 356699155.0, + "step": 14289 + }, + { + "epoch": 1.5692949703492203, + "grad_norm": 2.185702085494995, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7314258813858032, + "num_tokens": 356723706.0, + "step": 14290 + }, + { + "epoch": 1.569404788051834, + "grad_norm": 1.9927544593811035, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7186349630355835, + "num_tokens": 356754764.0, + "step": 14291 + }, + { + "epoch": 1.5695146057544476, + "grad_norm": 2.215459108352661, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7300718426704407, + "num_tokens": 356780153.0, + "step": 14292 + }, + { + "epoch": 1.5696244234570613, + "grad_norm": 2.4106035232543945, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7374488115310669, + "num_tokens": 356803466.0, + "step": 14293 + }, + { + "epoch": 1.5697342411596749, + "grad_norm": 2.110527515411377, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7250148057937622, + "num_tokens": 356831618.0, + "step": 14294 + }, + { + "epoch": 1.5698440588622886, + "grad_norm": 2.4802322387695312, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.713817834854126, + "num_tokens": 356852467.0, + "step": 14295 + }, + { + "epoch": 1.5699538765649024, + "grad_norm": 2.367879867553711, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7459145784378052, + "num_tokens": 356874038.0, + "step": 14296 + }, + { + "epoch": 1.570063694267516, + "grad_norm": 2.0670130252838135, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.700329065322876, + "num_tokens": 356905218.0, + "step": 14297 + }, + { + "epoch": 1.5701735119701294, + "grad_norm": 2.3571481704711914, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.732027530670166, + "num_tokens": 356927758.0, + "step": 14298 + }, + { + "epoch": 1.5702833296727432, + "grad_norm": 2.1689505577087402, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7059222459793091, + "num_tokens": 356955890.0, + "step": 14299 + }, + { + "epoch": 1.570393147375357, + "grad_norm": 2.2327165603637695, + "learning_rate": 1e-06, + "loss": 0.9913, + "mean_token_accuracy": 0.7042827606201172, + "num_tokens": 356984699.0, + "step": 14300 + }, + { + "epoch": 1.5705029650779707, + "grad_norm": 2.2130775451660156, + "learning_rate": 1e-06, + "loss": 1.0231, + "mean_token_accuracy": 0.6923394203186035, + "num_tokens": 357011755.0, + "step": 14301 + }, + { + "epoch": 1.5706127827805842, + "grad_norm": 2.452310562133789, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.711045503616333, + "num_tokens": 357034306.0, + "step": 14302 + }, + { + "epoch": 1.5707226004831978, + "grad_norm": 2.378770351409912, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7411973476409912, + "num_tokens": 357056455.0, + "step": 14303 + }, + { + "epoch": 1.5708324181858115, + "grad_norm": 2.030353546142578, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7093172669410706, + "num_tokens": 357086479.0, + "step": 14304 + }, + { + "epoch": 1.5709422358884253, + "grad_norm": 2.2406933307647705, + "learning_rate": 1e-06, + "loss": 0.8059, + "mean_token_accuracy": 0.7412030696868896, + "num_tokens": 357109382.0, + "step": 14305 + }, + { + "epoch": 1.5710520535910388, + "grad_norm": 2.1169235706329346, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7056402564048767, + "num_tokens": 357139004.0, + "step": 14306 + }, + { + "epoch": 1.5711618712936526, + "grad_norm": 2.2866322994232178, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7031140327453613, + "num_tokens": 357164230.0, + "step": 14307 + }, + { + "epoch": 1.5712716889962661, + "grad_norm": 2.1255712509155273, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.7449458241462708, + "num_tokens": 357188672.0, + "step": 14308 + }, + { + "epoch": 1.5713815066988799, + "grad_norm": 2.2912182807922363, + "learning_rate": 1e-06, + "loss": 0.8052, + "mean_token_accuracy": 0.7414706349372864, + "num_tokens": 357211481.0, + "step": 14309 + }, + { + "epoch": 1.5714913244014936, + "grad_norm": 2.1730377674102783, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7307759523391724, + "num_tokens": 357238098.0, + "step": 14310 + }, + { + "epoch": 1.5716011421041072, + "grad_norm": 2.045048475265503, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7082263231277466, + "num_tokens": 357267709.0, + "step": 14311 + }, + { + "epoch": 1.5717109598067207, + "grad_norm": 2.1549644470214844, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7164354920387268, + "num_tokens": 357294624.0, + "step": 14312 + }, + { + "epoch": 1.5718207775093345, + "grad_norm": 2.374913215637207, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7197103500366211, + "num_tokens": 357317038.0, + "step": 14313 + }, + { + "epoch": 1.5719305952119482, + "grad_norm": 2.6670796871185303, + "learning_rate": 1e-06, + "loss": 0.7804, + "mean_token_accuracy": 0.7474307417869568, + "num_tokens": 357335763.0, + "step": 14314 + }, + { + "epoch": 1.572040412914562, + "grad_norm": 2.3239943981170654, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7441171407699585, + "num_tokens": 357358924.0, + "step": 14315 + }, + { + "epoch": 1.5721502306171755, + "grad_norm": 2.0607686042785645, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7172306180000305, + "num_tokens": 357389626.0, + "step": 14316 + }, + { + "epoch": 1.572260048319789, + "grad_norm": 2.27116060256958, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.695080578327179, + "num_tokens": 357415487.0, + "step": 14317 + }, + { + "epoch": 1.5723698660224028, + "grad_norm": 2.325662612915039, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7033389210700989, + "num_tokens": 357439985.0, + "step": 14318 + }, + { + "epoch": 1.5724796837250166, + "grad_norm": 2.2235090732574463, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7056208252906799, + "num_tokens": 357467597.0, + "step": 14319 + }, + { + "epoch": 1.57258950142763, + "grad_norm": 2.241027593612671, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7214433550834656, + "num_tokens": 357493712.0, + "step": 14320 + }, + { + "epoch": 1.5726993191302436, + "grad_norm": 2.0514190196990967, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7018780708312988, + "num_tokens": 357524716.0, + "step": 14321 + }, + { + "epoch": 1.5728091368328574, + "grad_norm": 2.527693748474121, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7182668447494507, + "num_tokens": 357545617.0, + "step": 14322 + }, + { + "epoch": 1.5729189545354711, + "grad_norm": 2.116811752319336, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7153480648994446, + "num_tokens": 357574359.0, + "step": 14323 + }, + { + "epoch": 1.573028772238085, + "grad_norm": 2.1863420009613037, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7152836322784424, + "num_tokens": 357601094.0, + "step": 14324 + }, + { + "epoch": 1.5731385899406984, + "grad_norm": 2.4954171180725098, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7183663845062256, + "num_tokens": 357622679.0, + "step": 14325 + }, + { + "epoch": 1.573248407643312, + "grad_norm": 2.1849870681762695, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.72245854139328, + "num_tokens": 357648263.0, + "step": 14326 + }, + { + "epoch": 1.5733582253459257, + "grad_norm": 1.951695442199707, + "learning_rate": 1e-06, + "loss": 0.9528, + "mean_token_accuracy": 0.7035865783691406, + "num_tokens": 357680610.0, + "step": 14327 + }, + { + "epoch": 1.5734680430485395, + "grad_norm": 2.404202461242676, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7394486665725708, + "num_tokens": 357702885.0, + "step": 14328 + }, + { + "epoch": 1.5735778607511532, + "grad_norm": 2.3166871070861816, + "learning_rate": 1e-06, + "loss": 0.9795, + "mean_token_accuracy": 0.6981036067008972, + "num_tokens": 357727293.0, + "step": 14329 + }, + { + "epoch": 1.5736876784537668, + "grad_norm": 2.1894614696502686, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7026113271713257, + "num_tokens": 357753155.0, + "step": 14330 + }, + { + "epoch": 1.5737974961563803, + "grad_norm": 2.653003215789795, + "learning_rate": 1e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7405174970626831, + "num_tokens": 357771311.0, + "step": 14331 + }, + { + "epoch": 1.573907313858994, + "grad_norm": 2.157042980194092, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7190858125686646, + "num_tokens": 357796339.0, + "step": 14332 + }, + { + "epoch": 1.5740171315616078, + "grad_norm": 2.3519580364227295, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7142891883850098, + "num_tokens": 357819618.0, + "step": 14333 + }, + { + "epoch": 1.5741269492642214, + "grad_norm": 2.228060245513916, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6868472695350647, + "num_tokens": 357847050.0, + "step": 14334 + }, + { + "epoch": 1.574236766966835, + "grad_norm": 2.1508333683013916, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6996673345565796, + "num_tokens": 357873116.0, + "step": 14335 + }, + { + "epoch": 1.5743465846694487, + "grad_norm": 2.1859703063964844, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7245720028877258, + "num_tokens": 357899286.0, + "step": 14336 + }, + { + "epoch": 1.5744564023720624, + "grad_norm": 2.3351516723632812, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.6993361711502075, + "num_tokens": 357923230.0, + "step": 14337 + }, + { + "epoch": 1.5745662200746762, + "grad_norm": 2.589784860610962, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7257899641990662, + "num_tokens": 357942693.0, + "step": 14338 + }, + { + "epoch": 1.5746760377772897, + "grad_norm": 2.216259717941284, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7240777015686035, + "num_tokens": 357967277.0, + "step": 14339 + }, + { + "epoch": 1.5747858554799032, + "grad_norm": 2.118598461151123, + "learning_rate": 1e-06, + "loss": 0.96, + "mean_token_accuracy": 0.7066617012023926, + "num_tokens": 357994570.0, + "step": 14340 + }, + { + "epoch": 1.574895673182517, + "grad_norm": 2.1305105686187744, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7317363619804382, + "num_tokens": 358021917.0, + "step": 14341 + }, + { + "epoch": 1.5750054908851308, + "grad_norm": 2.5132131576538086, + "learning_rate": 1e-06, + "loss": 0.8606, + "mean_token_accuracy": 0.7399488687515259, + "num_tokens": 358044010.0, + "step": 14342 + }, + { + "epoch": 1.5751153085877443, + "grad_norm": 2.527977705001831, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7199198007583618, + "num_tokens": 358064868.0, + "step": 14343 + }, + { + "epoch": 1.575225126290358, + "grad_norm": 2.089482307434082, + "learning_rate": 1e-06, + "loss": 1.0153, + "mean_token_accuracy": 0.6896427869796753, + "num_tokens": 358096086.0, + "step": 14344 + }, + { + "epoch": 1.5753349439929716, + "grad_norm": 2.134093761444092, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7322002649307251, + "num_tokens": 358123004.0, + "step": 14345 + }, + { + "epoch": 1.5754447616955853, + "grad_norm": 2.543337345123291, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7298277616500854, + "num_tokens": 358142258.0, + "step": 14346 + }, + { + "epoch": 1.575554579398199, + "grad_norm": 2.2126290798187256, + "learning_rate": 1e-06, + "loss": 0.9937, + "mean_token_accuracy": 0.7004445791244507, + "num_tokens": 358169140.0, + "step": 14347 + }, + { + "epoch": 1.5756643971008126, + "grad_norm": 2.429060459136963, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7285128831863403, + "num_tokens": 358191347.0, + "step": 14348 + }, + { + "epoch": 1.5757742148034262, + "grad_norm": 2.101259469985962, + "learning_rate": 1e-06, + "loss": 1.0339, + "mean_token_accuracy": 0.699614405632019, + "num_tokens": 358225858.0, + "step": 14349 + }, + { + "epoch": 1.57588403250604, + "grad_norm": 2.419417142868042, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7259858846664429, + "num_tokens": 358247921.0, + "step": 14350 + }, + { + "epoch": 1.5759938502086537, + "grad_norm": 2.3861191272735596, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7316096425056458, + "num_tokens": 358270501.0, + "step": 14351 + }, + { + "epoch": 1.5761036679112674, + "grad_norm": 2.229743719100952, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7016003727912903, + "num_tokens": 358297958.0, + "step": 14352 + }, + { + "epoch": 1.576213485613881, + "grad_norm": 2.5889246463775635, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7235099077224731, + "num_tokens": 358317836.0, + "step": 14353 + }, + { + "epoch": 1.5763233033164945, + "grad_norm": 2.2768540382385254, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7201187610626221, + "num_tokens": 358341626.0, + "step": 14354 + }, + { + "epoch": 1.5764331210191083, + "grad_norm": 2.395890474319458, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7401122450828552, + "num_tokens": 358363486.0, + "step": 14355 + }, + { + "epoch": 1.576542938721722, + "grad_norm": 2.1883256435394287, + "learning_rate": 1e-06, + "loss": 1.0049, + "mean_token_accuracy": 0.6915252208709717, + "num_tokens": 358393132.0, + "step": 14356 + }, + { + "epoch": 1.5766527564243356, + "grad_norm": 2.291066884994507, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7447496056556702, + "num_tokens": 358416018.0, + "step": 14357 + }, + { + "epoch": 1.5767625741269493, + "grad_norm": 2.2302944660186768, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7399100065231323, + "num_tokens": 358442114.0, + "step": 14358 + }, + { + "epoch": 1.5768723918295628, + "grad_norm": 2.128471612930298, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7229777574539185, + "num_tokens": 358470368.0, + "step": 14359 + }, + { + "epoch": 1.5769822095321766, + "grad_norm": 2.2540953159332275, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7020825147628784, + "num_tokens": 358495884.0, + "step": 14360 + }, + { + "epoch": 1.5770920272347904, + "grad_norm": 2.214362621307373, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7120643854141235, + "num_tokens": 358522771.0, + "step": 14361 + }, + { + "epoch": 1.577201844937404, + "grad_norm": 2.3648414611816406, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.7208578586578369, + "num_tokens": 358546845.0, + "step": 14362 + }, + { + "epoch": 1.5773116626400174, + "grad_norm": 2.012153148651123, + "learning_rate": 1e-06, + "loss": 0.977, + "mean_token_accuracy": 0.7020978331565857, + "num_tokens": 358578747.0, + "step": 14363 + }, + { + "epoch": 1.5774214803426312, + "grad_norm": 2.311133623123169, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7376760244369507, + "num_tokens": 358604294.0, + "step": 14364 + }, + { + "epoch": 1.577531298045245, + "grad_norm": 2.1566004753112793, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.6983771324157715, + "num_tokens": 358632932.0, + "step": 14365 + }, + { + "epoch": 1.5776411157478587, + "grad_norm": 1.983901023864746, + "learning_rate": 1e-06, + "loss": 0.9954, + "mean_token_accuracy": 0.6978722810745239, + "num_tokens": 358664731.0, + "step": 14366 + }, + { + "epoch": 1.5777509334504722, + "grad_norm": 2.2212204933166504, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7127798795700073, + "num_tokens": 358691760.0, + "step": 14367 + }, + { + "epoch": 1.5778607511530858, + "grad_norm": 2.41732120513916, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.731153666973114, + "num_tokens": 358712819.0, + "step": 14368 + }, + { + "epoch": 1.5779705688556995, + "grad_norm": 2.0480759143829346, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7149808406829834, + "num_tokens": 358742203.0, + "step": 14369 + }, + { + "epoch": 1.5780803865583133, + "grad_norm": 2.3168914318084717, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.707221269607544, + "num_tokens": 358766747.0, + "step": 14370 + }, + { + "epoch": 1.5781902042609268, + "grad_norm": 2.1920409202575684, + "learning_rate": 1e-06, + "loss": 0.9286, + "mean_token_accuracy": 0.7093936204910278, + "num_tokens": 358793956.0, + "step": 14371 + }, + { + "epoch": 1.5783000219635406, + "grad_norm": 2.2700798511505127, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7353862524032593, + "num_tokens": 358818194.0, + "step": 14372 + }, + { + "epoch": 1.5784098396661541, + "grad_norm": 2.3241348266601562, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7195114493370056, + "num_tokens": 358841066.0, + "step": 14373 + }, + { + "epoch": 1.5785196573687679, + "grad_norm": 2.198103189468384, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7079036831855774, + "num_tokens": 358867334.0, + "step": 14374 + }, + { + "epoch": 1.5786294750713816, + "grad_norm": 2.5928561687469482, + "learning_rate": 1e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7447444200515747, + "num_tokens": 358886871.0, + "step": 14375 + }, + { + "epoch": 1.5787392927739952, + "grad_norm": 2.169797897338867, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7444981932640076, + "num_tokens": 358913064.0, + "step": 14376 + }, + { + "epoch": 1.5788491104766087, + "grad_norm": 2.340461254119873, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.7071800231933594, + "num_tokens": 358935659.0, + "step": 14377 + }, + { + "epoch": 1.5789589281792225, + "grad_norm": 2.191433906555176, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7038266062736511, + "num_tokens": 358961908.0, + "step": 14378 + }, + { + "epoch": 1.5790687458818362, + "grad_norm": 2.130399703979492, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7123816013336182, + "num_tokens": 358990058.0, + "step": 14379 + }, + { + "epoch": 1.57917856358445, + "grad_norm": 2.290530204772949, + "learning_rate": 1e-06, + "loss": 0.782, + "mean_token_accuracy": 0.74803626537323, + "num_tokens": 359012294.0, + "step": 14380 + }, + { + "epoch": 1.5792883812870635, + "grad_norm": 2.187281608581543, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7111427783966064, + "num_tokens": 359037744.0, + "step": 14381 + }, + { + "epoch": 1.579398198989677, + "grad_norm": 2.369635581970215, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7249530553817749, + "num_tokens": 359059412.0, + "step": 14382 + }, + { + "epoch": 1.5795080166922908, + "grad_norm": 2.267932415008545, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7122769951820374, + "num_tokens": 359084471.0, + "step": 14383 + }, + { + "epoch": 1.5796178343949046, + "grad_norm": 2.0561280250549316, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.725515604019165, + "num_tokens": 359111594.0, + "step": 14384 + }, + { + "epoch": 1.579727652097518, + "grad_norm": 2.2074434757232666, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7199990749359131, + "num_tokens": 359137273.0, + "step": 14385 + }, + { + "epoch": 1.5798374698001316, + "grad_norm": 2.430917501449585, + "learning_rate": 1e-06, + "loss": 1.0432, + "mean_token_accuracy": 0.6789597868919373, + "num_tokens": 359162510.0, + "step": 14386 + }, + { + "epoch": 1.5799472875027454, + "grad_norm": 2.2491214275360107, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7145309448242188, + "num_tokens": 359187610.0, + "step": 14387 + }, + { + "epoch": 1.5800571052053591, + "grad_norm": 2.8703293800354004, + "learning_rate": 1e-06, + "loss": 0.8106, + "mean_token_accuracy": 0.7441340684890747, + "num_tokens": 359203619.0, + "step": 14388 + }, + { + "epoch": 1.580166922907973, + "grad_norm": 2.144649028778076, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.729174017906189, + "num_tokens": 359228188.0, + "step": 14389 + }, + { + "epoch": 1.5802767406105864, + "grad_norm": 2.166166067123413, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7020935416221619, + "num_tokens": 359256030.0, + "step": 14390 + }, + { + "epoch": 1.5803865583132, + "grad_norm": 2.1268153190612793, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.7064980268478394, + "num_tokens": 359285599.0, + "step": 14391 + }, + { + "epoch": 1.5804963760158137, + "grad_norm": 2.182159423828125, + "learning_rate": 1e-06, + "loss": 1.0102, + "mean_token_accuracy": 0.6837294101715088, + "num_tokens": 359315043.0, + "step": 14392 + }, + { + "epoch": 1.5806061937184275, + "grad_norm": 2.1994807720184326, + "learning_rate": 1e-06, + "loss": 0.855, + "mean_token_accuracy": 0.7386071085929871, + "num_tokens": 359340817.0, + "step": 14393 + }, + { + "epoch": 1.5807160114210412, + "grad_norm": 2.67612361907959, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7200934886932373, + "num_tokens": 359361262.0, + "step": 14394 + }, + { + "epoch": 1.5808258291236548, + "grad_norm": 2.8028132915496826, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7385499477386475, + "num_tokens": 359379919.0, + "step": 14395 + }, + { + "epoch": 1.5809356468262683, + "grad_norm": 2.3323967456817627, + "learning_rate": 1e-06, + "loss": 0.9249, + "mean_token_accuracy": 0.7125698328018188, + "num_tokens": 359403937.0, + "step": 14396 + }, + { + "epoch": 1.581045464528882, + "grad_norm": 2.532632350921631, + "learning_rate": 1e-06, + "loss": 0.9881, + "mean_token_accuracy": 0.7139841318130493, + "num_tokens": 359426864.0, + "step": 14397 + }, + { + "epoch": 1.5811552822314958, + "grad_norm": 2.3053605556488037, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.721989095211029, + "num_tokens": 359452345.0, + "step": 14398 + }, + { + "epoch": 1.5812650999341094, + "grad_norm": 2.2279162406921387, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7089844942092896, + "num_tokens": 359476659.0, + "step": 14399 + }, + { + "epoch": 1.5813749176367229, + "grad_norm": 2.5325639247894287, + "learning_rate": 1e-06, + "loss": 0.8395, + "mean_token_accuracy": 0.733649492263794, + "num_tokens": 359495813.0, + "step": 14400 + }, + { + "epoch": 1.5814847353393366, + "grad_norm": 2.2238314151763916, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7318872213363647, + "num_tokens": 359520430.0, + "step": 14401 + }, + { + "epoch": 1.5815945530419504, + "grad_norm": 2.3523316383361816, + "learning_rate": 1e-06, + "loss": 0.8763, + "mean_token_accuracy": 0.7242254614830017, + "num_tokens": 359545824.0, + "step": 14402 + }, + { + "epoch": 1.5817043707445642, + "grad_norm": 2.6448447704315186, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7176289558410645, + "num_tokens": 359566248.0, + "step": 14403 + }, + { + "epoch": 1.5818141884471777, + "grad_norm": 2.0694527626037598, + "learning_rate": 1e-06, + "loss": 0.89, + "mean_token_accuracy": 0.7225146293640137, + "num_tokens": 359595170.0, + "step": 14404 + }, + { + "epoch": 1.5819240061497912, + "grad_norm": 2.118588924407959, + "learning_rate": 1e-06, + "loss": 1.017, + "mean_token_accuracy": 0.698357105255127, + "num_tokens": 359623974.0, + "step": 14405 + }, + { + "epoch": 1.582033823852405, + "grad_norm": 2.2104122638702393, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7260480523109436, + "num_tokens": 359648960.0, + "step": 14406 + }, + { + "epoch": 1.5821436415550187, + "grad_norm": 2.1009018421173096, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.724232017993927, + "num_tokens": 359675676.0, + "step": 14407 + }, + { + "epoch": 1.5822534592576323, + "grad_norm": 2.0723822116851807, + "learning_rate": 1e-06, + "loss": 0.795, + "mean_token_accuracy": 0.7479815483093262, + "num_tokens": 359703391.0, + "step": 14408 + }, + { + "epoch": 1.582363276960246, + "grad_norm": 2.1127445697784424, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7388260364532471, + "num_tokens": 359732523.0, + "step": 14409 + }, + { + "epoch": 1.5824730946628596, + "grad_norm": 2.131382465362549, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.70546954870224, + "num_tokens": 359762462.0, + "step": 14410 + }, + { + "epoch": 1.5825829123654733, + "grad_norm": 2.2798752784729004, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7171179056167603, + "num_tokens": 359786733.0, + "step": 14411 + }, + { + "epoch": 1.582692730068087, + "grad_norm": 2.178347110748291, + "learning_rate": 1e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7525045275688171, + "num_tokens": 359812020.0, + "step": 14412 + }, + { + "epoch": 1.5828025477707006, + "grad_norm": 2.322709798812866, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7221883535385132, + "num_tokens": 359835182.0, + "step": 14413 + }, + { + "epoch": 1.5829123654733142, + "grad_norm": 2.055143117904663, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7209601402282715, + "num_tokens": 359863761.0, + "step": 14414 + }, + { + "epoch": 1.583022183175928, + "grad_norm": 2.166576862335205, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7117385268211365, + "num_tokens": 359891705.0, + "step": 14415 + }, + { + "epoch": 1.5831320008785417, + "grad_norm": 2.509333610534668, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7215933799743652, + "num_tokens": 359913390.0, + "step": 14416 + }, + { + "epoch": 1.5832418185811554, + "grad_norm": 2.2850892543792725, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7175990343093872, + "num_tokens": 359938150.0, + "step": 14417 + }, + { + "epoch": 1.583351636283769, + "grad_norm": 2.202909231185913, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7104004621505737, + "num_tokens": 359964079.0, + "step": 14418 + }, + { + "epoch": 1.5834614539863825, + "grad_norm": 2.6470160484313965, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7124135494232178, + "num_tokens": 359984482.0, + "step": 14419 + }, + { + "epoch": 1.5835712716889963, + "grad_norm": 2.351639747619629, + "learning_rate": 1e-06, + "loss": 0.943, + "mean_token_accuracy": 0.706031084060669, + "num_tokens": 360010285.0, + "step": 14420 + }, + { + "epoch": 1.58368108939161, + "grad_norm": 2.6166164875030518, + "learning_rate": 1e-06, + "loss": 0.7975, + "mean_token_accuracy": 0.7453829646110535, + "num_tokens": 360029078.0, + "step": 14421 + }, + { + "epoch": 1.5837909070942235, + "grad_norm": 2.454265832901001, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7330588102340698, + "num_tokens": 360050465.0, + "step": 14422 + }, + { + "epoch": 1.5839007247968373, + "grad_norm": 2.6636364459991455, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.726125180721283, + "num_tokens": 360069740.0, + "step": 14423 + }, + { + "epoch": 1.5840105424994508, + "grad_norm": 2.477764129638672, + "learning_rate": 1e-06, + "loss": 0.8603, + "mean_token_accuracy": 0.7354397773742676, + "num_tokens": 360089525.0, + "step": 14424 + }, + { + "epoch": 1.5841203602020646, + "grad_norm": 2.3377552032470703, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7202888131141663, + "num_tokens": 360111685.0, + "step": 14425 + }, + { + "epoch": 1.5842301779046783, + "grad_norm": 2.2309067249298096, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7189119458198547, + "num_tokens": 360137016.0, + "step": 14426 + }, + { + "epoch": 1.5843399956072919, + "grad_norm": 1.9863537549972534, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7178162336349487, + "num_tokens": 360168521.0, + "step": 14427 + }, + { + "epoch": 1.5844498133099054, + "grad_norm": 2.746399402618408, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.718323826789856, + "num_tokens": 360187846.0, + "step": 14428 + }, + { + "epoch": 1.5845596310125192, + "grad_norm": 1.9677318334579468, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.7116541862487793, + "num_tokens": 360218273.0, + "step": 14429 + }, + { + "epoch": 1.584669448715133, + "grad_norm": 2.2928168773651123, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7248529195785522, + "num_tokens": 360241822.0, + "step": 14430 + }, + { + "epoch": 1.5847792664177467, + "grad_norm": 2.6131064891815186, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6879383325576782, + "num_tokens": 360263655.0, + "step": 14431 + }, + { + "epoch": 1.5848890841203602, + "grad_norm": 2.224926233291626, + "learning_rate": 1e-06, + "loss": 0.974, + "mean_token_accuracy": 0.6950538158416748, + "num_tokens": 360293146.0, + "step": 14432 + }, + { + "epoch": 1.5849989018229738, + "grad_norm": 2.3640058040618896, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7168091535568237, + "num_tokens": 360317329.0, + "step": 14433 + }, + { + "epoch": 1.5851087195255875, + "grad_norm": 2.792564630508423, + "learning_rate": 1e-06, + "loss": 0.7637, + "mean_token_accuracy": 0.751825213432312, + "num_tokens": 360334313.0, + "step": 14434 + }, + { + "epoch": 1.5852185372282013, + "grad_norm": 2.1488518714904785, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7357227802276611, + "num_tokens": 360359280.0, + "step": 14435 + }, + { + "epoch": 1.5853283549308148, + "grad_norm": 2.3916661739349365, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7239694595336914, + "num_tokens": 360381996.0, + "step": 14436 + }, + { + "epoch": 1.5854381726334283, + "grad_norm": 2.7985405921936035, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7320571541786194, + "num_tokens": 360400550.0, + "step": 14437 + }, + { + "epoch": 1.585547990336042, + "grad_norm": 2.6578755378723145, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7262501120567322, + "num_tokens": 360428760.0, + "step": 14438 + }, + { + "epoch": 1.5856578080386559, + "grad_norm": 2.1307592391967773, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7374398708343506, + "num_tokens": 360455590.0, + "step": 14439 + }, + { + "epoch": 1.5857676257412696, + "grad_norm": 2.36987042427063, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7107144594192505, + "num_tokens": 360481288.0, + "step": 14440 + }, + { + "epoch": 1.5858774434438832, + "grad_norm": 2.394479274749756, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7197428941726685, + "num_tokens": 360505311.0, + "step": 14441 + }, + { + "epoch": 1.5859872611464967, + "grad_norm": 2.2270073890686035, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7105898261070251, + "num_tokens": 360531311.0, + "step": 14442 + }, + { + "epoch": 1.5860970788491104, + "grad_norm": 2.2319130897521973, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7191375494003296, + "num_tokens": 360555906.0, + "step": 14443 + }, + { + "epoch": 1.5862068965517242, + "grad_norm": 2.385883092880249, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7098181247711182, + "num_tokens": 360578529.0, + "step": 14444 + }, + { + "epoch": 1.586316714254338, + "grad_norm": 2.3103418350219727, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7250230312347412, + "num_tokens": 360602250.0, + "step": 14445 + }, + { + "epoch": 1.5864265319569515, + "grad_norm": 2.0558385848999023, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.722170352935791, + "num_tokens": 360632163.0, + "step": 14446 + }, + { + "epoch": 1.586536349659565, + "grad_norm": 2.365488052368164, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7084361910820007, + "num_tokens": 360656178.0, + "step": 14447 + }, + { + "epoch": 1.5866461673621788, + "grad_norm": 2.536853075027466, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7170044183731079, + "num_tokens": 360677122.0, + "step": 14448 + }, + { + "epoch": 1.5867559850647925, + "grad_norm": 2.28898549079895, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.723484992980957, + "num_tokens": 360701172.0, + "step": 14449 + }, + { + "epoch": 1.586865802767406, + "grad_norm": 2.380540609359741, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7479386925697327, + "num_tokens": 360722903.0, + "step": 14450 + }, + { + "epoch": 1.5869756204700196, + "grad_norm": 2.3623592853546143, + "learning_rate": 1e-06, + "loss": 0.8397, + "mean_token_accuracy": 0.7411639094352722, + "num_tokens": 360745738.0, + "step": 14451 + }, + { + "epoch": 1.5870854381726334, + "grad_norm": 2.7043561935424805, + "learning_rate": 1e-06, + "loss": 0.8697, + "mean_token_accuracy": 0.7299615740776062, + "num_tokens": 360763829.0, + "step": 14452 + }, + { + "epoch": 1.5871952558752471, + "grad_norm": 2.4502153396606445, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7168915271759033, + "num_tokens": 360786694.0, + "step": 14453 + }, + { + "epoch": 1.5873050735778609, + "grad_norm": 2.394345760345459, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7171010971069336, + "num_tokens": 360811634.0, + "step": 14454 + }, + { + "epoch": 1.5874148912804744, + "grad_norm": 2.3013012409210205, + "learning_rate": 1e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7352743148803711, + "num_tokens": 360835001.0, + "step": 14455 + }, + { + "epoch": 1.587524708983088, + "grad_norm": 2.229623556137085, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7007693648338318, + "num_tokens": 360860701.0, + "step": 14456 + }, + { + "epoch": 1.5876345266857017, + "grad_norm": 2.1563687324523926, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7328805923461914, + "num_tokens": 360887719.0, + "step": 14457 + }, + { + "epoch": 1.5877443443883155, + "grad_norm": 2.2145440578460693, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7049784064292908, + "num_tokens": 360913876.0, + "step": 14458 + }, + { + "epoch": 1.5878541620909292, + "grad_norm": 2.142518997192383, + "learning_rate": 1e-06, + "loss": 0.9956, + "mean_token_accuracy": 0.6993370056152344, + "num_tokens": 360942776.0, + "step": 14459 + }, + { + "epoch": 1.5879639797935428, + "grad_norm": 2.274631977081299, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7192644476890564, + "num_tokens": 360968608.0, + "step": 14460 + }, + { + "epoch": 1.5880737974961563, + "grad_norm": 2.160809278488159, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7102924585342407, + "num_tokens": 360994614.0, + "step": 14461 + }, + { + "epoch": 1.58818361519877, + "grad_norm": 2.2255709171295166, + "learning_rate": 1e-06, + "loss": 0.9903, + "mean_token_accuracy": 0.6955488920211792, + "num_tokens": 361021369.0, + "step": 14462 + }, + { + "epoch": 1.5882934329013838, + "grad_norm": 2.2610599994659424, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7183367609977722, + "num_tokens": 361048206.0, + "step": 14463 + }, + { + "epoch": 1.5884032506039973, + "grad_norm": 2.641049385070801, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.734950065612793, + "num_tokens": 361068132.0, + "step": 14464 + }, + { + "epoch": 1.5885130683066109, + "grad_norm": 2.2226524353027344, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7509320974349976, + "num_tokens": 361092920.0, + "step": 14465 + }, + { + "epoch": 1.5886228860092246, + "grad_norm": 2.2412781715393066, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7095182538032532, + "num_tokens": 361119140.0, + "step": 14466 + }, + { + "epoch": 1.5887327037118384, + "grad_norm": 2.116771936416626, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7128193378448486, + "num_tokens": 361146899.0, + "step": 14467 + }, + { + "epoch": 1.5888425214144521, + "grad_norm": 2.2922937870025635, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7420887351036072, + "num_tokens": 361170004.0, + "step": 14468 + }, + { + "epoch": 1.5889523391170657, + "grad_norm": 2.354546308517456, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7210960388183594, + "num_tokens": 361191584.0, + "step": 14469 + }, + { + "epoch": 1.5890621568196792, + "grad_norm": 2.1594202518463135, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7249940037727356, + "num_tokens": 361218752.0, + "step": 14470 + }, + { + "epoch": 1.589171974522293, + "grad_norm": 1.957811951637268, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7180495262145996, + "num_tokens": 361251064.0, + "step": 14471 + }, + { + "epoch": 1.5892817922249067, + "grad_norm": 2.273864984512329, + "learning_rate": 1e-06, + "loss": 0.8068, + "mean_token_accuracy": 0.7352930307388306, + "num_tokens": 361274311.0, + "step": 14472 + }, + { + "epoch": 1.5893916099275203, + "grad_norm": 2.5169849395751953, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.730751633644104, + "num_tokens": 361293145.0, + "step": 14473 + }, + { + "epoch": 1.589501427630134, + "grad_norm": 2.0010406970977783, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7152325510978699, + "num_tokens": 361322993.0, + "step": 14474 + }, + { + "epoch": 1.5896112453327476, + "grad_norm": 2.2107620239257812, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7254403829574585, + "num_tokens": 361347531.0, + "step": 14475 + }, + { + "epoch": 1.5897210630353613, + "grad_norm": 2.207847833633423, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7125481367111206, + "num_tokens": 361375279.0, + "step": 14476 + }, + { + "epoch": 1.589830880737975, + "grad_norm": 1.95393705368042, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7258549928665161, + "num_tokens": 361409419.0, + "step": 14477 + }, + { + "epoch": 1.5899406984405886, + "grad_norm": 2.078216552734375, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.7203184366226196, + "num_tokens": 361438343.0, + "step": 14478 + }, + { + "epoch": 1.5900505161432021, + "grad_norm": 2.062537431716919, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7243853807449341, + "num_tokens": 361465893.0, + "step": 14479 + }, + { + "epoch": 1.590160333845816, + "grad_norm": 2.400918483734131, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7058855295181274, + "num_tokens": 361490755.0, + "step": 14480 + }, + { + "epoch": 1.5902701515484297, + "grad_norm": 2.419743061065674, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7166732549667358, + "num_tokens": 361513006.0, + "step": 14481 + }, + { + "epoch": 1.5903799692510434, + "grad_norm": 2.366006374359131, + "learning_rate": 1e-06, + "loss": 0.7866, + "mean_token_accuracy": 0.7539939880371094, + "num_tokens": 361534746.0, + "step": 14482 + }, + { + "epoch": 1.590489786953657, + "grad_norm": 2.376345157623291, + "learning_rate": 1e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.7413966059684753, + "num_tokens": 361555722.0, + "step": 14483 + }, + { + "epoch": 1.5905996046562705, + "grad_norm": 2.0441200733184814, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7407625317573547, + "num_tokens": 361584279.0, + "step": 14484 + }, + { + "epoch": 1.5907094223588842, + "grad_norm": 2.343167781829834, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.7002787590026855, + "num_tokens": 361610114.0, + "step": 14485 + }, + { + "epoch": 1.590819240061498, + "grad_norm": 2.10577130317688, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7205322980880737, + "num_tokens": 361638313.0, + "step": 14486 + }, + { + "epoch": 1.5909290577641115, + "grad_norm": 2.340587615966797, + "learning_rate": 1e-06, + "loss": 0.8435, + "mean_token_accuracy": 0.7330783605575562, + "num_tokens": 361662285.0, + "step": 14487 + }, + { + "epoch": 1.5910388754667253, + "grad_norm": 2.2246851921081543, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7211010456085205, + "num_tokens": 361687882.0, + "step": 14488 + }, + { + "epoch": 1.5911486931693388, + "grad_norm": 2.5959017276763916, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7115216851234436, + "num_tokens": 361708622.0, + "step": 14489 + }, + { + "epoch": 1.5912585108719526, + "grad_norm": 2.0702898502349854, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7317205667495728, + "num_tokens": 361736761.0, + "step": 14490 + }, + { + "epoch": 1.5913683285745663, + "grad_norm": 1.9517492055892944, + "learning_rate": 1e-06, + "loss": 1.02, + "mean_token_accuracy": 0.6862822771072388, + "num_tokens": 361771059.0, + "step": 14491 + }, + { + "epoch": 1.5914781462771799, + "grad_norm": 1.9433943033218384, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7010796666145325, + "num_tokens": 361804278.0, + "step": 14492 + }, + { + "epoch": 1.5915879639797934, + "grad_norm": 2.0153908729553223, + "learning_rate": 1e-06, + "loss": 0.9558, + "mean_token_accuracy": 0.7019858360290527, + "num_tokens": 361836090.0, + "step": 14493 + }, + { + "epoch": 1.5916977816824072, + "grad_norm": 2.582052707672119, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7162913084030151, + "num_tokens": 361855993.0, + "step": 14494 + }, + { + "epoch": 1.591807599385021, + "grad_norm": 2.4013795852661133, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7354888916015625, + "num_tokens": 361878889.0, + "step": 14495 + }, + { + "epoch": 1.5919174170876347, + "grad_norm": 2.2002406120300293, + "learning_rate": 1e-06, + "loss": 0.765, + "mean_token_accuracy": 0.7636396884918213, + "num_tokens": 361903358.0, + "step": 14496 + }, + { + "epoch": 1.5920272347902482, + "grad_norm": 1.831266164779663, + "learning_rate": 1e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7402915954589844, + "num_tokens": 361935843.0, + "step": 14497 + }, + { + "epoch": 1.5921370524928617, + "grad_norm": 1.9637677669525146, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.6992273330688477, + "num_tokens": 361969701.0, + "step": 14498 + }, + { + "epoch": 1.5922468701954755, + "grad_norm": 2.0540125370025635, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7052746415138245, + "num_tokens": 362000865.0, + "step": 14499 + }, + { + "epoch": 1.5923566878980893, + "grad_norm": 2.4485301971435547, + "learning_rate": 1e-06, + "loss": 0.8252, + "mean_token_accuracy": 0.7399334907531738, + "num_tokens": 362022571.0, + "step": 14500 + }, + { + "epoch": 1.5924665056007028, + "grad_norm": 2.7586135864257812, + "learning_rate": 1e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7558382153511047, + "num_tokens": 362040106.0, + "step": 14501 + }, + { + "epoch": 1.5925763233033163, + "grad_norm": 2.0970778465270996, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7187567353248596, + "num_tokens": 362069496.0, + "step": 14502 + }, + { + "epoch": 1.59268614100593, + "grad_norm": 2.3991944789886475, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7070112228393555, + "num_tokens": 362095882.0, + "step": 14503 + }, + { + "epoch": 1.5927959587085438, + "grad_norm": 2.394662857055664, + "learning_rate": 1e-06, + "loss": 1.0112, + "mean_token_accuracy": 0.6937048435211182, + "num_tokens": 362119717.0, + "step": 14504 + }, + { + "epoch": 1.5929057764111576, + "grad_norm": 2.534052610397339, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7165191769599915, + "num_tokens": 362139598.0, + "step": 14505 + }, + { + "epoch": 1.5930155941137711, + "grad_norm": 2.367035388946533, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.722994863986969, + "num_tokens": 362163401.0, + "step": 14506 + }, + { + "epoch": 1.5931254118163847, + "grad_norm": 2.1498777866363525, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7220011949539185, + "num_tokens": 362190051.0, + "step": 14507 + }, + { + "epoch": 1.5932352295189984, + "grad_norm": 2.0267174243927, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7137324213981628, + "num_tokens": 362219378.0, + "step": 14508 + }, + { + "epoch": 1.5933450472216122, + "grad_norm": 2.464683771133423, + "learning_rate": 1e-06, + "loss": 0.876, + "mean_token_accuracy": 0.7207234501838684, + "num_tokens": 362240304.0, + "step": 14509 + }, + { + "epoch": 1.593454864924226, + "grad_norm": 2.2782623767852783, + "learning_rate": 1e-06, + "loss": 0.9101, + "mean_token_accuracy": 0.7173665761947632, + "num_tokens": 362264942.0, + "step": 14510 + }, + { + "epoch": 1.5935646826268395, + "grad_norm": 2.2597098350524902, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7100762128829956, + "num_tokens": 362288567.0, + "step": 14511 + }, + { + "epoch": 1.593674500329453, + "grad_norm": 2.1942403316497803, + "learning_rate": 1e-06, + "loss": 0.8621, + "mean_token_accuracy": 0.72705078125, + "num_tokens": 362314611.0, + "step": 14512 + }, + { + "epoch": 1.5937843180320668, + "grad_norm": 2.4444873332977295, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7084012031555176, + "num_tokens": 362337025.0, + "step": 14513 + }, + { + "epoch": 1.5938941357346805, + "grad_norm": 2.334625005722046, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7076767683029175, + "num_tokens": 362362211.0, + "step": 14514 + }, + { + "epoch": 1.594003953437294, + "grad_norm": 2.2814407348632812, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7107895612716675, + "num_tokens": 362386680.0, + "step": 14515 + }, + { + "epoch": 1.5941137711399076, + "grad_norm": 2.6746160984039307, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7275798320770264, + "num_tokens": 362405535.0, + "step": 14516 + }, + { + "epoch": 1.5942235888425214, + "grad_norm": 2.368955135345459, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.7087337374687195, + "num_tokens": 362429705.0, + "step": 14517 + }, + { + "epoch": 1.5943334065451351, + "grad_norm": 2.0867371559143066, + "learning_rate": 1e-06, + "loss": 0.9943, + "mean_token_accuracy": 0.6965964436531067, + "num_tokens": 362460933.0, + "step": 14518 + }, + { + "epoch": 1.5944432242477489, + "grad_norm": 2.2462477684020996, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.738056480884552, + "num_tokens": 362485319.0, + "step": 14519 + }, + { + "epoch": 1.5945530419503624, + "grad_norm": 2.6315951347351074, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.7298407554626465, + "num_tokens": 362503978.0, + "step": 14520 + }, + { + "epoch": 1.594662859652976, + "grad_norm": 2.4071171283721924, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7266721129417419, + "num_tokens": 362526799.0, + "step": 14521 + }, + { + "epoch": 1.5947726773555897, + "grad_norm": 2.3641927242279053, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7110353708267212, + "num_tokens": 362550458.0, + "step": 14522 + }, + { + "epoch": 1.5948824950582035, + "grad_norm": 2.192317008972168, + "learning_rate": 1e-06, + "loss": 1.014, + "mean_token_accuracy": 0.6948325037956238, + "num_tokens": 362578337.0, + "step": 14523 + }, + { + "epoch": 1.5949923127608172, + "grad_norm": 2.098043918609619, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7366994023323059, + "num_tokens": 362606486.0, + "step": 14524 + }, + { + "epoch": 1.5951021304634307, + "grad_norm": 2.4042105674743652, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.714471161365509, + "num_tokens": 362628977.0, + "step": 14525 + }, + { + "epoch": 1.5952119481660443, + "grad_norm": 2.255247116088867, + "learning_rate": 1e-06, + "loss": 0.9764, + "mean_token_accuracy": 0.7016660571098328, + "num_tokens": 362653998.0, + "step": 14526 + }, + { + "epoch": 1.595321765868658, + "grad_norm": 2.446148633956909, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7107487320899963, + "num_tokens": 362676102.0, + "step": 14527 + }, + { + "epoch": 1.5954315835712718, + "grad_norm": 2.1942522525787354, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.725003719329834, + "num_tokens": 362702983.0, + "step": 14528 + }, + { + "epoch": 1.5955414012738853, + "grad_norm": 2.653536081314087, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7205684185028076, + "num_tokens": 362724593.0, + "step": 14529 + }, + { + "epoch": 1.5956512189764989, + "grad_norm": 2.106210470199585, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7210522890090942, + "num_tokens": 362753269.0, + "step": 14530 + }, + { + "epoch": 1.5957610366791126, + "grad_norm": 2.452326536178589, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7269284725189209, + "num_tokens": 362776223.0, + "step": 14531 + }, + { + "epoch": 1.5958708543817264, + "grad_norm": 2.5782644748687744, + "learning_rate": 1e-06, + "loss": 0.7995, + "mean_token_accuracy": 0.7529206275939941, + "num_tokens": 362794737.0, + "step": 14532 + }, + { + "epoch": 1.5959806720843401, + "grad_norm": 2.3542962074279785, + "learning_rate": 1e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7371129989624023, + "num_tokens": 362817386.0, + "step": 14533 + }, + { + "epoch": 1.5960904897869537, + "grad_norm": 2.539928674697876, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7352277040481567, + "num_tokens": 362837663.0, + "step": 14534 + }, + { + "epoch": 1.5962003074895672, + "grad_norm": 2.445838689804077, + "learning_rate": 1e-06, + "loss": 0.9826, + "mean_token_accuracy": 0.701543927192688, + "num_tokens": 362861769.0, + "step": 14535 + }, + { + "epoch": 1.596310125192181, + "grad_norm": 2.279524087905884, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7178702354431152, + "num_tokens": 362886478.0, + "step": 14536 + }, + { + "epoch": 1.5964199428947947, + "grad_norm": 2.3127479553222656, + "learning_rate": 1e-06, + "loss": 0.9626, + "mean_token_accuracy": 0.7009771466255188, + "num_tokens": 362913088.0, + "step": 14537 + }, + { + "epoch": 1.5965297605974083, + "grad_norm": 2.2091522216796875, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7376185655593872, + "num_tokens": 362938826.0, + "step": 14538 + }, + { + "epoch": 1.596639578300022, + "grad_norm": 2.205775499343872, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7222418189048767, + "num_tokens": 362965352.0, + "step": 14539 + }, + { + "epoch": 1.5967493960026355, + "grad_norm": 2.182314872741699, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7127442359924316, + "num_tokens": 362992823.0, + "step": 14540 + }, + { + "epoch": 1.5968592137052493, + "grad_norm": 2.3571364879608154, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7146867513656616, + "num_tokens": 363015557.0, + "step": 14541 + }, + { + "epoch": 1.596969031407863, + "grad_norm": 2.522413492202759, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7224688529968262, + "num_tokens": 363036308.0, + "step": 14542 + }, + { + "epoch": 1.5970788491104766, + "grad_norm": 2.4228296279907227, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7278276085853577, + "num_tokens": 363056747.0, + "step": 14543 + }, + { + "epoch": 1.5971886668130901, + "grad_norm": 2.3462610244750977, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7262551784515381, + "num_tokens": 363079001.0, + "step": 14544 + }, + { + "epoch": 1.5972984845157039, + "grad_norm": 2.585350751876831, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.722805917263031, + "num_tokens": 363098854.0, + "step": 14545 + }, + { + "epoch": 1.5974083022183176, + "grad_norm": 2.1410186290740967, + "learning_rate": 1e-06, + "loss": 0.9867, + "mean_token_accuracy": 0.7014037370681763, + "num_tokens": 363126465.0, + "step": 14546 + }, + { + "epoch": 1.5975181199209314, + "grad_norm": 2.508169651031494, + "learning_rate": 1e-06, + "loss": 0.8962, + "mean_token_accuracy": 0.7246816158294678, + "num_tokens": 363147756.0, + "step": 14547 + }, + { + "epoch": 1.597627937623545, + "grad_norm": 2.1034998893737793, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7379934191703796, + "num_tokens": 363176234.0, + "step": 14548 + }, + { + "epoch": 1.5977377553261585, + "grad_norm": 2.2214457988739014, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7335537672042847, + "num_tokens": 363202126.0, + "step": 14549 + }, + { + "epoch": 1.5978475730287722, + "grad_norm": 2.2936766147613525, + "learning_rate": 1e-06, + "loss": 0.9702, + "mean_token_accuracy": 0.7075233459472656, + "num_tokens": 363227454.0, + "step": 14550 + }, + { + "epoch": 1.597957390731386, + "grad_norm": 2.3067753314971924, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7411937713623047, + "num_tokens": 363249985.0, + "step": 14551 + }, + { + "epoch": 1.5980672084339995, + "grad_norm": 2.309288501739502, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.7191796898841858, + "num_tokens": 363274929.0, + "step": 14552 + }, + { + "epoch": 1.5981770261366133, + "grad_norm": 2.2887158393859863, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7189093828201294, + "num_tokens": 363299761.0, + "step": 14553 + }, + { + "epoch": 1.5982868438392268, + "grad_norm": 2.2919323444366455, + "learning_rate": 1e-06, + "loss": 0.8239, + "mean_token_accuracy": 0.7390544414520264, + "num_tokens": 363323957.0, + "step": 14554 + }, + { + "epoch": 1.5983966615418406, + "grad_norm": 2.7811152935028076, + "learning_rate": 1e-06, + "loss": 0.7958, + "mean_token_accuracy": 0.7466000318527222, + "num_tokens": 363342156.0, + "step": 14555 + }, + { + "epoch": 1.5985064792444543, + "grad_norm": 2.7142722606658936, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7193022966384888, + "num_tokens": 363361855.0, + "step": 14556 + }, + { + "epoch": 1.5986162969470679, + "grad_norm": 2.1989805698394775, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7107126712799072, + "num_tokens": 363391495.0, + "step": 14557 + }, + { + "epoch": 1.5987261146496814, + "grad_norm": 2.7279574871063232, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7214434146881104, + "num_tokens": 363408783.0, + "step": 14558 + }, + { + "epoch": 1.5988359323522952, + "grad_norm": 2.1763033866882324, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.725609540939331, + "num_tokens": 363435072.0, + "step": 14559 + }, + { + "epoch": 1.598945750054909, + "grad_norm": 2.319817066192627, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7221049070358276, + "num_tokens": 363459516.0, + "step": 14560 + }, + { + "epoch": 1.5990555677575227, + "grad_norm": 2.293607473373413, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.716159462928772, + "num_tokens": 363484978.0, + "step": 14561 + }, + { + "epoch": 1.5991653854601362, + "grad_norm": 2.3810088634490967, + "learning_rate": 1e-06, + "loss": 0.9688, + "mean_token_accuracy": 0.7169615626335144, + "num_tokens": 363510764.0, + "step": 14562 + }, + { + "epoch": 1.5992752031627497, + "grad_norm": 2.459494113922119, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7243740558624268, + "num_tokens": 363532375.0, + "step": 14563 + }, + { + "epoch": 1.5993850208653635, + "grad_norm": 2.2926547527313232, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7040186524391174, + "num_tokens": 363557629.0, + "step": 14564 + }, + { + "epoch": 1.5994948385679773, + "grad_norm": 2.2530572414398193, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.723263144493103, + "num_tokens": 363584407.0, + "step": 14565 + }, + { + "epoch": 1.5996046562705908, + "grad_norm": 2.159198522567749, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7307844758033752, + "num_tokens": 363609549.0, + "step": 14566 + }, + { + "epoch": 1.5997144739732043, + "grad_norm": 2.236473321914673, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7112103700637817, + "num_tokens": 363636560.0, + "step": 14567 + }, + { + "epoch": 1.599824291675818, + "grad_norm": 2.2893428802490234, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7089911103248596, + "num_tokens": 363661911.0, + "step": 14568 + }, + { + "epoch": 1.5999341093784318, + "grad_norm": 2.1817626953125, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7341153621673584, + "num_tokens": 363687498.0, + "step": 14569 + }, + { + "epoch": 1.6000439270810456, + "grad_norm": 2.2387571334838867, + "learning_rate": 1e-06, + "loss": 0.9389, + "mean_token_accuracy": 0.7105154991149902, + "num_tokens": 363713733.0, + "step": 14570 + }, + { + "epoch": 1.6001537447836591, + "grad_norm": 2.226928949356079, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7186403870582581, + "num_tokens": 363738712.0, + "step": 14571 + }, + { + "epoch": 1.6002635624862727, + "grad_norm": 2.0877139568328857, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7216049432754517, + "num_tokens": 363766027.0, + "step": 14572 + }, + { + "epoch": 1.6003733801888864, + "grad_norm": 2.3653786182403564, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7012085914611816, + "num_tokens": 363791364.0, + "step": 14573 + }, + { + "epoch": 1.6004831978915002, + "grad_norm": 2.179234027862549, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7396714687347412, + "num_tokens": 363818219.0, + "step": 14574 + }, + { + "epoch": 1.600593015594114, + "grad_norm": 2.6149086952209473, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7298117876052856, + "num_tokens": 363838566.0, + "step": 14575 + }, + { + "epoch": 1.6007028332967275, + "grad_norm": 2.185551881790161, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.708238959312439, + "num_tokens": 363864919.0, + "step": 14576 + }, + { + "epoch": 1.600812650999341, + "grad_norm": 2.338960886001587, + "learning_rate": 1e-06, + "loss": 0.9475, + "mean_token_accuracy": 0.7027319669723511, + "num_tokens": 363888163.0, + "step": 14577 + }, + { + "epoch": 1.6009224687019548, + "grad_norm": 2.74275803565979, + "learning_rate": 1e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7490746974945068, + "num_tokens": 363904865.0, + "step": 14578 + }, + { + "epoch": 1.6010322864045685, + "grad_norm": 2.747605562210083, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7337250113487244, + "num_tokens": 363923667.0, + "step": 14579 + }, + { + "epoch": 1.601142104107182, + "grad_norm": 2.079461097717285, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7218779921531677, + "num_tokens": 363953131.0, + "step": 14580 + }, + { + "epoch": 1.6012519218097956, + "grad_norm": 1.9837007522583008, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7264313101768494, + "num_tokens": 363984511.0, + "step": 14581 + }, + { + "epoch": 1.6013617395124093, + "grad_norm": 2.23122501373291, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.708297610282898, + "num_tokens": 364013324.0, + "step": 14582 + }, + { + "epoch": 1.601471557215023, + "grad_norm": 2.5697314739227295, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7211237549781799, + "num_tokens": 364034215.0, + "step": 14583 + }, + { + "epoch": 1.6015813749176369, + "grad_norm": 2.1958200931549072, + "learning_rate": 1e-06, + "loss": 1.0245, + "mean_token_accuracy": 0.6922694444656372, + "num_tokens": 364064143.0, + "step": 14584 + }, + { + "epoch": 1.6016911926202504, + "grad_norm": 2.279879331588745, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7307000160217285, + "num_tokens": 364087919.0, + "step": 14585 + }, + { + "epoch": 1.601801010322864, + "grad_norm": 2.2954235076904297, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7156347036361694, + "num_tokens": 364113980.0, + "step": 14586 + }, + { + "epoch": 1.6019108280254777, + "grad_norm": 2.239441156387329, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6926062703132629, + "num_tokens": 364139796.0, + "step": 14587 + }, + { + "epoch": 1.6020206457280914, + "grad_norm": 2.5456583499908447, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7080792784690857, + "num_tokens": 364160400.0, + "step": 14588 + }, + { + "epoch": 1.602130463430705, + "grad_norm": 2.2494375705718994, + "learning_rate": 1e-06, + "loss": 1.0194, + "mean_token_accuracy": 0.6880403757095337, + "num_tokens": 364187414.0, + "step": 14589 + }, + { + "epoch": 1.6022402811333187, + "grad_norm": 2.2505693435668945, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7188506126403809, + "num_tokens": 364210737.0, + "step": 14590 + }, + { + "epoch": 1.6023500988359323, + "grad_norm": 2.0325586795806885, + "learning_rate": 1e-06, + "loss": 0.9901, + "mean_token_accuracy": 0.6978542804718018, + "num_tokens": 364241821.0, + "step": 14591 + }, + { + "epoch": 1.602459916538546, + "grad_norm": 2.7843830585479736, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7279638051986694, + "num_tokens": 364260652.0, + "step": 14592 + }, + { + "epoch": 1.6025697342411598, + "grad_norm": 2.201371192932129, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7101895809173584, + "num_tokens": 364289443.0, + "step": 14593 + }, + { + "epoch": 1.6026795519437733, + "grad_norm": 2.1103105545043945, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7217941284179688, + "num_tokens": 364318046.0, + "step": 14594 + }, + { + "epoch": 1.6027893696463869, + "grad_norm": 2.2591629028320312, + "learning_rate": 1e-06, + "loss": 1.0072, + "mean_token_accuracy": 0.6998063921928406, + "num_tokens": 364345917.0, + "step": 14595 + }, + { + "epoch": 1.6028991873490006, + "grad_norm": 2.0912976264953613, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7302206754684448, + "num_tokens": 364373121.0, + "step": 14596 + }, + { + "epoch": 1.6030090050516144, + "grad_norm": 2.0475828647613525, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7256714105606079, + "num_tokens": 364401767.0, + "step": 14597 + }, + { + "epoch": 1.6031188227542281, + "grad_norm": 2.1228721141815186, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7119994163513184, + "num_tokens": 364427078.0, + "step": 14598 + }, + { + "epoch": 1.6032286404568417, + "grad_norm": 2.2612040042877197, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7073270082473755, + "num_tokens": 364452296.0, + "step": 14599 + }, + { + "epoch": 1.6033384581594552, + "grad_norm": 2.1987733840942383, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7014874219894409, + "num_tokens": 364480763.0, + "step": 14600 + }, + { + "epoch": 1.603448275862069, + "grad_norm": 2.065955877304077, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7246062755584717, + "num_tokens": 364507943.0, + "step": 14601 + }, + { + "epoch": 1.6035580935646827, + "grad_norm": 2.3502936363220215, + "learning_rate": 1e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7551921606063843, + "num_tokens": 364531476.0, + "step": 14602 + }, + { + "epoch": 1.6036679112672962, + "grad_norm": 2.1416831016540527, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7255420088768005, + "num_tokens": 364558027.0, + "step": 14603 + }, + { + "epoch": 1.60377772896991, + "grad_norm": 2.3704609870910645, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7367023825645447, + "num_tokens": 364579851.0, + "step": 14604 + }, + { + "epoch": 1.6038875466725235, + "grad_norm": 2.5061585903167725, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7448792457580566, + "num_tokens": 364601375.0, + "step": 14605 + }, + { + "epoch": 1.6039973643751373, + "grad_norm": 2.2427101135253906, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7273619771003723, + "num_tokens": 364625577.0, + "step": 14606 + }, + { + "epoch": 1.604107182077751, + "grad_norm": 2.2365055084228516, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.717767596244812, + "num_tokens": 364652620.0, + "step": 14607 + }, + { + "epoch": 1.6042169997803646, + "grad_norm": 2.40598201751709, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7426154613494873, + "num_tokens": 364674596.0, + "step": 14608 + }, + { + "epoch": 1.6043268174829781, + "grad_norm": 2.5986554622650146, + "learning_rate": 1e-06, + "loss": 0.7733, + "mean_token_accuracy": 0.7572360634803772, + "num_tokens": 364693448.0, + "step": 14609 + }, + { + "epoch": 1.6044366351855919, + "grad_norm": 1.9734960794448853, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7280277013778687, + "num_tokens": 364726003.0, + "step": 14610 + }, + { + "epoch": 1.6045464528882056, + "grad_norm": 2.1390349864959717, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7008584141731262, + "num_tokens": 364755351.0, + "step": 14611 + }, + { + "epoch": 1.6046562705908194, + "grad_norm": 2.1257989406585693, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7003226280212402, + "num_tokens": 364785934.0, + "step": 14612 + }, + { + "epoch": 1.604766088293433, + "grad_norm": 2.263695478439331, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.7313849925994873, + "num_tokens": 364810109.0, + "step": 14613 + }, + { + "epoch": 1.6048759059960465, + "grad_norm": 2.2526023387908936, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7266005873680115, + "num_tokens": 364833682.0, + "step": 14614 + }, + { + "epoch": 1.6049857236986602, + "grad_norm": 2.1777405738830566, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7108115553855896, + "num_tokens": 364861515.0, + "step": 14615 + }, + { + "epoch": 1.605095541401274, + "grad_norm": 2.324519395828247, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7087483406066895, + "num_tokens": 364884872.0, + "step": 14616 + }, + { + "epoch": 1.6052053591038875, + "grad_norm": 2.2063796520233154, + "learning_rate": 1e-06, + "loss": 0.9753, + "mean_token_accuracy": 0.7092788219451904, + "num_tokens": 364910796.0, + "step": 14617 + }, + { + "epoch": 1.605315176806501, + "grad_norm": 1.9922460317611694, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7183672189712524, + "num_tokens": 364941558.0, + "step": 14618 + }, + { + "epoch": 1.6054249945091148, + "grad_norm": 2.4232873916625977, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7245504856109619, + "num_tokens": 364962606.0, + "step": 14619 + }, + { + "epoch": 1.6055348122117286, + "grad_norm": 2.186776876449585, + "learning_rate": 1e-06, + "loss": 0.9794, + "mean_token_accuracy": 0.7021045088768005, + "num_tokens": 364989023.0, + "step": 14620 + }, + { + "epoch": 1.6056446299143423, + "grad_norm": 2.1596055030822754, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7154055833816528, + "num_tokens": 365016304.0, + "step": 14621 + }, + { + "epoch": 1.6057544476169558, + "grad_norm": 2.1810190677642822, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7158452868461609, + "num_tokens": 365043910.0, + "step": 14622 + }, + { + "epoch": 1.6058642653195694, + "grad_norm": 2.151621103286743, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7444812059402466, + "num_tokens": 365069853.0, + "step": 14623 + }, + { + "epoch": 1.6059740830221831, + "grad_norm": 2.187619686126709, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.6993410587310791, + "num_tokens": 365096115.0, + "step": 14624 + }, + { + "epoch": 1.606083900724797, + "grad_norm": 2.772306203842163, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7294747233390808, + "num_tokens": 365114527.0, + "step": 14625 + }, + { + "epoch": 1.6061937184274107, + "grad_norm": 2.5853078365325928, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7320384979248047, + "num_tokens": 365133913.0, + "step": 14626 + }, + { + "epoch": 1.6063035361300242, + "grad_norm": 2.549928903579712, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7479726672172546, + "num_tokens": 365153723.0, + "step": 14627 + }, + { + "epoch": 1.6064133538326377, + "grad_norm": 2.126081705093384, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7133369445800781, + "num_tokens": 365181313.0, + "step": 14628 + }, + { + "epoch": 1.6065231715352515, + "grad_norm": 2.379791736602783, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7209404110908508, + "num_tokens": 365205041.0, + "step": 14629 + }, + { + "epoch": 1.6066329892378652, + "grad_norm": 2.319488525390625, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.705551266670227, + "num_tokens": 365231147.0, + "step": 14630 + }, + { + "epoch": 1.6067428069404788, + "grad_norm": 2.1609840393066406, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6980544924736023, + "num_tokens": 365261396.0, + "step": 14631 + }, + { + "epoch": 1.6068526246430923, + "grad_norm": 2.2033302783966064, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7200131416320801, + "num_tokens": 365286390.0, + "step": 14632 + }, + { + "epoch": 1.606962442345706, + "grad_norm": 2.4256527423858643, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7105487585067749, + "num_tokens": 365309244.0, + "step": 14633 + }, + { + "epoch": 1.6070722600483198, + "grad_norm": 2.3106956481933594, + "learning_rate": 1e-06, + "loss": 0.9672, + "mean_token_accuracy": 0.7143319845199585, + "num_tokens": 365333830.0, + "step": 14634 + }, + { + "epoch": 1.6071820777509336, + "grad_norm": 2.065964460372925, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7125842571258545, + "num_tokens": 365364670.0, + "step": 14635 + }, + { + "epoch": 1.6072918954535471, + "grad_norm": 2.336582660675049, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7090269923210144, + "num_tokens": 365390818.0, + "step": 14636 + }, + { + "epoch": 1.6074017131561606, + "grad_norm": 2.237633466720581, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.720950722694397, + "num_tokens": 365416223.0, + "step": 14637 + }, + { + "epoch": 1.6075115308587744, + "grad_norm": 2.1087255477905273, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7265288233757019, + "num_tokens": 365445355.0, + "step": 14638 + }, + { + "epoch": 1.6076213485613882, + "grad_norm": 2.0882325172424316, + "learning_rate": 1e-06, + "loss": 0.9481, + "mean_token_accuracy": 0.7089205384254456, + "num_tokens": 365474441.0, + "step": 14639 + }, + { + "epoch": 1.607731166264002, + "grad_norm": 2.372948408126831, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7126690149307251, + "num_tokens": 365498679.0, + "step": 14640 + }, + { + "epoch": 1.6078409839666155, + "grad_norm": 2.520400285720825, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.735749363899231, + "num_tokens": 365519329.0, + "step": 14641 + }, + { + "epoch": 1.607950801669229, + "grad_norm": 1.9301068782806396, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7305705547332764, + "num_tokens": 365551105.0, + "step": 14642 + }, + { + "epoch": 1.6080606193718427, + "grad_norm": 1.9948147535324097, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.71787428855896, + "num_tokens": 365582994.0, + "step": 14643 + }, + { + "epoch": 1.6081704370744565, + "grad_norm": 2.235417127609253, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.7212727665901184, + "num_tokens": 365609272.0, + "step": 14644 + }, + { + "epoch": 1.60828025477707, + "grad_norm": 2.6080825328826904, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7070859670639038, + "num_tokens": 365630907.0, + "step": 14645 + }, + { + "epoch": 1.6083900724796836, + "grad_norm": 2.362837553024292, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7295079231262207, + "num_tokens": 365654579.0, + "step": 14646 + }, + { + "epoch": 1.6084998901822973, + "grad_norm": 2.386406898498535, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7182067632675171, + "num_tokens": 365677991.0, + "step": 14647 + }, + { + "epoch": 1.608609707884911, + "grad_norm": 2.3569111824035645, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7284648418426514, + "num_tokens": 365700945.0, + "step": 14648 + }, + { + "epoch": 1.6087195255875248, + "grad_norm": 2.1730716228485107, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7207480072975159, + "num_tokens": 365727010.0, + "step": 14649 + }, + { + "epoch": 1.6088293432901384, + "grad_norm": 2.1738598346710205, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.71427321434021, + "num_tokens": 365753323.0, + "step": 14650 + }, + { + "epoch": 1.608939160992752, + "grad_norm": 2.477752208709717, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7344725131988525, + "num_tokens": 365773433.0, + "step": 14651 + }, + { + "epoch": 1.6090489786953657, + "grad_norm": 2.381230592727661, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7235080003738403, + "num_tokens": 365797323.0, + "step": 14652 + }, + { + "epoch": 1.6091587963979794, + "grad_norm": 2.7098991870880127, + "learning_rate": 1e-06, + "loss": 0.9501, + "mean_token_accuracy": 0.7118735313415527, + "num_tokens": 365817474.0, + "step": 14653 + }, + { + "epoch": 1.609268614100593, + "grad_norm": 2.362947463989258, + "learning_rate": 1e-06, + "loss": 0.9149, + "mean_token_accuracy": 0.7128222584724426, + "num_tokens": 365841027.0, + "step": 14654 + }, + { + "epoch": 1.6093784318032067, + "grad_norm": 2.2330331802368164, + "learning_rate": 1e-06, + "loss": 0.9974, + "mean_token_accuracy": 0.6941915154457092, + "num_tokens": 365868621.0, + "step": 14655 + }, + { + "epoch": 1.6094882495058203, + "grad_norm": 2.2461934089660645, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.713367223739624, + "num_tokens": 365894278.0, + "step": 14656 + }, + { + "epoch": 1.609598067208434, + "grad_norm": 2.2293038368225098, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.70771723985672, + "num_tokens": 365921900.0, + "step": 14657 + }, + { + "epoch": 1.6097078849110478, + "grad_norm": 2.6863441467285156, + "learning_rate": 1e-06, + "loss": 0.8123, + "mean_token_accuracy": 0.7359558939933777, + "num_tokens": 365940389.0, + "step": 14658 + }, + { + "epoch": 1.6098177026136613, + "grad_norm": 2.3352956771850586, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7233213186264038, + "num_tokens": 365963455.0, + "step": 14659 + }, + { + "epoch": 1.6099275203162748, + "grad_norm": 2.277090549468994, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7174278497695923, + "num_tokens": 365987871.0, + "step": 14660 + }, + { + "epoch": 1.6100373380188886, + "grad_norm": 1.9752830266952515, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7315041422843933, + "num_tokens": 366019547.0, + "step": 14661 + }, + { + "epoch": 1.6101471557215024, + "grad_norm": 2.335850238800049, + "learning_rate": 1e-06, + "loss": 0.8241, + "mean_token_accuracy": 0.7431297898292542, + "num_tokens": 366044717.0, + "step": 14662 + }, + { + "epoch": 1.6102569734241161, + "grad_norm": 2.111201047897339, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7065379619598389, + "num_tokens": 366075549.0, + "step": 14663 + }, + { + "epoch": 1.6103667911267296, + "grad_norm": 2.113271713256836, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.703840970993042, + "num_tokens": 366104814.0, + "step": 14664 + }, + { + "epoch": 1.6104766088293432, + "grad_norm": 2.7323033809661865, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.7207800149917603, + "num_tokens": 366124168.0, + "step": 14665 + }, + { + "epoch": 1.610586426531957, + "grad_norm": 2.7645115852355957, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7148395776748657, + "num_tokens": 366142476.0, + "step": 14666 + }, + { + "epoch": 1.6106962442345707, + "grad_norm": 2.1479547023773193, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7417627573013306, + "num_tokens": 366170739.0, + "step": 14667 + }, + { + "epoch": 1.6108060619371842, + "grad_norm": 2.2648606300354004, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7212899923324585, + "num_tokens": 366197109.0, + "step": 14668 + }, + { + "epoch": 1.610915879639798, + "grad_norm": 2.099963426589966, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7013221979141235, + "num_tokens": 366226296.0, + "step": 14669 + }, + { + "epoch": 1.6110256973424115, + "grad_norm": 2.2908217906951904, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7207034230232239, + "num_tokens": 366249601.0, + "step": 14670 + }, + { + "epoch": 1.6111355150450253, + "grad_norm": 2.2826590538024902, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7090478539466858, + "num_tokens": 366275959.0, + "step": 14671 + }, + { + "epoch": 1.611245332747639, + "grad_norm": 2.1545708179473877, + "learning_rate": 1e-06, + "loss": 0.9512, + "mean_token_accuracy": 0.7070192694664001, + "num_tokens": 366304241.0, + "step": 14672 + }, + { + "epoch": 1.6113551504502526, + "grad_norm": 2.320645570755005, + "learning_rate": 1e-06, + "loss": 0.9624, + "mean_token_accuracy": 0.7008689641952515, + "num_tokens": 366329846.0, + "step": 14673 + }, + { + "epoch": 1.611464968152866, + "grad_norm": 2.2947561740875244, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7195443511009216, + "num_tokens": 366355144.0, + "step": 14674 + }, + { + "epoch": 1.6115747858554799, + "grad_norm": 2.415754795074463, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.724529504776001, + "num_tokens": 366377584.0, + "step": 14675 + }, + { + "epoch": 1.6116846035580936, + "grad_norm": 2.4515929222106934, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7165915369987488, + "num_tokens": 366398802.0, + "step": 14676 + }, + { + "epoch": 1.6117944212607074, + "grad_norm": 1.9798136949539185, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7121700048446655, + "num_tokens": 366429238.0, + "step": 14677 + }, + { + "epoch": 1.611904238963321, + "grad_norm": 2.5416159629821777, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7120739221572876, + "num_tokens": 366450016.0, + "step": 14678 + }, + { + "epoch": 1.6120140566659344, + "grad_norm": 2.221310615539551, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7410575151443481, + "num_tokens": 366475841.0, + "step": 14679 + }, + { + "epoch": 1.6121238743685482, + "grad_norm": 2.6529619693756104, + "learning_rate": 1e-06, + "loss": 0.8001, + "mean_token_accuracy": 0.7409570217132568, + "num_tokens": 366494008.0, + "step": 14680 + }, + { + "epoch": 1.612233692071162, + "grad_norm": 2.5027196407318115, + "learning_rate": 1e-06, + "loss": 0.9667, + "mean_token_accuracy": 0.7026035785675049, + "num_tokens": 366514550.0, + "step": 14681 + }, + { + "epoch": 1.6123435097737755, + "grad_norm": 2.670297622680664, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7305864095687866, + "num_tokens": 366532580.0, + "step": 14682 + }, + { + "epoch": 1.612453327476389, + "grad_norm": 2.204045534133911, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7318325042724609, + "num_tokens": 366562246.0, + "step": 14683 + }, + { + "epoch": 1.6125631451790028, + "grad_norm": 2.2968571186065674, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7145361304283142, + "num_tokens": 366588410.0, + "step": 14684 + }, + { + "epoch": 1.6126729628816165, + "grad_norm": 2.2648959159851074, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7305423021316528, + "num_tokens": 366614344.0, + "step": 14685 + }, + { + "epoch": 1.6127827805842303, + "grad_norm": 2.3101844787597656, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7299975752830505, + "num_tokens": 366638944.0, + "step": 14686 + }, + { + "epoch": 1.6128925982868438, + "grad_norm": 2.1092987060546875, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7155166864395142, + "num_tokens": 366667869.0, + "step": 14687 + }, + { + "epoch": 1.6130024159894574, + "grad_norm": 2.3524320125579834, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7195669412612915, + "num_tokens": 366691158.0, + "step": 14688 + }, + { + "epoch": 1.6131122336920711, + "grad_norm": 2.4549975395202637, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.723722517490387, + "num_tokens": 366712885.0, + "step": 14689 + }, + { + "epoch": 1.6132220513946849, + "grad_norm": 2.545098304748535, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7095326781272888, + "num_tokens": 366733658.0, + "step": 14690 + }, + { + "epoch": 1.6133318690972986, + "grad_norm": 2.5244531631469727, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7254979014396667, + "num_tokens": 366754593.0, + "step": 14691 + }, + { + "epoch": 1.6134416867999122, + "grad_norm": 2.3036961555480957, + "learning_rate": 1e-06, + "loss": 0.9066, + "mean_token_accuracy": 0.7141039371490479, + "num_tokens": 366779946.0, + "step": 14692 + }, + { + "epoch": 1.6135515045025257, + "grad_norm": 2.3028671741485596, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7152493596076965, + "num_tokens": 366803733.0, + "step": 14693 + }, + { + "epoch": 1.6136613222051395, + "grad_norm": 2.2955894470214844, + "learning_rate": 1e-06, + "loss": 0.9138, + "mean_token_accuracy": 0.7162396311759949, + "num_tokens": 366829095.0, + "step": 14694 + }, + { + "epoch": 1.6137711399077532, + "grad_norm": 2.9956157207489014, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7282137870788574, + "num_tokens": 366844850.0, + "step": 14695 + }, + { + "epoch": 1.6138809576103668, + "grad_norm": 2.4023494720458984, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7228736281394958, + "num_tokens": 366866724.0, + "step": 14696 + }, + { + "epoch": 1.6139907753129803, + "grad_norm": 2.189455986022949, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7209222912788391, + "num_tokens": 366892995.0, + "step": 14697 + }, + { + "epoch": 1.614100593015594, + "grad_norm": 2.2429277896881104, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7074534893035889, + "num_tokens": 366918092.0, + "step": 14698 + }, + { + "epoch": 1.6142104107182078, + "grad_norm": 2.7802071571350098, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7268607020378113, + "num_tokens": 366935133.0, + "step": 14699 + }, + { + "epoch": 1.6143202284208216, + "grad_norm": 2.249185085296631, + "learning_rate": 1e-06, + "loss": 0.892, + "mean_token_accuracy": 0.7209999561309814, + "num_tokens": 366962258.0, + "step": 14700 + }, + { + "epoch": 1.614430046123435, + "grad_norm": 2.2974815368652344, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7107583284378052, + "num_tokens": 366988324.0, + "step": 14701 + }, + { + "epoch": 1.6145398638260486, + "grad_norm": 2.310086727142334, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7194530963897705, + "num_tokens": 367014045.0, + "step": 14702 + }, + { + "epoch": 1.6146496815286624, + "grad_norm": 2.311521291732788, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.717825710773468, + "num_tokens": 367038359.0, + "step": 14703 + }, + { + "epoch": 1.6147594992312762, + "grad_norm": 2.430748701095581, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7204292416572571, + "num_tokens": 367058960.0, + "step": 14704 + }, + { + "epoch": 1.61486931693389, + "grad_norm": 2.2746505737304688, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7107387781143188, + "num_tokens": 367085722.0, + "step": 14705 + }, + { + "epoch": 1.6149791346365034, + "grad_norm": 2.1952924728393555, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7232285141944885, + "num_tokens": 367112071.0, + "step": 14706 + }, + { + "epoch": 1.615088952339117, + "grad_norm": 2.3884687423706055, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7208731770515442, + "num_tokens": 367133911.0, + "step": 14707 + }, + { + "epoch": 1.6151987700417307, + "grad_norm": 2.3664348125457764, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7087100744247437, + "num_tokens": 367158042.0, + "step": 14708 + }, + { + "epoch": 1.6153085877443445, + "grad_norm": 2.0880966186523438, + "learning_rate": 1e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7356712818145752, + "num_tokens": 367186740.0, + "step": 14709 + }, + { + "epoch": 1.615418405446958, + "grad_norm": 1.9103658199310303, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7013353109359741, + "num_tokens": 367220303.0, + "step": 14710 + }, + { + "epoch": 1.6155282231495716, + "grad_norm": 2.009032726287842, + "learning_rate": 1e-06, + "loss": 0.9862, + "mean_token_accuracy": 0.6943422555923462, + "num_tokens": 367252007.0, + "step": 14711 + }, + { + "epoch": 1.6156380408521853, + "grad_norm": 2.2208211421966553, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7143408060073853, + "num_tokens": 367276795.0, + "step": 14712 + }, + { + "epoch": 1.615747858554799, + "grad_norm": 2.2025973796844482, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7097903490066528, + "num_tokens": 367301957.0, + "step": 14713 + }, + { + "epoch": 1.6158576762574128, + "grad_norm": 2.8264575004577637, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7224437594413757, + "num_tokens": 367321030.0, + "step": 14714 + }, + { + "epoch": 1.6159674939600264, + "grad_norm": 2.896108865737915, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7445175051689148, + "num_tokens": 367338333.0, + "step": 14715 + }, + { + "epoch": 1.61607731166264, + "grad_norm": 2.3562827110290527, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7246086597442627, + "num_tokens": 367361010.0, + "step": 14716 + }, + { + "epoch": 1.6161871293652537, + "grad_norm": 2.314145565032959, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7275598645210266, + "num_tokens": 367383455.0, + "step": 14717 + }, + { + "epoch": 1.6162969470678674, + "grad_norm": 2.2789106369018555, + "learning_rate": 1e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7375223636627197, + "num_tokens": 367406128.0, + "step": 14718 + }, + { + "epoch": 1.616406764770481, + "grad_norm": 2.1452128887176514, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7139605283737183, + "num_tokens": 367434097.0, + "step": 14719 + }, + { + "epoch": 1.6165165824730947, + "grad_norm": 2.5451066493988037, + "learning_rate": 1e-06, + "loss": 0.879, + "mean_token_accuracy": 0.7196231484413147, + "num_tokens": 367456507.0, + "step": 14720 + }, + { + "epoch": 1.6166264001757082, + "grad_norm": 2.413405418395996, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7058759927749634, + "num_tokens": 367480484.0, + "step": 14721 + }, + { + "epoch": 1.616736217878322, + "grad_norm": 2.411604404449463, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7319348454475403, + "num_tokens": 367502128.0, + "step": 14722 + }, + { + "epoch": 1.6168460355809358, + "grad_norm": 2.0928120613098145, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7304980754852295, + "num_tokens": 367530589.0, + "step": 14723 + }, + { + "epoch": 1.6169558532835493, + "grad_norm": 2.1157748699188232, + "learning_rate": 1e-06, + "loss": 0.9395, + "mean_token_accuracy": 0.7072352766990662, + "num_tokens": 367560372.0, + "step": 14724 + }, + { + "epoch": 1.6170656709861628, + "grad_norm": 2.2404661178588867, + "learning_rate": 1e-06, + "loss": 1.0018, + "mean_token_accuracy": 0.6881791353225708, + "num_tokens": 367585763.0, + "step": 14725 + }, + { + "epoch": 1.6171754886887766, + "grad_norm": 1.9468225240707397, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7163813710212708, + "num_tokens": 367618287.0, + "step": 14726 + }, + { + "epoch": 1.6172853063913903, + "grad_norm": 2.288127899169922, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7176454663276672, + "num_tokens": 367643128.0, + "step": 14727 + }, + { + "epoch": 1.617395124094004, + "grad_norm": 2.3872756958007812, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7071403861045837, + "num_tokens": 367667530.0, + "step": 14728 + }, + { + "epoch": 1.6175049417966176, + "grad_norm": 2.1504878997802734, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7175881862640381, + "num_tokens": 367696986.0, + "step": 14729 + }, + { + "epoch": 1.6176147594992312, + "grad_norm": 2.343245029449463, + "learning_rate": 1e-06, + "loss": 0.9358, + "mean_token_accuracy": 0.7092942595481873, + "num_tokens": 367723482.0, + "step": 14730 + }, + { + "epoch": 1.617724577201845, + "grad_norm": 2.3007683753967285, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.6990761160850525, + "num_tokens": 367752186.0, + "step": 14731 + }, + { + "epoch": 1.6178343949044587, + "grad_norm": 2.425581932067871, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7336830496788025, + "num_tokens": 367774248.0, + "step": 14732 + }, + { + "epoch": 1.6179442126070722, + "grad_norm": 1.8414499759674072, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6974554061889648, + "num_tokens": 367812522.0, + "step": 14733 + }, + { + "epoch": 1.618054030309686, + "grad_norm": 2.5350003242492676, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7134747505187988, + "num_tokens": 367836420.0, + "step": 14734 + }, + { + "epoch": 1.6181638480122995, + "grad_norm": 2.312340021133423, + "learning_rate": 1e-06, + "loss": 0.8814, + "mean_token_accuracy": 0.7239986658096313, + "num_tokens": 367859616.0, + "step": 14735 + }, + { + "epoch": 1.6182736657149133, + "grad_norm": 2.5957462787628174, + "learning_rate": 1e-06, + "loss": 0.7982, + "mean_token_accuracy": 0.7445797920227051, + "num_tokens": 367879522.0, + "step": 14736 + }, + { + "epoch": 1.618383483417527, + "grad_norm": 2.5197737216949463, + "learning_rate": 1e-06, + "loss": 1.0197, + "mean_token_accuracy": 0.7273367047309875, + "num_tokens": 367904974.0, + "step": 14737 + }, + { + "epoch": 1.6184933011201406, + "grad_norm": 2.3075177669525146, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7203476428985596, + "num_tokens": 367931659.0, + "step": 14738 + }, + { + "epoch": 1.618603118822754, + "grad_norm": 2.2542543411254883, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.6981828212738037, + "num_tokens": 367956397.0, + "step": 14739 + }, + { + "epoch": 1.6187129365253679, + "grad_norm": 2.473254919052124, + "learning_rate": 1e-06, + "loss": 0.759, + "mean_token_accuracy": 0.755191445350647, + "num_tokens": 367977141.0, + "step": 14740 + }, + { + "epoch": 1.6188227542279816, + "grad_norm": 2.40681529045105, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7218540906906128, + "num_tokens": 368000287.0, + "step": 14741 + }, + { + "epoch": 1.6189325719305954, + "grad_norm": 2.1497609615325928, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.713853657245636, + "num_tokens": 368029214.0, + "step": 14742 + }, + { + "epoch": 1.619042389633209, + "grad_norm": 2.0923216342926025, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7186017036437988, + "num_tokens": 368058573.0, + "step": 14743 + }, + { + "epoch": 1.6191522073358224, + "grad_norm": 2.5401573181152344, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7444099187850952, + "num_tokens": 368078843.0, + "step": 14744 + }, + { + "epoch": 1.6192620250384362, + "grad_norm": 2.2067148685455322, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.730794370174408, + "num_tokens": 368104457.0, + "step": 14745 + }, + { + "epoch": 1.61937184274105, + "grad_norm": 2.2020843029022217, + "learning_rate": 1e-06, + "loss": 0.9354, + "mean_token_accuracy": 0.7144961357116699, + "num_tokens": 368129874.0, + "step": 14746 + }, + { + "epoch": 1.6194816604436635, + "grad_norm": 2.446577787399292, + "learning_rate": 1e-06, + "loss": 0.9169, + "mean_token_accuracy": 0.7236205339431763, + "num_tokens": 368152564.0, + "step": 14747 + }, + { + "epoch": 1.619591478146277, + "grad_norm": 2.127157211303711, + "learning_rate": 1e-06, + "loss": 0.9716, + "mean_token_accuracy": 0.696157693862915, + "num_tokens": 368180700.0, + "step": 14748 + }, + { + "epoch": 1.6197012958488908, + "grad_norm": 2.1900792121887207, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7299693822860718, + "num_tokens": 368207397.0, + "step": 14749 + }, + { + "epoch": 1.6198111135515045, + "grad_norm": 2.289440631866455, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7364120483398438, + "num_tokens": 368231308.0, + "step": 14750 + }, + { + "epoch": 1.6199209312541183, + "grad_norm": 2.26855206489563, + "learning_rate": 1e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.7517819404602051, + "num_tokens": 368253335.0, + "step": 14751 + }, + { + "epoch": 1.6200307489567318, + "grad_norm": 2.1319525241851807, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.7271680235862732, + "num_tokens": 368279583.0, + "step": 14752 + }, + { + "epoch": 1.6201405666593454, + "grad_norm": 2.3101186752319336, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7333163619041443, + "num_tokens": 368302743.0, + "step": 14753 + }, + { + "epoch": 1.6202503843619591, + "grad_norm": 2.4238133430480957, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7426823377609253, + "num_tokens": 368322948.0, + "step": 14754 + }, + { + "epoch": 1.6203602020645729, + "grad_norm": 2.1499531269073486, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7244852781295776, + "num_tokens": 368351542.0, + "step": 14755 + }, + { + "epoch": 1.6204700197671866, + "grad_norm": 2.461304187774658, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.70695960521698, + "num_tokens": 368373320.0, + "step": 14756 + }, + { + "epoch": 1.6205798374698002, + "grad_norm": 2.0976576805114746, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7139283418655396, + "num_tokens": 368402042.0, + "step": 14757 + }, + { + "epoch": 1.6206896551724137, + "grad_norm": 2.1337203979492188, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7129732966423035, + "num_tokens": 368431845.0, + "step": 14758 + }, + { + "epoch": 1.6207994728750275, + "grad_norm": 2.6687021255493164, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7144215703010559, + "num_tokens": 368450749.0, + "step": 14759 + }, + { + "epoch": 1.6209092905776412, + "grad_norm": 2.1116280555725098, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7357476353645325, + "num_tokens": 368479794.0, + "step": 14760 + }, + { + "epoch": 1.6210191082802548, + "grad_norm": 2.4516708850860596, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7173331379890442, + "num_tokens": 368502819.0, + "step": 14761 + }, + { + "epoch": 1.6211289259828683, + "grad_norm": 2.1813318729400635, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7213742733001709, + "num_tokens": 368530210.0, + "step": 14762 + }, + { + "epoch": 1.621238743685482, + "grad_norm": 2.094663619995117, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7065790891647339, + "num_tokens": 368561780.0, + "step": 14763 + }, + { + "epoch": 1.6213485613880958, + "grad_norm": 2.2677829265594482, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7318814396858215, + "num_tokens": 368585705.0, + "step": 14764 + }, + { + "epoch": 1.6214583790907096, + "grad_norm": 2.117649793624878, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7227725982666016, + "num_tokens": 368613001.0, + "step": 14765 + }, + { + "epoch": 1.621568196793323, + "grad_norm": 2.596925735473633, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7155146598815918, + "num_tokens": 368634153.0, + "step": 14766 + }, + { + "epoch": 1.6216780144959366, + "grad_norm": 2.4146506786346436, + "learning_rate": 1e-06, + "loss": 0.9109, + "mean_token_accuracy": 0.7151170372962952, + "num_tokens": 368657459.0, + "step": 14767 + }, + { + "epoch": 1.6217878321985504, + "grad_norm": 2.440175771713257, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7389934062957764, + "num_tokens": 368680669.0, + "step": 14768 + }, + { + "epoch": 1.6218976499011641, + "grad_norm": 2.369127035140991, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7143650054931641, + "num_tokens": 368706931.0, + "step": 14769 + }, + { + "epoch": 1.6220074676037777, + "grad_norm": 2.449838161468506, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7272973656654358, + "num_tokens": 368729618.0, + "step": 14770 + }, + { + "epoch": 1.6221172853063914, + "grad_norm": 2.029470205307007, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6965092420578003, + "num_tokens": 368760750.0, + "step": 14771 + }, + { + "epoch": 1.622227103009005, + "grad_norm": 2.229111671447754, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.6915411949157715, + "num_tokens": 368789733.0, + "step": 14772 + }, + { + "epoch": 1.6223369207116187, + "grad_norm": 2.647143840789795, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7106298208236694, + "num_tokens": 368810396.0, + "step": 14773 + }, + { + "epoch": 1.6224467384142325, + "grad_norm": 2.1879749298095703, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.708059549331665, + "num_tokens": 368836469.0, + "step": 14774 + }, + { + "epoch": 1.622556556116846, + "grad_norm": 2.11507511138916, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7393729090690613, + "num_tokens": 368863141.0, + "step": 14775 + }, + { + "epoch": 1.6226663738194596, + "grad_norm": 2.5855212211608887, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7237787842750549, + "num_tokens": 368883319.0, + "step": 14776 + }, + { + "epoch": 1.6227761915220733, + "grad_norm": 2.3483879566192627, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7261736392974854, + "num_tokens": 368907457.0, + "step": 14777 + }, + { + "epoch": 1.622886009224687, + "grad_norm": 2.2869651317596436, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.6998069286346436, + "num_tokens": 368934044.0, + "step": 14778 + }, + { + "epoch": 1.6229958269273008, + "grad_norm": 2.034186840057373, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7163301110267639, + "num_tokens": 368964498.0, + "step": 14779 + }, + { + "epoch": 1.6231056446299144, + "grad_norm": 2.270690679550171, + "learning_rate": 1e-06, + "loss": 0.9335, + "mean_token_accuracy": 0.7140311002731323, + "num_tokens": 368990468.0, + "step": 14780 + }, + { + "epoch": 1.623215462332528, + "grad_norm": 2.417562961578369, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7099128365516663, + "num_tokens": 369014333.0, + "step": 14781 + }, + { + "epoch": 1.6233252800351416, + "grad_norm": 2.1205358505249023, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7237921357154846, + "num_tokens": 369042235.0, + "step": 14782 + }, + { + "epoch": 1.6234350977377554, + "grad_norm": 2.3451099395751953, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7180720567703247, + "num_tokens": 369064530.0, + "step": 14783 + }, + { + "epoch": 1.623544915440369, + "grad_norm": 2.173003673553467, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7293312549591064, + "num_tokens": 369092012.0, + "step": 14784 + }, + { + "epoch": 1.6236547331429827, + "grad_norm": 2.499772548675537, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7243846654891968, + "num_tokens": 369112715.0, + "step": 14785 + }, + { + "epoch": 1.6237645508455962, + "grad_norm": 1.9450535774230957, + "learning_rate": 1e-06, + "loss": 0.9239, + "mean_token_accuracy": 0.7151452898979187, + "num_tokens": 369144584.0, + "step": 14786 + }, + { + "epoch": 1.62387436854821, + "grad_norm": 2.086829900741577, + "learning_rate": 1e-06, + "loss": 0.8516, + "mean_token_accuracy": 0.7260648012161255, + "num_tokens": 369172480.0, + "step": 14787 + }, + { + "epoch": 1.6239841862508237, + "grad_norm": 2.3534743785858154, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7311968803405762, + "num_tokens": 369195420.0, + "step": 14788 + }, + { + "epoch": 1.6240940039534373, + "grad_norm": 2.1975386142730713, + "learning_rate": 1e-06, + "loss": 0.9349, + "mean_token_accuracy": 0.7101545333862305, + "num_tokens": 369222895.0, + "step": 14789 + }, + { + "epoch": 1.6242038216560508, + "grad_norm": 2.276942253112793, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7090479135513306, + "num_tokens": 369250330.0, + "step": 14790 + }, + { + "epoch": 1.6243136393586646, + "grad_norm": 1.9316236972808838, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.6995244026184082, + "num_tokens": 369284815.0, + "step": 14791 + }, + { + "epoch": 1.6244234570612783, + "grad_norm": 2.067373037338257, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7006595134735107, + "num_tokens": 369316013.0, + "step": 14792 + }, + { + "epoch": 1.624533274763892, + "grad_norm": 1.9742536544799805, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7152237892150879, + "num_tokens": 369346642.0, + "step": 14793 + }, + { + "epoch": 1.6246430924665056, + "grad_norm": 2.332195281982422, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7354039549827576, + "num_tokens": 369369771.0, + "step": 14794 + }, + { + "epoch": 1.6247529101691192, + "grad_norm": 2.3349359035491943, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.723907470703125, + "num_tokens": 369393635.0, + "step": 14795 + }, + { + "epoch": 1.624862727871733, + "grad_norm": 2.1092276573181152, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6962610483169556, + "num_tokens": 369423892.0, + "step": 14796 + }, + { + "epoch": 1.6249725455743467, + "grad_norm": 2.6186091899871826, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.753585696220398, + "num_tokens": 369442134.0, + "step": 14797 + }, + { + "epoch": 1.6250823632769602, + "grad_norm": 2.2524943351745605, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7063342332839966, + "num_tokens": 369467904.0, + "step": 14798 + }, + { + "epoch": 1.6251921809795737, + "grad_norm": 2.3539767265319824, + "learning_rate": 1e-06, + "loss": 0.839, + "mean_token_accuracy": 0.7360631227493286, + "num_tokens": 369491563.0, + "step": 14799 + }, + { + "epoch": 1.6253019986821875, + "grad_norm": 2.4522554874420166, + "learning_rate": 1e-06, + "loss": 0.8987, + "mean_token_accuracy": 0.7138761281967163, + "num_tokens": 369515375.0, + "step": 14800 + }, + { + "epoch": 1.6254118163848013, + "grad_norm": 2.073207139968872, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7106368541717529, + "num_tokens": 369546046.0, + "step": 14801 + }, + { + "epoch": 1.625521634087415, + "grad_norm": 2.168715000152588, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7271497249603271, + "num_tokens": 369571267.0, + "step": 14802 + }, + { + "epoch": 1.6256314517900285, + "grad_norm": 2.305603265762329, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7125198245048523, + "num_tokens": 369595801.0, + "step": 14803 + }, + { + "epoch": 1.625741269492642, + "grad_norm": 2.380570888519287, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7304008603096008, + "num_tokens": 369618400.0, + "step": 14804 + }, + { + "epoch": 1.6258510871952558, + "grad_norm": 2.2056400775909424, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7437953948974609, + "num_tokens": 369643443.0, + "step": 14805 + }, + { + "epoch": 1.6259609048978696, + "grad_norm": 1.967042088508606, + "learning_rate": 1e-06, + "loss": 0.9783, + "mean_token_accuracy": 0.7056476473808289, + "num_tokens": 369674852.0, + "step": 14806 + }, + { + "epoch": 1.6260707226004834, + "grad_norm": 2.118643283843994, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7221988439559937, + "num_tokens": 369702803.0, + "step": 14807 + }, + { + "epoch": 1.6261805403030969, + "grad_norm": 2.335679769515991, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7210711240768433, + "num_tokens": 369728289.0, + "step": 14808 + }, + { + "epoch": 1.6262903580057104, + "grad_norm": 2.192777395248413, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7176154851913452, + "num_tokens": 369754573.0, + "step": 14809 + }, + { + "epoch": 1.6264001757083242, + "grad_norm": 2.2090964317321777, + "learning_rate": 1e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7434473037719727, + "num_tokens": 369778518.0, + "step": 14810 + }, + { + "epoch": 1.626509993410938, + "grad_norm": 2.7184460163116455, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.7126548886299133, + "num_tokens": 369796811.0, + "step": 14811 + }, + { + "epoch": 1.6266198111135515, + "grad_norm": 2.465827226638794, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7164039611816406, + "num_tokens": 369818706.0, + "step": 14812 + }, + { + "epoch": 1.626729628816165, + "grad_norm": 2.3768417835235596, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7383564710617065, + "num_tokens": 369842739.0, + "step": 14813 + }, + { + "epoch": 1.6268394465187788, + "grad_norm": 2.0972139835357666, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.7212327122688293, + "num_tokens": 369873366.0, + "step": 14814 + }, + { + "epoch": 1.6269492642213925, + "grad_norm": 2.505033493041992, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7105029225349426, + "num_tokens": 369895174.0, + "step": 14815 + }, + { + "epoch": 1.6270590819240063, + "grad_norm": 2.1378252506256104, + "learning_rate": 1e-06, + "loss": 0.891, + "mean_token_accuracy": 0.7247577905654907, + "num_tokens": 369921865.0, + "step": 14816 + }, + { + "epoch": 1.6271688996266198, + "grad_norm": 2.14766788482666, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7121841907501221, + "num_tokens": 369950526.0, + "step": 14817 + }, + { + "epoch": 1.6272787173292333, + "grad_norm": 3.066281795501709, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7339487075805664, + "num_tokens": 369965665.0, + "step": 14818 + }, + { + "epoch": 1.627388535031847, + "grad_norm": 2.2754390239715576, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7115094065666199, + "num_tokens": 369991217.0, + "step": 14819 + }, + { + "epoch": 1.6274983527344609, + "grad_norm": 2.028510093688965, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7164711356163025, + "num_tokens": 370019934.0, + "step": 14820 + }, + { + "epoch": 1.6276081704370746, + "grad_norm": 2.079653263092041, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7214916944503784, + "num_tokens": 370048192.0, + "step": 14821 + }, + { + "epoch": 1.6277179881396882, + "grad_norm": 2.224832773208618, + "learning_rate": 1e-06, + "loss": 0.823, + "mean_token_accuracy": 0.7454870939254761, + "num_tokens": 370071794.0, + "step": 14822 + }, + { + "epoch": 1.6278278058423017, + "grad_norm": 2.4139318466186523, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7364034652709961, + "num_tokens": 370094265.0, + "step": 14823 + }, + { + "epoch": 1.6279376235449154, + "grad_norm": 2.384974479675293, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7317819595336914, + "num_tokens": 370118211.0, + "step": 14824 + }, + { + "epoch": 1.6280474412475292, + "grad_norm": 2.0944762229919434, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7180624008178711, + "num_tokens": 370147411.0, + "step": 14825 + }, + { + "epoch": 1.6281572589501427, + "grad_norm": 2.329432487487793, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7047073841094971, + "num_tokens": 370172612.0, + "step": 14826 + }, + { + "epoch": 1.6282670766527563, + "grad_norm": 2.11464786529541, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7168257236480713, + "num_tokens": 370202350.0, + "step": 14827 + }, + { + "epoch": 1.62837689435537, + "grad_norm": 2.212404489517212, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7176647186279297, + "num_tokens": 370229153.0, + "step": 14828 + }, + { + "epoch": 1.6284867120579838, + "grad_norm": 2.3059563636779785, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7012860774993896, + "num_tokens": 370255447.0, + "step": 14829 + }, + { + "epoch": 1.6285965297605975, + "grad_norm": 2.1213197708129883, + "learning_rate": 1e-06, + "loss": 0.9612, + "mean_token_accuracy": 0.7025035619735718, + "num_tokens": 370283772.0, + "step": 14830 + }, + { + "epoch": 1.628706347463211, + "grad_norm": 2.15740704536438, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7227650880813599, + "num_tokens": 370311305.0, + "step": 14831 + }, + { + "epoch": 1.6288161651658246, + "grad_norm": 2.206876277923584, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7111606597900391, + "num_tokens": 370336973.0, + "step": 14832 + }, + { + "epoch": 1.6289259828684384, + "grad_norm": 2.4269816875457764, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.727631688117981, + "num_tokens": 370360930.0, + "step": 14833 + }, + { + "epoch": 1.6290358005710521, + "grad_norm": 2.3479764461517334, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7159382104873657, + "num_tokens": 370383397.0, + "step": 14834 + }, + { + "epoch": 1.6291456182736657, + "grad_norm": 2.611717700958252, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7456790804862976, + "num_tokens": 370403295.0, + "step": 14835 + }, + { + "epoch": 1.6292554359762794, + "grad_norm": 2.1463840007781982, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7019174695014954, + "num_tokens": 370431336.0, + "step": 14836 + }, + { + "epoch": 1.629365253678893, + "grad_norm": 1.9476065635681152, + "learning_rate": 1e-06, + "loss": 0.9904, + "mean_token_accuracy": 0.6905295848846436, + "num_tokens": 370465005.0, + "step": 14837 + }, + { + "epoch": 1.6294750713815067, + "grad_norm": 2.720165491104126, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7348207235336304, + "num_tokens": 370482348.0, + "step": 14838 + }, + { + "epoch": 1.6295848890841205, + "grad_norm": 2.428706407546997, + "learning_rate": 1e-06, + "loss": 0.9848, + "mean_token_accuracy": 0.6960451602935791, + "num_tokens": 370506929.0, + "step": 14839 + }, + { + "epoch": 1.629694706786734, + "grad_norm": 2.2307095527648926, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7088377475738525, + "num_tokens": 370533116.0, + "step": 14840 + }, + { + "epoch": 1.6298045244893475, + "grad_norm": 2.481428384780884, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7384073734283447, + "num_tokens": 370554113.0, + "step": 14841 + }, + { + "epoch": 1.6299143421919613, + "grad_norm": 2.3283369541168213, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.7234412431716919, + "num_tokens": 370577651.0, + "step": 14842 + }, + { + "epoch": 1.630024159894575, + "grad_norm": 2.2390995025634766, + "learning_rate": 1e-06, + "loss": 0.9494, + "mean_token_accuracy": 0.6998587250709534, + "num_tokens": 370603752.0, + "step": 14843 + }, + { + "epoch": 1.6301339775971888, + "grad_norm": 2.2852511405944824, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7228001356124878, + "num_tokens": 370628803.0, + "step": 14844 + }, + { + "epoch": 1.6302437952998023, + "grad_norm": 2.311739444732666, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7165579795837402, + "num_tokens": 370653624.0, + "step": 14845 + }, + { + "epoch": 1.6303536130024159, + "grad_norm": 2.27758526802063, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.716545581817627, + "num_tokens": 370682006.0, + "step": 14846 + }, + { + "epoch": 1.6304634307050296, + "grad_norm": 2.214770793914795, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7342205047607422, + "num_tokens": 370709437.0, + "step": 14847 + }, + { + "epoch": 1.6305732484076434, + "grad_norm": 2.2626073360443115, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7104646563529968, + "num_tokens": 370734620.0, + "step": 14848 + }, + { + "epoch": 1.630683066110257, + "grad_norm": 2.6836299896240234, + "learning_rate": 1e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7479599118232727, + "num_tokens": 370752859.0, + "step": 14849 + }, + { + "epoch": 1.6307928838128707, + "grad_norm": 2.412729024887085, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7277874946594238, + "num_tokens": 370777811.0, + "step": 14850 + }, + { + "epoch": 1.6309027015154842, + "grad_norm": 2.372575044631958, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7325440049171448, + "num_tokens": 370800827.0, + "step": 14851 + }, + { + "epoch": 1.631012519218098, + "grad_norm": 2.2568719387054443, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7154090404510498, + "num_tokens": 370826252.0, + "step": 14852 + }, + { + "epoch": 1.6311223369207117, + "grad_norm": 2.4355523586273193, + "learning_rate": 1e-06, + "loss": 0.7872, + "mean_token_accuracy": 0.7474570274353027, + "num_tokens": 370846332.0, + "step": 14853 + }, + { + "epoch": 1.6312321546233253, + "grad_norm": 2.6193127632141113, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7256196737289429, + "num_tokens": 370865863.0, + "step": 14854 + }, + { + "epoch": 1.6313419723259388, + "grad_norm": 2.016749143600464, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7312443256378174, + "num_tokens": 370895851.0, + "step": 14855 + }, + { + "epoch": 1.6314517900285526, + "grad_norm": 2.042710065841675, + "learning_rate": 1e-06, + "loss": 0.9844, + "mean_token_accuracy": 0.717200756072998, + "num_tokens": 370926438.0, + "step": 14856 + }, + { + "epoch": 1.6315616077311663, + "grad_norm": 2.065615653991699, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7647081613540649, + "num_tokens": 370952844.0, + "step": 14857 + }, + { + "epoch": 1.63167142543378, + "grad_norm": 2.535330295562744, + "learning_rate": 1e-06, + "loss": 0.9662, + "mean_token_accuracy": 0.7019686698913574, + "num_tokens": 370974569.0, + "step": 14858 + }, + { + "epoch": 1.6317812431363936, + "grad_norm": 2.3127660751342773, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7276981472969055, + "num_tokens": 370998840.0, + "step": 14859 + }, + { + "epoch": 1.6318910608390071, + "grad_norm": 2.2465391159057617, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7083097696304321, + "num_tokens": 371027439.0, + "step": 14860 + }, + { + "epoch": 1.632000878541621, + "grad_norm": 2.491521120071411, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7112826108932495, + "num_tokens": 371048600.0, + "step": 14861 + }, + { + "epoch": 1.6321106962442347, + "grad_norm": 2.395709991455078, + "learning_rate": 1e-06, + "loss": 0.7819, + "mean_token_accuracy": 0.7498956322669983, + "num_tokens": 371070771.0, + "step": 14862 + }, + { + "epoch": 1.6322205139468482, + "grad_norm": 2.093909740447998, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.726291298866272, + "num_tokens": 371099520.0, + "step": 14863 + }, + { + "epoch": 1.6323303316494617, + "grad_norm": 2.134653091430664, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7111142873764038, + "num_tokens": 371126518.0, + "step": 14864 + }, + { + "epoch": 1.6324401493520755, + "grad_norm": 2.3700673580169678, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7323914766311646, + "num_tokens": 371148080.0, + "step": 14865 + }, + { + "epoch": 1.6325499670546892, + "grad_norm": 2.2298686504364014, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.6998021006584167, + "num_tokens": 371173894.0, + "step": 14866 + }, + { + "epoch": 1.632659784757303, + "grad_norm": 2.251835584640503, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7250509262084961, + "num_tokens": 371199218.0, + "step": 14867 + }, + { + "epoch": 1.6327696024599165, + "grad_norm": 2.5701000690460205, + "learning_rate": 1e-06, + "loss": 0.8156, + "mean_token_accuracy": 0.7404084205627441, + "num_tokens": 371218992.0, + "step": 14868 + }, + { + "epoch": 1.63287942016253, + "grad_norm": 2.07613205909729, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.6992807984352112, + "num_tokens": 371249387.0, + "step": 14869 + }, + { + "epoch": 1.6329892378651438, + "grad_norm": 2.141636848449707, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7269918918609619, + "num_tokens": 371275507.0, + "step": 14870 + }, + { + "epoch": 1.6330990555677576, + "grad_norm": 2.424546718597412, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6880161166191101, + "num_tokens": 371300477.0, + "step": 14871 + }, + { + "epoch": 1.6332088732703713, + "grad_norm": 2.3881993293762207, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7237317562103271, + "num_tokens": 371323473.0, + "step": 14872 + }, + { + "epoch": 1.6333186909729849, + "grad_norm": 2.3748950958251953, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7079002857208252, + "num_tokens": 371349607.0, + "step": 14873 + }, + { + "epoch": 1.6334285086755984, + "grad_norm": 2.446932792663574, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7273145318031311, + "num_tokens": 371369971.0, + "step": 14874 + }, + { + "epoch": 1.6335383263782122, + "grad_norm": 2.3909122943878174, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7318485975265503, + "num_tokens": 371394978.0, + "step": 14875 + }, + { + "epoch": 1.633648144080826, + "grad_norm": 2.1860456466674805, + "learning_rate": 1e-06, + "loss": 0.7833, + "mean_token_accuracy": 0.7554782032966614, + "num_tokens": 371419553.0, + "step": 14876 + }, + { + "epoch": 1.6337579617834395, + "grad_norm": 2.0124318599700928, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.7071018218994141, + "num_tokens": 371452273.0, + "step": 14877 + }, + { + "epoch": 1.633867779486053, + "grad_norm": 2.561857223510742, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7299088835716248, + "num_tokens": 371472604.0, + "step": 14878 + }, + { + "epoch": 1.6339775971886668, + "grad_norm": 2.4519410133361816, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7213137149810791, + "num_tokens": 371496073.0, + "step": 14879 + }, + { + "epoch": 1.6340874148912805, + "grad_norm": 2.225409984588623, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7067352533340454, + "num_tokens": 371522595.0, + "step": 14880 + }, + { + "epoch": 1.6341972325938943, + "grad_norm": 2.635676145553589, + "learning_rate": 1e-06, + "loss": 0.9339, + "mean_token_accuracy": 0.706122636795044, + "num_tokens": 371543550.0, + "step": 14881 + }, + { + "epoch": 1.6343070502965078, + "grad_norm": 2.5347588062286377, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7311570048332214, + "num_tokens": 371564551.0, + "step": 14882 + }, + { + "epoch": 1.6344168679991213, + "grad_norm": 2.300410032272339, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7244151830673218, + "num_tokens": 371590432.0, + "step": 14883 + }, + { + "epoch": 1.634526685701735, + "grad_norm": 2.190633773803711, + "learning_rate": 1e-06, + "loss": 0.849, + "mean_token_accuracy": 0.72261643409729, + "num_tokens": 371617209.0, + "step": 14884 + }, + { + "epoch": 1.6346365034043489, + "grad_norm": 2.3570094108581543, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7286628484725952, + "num_tokens": 371641225.0, + "step": 14885 + }, + { + "epoch": 1.6347463211069626, + "grad_norm": 2.015209197998047, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7315680980682373, + "num_tokens": 371672065.0, + "step": 14886 + }, + { + "epoch": 1.6348561388095761, + "grad_norm": 2.3662638664245605, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.729082465171814, + "num_tokens": 371694522.0, + "step": 14887 + }, + { + "epoch": 1.6349659565121897, + "grad_norm": 2.4119887351989746, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7262727618217468, + "num_tokens": 371718814.0, + "step": 14888 + }, + { + "epoch": 1.6350757742148034, + "grad_norm": 2.7664129734039307, + "learning_rate": 1e-06, + "loss": 0.8113, + "mean_token_accuracy": 0.7388992309570312, + "num_tokens": 371737655.0, + "step": 14889 + }, + { + "epoch": 1.6351855919174172, + "grad_norm": 2.862098217010498, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.749113917350769, + "num_tokens": 371754184.0, + "step": 14890 + }, + { + "epoch": 1.6352954096200307, + "grad_norm": 2.523422956466675, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.6988328099250793, + "num_tokens": 371777277.0, + "step": 14891 + }, + { + "epoch": 1.6354052273226443, + "grad_norm": 2.543895721435547, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.7244627475738525, + "num_tokens": 371799099.0, + "step": 14892 + }, + { + "epoch": 1.635515045025258, + "grad_norm": 2.1242740154266357, + "learning_rate": 1e-06, + "loss": 0.9275, + "mean_token_accuracy": 0.7126439213752747, + "num_tokens": 371830018.0, + "step": 14893 + }, + { + "epoch": 1.6356248627278718, + "grad_norm": 2.179550886154175, + "learning_rate": 1e-06, + "loss": 1.0036, + "mean_token_accuracy": 0.6885223984718323, + "num_tokens": 371857470.0, + "step": 14894 + }, + { + "epoch": 1.6357346804304855, + "grad_norm": 2.4829554557800293, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7226704359054565, + "num_tokens": 371878249.0, + "step": 14895 + }, + { + "epoch": 1.635844498133099, + "grad_norm": 2.319589614868164, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7359952330589294, + "num_tokens": 371901542.0, + "step": 14896 + }, + { + "epoch": 1.6359543158357126, + "grad_norm": 2.0937814712524414, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7133246064186096, + "num_tokens": 371929669.0, + "step": 14897 + }, + { + "epoch": 1.6360641335383264, + "grad_norm": 2.2405333518981934, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7309229969978333, + "num_tokens": 371955455.0, + "step": 14898 + }, + { + "epoch": 1.6361739512409401, + "grad_norm": 2.3104448318481445, + "learning_rate": 1e-06, + "loss": 0.9646, + "mean_token_accuracy": 0.7056572437286377, + "num_tokens": 371980159.0, + "step": 14899 + }, + { + "epoch": 1.6362837689435537, + "grad_norm": 2.571922540664673, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7230110168457031, + "num_tokens": 372001386.0, + "step": 14900 + }, + { + "epoch": 1.6363935866461674, + "grad_norm": 2.8614330291748047, + "learning_rate": 1e-06, + "loss": 0.7673, + "mean_token_accuracy": 0.7514886856079102, + "num_tokens": 372018356.0, + "step": 14901 + }, + { + "epoch": 1.636503404348781, + "grad_norm": 2.352829694747925, + "learning_rate": 1e-06, + "loss": 0.8065, + "mean_token_accuracy": 0.7425755262374878, + "num_tokens": 372042278.0, + "step": 14902 + }, + { + "epoch": 1.6366132220513947, + "grad_norm": 2.06911039352417, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7216082811355591, + "num_tokens": 372070935.0, + "step": 14903 + }, + { + "epoch": 1.6367230397540085, + "grad_norm": 2.249976634979248, + "learning_rate": 1e-06, + "loss": 0.9081, + "mean_token_accuracy": 0.7184983491897583, + "num_tokens": 372096900.0, + "step": 14904 + }, + { + "epoch": 1.636832857456622, + "grad_norm": 2.2574996948242188, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7214930653572083, + "num_tokens": 372125886.0, + "step": 14905 + }, + { + "epoch": 1.6369426751592355, + "grad_norm": 2.086266279220581, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7190225124359131, + "num_tokens": 372153561.0, + "step": 14906 + }, + { + "epoch": 1.6370524928618493, + "grad_norm": 2.7422940731048584, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7301366329193115, + "num_tokens": 372173985.0, + "step": 14907 + }, + { + "epoch": 1.637162310564463, + "grad_norm": 2.0166897773742676, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7307726740837097, + "num_tokens": 372201999.0, + "step": 14908 + }, + { + "epoch": 1.6372721282670768, + "grad_norm": 2.2196123600006104, + "learning_rate": 1e-06, + "loss": 0.9607, + "mean_token_accuracy": 0.7009168863296509, + "num_tokens": 372229496.0, + "step": 14909 + }, + { + "epoch": 1.6373819459696903, + "grad_norm": 2.195302724838257, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7276142239570618, + "num_tokens": 372253917.0, + "step": 14910 + }, + { + "epoch": 1.6374917636723039, + "grad_norm": 2.3123621940612793, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.6937031745910645, + "num_tokens": 372278839.0, + "step": 14911 + }, + { + "epoch": 1.6376015813749176, + "grad_norm": 2.32464861869812, + "learning_rate": 1e-06, + "loss": 0.8437, + "mean_token_accuracy": 0.7379553914070129, + "num_tokens": 372302478.0, + "step": 14912 + }, + { + "epoch": 1.6377113990775314, + "grad_norm": 2.3297698497772217, + "learning_rate": 1e-06, + "loss": 0.9631, + "mean_token_accuracy": 0.69774329662323, + "num_tokens": 372326282.0, + "step": 14913 + }, + { + "epoch": 1.637821216780145, + "grad_norm": 2.0917675495147705, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7352014780044556, + "num_tokens": 372355234.0, + "step": 14914 + }, + { + "epoch": 1.6379310344827587, + "grad_norm": 2.2953684329986572, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.7024922370910645, + "num_tokens": 372380377.0, + "step": 14915 + }, + { + "epoch": 1.6380408521853722, + "grad_norm": 2.5403358936309814, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.713245689868927, + "num_tokens": 372400416.0, + "step": 14916 + }, + { + "epoch": 1.638150669887986, + "grad_norm": 2.2871673107147217, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7158154249191284, + "num_tokens": 372426053.0, + "step": 14917 + }, + { + "epoch": 1.6382604875905997, + "grad_norm": 2.6682751178741455, + "learning_rate": 1e-06, + "loss": 0.8973, + "mean_token_accuracy": 0.7194063663482666, + "num_tokens": 372445522.0, + "step": 14918 + }, + { + "epoch": 1.6383703052932133, + "grad_norm": 2.479050874710083, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.7429969906806946, + "num_tokens": 372466276.0, + "step": 14919 + }, + { + "epoch": 1.6384801229958268, + "grad_norm": 2.373183250427246, + "learning_rate": 1e-06, + "loss": 0.8751, + "mean_token_accuracy": 0.722965657711029, + "num_tokens": 372490134.0, + "step": 14920 + }, + { + "epoch": 1.6385899406984406, + "grad_norm": 2.2195775508880615, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7066066265106201, + "num_tokens": 372517482.0, + "step": 14921 + }, + { + "epoch": 1.6386997584010543, + "grad_norm": 2.2125680446624756, + "learning_rate": 1e-06, + "loss": 0.972, + "mean_token_accuracy": 0.7166535258293152, + "num_tokens": 372543687.0, + "step": 14922 + }, + { + "epoch": 1.638809576103668, + "grad_norm": 2.2732386589050293, + "learning_rate": 1e-06, + "loss": 0.8015, + "mean_token_accuracy": 0.7509864568710327, + "num_tokens": 372566762.0, + "step": 14923 + }, + { + "epoch": 1.6389193938062816, + "grad_norm": 2.074833869934082, + "learning_rate": 1e-06, + "loss": 0.9834, + "mean_token_accuracy": 0.7144508361816406, + "num_tokens": 372595740.0, + "step": 14924 + }, + { + "epoch": 1.6390292115088951, + "grad_norm": 2.341872215270996, + "learning_rate": 1e-06, + "loss": 0.9462, + "mean_token_accuracy": 0.7089323997497559, + "num_tokens": 372620405.0, + "step": 14925 + }, + { + "epoch": 1.639139029211509, + "grad_norm": 2.3550872802734375, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7179168462753296, + "num_tokens": 372644391.0, + "step": 14926 + }, + { + "epoch": 1.6392488469141226, + "grad_norm": 2.3603272438049316, + "learning_rate": 1e-06, + "loss": 0.7894, + "mean_token_accuracy": 0.7449755668640137, + "num_tokens": 372665505.0, + "step": 14927 + }, + { + "epoch": 1.6393586646167362, + "grad_norm": 2.1241884231567383, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.7021586894989014, + "num_tokens": 372696156.0, + "step": 14928 + }, + { + "epoch": 1.6394684823193497, + "grad_norm": 2.0595076084136963, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7312061190605164, + "num_tokens": 372724333.0, + "step": 14929 + }, + { + "epoch": 1.6395783000219635, + "grad_norm": 2.0482139587402344, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7209508419036865, + "num_tokens": 372753950.0, + "step": 14930 + }, + { + "epoch": 1.6396881177245772, + "grad_norm": 1.8882343769073486, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7156400680541992, + "num_tokens": 372787137.0, + "step": 14931 + }, + { + "epoch": 1.639797935427191, + "grad_norm": 2.252246856689453, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.7369522452354431, + "num_tokens": 372810381.0, + "step": 14932 + }, + { + "epoch": 1.6399077531298045, + "grad_norm": 2.0799291133880615, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7058978080749512, + "num_tokens": 372840930.0, + "step": 14933 + }, + { + "epoch": 1.640017570832418, + "grad_norm": 2.4822258949279785, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7114916443824768, + "num_tokens": 372862547.0, + "step": 14934 + }, + { + "epoch": 1.6401273885350318, + "grad_norm": 2.2698187828063965, + "learning_rate": 1e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7455422282218933, + "num_tokens": 372886921.0, + "step": 14935 + }, + { + "epoch": 1.6402372062376456, + "grad_norm": 2.341113567352295, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7221450805664062, + "num_tokens": 372910146.0, + "step": 14936 + }, + { + "epoch": 1.6403470239402593, + "grad_norm": 2.281388282775879, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7375797033309937, + "num_tokens": 372934494.0, + "step": 14937 + }, + { + "epoch": 1.6404568416428729, + "grad_norm": 2.207448720932007, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7156013250350952, + "num_tokens": 372960458.0, + "step": 14938 + }, + { + "epoch": 1.6405666593454864, + "grad_norm": 2.085710048675537, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7135528326034546, + "num_tokens": 372991069.0, + "step": 14939 + }, + { + "epoch": 1.6406764770481002, + "grad_norm": 2.1703131198883057, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7150241136550903, + "num_tokens": 373017564.0, + "step": 14940 + }, + { + "epoch": 1.640786294750714, + "grad_norm": 2.3451523780822754, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7157410383224487, + "num_tokens": 373042149.0, + "step": 14941 + }, + { + "epoch": 1.6408961124533274, + "grad_norm": 2.4322712421417236, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7244205474853516, + "num_tokens": 373063587.0, + "step": 14942 + }, + { + "epoch": 1.641005930155941, + "grad_norm": 2.19527268409729, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7282283306121826, + "num_tokens": 373089925.0, + "step": 14943 + }, + { + "epoch": 1.6411157478585547, + "grad_norm": 2.384124994277954, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7034866809844971, + "num_tokens": 373113581.0, + "step": 14944 + }, + { + "epoch": 1.6412255655611685, + "grad_norm": 2.3087995052337646, + "learning_rate": 1e-06, + "loss": 0.8914, + "mean_token_accuracy": 0.7141939997673035, + "num_tokens": 373139436.0, + "step": 14945 + }, + { + "epoch": 1.6413353832637823, + "grad_norm": 2.743837356567383, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7325382232666016, + "num_tokens": 373157998.0, + "step": 14946 + }, + { + "epoch": 1.6414452009663958, + "grad_norm": 2.366396188735962, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7293931245803833, + "num_tokens": 373180468.0, + "step": 14947 + }, + { + "epoch": 1.6415550186690093, + "grad_norm": 1.9964237213134766, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7213091850280762, + "num_tokens": 373210316.0, + "step": 14948 + }, + { + "epoch": 1.641664836371623, + "grad_norm": 2.2929458618164062, + "learning_rate": 1e-06, + "loss": 0.9111, + "mean_token_accuracy": 0.7153799533843994, + "num_tokens": 373237017.0, + "step": 14949 + }, + { + "epoch": 1.6417746540742368, + "grad_norm": 2.3782074451446533, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7108885645866394, + "num_tokens": 373260272.0, + "step": 14950 + }, + { + "epoch": 1.6418844717768504, + "grad_norm": 2.412180185317993, + "learning_rate": 1e-06, + "loss": 0.8172, + "mean_token_accuracy": 0.746239960193634, + "num_tokens": 373281344.0, + "step": 14951 + }, + { + "epoch": 1.6419942894794641, + "grad_norm": 1.9725372791290283, + "learning_rate": 1e-06, + "loss": 0.9178, + "mean_token_accuracy": 0.7041555643081665, + "num_tokens": 373314156.0, + "step": 14952 + }, + { + "epoch": 1.6421041071820777, + "grad_norm": 2.2572309970855713, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.712558388710022, + "num_tokens": 373338688.0, + "step": 14953 + }, + { + "epoch": 1.6422139248846914, + "grad_norm": 2.8446872234344482, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7328565120697021, + "num_tokens": 373356438.0, + "step": 14954 + }, + { + "epoch": 1.6423237425873052, + "grad_norm": 2.2958028316497803, + "learning_rate": 1e-06, + "loss": 0.9805, + "mean_token_accuracy": 0.7009574174880981, + "num_tokens": 373382652.0, + "step": 14955 + }, + { + "epoch": 1.6424335602899187, + "grad_norm": 2.283501148223877, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7243313789367676, + "num_tokens": 373409253.0, + "step": 14956 + }, + { + "epoch": 1.6425433779925322, + "grad_norm": 2.84293270111084, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7377591133117676, + "num_tokens": 373425760.0, + "step": 14957 + }, + { + "epoch": 1.642653195695146, + "grad_norm": 2.519618511199951, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7358000874519348, + "num_tokens": 373445112.0, + "step": 14958 + }, + { + "epoch": 1.6427630133977598, + "grad_norm": 2.229597568511963, + "learning_rate": 1e-06, + "loss": 0.8901, + "mean_token_accuracy": 0.733032763004303, + "num_tokens": 373471845.0, + "step": 14959 + }, + { + "epoch": 1.6428728311003735, + "grad_norm": 2.252514123916626, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7121081948280334, + "num_tokens": 373500427.0, + "step": 14960 + }, + { + "epoch": 1.642982648802987, + "grad_norm": 2.6475589275360107, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7281310558319092, + "num_tokens": 373519040.0, + "step": 14961 + }, + { + "epoch": 1.6430924665056006, + "grad_norm": 2.2319345474243164, + "learning_rate": 1e-06, + "loss": 0.8867, + "mean_token_accuracy": 0.7173198461532593, + "num_tokens": 373543426.0, + "step": 14962 + }, + { + "epoch": 1.6432022842082143, + "grad_norm": 2.259584903717041, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7202553749084473, + "num_tokens": 373568809.0, + "step": 14963 + }, + { + "epoch": 1.643312101910828, + "grad_norm": 2.1134841442108154, + "learning_rate": 1e-06, + "loss": 1.0309, + "mean_token_accuracy": 0.685020387172699, + "num_tokens": 373598942.0, + "step": 14964 + }, + { + "epoch": 1.6434219196134416, + "grad_norm": 2.301805257797241, + "learning_rate": 1e-06, + "loss": 0.9925, + "mean_token_accuracy": 0.7013562321662903, + "num_tokens": 373624388.0, + "step": 14965 + }, + { + "epoch": 1.6435317373160554, + "grad_norm": 2.154611825942993, + "learning_rate": 1e-06, + "loss": 0.9511, + "mean_token_accuracy": 0.708817183971405, + "num_tokens": 373650839.0, + "step": 14966 + }, + { + "epoch": 1.643641555018669, + "grad_norm": 2.254509449005127, + "learning_rate": 1e-06, + "loss": 0.8105, + "mean_token_accuracy": 0.742017924785614, + "num_tokens": 373674641.0, + "step": 14967 + }, + { + "epoch": 1.6437513727212827, + "grad_norm": 2.5317702293395996, + "learning_rate": 1e-06, + "loss": 0.8254, + "mean_token_accuracy": 0.7357552647590637, + "num_tokens": 373694883.0, + "step": 14968 + }, + { + "epoch": 1.6438611904238964, + "grad_norm": 2.0578455924987793, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7193796634674072, + "num_tokens": 373727499.0, + "step": 14969 + }, + { + "epoch": 1.64397100812651, + "grad_norm": 2.2963640689849854, + "learning_rate": 1e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.7286115884780884, + "num_tokens": 373750469.0, + "step": 14970 + }, + { + "epoch": 1.6440808258291235, + "grad_norm": 2.2082443237304688, + "learning_rate": 1e-06, + "loss": 0.9551, + "mean_token_accuracy": 0.703559935092926, + "num_tokens": 373779651.0, + "step": 14971 + }, + { + "epoch": 1.6441906435317373, + "grad_norm": 2.3272156715393066, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7297064065933228, + "num_tokens": 373803862.0, + "step": 14972 + }, + { + "epoch": 1.644300461234351, + "grad_norm": 2.554687976837158, + "learning_rate": 1e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7428876757621765, + "num_tokens": 373824109.0, + "step": 14973 + }, + { + "epoch": 1.6444102789369648, + "grad_norm": 2.1838464736938477, + "learning_rate": 1e-06, + "loss": 0.8913, + "mean_token_accuracy": 0.7184480428695679, + "num_tokens": 373849798.0, + "step": 14974 + }, + { + "epoch": 1.6445200966395783, + "grad_norm": 2.2713534832000732, + "learning_rate": 1e-06, + "loss": 0.8222, + "mean_token_accuracy": 0.7407979965209961, + "num_tokens": 373875535.0, + "step": 14975 + }, + { + "epoch": 1.6446299143421919, + "grad_norm": 2.6487598419189453, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7221596240997314, + "num_tokens": 373895304.0, + "step": 14976 + }, + { + "epoch": 1.6447397320448056, + "grad_norm": 2.310004949569702, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7034631371498108, + "num_tokens": 373920232.0, + "step": 14977 + }, + { + "epoch": 1.6448495497474194, + "grad_norm": 2.2019195556640625, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.711261510848999, + "num_tokens": 373945706.0, + "step": 14978 + }, + { + "epoch": 1.644959367450033, + "grad_norm": 2.0696799755096436, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7092997431755066, + "num_tokens": 373976849.0, + "step": 14979 + }, + { + "epoch": 1.6450691851526464, + "grad_norm": 2.322786808013916, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7336634397506714, + "num_tokens": 374001024.0, + "step": 14980 + }, + { + "epoch": 1.6451790028552602, + "grad_norm": 2.620159864425659, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.7051100730895996, + "num_tokens": 374021511.0, + "step": 14981 + }, + { + "epoch": 1.645288820557874, + "grad_norm": 2.238372564315796, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7222463488578796, + "num_tokens": 374048047.0, + "step": 14982 + }, + { + "epoch": 1.6453986382604877, + "grad_norm": 2.2771048545837402, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7208508253097534, + "num_tokens": 374073666.0, + "step": 14983 + }, + { + "epoch": 1.6455084559631012, + "grad_norm": 2.175520181655884, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6930504441261292, + "num_tokens": 374103538.0, + "step": 14984 + }, + { + "epoch": 1.6456182736657148, + "grad_norm": 2.382293701171875, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7384902238845825, + "num_tokens": 374127182.0, + "step": 14985 + }, + { + "epoch": 1.6457280913683285, + "grad_norm": 1.95863676071167, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.7021981477737427, + "num_tokens": 374159848.0, + "step": 14986 + }, + { + "epoch": 1.6458379090709423, + "grad_norm": 2.266514539718628, + "learning_rate": 1e-06, + "loss": 1.0215, + "mean_token_accuracy": 0.6881098747253418, + "num_tokens": 374185705.0, + "step": 14987 + }, + { + "epoch": 1.645947726773556, + "grad_norm": 2.240511178970337, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7212373614311218, + "num_tokens": 374211162.0, + "step": 14988 + }, + { + "epoch": 1.6460575444761696, + "grad_norm": 2.5422439575195312, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7268391847610474, + "num_tokens": 374233548.0, + "step": 14989 + }, + { + "epoch": 1.6461673621787831, + "grad_norm": 2.4497458934783936, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.6997500658035278, + "num_tokens": 374257840.0, + "step": 14990 + }, + { + "epoch": 1.6462771798813969, + "grad_norm": 2.2254233360290527, + "learning_rate": 1e-06, + "loss": 0.7891, + "mean_token_accuracy": 0.7497390508651733, + "num_tokens": 374281504.0, + "step": 14991 + }, + { + "epoch": 1.6463869975840106, + "grad_norm": 2.199357748031616, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7125239372253418, + "num_tokens": 374310555.0, + "step": 14992 + }, + { + "epoch": 1.6464968152866242, + "grad_norm": 2.4157588481903076, + "learning_rate": 1e-06, + "loss": 0.8022, + "mean_token_accuracy": 0.7418255805969238, + "num_tokens": 374331676.0, + "step": 14993 + }, + { + "epoch": 1.6466066329892377, + "grad_norm": 2.3895933628082275, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7192403078079224, + "num_tokens": 374356733.0, + "step": 14994 + }, + { + "epoch": 1.6467164506918515, + "grad_norm": 2.1803526878356934, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7063151001930237, + "num_tokens": 374382980.0, + "step": 14995 + }, + { + "epoch": 1.6468262683944652, + "grad_norm": 2.4421095848083496, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7188311815261841, + "num_tokens": 374404762.0, + "step": 14996 + }, + { + "epoch": 1.646936086097079, + "grad_norm": 2.2521250247955322, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7067104578018188, + "num_tokens": 374431329.0, + "step": 14997 + }, + { + "epoch": 1.6470459037996925, + "grad_norm": 2.2086098194122314, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6970276832580566, + "num_tokens": 374457291.0, + "step": 14998 + }, + { + "epoch": 1.647155721502306, + "grad_norm": 2.298790216445923, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7280057072639465, + "num_tokens": 374483156.0, + "step": 14999 + }, + { + "epoch": 1.6472655392049198, + "grad_norm": 2.449876070022583, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7059371471405029, + "num_tokens": 374506223.0, + "step": 15000 + }, + { + "epoch": 1.6473753569075336, + "grad_norm": 2.2413392066955566, + "learning_rate": 1e-06, + "loss": 0.7295, + "mean_token_accuracy": 0.764784574508667, + "num_tokens": 374528014.0, + "step": 15001 + }, + { + "epoch": 1.6474851746101473, + "grad_norm": 2.232938766479492, + "learning_rate": 1e-06, + "loss": 0.9924, + "mean_token_accuracy": 0.6889849901199341, + "num_tokens": 374552326.0, + "step": 15002 + }, + { + "epoch": 1.6475949923127609, + "grad_norm": 2.274789810180664, + "learning_rate": 1e-06, + "loss": 0.9371, + "mean_token_accuracy": 0.7106879949569702, + "num_tokens": 374577328.0, + "step": 15003 + }, + { + "epoch": 1.6477048100153744, + "grad_norm": 2.548325777053833, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7121433019638062, + "num_tokens": 374600045.0, + "step": 15004 + }, + { + "epoch": 1.6478146277179881, + "grad_norm": 2.286608934402466, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7110179662704468, + "num_tokens": 374625328.0, + "step": 15005 + }, + { + "epoch": 1.647924445420602, + "grad_norm": 2.512233018875122, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7345842123031616, + "num_tokens": 374645672.0, + "step": 15006 + }, + { + "epoch": 1.6480342631232154, + "grad_norm": 2.2888190746307373, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7298825979232788, + "num_tokens": 374669781.0, + "step": 15007 + }, + { + "epoch": 1.648144080825829, + "grad_norm": 2.4114246368408203, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7204389572143555, + "num_tokens": 374693467.0, + "step": 15008 + }, + { + "epoch": 1.6482538985284427, + "grad_norm": 2.1366686820983887, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7211315631866455, + "num_tokens": 374722431.0, + "step": 15009 + }, + { + "epoch": 1.6483637162310565, + "grad_norm": 2.082322120666504, + "learning_rate": 1e-06, + "loss": 1.0192, + "mean_token_accuracy": 0.6921778917312622, + "num_tokens": 374752809.0, + "step": 15010 + }, + { + "epoch": 1.6484735339336702, + "grad_norm": 2.468491792678833, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.706528902053833, + "num_tokens": 374774909.0, + "step": 15011 + }, + { + "epoch": 1.6485833516362838, + "grad_norm": 2.007552146911621, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6921377778053284, + "num_tokens": 374807972.0, + "step": 15012 + }, + { + "epoch": 1.6486931693388973, + "grad_norm": 2.032028913497925, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7227156162261963, + "num_tokens": 374838515.0, + "step": 15013 + }, + { + "epoch": 1.648802987041511, + "grad_norm": 2.4414477348327637, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.70589280128479, + "num_tokens": 374861313.0, + "step": 15014 + }, + { + "epoch": 1.6489128047441248, + "grad_norm": 2.2623794078826904, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7348310351371765, + "num_tokens": 374887462.0, + "step": 15015 + }, + { + "epoch": 1.6490226224467384, + "grad_norm": 2.3376433849334717, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7289832830429077, + "num_tokens": 374912192.0, + "step": 15016 + }, + { + "epoch": 1.6491324401493521, + "grad_norm": 2.2203235626220703, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.7173937559127808, + "num_tokens": 374938502.0, + "step": 15017 + }, + { + "epoch": 1.6492422578519657, + "grad_norm": 2.2062203884124756, + "learning_rate": 1e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7550063133239746, + "num_tokens": 374963010.0, + "step": 15018 + }, + { + "epoch": 1.6493520755545794, + "grad_norm": 1.9961130619049072, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7156674861907959, + "num_tokens": 374992833.0, + "step": 15019 + }, + { + "epoch": 1.6494618932571932, + "grad_norm": 2.1497886180877686, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.704592227935791, + "num_tokens": 375021983.0, + "step": 15020 + }, + { + "epoch": 1.6495717109598067, + "grad_norm": 2.099116086959839, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7198367118835449, + "num_tokens": 375052434.0, + "step": 15021 + }, + { + "epoch": 1.6496815286624202, + "grad_norm": 1.8180572986602783, + "learning_rate": 1e-06, + "loss": 0.9668, + "mean_token_accuracy": 0.7007673382759094, + "num_tokens": 375089584.0, + "step": 15022 + }, + { + "epoch": 1.649791346365034, + "grad_norm": 2.660682439804077, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7217001914978027, + "num_tokens": 375109995.0, + "step": 15023 + }, + { + "epoch": 1.6499011640676478, + "grad_norm": 2.160311222076416, + "learning_rate": 1e-06, + "loss": 0.8232, + "mean_token_accuracy": 0.734508752822876, + "num_tokens": 375136144.0, + "step": 15024 + }, + { + "epoch": 1.6500109817702615, + "grad_norm": 2.2349328994750977, + "learning_rate": 1e-06, + "loss": 0.9579, + "mean_token_accuracy": 0.7084963917732239, + "num_tokens": 375162135.0, + "step": 15025 + }, + { + "epoch": 1.650120799472875, + "grad_norm": 2.233306646347046, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7184814810752869, + "num_tokens": 375189909.0, + "step": 15026 + }, + { + "epoch": 1.6502306171754886, + "grad_norm": 2.302337646484375, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.6983911991119385, + "num_tokens": 375215087.0, + "step": 15027 + }, + { + "epoch": 1.6503404348781023, + "grad_norm": 2.143855333328247, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.705106794834137, + "num_tokens": 375244769.0, + "step": 15028 + }, + { + "epoch": 1.650450252580716, + "grad_norm": 2.342679977416992, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7067156434059143, + "num_tokens": 375270033.0, + "step": 15029 + }, + { + "epoch": 1.6505600702833296, + "grad_norm": 2.2352919578552246, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7522085309028625, + "num_tokens": 375294604.0, + "step": 15030 + }, + { + "epoch": 1.6506698879859434, + "grad_norm": 2.3359084129333496, + "learning_rate": 1e-06, + "loss": 0.9698, + "mean_token_accuracy": 0.7046914100646973, + "num_tokens": 375318796.0, + "step": 15031 + }, + { + "epoch": 1.650779705688557, + "grad_norm": 2.2065720558166504, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7170053124427795, + "num_tokens": 375346010.0, + "step": 15032 + }, + { + "epoch": 1.6508895233911707, + "grad_norm": 2.128552198410034, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7209684252738953, + "num_tokens": 375371509.0, + "step": 15033 + }, + { + "epoch": 1.6509993410937844, + "grad_norm": 2.28355073928833, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7266225814819336, + "num_tokens": 375399184.0, + "step": 15034 + }, + { + "epoch": 1.651109158796398, + "grad_norm": 2.2802913188934326, + "learning_rate": 1e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7325236201286316, + "num_tokens": 375423831.0, + "step": 15035 + }, + { + "epoch": 1.6512189764990115, + "grad_norm": 2.2762348651885986, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7217742204666138, + "num_tokens": 375448542.0, + "step": 15036 + }, + { + "epoch": 1.6513287942016253, + "grad_norm": 2.2842726707458496, + "learning_rate": 1e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7382615804672241, + "num_tokens": 375471313.0, + "step": 15037 + }, + { + "epoch": 1.651438611904239, + "grad_norm": 2.507824420928955, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7208762764930725, + "num_tokens": 375492668.0, + "step": 15038 + }, + { + "epoch": 1.6515484296068528, + "grad_norm": 2.3711893558502197, + "learning_rate": 1e-06, + "loss": 0.9869, + "mean_token_accuracy": 0.6966370344161987, + "num_tokens": 375517172.0, + "step": 15039 + }, + { + "epoch": 1.6516582473094663, + "grad_norm": 2.3025271892547607, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7295114398002625, + "num_tokens": 375540634.0, + "step": 15040 + }, + { + "epoch": 1.6517680650120798, + "grad_norm": 2.20959734916687, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7091372013092041, + "num_tokens": 375567323.0, + "step": 15041 + }, + { + "epoch": 1.6518778827146936, + "grad_norm": 2.1298274993896484, + "learning_rate": 1e-06, + "loss": 0.9264, + "mean_token_accuracy": 0.7177410125732422, + "num_tokens": 375596465.0, + "step": 15042 + }, + { + "epoch": 1.6519877004173074, + "grad_norm": 2.064512252807617, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6910654306411743, + "num_tokens": 375624864.0, + "step": 15043 + }, + { + "epoch": 1.652097518119921, + "grad_norm": 2.1190974712371826, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7120423316955566, + "num_tokens": 375655187.0, + "step": 15044 + }, + { + "epoch": 1.6522073358225344, + "grad_norm": 2.427036762237549, + "learning_rate": 1e-06, + "loss": 0.9444, + "mean_token_accuracy": 0.7121115922927856, + "num_tokens": 375677418.0, + "step": 15045 + }, + { + "epoch": 1.6523171535251482, + "grad_norm": 2.51775860786438, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.7154271602630615, + "num_tokens": 375699030.0, + "step": 15046 + }, + { + "epoch": 1.652426971227762, + "grad_norm": 2.37640380859375, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7174832820892334, + "num_tokens": 375722285.0, + "step": 15047 + }, + { + "epoch": 1.6525367889303757, + "grad_norm": 2.1434590816497803, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7208156585693359, + "num_tokens": 375749762.0, + "step": 15048 + }, + { + "epoch": 1.6526466066329892, + "grad_norm": 2.1794521808624268, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7037764191627502, + "num_tokens": 375776440.0, + "step": 15049 + }, + { + "epoch": 1.6527564243356028, + "grad_norm": 2.259493350982666, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7263633012771606, + "num_tokens": 375801932.0, + "step": 15050 + }, + { + "epoch": 1.6528662420382165, + "grad_norm": 2.623734712600708, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7246305346488953, + "num_tokens": 375820917.0, + "step": 15051 + }, + { + "epoch": 1.6529760597408303, + "grad_norm": 2.4804646968841553, + "learning_rate": 1e-06, + "loss": 0.8581, + "mean_token_accuracy": 0.7305516600608826, + "num_tokens": 375840338.0, + "step": 15052 + }, + { + "epoch": 1.653085877443444, + "grad_norm": 2.3210887908935547, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7264133095741272, + "num_tokens": 375864683.0, + "step": 15053 + }, + { + "epoch": 1.6531956951460576, + "grad_norm": 2.436035633087158, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7081922888755798, + "num_tokens": 375886475.0, + "step": 15054 + }, + { + "epoch": 1.653305512848671, + "grad_norm": 2.6459224224090576, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7347554564476013, + "num_tokens": 375905140.0, + "step": 15055 + }, + { + "epoch": 1.6534153305512849, + "grad_norm": 2.310044765472412, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.6996391415596008, + "num_tokens": 375929758.0, + "step": 15056 + }, + { + "epoch": 1.6535251482538986, + "grad_norm": 2.372061252593994, + "learning_rate": 1e-06, + "loss": 0.7946, + "mean_token_accuracy": 0.7411391139030457, + "num_tokens": 375950894.0, + "step": 15057 + }, + { + "epoch": 1.6536349659565122, + "grad_norm": 2.3774821758270264, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7258483171463013, + "num_tokens": 375974414.0, + "step": 15058 + }, + { + "epoch": 1.6537447836591257, + "grad_norm": 2.1606955528259277, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7177512645721436, + "num_tokens": 376000033.0, + "step": 15059 + }, + { + "epoch": 1.6538546013617395, + "grad_norm": 2.511005401611328, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7318407297134399, + "num_tokens": 376020269.0, + "step": 15060 + }, + { + "epoch": 1.6539644190643532, + "grad_norm": 2.3301308155059814, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7553696632385254, + "num_tokens": 376041451.0, + "step": 15061 + }, + { + "epoch": 1.654074236766967, + "grad_norm": 2.38349986076355, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7077528238296509, + "num_tokens": 376065239.0, + "step": 15062 + }, + { + "epoch": 1.6541840544695805, + "grad_norm": 2.0693929195404053, + "learning_rate": 1e-06, + "loss": 0.9452, + "mean_token_accuracy": 0.7014164924621582, + "num_tokens": 376094666.0, + "step": 15063 + }, + { + "epoch": 1.654293872172194, + "grad_norm": 2.2345311641693115, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7178907990455627, + "num_tokens": 376120229.0, + "step": 15064 + }, + { + "epoch": 1.6544036898748078, + "grad_norm": 2.3589460849761963, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.7275096774101257, + "num_tokens": 376145321.0, + "step": 15065 + }, + { + "epoch": 1.6545135075774215, + "grad_norm": 2.294992685317993, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.7263248562812805, + "num_tokens": 376170277.0, + "step": 15066 + }, + { + "epoch": 1.6546233252800353, + "grad_norm": 2.3432259559631348, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7274348735809326, + "num_tokens": 376195838.0, + "step": 15067 + }, + { + "epoch": 1.6547331429826488, + "grad_norm": 2.448178768157959, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7373583316802979, + "num_tokens": 376219256.0, + "step": 15068 + }, + { + "epoch": 1.6548429606852624, + "grad_norm": 2.5674121379852295, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7199349403381348, + "num_tokens": 376238479.0, + "step": 15069 + }, + { + "epoch": 1.6549527783878761, + "grad_norm": 2.4012796878814697, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7245969772338867, + "num_tokens": 376262331.0, + "step": 15070 + }, + { + "epoch": 1.65506259609049, + "grad_norm": 2.024862289428711, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7280913591384888, + "num_tokens": 376290851.0, + "step": 15071 + }, + { + "epoch": 1.6551724137931034, + "grad_norm": 2.1714298725128174, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7213114500045776, + "num_tokens": 376317454.0, + "step": 15072 + }, + { + "epoch": 1.655282231495717, + "grad_norm": 2.677157163619995, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.729046642780304, + "num_tokens": 376335917.0, + "step": 15073 + }, + { + "epoch": 1.6553920491983307, + "grad_norm": 2.7967848777770996, + "learning_rate": 1e-06, + "loss": 0.8833, + "mean_token_accuracy": 0.7225822806358337, + "num_tokens": 376353060.0, + "step": 15074 + }, + { + "epoch": 1.6555018669009445, + "grad_norm": 2.0183396339416504, + "learning_rate": 1e-06, + "loss": 0.9651, + "mean_token_accuracy": 0.7025519609451294, + "num_tokens": 376383323.0, + "step": 15075 + }, + { + "epoch": 1.6556116846035582, + "grad_norm": 2.0188968181610107, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7248364090919495, + "num_tokens": 376413958.0, + "step": 15076 + }, + { + "epoch": 1.6557215023061718, + "grad_norm": 2.3702170848846436, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7267827987670898, + "num_tokens": 376437879.0, + "step": 15077 + }, + { + "epoch": 1.6558313200087853, + "grad_norm": 2.478179693222046, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7185612320899963, + "num_tokens": 376460113.0, + "step": 15078 + }, + { + "epoch": 1.655941137711399, + "grad_norm": 2.7716078758239746, + "learning_rate": 1e-06, + "loss": 0.8702, + "mean_token_accuracy": 0.7272917032241821, + "num_tokens": 376479749.0, + "step": 15079 + }, + { + "epoch": 1.6560509554140128, + "grad_norm": 2.179356575012207, + "learning_rate": 1e-06, + "loss": 0.8922, + "mean_token_accuracy": 0.7277787923812866, + "num_tokens": 376509050.0, + "step": 15080 + }, + { + "epoch": 1.6561607731166264, + "grad_norm": 2.3698182106018066, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.7052900791168213, + "num_tokens": 376532045.0, + "step": 15081 + }, + { + "epoch": 1.65627059081924, + "grad_norm": 2.0573041439056396, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7026093006134033, + "num_tokens": 376562460.0, + "step": 15082 + }, + { + "epoch": 1.6563804085218536, + "grad_norm": 2.1141304969787598, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7155618667602539, + "num_tokens": 376589230.0, + "step": 15083 + }, + { + "epoch": 1.6564902262244674, + "grad_norm": 2.525453805923462, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7411835193634033, + "num_tokens": 376608926.0, + "step": 15084 + }, + { + "epoch": 1.6566000439270812, + "grad_norm": 2.246225357055664, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7326305508613586, + "num_tokens": 376635840.0, + "step": 15085 + }, + { + "epoch": 1.6567098616296947, + "grad_norm": 1.9995518922805786, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7216286659240723, + "num_tokens": 376664808.0, + "step": 15086 + }, + { + "epoch": 1.6568196793323082, + "grad_norm": 2.006343364715576, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7232096195220947, + "num_tokens": 376695359.0, + "step": 15087 + }, + { + "epoch": 1.656929497034922, + "grad_norm": 2.444441080093384, + "learning_rate": 1e-06, + "loss": 0.7985, + "mean_token_accuracy": 0.7438802719116211, + "num_tokens": 376716586.0, + "step": 15088 + }, + { + "epoch": 1.6570393147375357, + "grad_norm": 2.2618730068206787, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7458503246307373, + "num_tokens": 376741589.0, + "step": 15089 + }, + { + "epoch": 1.6571491324401495, + "grad_norm": 2.1938400268554688, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.721550464630127, + "num_tokens": 376768263.0, + "step": 15090 + }, + { + "epoch": 1.657258950142763, + "grad_norm": 1.9886648654937744, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7302378416061401, + "num_tokens": 376797701.0, + "step": 15091 + }, + { + "epoch": 1.6573687678453766, + "grad_norm": 2.192064046859741, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7047519683837891, + "num_tokens": 376825460.0, + "step": 15092 + }, + { + "epoch": 1.6574785855479903, + "grad_norm": 2.4204046726226807, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7403853535652161, + "num_tokens": 376846852.0, + "step": 15093 + }, + { + "epoch": 1.657588403250604, + "grad_norm": 2.2898924350738525, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7401277422904968, + "num_tokens": 376870704.0, + "step": 15094 + }, + { + "epoch": 1.6576982209532176, + "grad_norm": 2.533237934112549, + "learning_rate": 1e-06, + "loss": 0.8611, + "mean_token_accuracy": 0.729161262512207, + "num_tokens": 376892452.0, + "step": 15095 + }, + { + "epoch": 1.6578080386558314, + "grad_norm": 2.3003718852996826, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.71103835105896, + "num_tokens": 376919556.0, + "step": 15096 + }, + { + "epoch": 1.657917856358445, + "grad_norm": 2.499479055404663, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7267554998397827, + "num_tokens": 376941187.0, + "step": 15097 + }, + { + "epoch": 1.6580276740610587, + "grad_norm": 2.0901038646698, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7139223217964172, + "num_tokens": 376970928.0, + "step": 15098 + }, + { + "epoch": 1.6581374917636724, + "grad_norm": 2.1341147422790527, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7268201112747192, + "num_tokens": 376998881.0, + "step": 15099 + }, + { + "epoch": 1.658247309466286, + "grad_norm": 2.3691697120666504, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.739276647567749, + "num_tokens": 377020376.0, + "step": 15100 + }, + { + "epoch": 1.6583571271688995, + "grad_norm": 2.7327942848205566, + "learning_rate": 1e-06, + "loss": 0.8049, + "mean_token_accuracy": 0.7407471537590027, + "num_tokens": 377038923.0, + "step": 15101 + }, + { + "epoch": 1.6584669448715132, + "grad_norm": 2.219708204269409, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7302650213241577, + "num_tokens": 377063020.0, + "step": 15102 + }, + { + "epoch": 1.658576762574127, + "grad_norm": 2.198692798614502, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7180964946746826, + "num_tokens": 377088632.0, + "step": 15103 + }, + { + "epoch": 1.6586865802767408, + "grad_norm": 2.52694034576416, + "learning_rate": 1e-06, + "loss": 0.7871, + "mean_token_accuracy": 0.7522236108779907, + "num_tokens": 377107813.0, + "step": 15104 + }, + { + "epoch": 1.6587963979793543, + "grad_norm": 2.3428664207458496, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7167763113975525, + "num_tokens": 377133992.0, + "step": 15105 + }, + { + "epoch": 1.6589062156819678, + "grad_norm": 2.3817591667175293, + "learning_rate": 1e-06, + "loss": 0.9404, + "mean_token_accuracy": 0.7019569277763367, + "num_tokens": 377156893.0, + "step": 15106 + }, + { + "epoch": 1.6590160333845816, + "grad_norm": 2.061707019805908, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.6968689560890198, + "num_tokens": 377186604.0, + "step": 15107 + }, + { + "epoch": 1.6591258510871953, + "grad_norm": 2.183377504348755, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7240293025970459, + "num_tokens": 377212409.0, + "step": 15108 + }, + { + "epoch": 1.6592356687898089, + "grad_norm": 2.2332475185394287, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7579847574234009, + "num_tokens": 377234790.0, + "step": 15109 + }, + { + "epoch": 1.6593454864924224, + "grad_norm": 2.401926040649414, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.7216474413871765, + "num_tokens": 377255996.0, + "step": 15110 + }, + { + "epoch": 1.6594553041950362, + "grad_norm": 2.1089377403259277, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7022531032562256, + "num_tokens": 377282834.0, + "step": 15111 + }, + { + "epoch": 1.65956512189765, + "grad_norm": 2.0581130981445312, + "learning_rate": 1e-06, + "loss": 0.9964, + "mean_token_accuracy": 0.6966986656188965, + "num_tokens": 377311073.0, + "step": 15112 + }, + { + "epoch": 1.6596749396002637, + "grad_norm": 2.3154191970825195, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7396354079246521, + "num_tokens": 377331853.0, + "step": 15113 + }, + { + "epoch": 1.6597847573028772, + "grad_norm": 2.2940237522125244, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7221283912658691, + "num_tokens": 377356410.0, + "step": 15114 + }, + { + "epoch": 1.6598945750054908, + "grad_norm": 2.283510684967041, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7093478441238403, + "num_tokens": 377380720.0, + "step": 15115 + }, + { + "epoch": 1.6600043927081045, + "grad_norm": 2.238966226577759, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7149454355239868, + "num_tokens": 377408144.0, + "step": 15116 + }, + { + "epoch": 1.6601142104107183, + "grad_norm": 2.184149980545044, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7234455347061157, + "num_tokens": 377435011.0, + "step": 15117 + }, + { + "epoch": 1.660224028113332, + "grad_norm": 2.00643253326416, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7096671462059021, + "num_tokens": 377464825.0, + "step": 15118 + }, + { + "epoch": 1.6603338458159456, + "grad_norm": 2.2729239463806152, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7142784595489502, + "num_tokens": 377491367.0, + "step": 15119 + }, + { + "epoch": 1.660443663518559, + "grad_norm": 2.5716590881347656, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7324669361114502, + "num_tokens": 377512083.0, + "step": 15120 + }, + { + "epoch": 1.6605534812211729, + "grad_norm": 2.0872724056243896, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7365289330482483, + "num_tokens": 377537533.0, + "step": 15121 + }, + { + "epoch": 1.6606632989237866, + "grad_norm": 2.1397135257720947, + "learning_rate": 1e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.751014232635498, + "num_tokens": 377561864.0, + "step": 15122 + }, + { + "epoch": 1.6607731166264001, + "grad_norm": 2.0791141986846924, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7156527638435364, + "num_tokens": 377591034.0, + "step": 15123 + }, + { + "epoch": 1.6608829343290137, + "grad_norm": 2.6657817363739014, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7221939563751221, + "num_tokens": 377611951.0, + "step": 15124 + }, + { + "epoch": 1.6609927520316274, + "grad_norm": 2.301224708557129, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7229442596435547, + "num_tokens": 377637372.0, + "step": 15125 + }, + { + "epoch": 1.6611025697342412, + "grad_norm": 2.086927890777588, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7139161229133606, + "num_tokens": 377666073.0, + "step": 15126 + }, + { + "epoch": 1.661212387436855, + "grad_norm": 2.433349847793579, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7095797657966614, + "num_tokens": 377688045.0, + "step": 15127 + }, + { + "epoch": 1.6613222051394685, + "grad_norm": 2.0301313400268555, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7128832936286926, + "num_tokens": 377717229.0, + "step": 15128 + }, + { + "epoch": 1.661432022842082, + "grad_norm": 2.3049612045288086, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7290744185447693, + "num_tokens": 377742062.0, + "step": 15129 + }, + { + "epoch": 1.6615418405446958, + "grad_norm": 2.2322404384613037, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7350319623947144, + "num_tokens": 377767477.0, + "step": 15130 + }, + { + "epoch": 1.6616516582473095, + "grad_norm": 2.130016803741455, + "learning_rate": 1e-06, + "loss": 0.9806, + "mean_token_accuracy": 0.7024854421615601, + "num_tokens": 377797969.0, + "step": 15131 + }, + { + "epoch": 1.661761475949923, + "grad_norm": 2.0763282775878906, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.714448869228363, + "num_tokens": 377825078.0, + "step": 15132 + }, + { + "epoch": 1.6618712936525368, + "grad_norm": 2.241180896759033, + "learning_rate": 1e-06, + "loss": 0.8819, + "mean_token_accuracy": 0.7262297868728638, + "num_tokens": 377851572.0, + "step": 15133 + }, + { + "epoch": 1.6619811113551504, + "grad_norm": 2.0596704483032227, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7252553701400757, + "num_tokens": 377880325.0, + "step": 15134 + }, + { + "epoch": 1.6620909290577641, + "grad_norm": 2.280977249145508, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.6999533772468567, + "num_tokens": 377906298.0, + "step": 15135 + }, + { + "epoch": 1.6622007467603779, + "grad_norm": 2.1827895641326904, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7078138589859009, + "num_tokens": 377933261.0, + "step": 15136 + }, + { + "epoch": 1.6623105644629914, + "grad_norm": 2.385469436645508, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7231333255767822, + "num_tokens": 377954863.0, + "step": 15137 + }, + { + "epoch": 1.662420382165605, + "grad_norm": 2.116933584213257, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7225174903869629, + "num_tokens": 377986076.0, + "step": 15138 + }, + { + "epoch": 1.6625301998682187, + "grad_norm": 2.2967679500579834, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7168205976486206, + "num_tokens": 378009432.0, + "step": 15139 + }, + { + "epoch": 1.6626400175708325, + "grad_norm": 1.7669456005096436, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6914547681808472, + "num_tokens": 378047071.0, + "step": 15140 + }, + { + "epoch": 1.6627498352734462, + "grad_norm": 2.7288880348205566, + "learning_rate": 1e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.7461308240890503, + "num_tokens": 378064990.0, + "step": 15141 + }, + { + "epoch": 1.6628596529760598, + "grad_norm": 2.1059281826019287, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.73423171043396, + "num_tokens": 378090762.0, + "step": 15142 + }, + { + "epoch": 1.6629694706786733, + "grad_norm": 2.595416784286499, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7120828628540039, + "num_tokens": 378111607.0, + "step": 15143 + }, + { + "epoch": 1.663079288381287, + "grad_norm": 2.267526388168335, + "learning_rate": 1e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7361249923706055, + "num_tokens": 378135090.0, + "step": 15144 + }, + { + "epoch": 1.6631891060839008, + "grad_norm": 2.4604618549346924, + "learning_rate": 1e-06, + "loss": 0.8324, + "mean_token_accuracy": 0.7390649318695068, + "num_tokens": 378159067.0, + "step": 15145 + }, + { + "epoch": 1.6632989237865143, + "grad_norm": 2.3924763202667236, + "learning_rate": 1e-06, + "loss": 0.7992, + "mean_token_accuracy": 0.7440481781959534, + "num_tokens": 378180520.0, + "step": 15146 + }, + { + "epoch": 1.663408741489128, + "grad_norm": 2.171508550643921, + "learning_rate": 1e-06, + "loss": 0.9531, + "mean_token_accuracy": 0.705367922782898, + "num_tokens": 378207529.0, + "step": 15147 + }, + { + "epoch": 1.6635185591917416, + "grad_norm": 2.3032474517822266, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7370679974555969, + "num_tokens": 378229337.0, + "step": 15148 + }, + { + "epoch": 1.6636283768943554, + "grad_norm": 2.1806693077087402, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.6944372057914734, + "num_tokens": 378257183.0, + "step": 15149 + }, + { + "epoch": 1.6637381945969691, + "grad_norm": 2.0502662658691406, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7324576377868652, + "num_tokens": 378284907.0, + "step": 15150 + }, + { + "epoch": 1.6638480122995827, + "grad_norm": 2.601857900619507, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.721473217010498, + "num_tokens": 378304567.0, + "step": 15151 + }, + { + "epoch": 1.6639578300021962, + "grad_norm": 2.368892192840576, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.6973188519477844, + "num_tokens": 378328111.0, + "step": 15152 + }, + { + "epoch": 1.66406764770481, + "grad_norm": 2.659971237182617, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.726730227470398, + "num_tokens": 378347889.0, + "step": 15153 + }, + { + "epoch": 1.6641774654074237, + "grad_norm": 2.263155221939087, + "learning_rate": 1e-06, + "loss": 0.983, + "mean_token_accuracy": 0.7044979929924011, + "num_tokens": 378373464.0, + "step": 15154 + }, + { + "epoch": 1.6642872831100375, + "grad_norm": 2.0641372203826904, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7163177728652954, + "num_tokens": 378403236.0, + "step": 15155 + }, + { + "epoch": 1.664397100812651, + "grad_norm": 2.2443857192993164, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7155040502548218, + "num_tokens": 378428871.0, + "step": 15156 + }, + { + "epoch": 1.6645069185152646, + "grad_norm": 2.108952760696411, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7429838180541992, + "num_tokens": 378457054.0, + "step": 15157 + }, + { + "epoch": 1.6646167362178783, + "grad_norm": 2.5637433528900146, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.706547200679779, + "num_tokens": 378476876.0, + "step": 15158 + }, + { + "epoch": 1.664726553920492, + "grad_norm": 2.3300366401672363, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7089919447898865, + "num_tokens": 378503024.0, + "step": 15159 + }, + { + "epoch": 1.6648363716231056, + "grad_norm": 2.3098652362823486, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7331517934799194, + "num_tokens": 378525950.0, + "step": 15160 + }, + { + "epoch": 1.6649461893257194, + "grad_norm": 2.2203307151794434, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7369680404663086, + "num_tokens": 378550405.0, + "step": 15161 + }, + { + "epoch": 1.665056007028333, + "grad_norm": 1.964030385017395, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7065062522888184, + "num_tokens": 378584976.0, + "step": 15162 + }, + { + "epoch": 1.6651658247309467, + "grad_norm": 1.940887689590454, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7110216617584229, + "num_tokens": 378618603.0, + "step": 15163 + }, + { + "epoch": 1.6652756424335604, + "grad_norm": 2.2500174045562744, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6928845047950745, + "num_tokens": 378644513.0, + "step": 15164 + }, + { + "epoch": 1.665385460136174, + "grad_norm": 2.353421211242676, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7181211709976196, + "num_tokens": 378667040.0, + "step": 15165 + }, + { + "epoch": 1.6654952778387875, + "grad_norm": 2.2426345348358154, + "learning_rate": 1e-06, + "loss": 0.8585, + "mean_token_accuracy": 0.7285077571868896, + "num_tokens": 378693212.0, + "step": 15166 + }, + { + "epoch": 1.6656050955414012, + "grad_norm": 2.39385986328125, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7269136905670166, + "num_tokens": 378714677.0, + "step": 15167 + }, + { + "epoch": 1.665714913244015, + "grad_norm": 2.1331045627593994, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7081197500228882, + "num_tokens": 378743184.0, + "step": 15168 + }, + { + "epoch": 1.6658247309466288, + "grad_norm": 2.375173568725586, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7424963712692261, + "num_tokens": 378766123.0, + "step": 15169 + }, + { + "epoch": 1.6659345486492423, + "grad_norm": 2.096733808517456, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7068910598754883, + "num_tokens": 378793582.0, + "step": 15170 + }, + { + "epoch": 1.6660443663518558, + "grad_norm": 2.1586077213287354, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7328211665153503, + "num_tokens": 378821909.0, + "step": 15171 + }, + { + "epoch": 1.6661541840544696, + "grad_norm": 2.105614185333252, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.6971561312675476, + "num_tokens": 378852382.0, + "step": 15172 + }, + { + "epoch": 1.6662640017570833, + "grad_norm": 2.1572372913360596, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7233951091766357, + "num_tokens": 378878941.0, + "step": 15173 + }, + { + "epoch": 1.6663738194596969, + "grad_norm": 2.0509161949157715, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7167977094650269, + "num_tokens": 378908745.0, + "step": 15174 + }, + { + "epoch": 1.6664836371623104, + "grad_norm": 2.140791416168213, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.7272039651870728, + "num_tokens": 378934409.0, + "step": 15175 + }, + { + "epoch": 1.6665934548649242, + "grad_norm": 2.0208394527435303, + "learning_rate": 1e-06, + "loss": 0.8475, + "mean_token_accuracy": 0.7344974279403687, + "num_tokens": 378965602.0, + "step": 15176 + }, + { + "epoch": 1.666703272567538, + "grad_norm": 2.3058178424835205, + "learning_rate": 1e-06, + "loss": 0.9786, + "mean_token_accuracy": 0.6949887871742249, + "num_tokens": 378992303.0, + "step": 15177 + }, + { + "epoch": 1.6668130902701517, + "grad_norm": 2.038569211959839, + "learning_rate": 1e-06, + "loss": 0.8975, + "mean_token_accuracy": 0.729667067527771, + "num_tokens": 379020111.0, + "step": 15178 + }, + { + "epoch": 1.6669229079727652, + "grad_norm": 2.2283360958099365, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7225997447967529, + "num_tokens": 379045496.0, + "step": 15179 + }, + { + "epoch": 1.6670327256753787, + "grad_norm": 2.0410687923431396, + "learning_rate": 1e-06, + "loss": 0.9879, + "mean_token_accuracy": 0.693108856678009, + "num_tokens": 379076783.0, + "step": 15180 + }, + { + "epoch": 1.6671425433779925, + "grad_norm": 2.464578151702881, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7334634065628052, + "num_tokens": 379099075.0, + "step": 15181 + }, + { + "epoch": 1.6672523610806063, + "grad_norm": 2.3988800048828125, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7375123500823975, + "num_tokens": 379121326.0, + "step": 15182 + }, + { + "epoch": 1.66736217878322, + "grad_norm": 2.2550220489501953, + "learning_rate": 1e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7381922006607056, + "num_tokens": 379144208.0, + "step": 15183 + }, + { + "epoch": 1.6674719964858336, + "grad_norm": 2.3973183631896973, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7348004579544067, + "num_tokens": 379166195.0, + "step": 15184 + }, + { + "epoch": 1.667581814188447, + "grad_norm": 2.658001661300659, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.720691978931427, + "num_tokens": 379185869.0, + "step": 15185 + }, + { + "epoch": 1.6676916318910608, + "grad_norm": 2.123469591140747, + "learning_rate": 1e-06, + "loss": 0.9499, + "mean_token_accuracy": 0.7056335210800171, + "num_tokens": 379214481.0, + "step": 15186 + }, + { + "epoch": 1.6678014495936746, + "grad_norm": 2.4697227478027344, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7173870205879211, + "num_tokens": 379236690.0, + "step": 15187 + }, + { + "epoch": 1.6679112672962881, + "grad_norm": 2.3972582817077637, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7078485488891602, + "num_tokens": 379260659.0, + "step": 15188 + }, + { + "epoch": 1.6680210849989017, + "grad_norm": 2.348266363143921, + "learning_rate": 1e-06, + "loss": 0.9761, + "mean_token_accuracy": 0.7102611064910889, + "num_tokens": 379284366.0, + "step": 15189 + }, + { + "epoch": 1.6681309027015154, + "grad_norm": 2.4818553924560547, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7112307548522949, + "num_tokens": 379306403.0, + "step": 15190 + }, + { + "epoch": 1.6682407204041292, + "grad_norm": 2.044081687927246, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7427271008491516, + "num_tokens": 379335247.0, + "step": 15191 + }, + { + "epoch": 1.668350538106743, + "grad_norm": 2.0042595863342285, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7278667688369751, + "num_tokens": 379366375.0, + "step": 15192 + }, + { + "epoch": 1.6684603558093565, + "grad_norm": 2.2000389099121094, + "learning_rate": 1e-06, + "loss": 0.9739, + "mean_token_accuracy": 0.7095359563827515, + "num_tokens": 379392715.0, + "step": 15193 + }, + { + "epoch": 1.66857017351197, + "grad_norm": 2.460209608078003, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7411419749259949, + "num_tokens": 379413404.0, + "step": 15194 + }, + { + "epoch": 1.6686799912145838, + "grad_norm": 2.5364484786987305, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7346035242080688, + "num_tokens": 379432286.0, + "step": 15195 + }, + { + "epoch": 1.6687898089171975, + "grad_norm": 2.319546699523926, + "learning_rate": 1e-06, + "loss": 0.8812, + "mean_token_accuracy": 0.7211923599243164, + "num_tokens": 379456181.0, + "step": 15196 + }, + { + "epoch": 1.668899626619811, + "grad_norm": 2.2305409908294678, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7281193137168884, + "num_tokens": 379480828.0, + "step": 15197 + }, + { + "epoch": 1.6690094443224248, + "grad_norm": 2.29547381401062, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7138040065765381, + "num_tokens": 379506726.0, + "step": 15198 + }, + { + "epoch": 1.6691192620250384, + "grad_norm": 1.8864928483963013, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7391895651817322, + "num_tokens": 379539533.0, + "step": 15199 + }, + { + "epoch": 1.669229079727652, + "grad_norm": 2.243450164794922, + "learning_rate": 1e-06, + "loss": 0.8838, + "mean_token_accuracy": 0.7229746580123901, + "num_tokens": 379565173.0, + "step": 15200 + }, + { + "epoch": 1.6693388974302659, + "grad_norm": 2.299978017807007, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7216914296150208, + "num_tokens": 379590603.0, + "step": 15201 + }, + { + "epoch": 1.6694487151328794, + "grad_norm": 2.322463035583496, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7117601037025452, + "num_tokens": 379615220.0, + "step": 15202 + }, + { + "epoch": 1.669558532835493, + "grad_norm": 2.3163034915924072, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7218078374862671, + "num_tokens": 379639138.0, + "step": 15203 + }, + { + "epoch": 1.6696683505381067, + "grad_norm": 2.4199533462524414, + "learning_rate": 1e-06, + "loss": 0.7881, + "mean_token_accuracy": 0.7479748725891113, + "num_tokens": 379659767.0, + "step": 15204 + }, + { + "epoch": 1.6697781682407205, + "grad_norm": 2.5569372177124023, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7138440608978271, + "num_tokens": 379681674.0, + "step": 15205 + }, + { + "epoch": 1.6698879859433342, + "grad_norm": 2.294816493988037, + "learning_rate": 1e-06, + "loss": 0.9222, + "mean_token_accuracy": 0.7203752994537354, + "num_tokens": 379707471.0, + "step": 15206 + }, + { + "epoch": 1.6699978036459477, + "grad_norm": 2.063835620880127, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7084457874298096, + "num_tokens": 379736459.0, + "step": 15207 + }, + { + "epoch": 1.6701076213485613, + "grad_norm": 2.1680729389190674, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7147514820098877, + "num_tokens": 379764107.0, + "step": 15208 + }, + { + "epoch": 1.670217439051175, + "grad_norm": 2.558202028274536, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7103279829025269, + "num_tokens": 379784397.0, + "step": 15209 + }, + { + "epoch": 1.6703272567537888, + "grad_norm": 2.5412235260009766, + "learning_rate": 1e-06, + "loss": 0.9083, + "mean_token_accuracy": 0.7138914465904236, + "num_tokens": 379805584.0, + "step": 15210 + }, + { + "epoch": 1.6704370744564023, + "grad_norm": 2.4402213096618652, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7146220803260803, + "num_tokens": 379828080.0, + "step": 15211 + }, + { + "epoch": 1.670546892159016, + "grad_norm": 2.338750123977661, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7383885383605957, + "num_tokens": 379850700.0, + "step": 15212 + }, + { + "epoch": 1.6706567098616296, + "grad_norm": 2.776458501815796, + "learning_rate": 1e-06, + "loss": 0.8202, + "mean_token_accuracy": 0.7358055710792542, + "num_tokens": 379868212.0, + "step": 15213 + }, + { + "epoch": 1.6707665275642434, + "grad_norm": 2.3462886810302734, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7287065982818604, + "num_tokens": 379889327.0, + "step": 15214 + }, + { + "epoch": 1.6708763452668571, + "grad_norm": 2.5424342155456543, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7163347005844116, + "num_tokens": 379911129.0, + "step": 15215 + }, + { + "epoch": 1.6709861629694707, + "grad_norm": 2.421273708343506, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7184714078903198, + "num_tokens": 379935353.0, + "step": 15216 + }, + { + "epoch": 1.6710959806720842, + "grad_norm": 2.1736435890197754, + "learning_rate": 1e-06, + "loss": 0.9423, + "mean_token_accuracy": 0.716455340385437, + "num_tokens": 379961175.0, + "step": 15217 + }, + { + "epoch": 1.671205798374698, + "grad_norm": 2.2672066688537598, + "learning_rate": 1e-06, + "loss": 0.9338, + "mean_token_accuracy": 0.7192987203598022, + "num_tokens": 379986797.0, + "step": 15218 + }, + { + "epoch": 1.6713156160773117, + "grad_norm": 2.1913516521453857, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7365155816078186, + "num_tokens": 380011596.0, + "step": 15219 + }, + { + "epoch": 1.6714254337799255, + "grad_norm": 2.150663137435913, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7123929858207703, + "num_tokens": 380041019.0, + "step": 15220 + }, + { + "epoch": 1.671535251482539, + "grad_norm": 2.0839192867279053, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7286132574081421, + "num_tokens": 380068300.0, + "step": 15221 + }, + { + "epoch": 1.6716450691851525, + "grad_norm": 2.22670316696167, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.6970025897026062, + "num_tokens": 380095159.0, + "step": 15222 + }, + { + "epoch": 1.6717548868877663, + "grad_norm": 2.4839797019958496, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.713550329208374, + "num_tokens": 380115410.0, + "step": 15223 + }, + { + "epoch": 1.67186470459038, + "grad_norm": 2.27036714553833, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7205432057380676, + "num_tokens": 380141901.0, + "step": 15224 + }, + { + "epoch": 1.6719745222929936, + "grad_norm": 2.211808681488037, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7159430384635925, + "num_tokens": 380167747.0, + "step": 15225 + }, + { + "epoch": 1.6720843399956071, + "grad_norm": 2.437351942062378, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7110280394554138, + "num_tokens": 380189501.0, + "step": 15226 + }, + { + "epoch": 1.6721941576982209, + "grad_norm": 1.9478213787078857, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7250709533691406, + "num_tokens": 380220267.0, + "step": 15227 + }, + { + "epoch": 1.6723039754008346, + "grad_norm": 2.734625816345215, + "learning_rate": 1e-06, + "loss": 0.7902, + "mean_token_accuracy": 0.7463398575782776, + "num_tokens": 380237966.0, + "step": 15228 + }, + { + "epoch": 1.6724137931034484, + "grad_norm": 2.0923898220062256, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.7056245803833008, + "num_tokens": 380267651.0, + "step": 15229 + }, + { + "epoch": 1.672523610806062, + "grad_norm": 2.185652256011963, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.7219699025154114, + "num_tokens": 380295281.0, + "step": 15230 + }, + { + "epoch": 1.6726334285086755, + "grad_norm": 2.544921875, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7331694960594177, + "num_tokens": 380315822.0, + "step": 15231 + }, + { + "epoch": 1.6727432462112892, + "grad_norm": 2.1789278984069824, + "learning_rate": 1e-06, + "loss": 0.9748, + "mean_token_accuracy": 0.7116480469703674, + "num_tokens": 380343312.0, + "step": 15232 + }, + { + "epoch": 1.672853063913903, + "grad_norm": 2.6062140464782715, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7308838367462158, + "num_tokens": 380363717.0, + "step": 15233 + }, + { + "epoch": 1.6729628816165167, + "grad_norm": 2.9461145401000977, + "learning_rate": 1e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7372128367424011, + "num_tokens": 380378539.0, + "step": 15234 + }, + { + "epoch": 1.6730726993191303, + "grad_norm": 2.6396491527557373, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7295119762420654, + "num_tokens": 380398967.0, + "step": 15235 + }, + { + "epoch": 1.6731825170217438, + "grad_norm": 2.2866780757904053, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7363101840019226, + "num_tokens": 380422852.0, + "step": 15236 + }, + { + "epoch": 1.6732923347243576, + "grad_norm": 2.2254552841186523, + "learning_rate": 1e-06, + "loss": 0.9292, + "mean_token_accuracy": 0.7113336324691772, + "num_tokens": 380449906.0, + "step": 15237 + }, + { + "epoch": 1.6734021524269713, + "grad_norm": 2.5458054542541504, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7369957566261292, + "num_tokens": 380469094.0, + "step": 15238 + }, + { + "epoch": 1.6735119701295849, + "grad_norm": 2.906010866165161, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7428840398788452, + "num_tokens": 380486451.0, + "step": 15239 + }, + { + "epoch": 1.6736217878321984, + "grad_norm": 2.2893269062042236, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7254350781440735, + "num_tokens": 380509970.0, + "step": 15240 + }, + { + "epoch": 1.6737316055348122, + "grad_norm": 2.418545722961426, + "learning_rate": 1e-06, + "loss": 0.818, + "mean_token_accuracy": 0.7368766069412231, + "num_tokens": 380531516.0, + "step": 15241 + }, + { + "epoch": 1.673841423237426, + "grad_norm": 2.393394947052002, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.732649564743042, + "num_tokens": 380553253.0, + "step": 15242 + }, + { + "epoch": 1.6739512409400397, + "grad_norm": 2.238434314727783, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7190696597099304, + "num_tokens": 380579376.0, + "step": 15243 + }, + { + "epoch": 1.6740610586426532, + "grad_norm": 2.4374191761016846, + "learning_rate": 1e-06, + "loss": 0.8349, + "mean_token_accuracy": 0.7452439069747925, + "num_tokens": 380600307.0, + "step": 15244 + }, + { + "epoch": 1.6741708763452667, + "grad_norm": 2.2721288204193115, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.695300817489624, + "num_tokens": 380626283.0, + "step": 15245 + }, + { + "epoch": 1.6742806940478805, + "grad_norm": 2.483569860458374, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7274916172027588, + "num_tokens": 380647209.0, + "step": 15246 + }, + { + "epoch": 1.6743905117504942, + "grad_norm": 2.045534133911133, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7187376022338867, + "num_tokens": 380676014.0, + "step": 15247 + }, + { + "epoch": 1.674500329453108, + "grad_norm": 1.9449304342269897, + "learning_rate": 1e-06, + "loss": 0.8532, + "mean_token_accuracy": 0.7271114587783813, + "num_tokens": 380709919.0, + "step": 15248 + }, + { + "epoch": 1.6746101471557215, + "grad_norm": 2.3104803562164307, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7130089402198792, + "num_tokens": 380733249.0, + "step": 15249 + }, + { + "epoch": 1.674719964858335, + "grad_norm": 2.137589931488037, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7265876531600952, + "num_tokens": 380760930.0, + "step": 15250 + }, + { + "epoch": 1.6748297825609488, + "grad_norm": 1.964159369468689, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7221552729606628, + "num_tokens": 380792791.0, + "step": 15251 + }, + { + "epoch": 1.6749396002635626, + "grad_norm": 2.4429805278778076, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7057074904441833, + "num_tokens": 380813926.0, + "step": 15252 + }, + { + "epoch": 1.6750494179661761, + "grad_norm": 2.3628034591674805, + "learning_rate": 1e-06, + "loss": 0.9632, + "mean_token_accuracy": 0.698316216468811, + "num_tokens": 380837671.0, + "step": 15253 + }, + { + "epoch": 1.6751592356687897, + "grad_norm": 2.208742141723633, + "learning_rate": 1e-06, + "loss": 0.9418, + "mean_token_accuracy": 0.7101987600326538, + "num_tokens": 380864237.0, + "step": 15254 + }, + { + "epoch": 1.6752690533714034, + "grad_norm": 2.843378782272339, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7333264946937561, + "num_tokens": 380882997.0, + "step": 15255 + }, + { + "epoch": 1.6753788710740172, + "grad_norm": 2.30716872215271, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7279254198074341, + "num_tokens": 380906135.0, + "step": 15256 + }, + { + "epoch": 1.675488688776631, + "grad_norm": 2.449528217315674, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7086483240127563, + "num_tokens": 380929124.0, + "step": 15257 + }, + { + "epoch": 1.6755985064792445, + "grad_norm": 2.1205673217773438, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.705906867980957, + "num_tokens": 380956680.0, + "step": 15258 + }, + { + "epoch": 1.675708324181858, + "grad_norm": 2.1485559940338135, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7178406715393066, + "num_tokens": 380983818.0, + "step": 15259 + }, + { + "epoch": 1.6758181418844718, + "grad_norm": 2.002211332321167, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7194656729698181, + "num_tokens": 381013174.0, + "step": 15260 + }, + { + "epoch": 1.6759279595870855, + "grad_norm": 2.3067269325256348, + "learning_rate": 1e-06, + "loss": 0.9661, + "mean_token_accuracy": 0.7074969410896301, + "num_tokens": 381038669.0, + "step": 15261 + }, + { + "epoch": 1.676037777289699, + "grad_norm": 2.085923910140991, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7076518535614014, + "num_tokens": 381068081.0, + "step": 15262 + }, + { + "epoch": 1.6761475949923128, + "grad_norm": 2.0494790077209473, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7123464345932007, + "num_tokens": 381100440.0, + "step": 15263 + }, + { + "epoch": 1.6762574126949263, + "grad_norm": 2.436286449432373, + "learning_rate": 1e-06, + "loss": 0.7911, + "mean_token_accuracy": 0.7502121925354004, + "num_tokens": 381121584.0, + "step": 15264 + }, + { + "epoch": 1.67636723039754, + "grad_norm": 2.028998374938965, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7013800740242004, + "num_tokens": 381150785.0, + "step": 15265 + }, + { + "epoch": 1.6764770481001539, + "grad_norm": 2.2639706134796143, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7415338754653931, + "num_tokens": 381174400.0, + "step": 15266 + }, + { + "epoch": 1.6765868658027674, + "grad_norm": 2.04569673538208, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7131275534629822, + "num_tokens": 381204354.0, + "step": 15267 + }, + { + "epoch": 1.676696683505381, + "grad_norm": 2.2905704975128174, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7294820547103882, + "num_tokens": 381229172.0, + "step": 15268 + }, + { + "epoch": 1.6768065012079947, + "grad_norm": 2.1640191078186035, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7110995054244995, + "num_tokens": 381256635.0, + "step": 15269 + }, + { + "epoch": 1.6769163189106084, + "grad_norm": 2.0358824729919434, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7321962714195251, + "num_tokens": 381284511.0, + "step": 15270 + }, + { + "epoch": 1.6770261366132222, + "grad_norm": 2.2133867740631104, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7255415916442871, + "num_tokens": 381310781.0, + "step": 15271 + }, + { + "epoch": 1.6771359543158357, + "grad_norm": 2.006331443786621, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7103140950202942, + "num_tokens": 381342768.0, + "step": 15272 + }, + { + "epoch": 1.6772457720184493, + "grad_norm": 2.1769919395446777, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7104742527008057, + "num_tokens": 381371065.0, + "step": 15273 + }, + { + "epoch": 1.677355589721063, + "grad_norm": 2.11983585357666, + "learning_rate": 1e-06, + "loss": 0.9684, + "mean_token_accuracy": 0.7045639753341675, + "num_tokens": 381399700.0, + "step": 15274 + }, + { + "epoch": 1.6774654074236768, + "grad_norm": 2.0231471061706543, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7084546089172363, + "num_tokens": 381429879.0, + "step": 15275 + }, + { + "epoch": 1.6775752251262903, + "grad_norm": 2.4340474605560303, + "learning_rate": 1e-06, + "loss": 0.8944, + "mean_token_accuracy": 0.7161701321601868, + "num_tokens": 381452198.0, + "step": 15276 + }, + { + "epoch": 1.677685042828904, + "grad_norm": 2.1397528648376465, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7202552556991577, + "num_tokens": 381480930.0, + "step": 15277 + }, + { + "epoch": 1.6777948605315176, + "grad_norm": 2.1964480876922607, + "learning_rate": 1e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.7485306859016418, + "num_tokens": 381505889.0, + "step": 15278 + }, + { + "epoch": 1.6779046782341314, + "grad_norm": 2.507207155227661, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7201570272445679, + "num_tokens": 381526725.0, + "step": 15279 + }, + { + "epoch": 1.6780144959367451, + "grad_norm": 2.7597029209136963, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7143541574478149, + "num_tokens": 381551966.0, + "step": 15280 + }, + { + "epoch": 1.6781243136393587, + "grad_norm": 2.049926519393921, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7041980624198914, + "num_tokens": 381582636.0, + "step": 15281 + }, + { + "epoch": 1.6782341313419722, + "grad_norm": 2.345501184463501, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7285323143005371, + "num_tokens": 381605036.0, + "step": 15282 + }, + { + "epoch": 1.678343949044586, + "grad_norm": 2.338914632797241, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7318335175514221, + "num_tokens": 381628500.0, + "step": 15283 + }, + { + "epoch": 1.6784537667471997, + "grad_norm": 2.33805775642395, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7149016261100769, + "num_tokens": 381653062.0, + "step": 15284 + }, + { + "epoch": 1.6785635844498135, + "grad_norm": 2.4326024055480957, + "learning_rate": 1e-06, + "loss": 0.7505, + "mean_token_accuracy": 0.7661640644073486, + "num_tokens": 381673698.0, + "step": 15285 + }, + { + "epoch": 1.678673402152427, + "grad_norm": 2.543024778366089, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7378032207489014, + "num_tokens": 381693827.0, + "step": 15286 + }, + { + "epoch": 1.6787832198550405, + "grad_norm": 2.1509432792663574, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7088973522186279, + "num_tokens": 381723400.0, + "step": 15287 + }, + { + "epoch": 1.6788930375576543, + "grad_norm": 2.634659767150879, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7313428521156311, + "num_tokens": 381741770.0, + "step": 15288 + }, + { + "epoch": 1.679002855260268, + "grad_norm": 2.3817145824432373, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7035441398620605, + "num_tokens": 381765968.0, + "step": 15289 + }, + { + "epoch": 1.6791126729628816, + "grad_norm": 2.4150218963623047, + "learning_rate": 1e-06, + "loss": 0.847, + "mean_token_accuracy": 0.7317575216293335, + "num_tokens": 381788101.0, + "step": 15290 + }, + { + "epoch": 1.6792224906654951, + "grad_norm": 2.545139789581299, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7119161486625671, + "num_tokens": 381809557.0, + "step": 15291 + }, + { + "epoch": 1.6793323083681089, + "grad_norm": 2.0229995250701904, + "learning_rate": 1e-06, + "loss": 0.831, + "mean_token_accuracy": 0.7419591546058655, + "num_tokens": 381837785.0, + "step": 15292 + }, + { + "epoch": 1.6794421260707226, + "grad_norm": 2.363192081451416, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7107205390930176, + "num_tokens": 381863771.0, + "step": 15293 + }, + { + "epoch": 1.6795519437733364, + "grad_norm": 2.2308509349823, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7147175073623657, + "num_tokens": 381890089.0, + "step": 15294 + }, + { + "epoch": 1.67966176147595, + "grad_norm": 2.2514822483062744, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7143856287002563, + "num_tokens": 381915450.0, + "step": 15295 + }, + { + "epoch": 1.6797715791785635, + "grad_norm": 2.283745527267456, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.710320234298706, + "num_tokens": 381940132.0, + "step": 15296 + }, + { + "epoch": 1.6798813968811772, + "grad_norm": 2.4317421913146973, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.745681643486023, + "num_tokens": 381960685.0, + "step": 15297 + }, + { + "epoch": 1.679991214583791, + "grad_norm": 2.3315021991729736, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7067625522613525, + "num_tokens": 381984856.0, + "step": 15298 + }, + { + "epoch": 1.6801010322864047, + "grad_norm": 2.3950629234313965, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7109371423721313, + "num_tokens": 382008634.0, + "step": 15299 + }, + { + "epoch": 1.6802108499890183, + "grad_norm": 2.1484017372131348, + "learning_rate": 1e-06, + "loss": 0.8311, + "mean_token_accuracy": 0.7398909330368042, + "num_tokens": 382035545.0, + "step": 15300 + }, + { + "epoch": 1.6803206676916318, + "grad_norm": 2.381112575531006, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.6963508129119873, + "num_tokens": 382061144.0, + "step": 15301 + }, + { + "epoch": 1.6804304853942456, + "grad_norm": 2.000601053237915, + "learning_rate": 1e-06, + "loss": 1.0043, + "mean_token_accuracy": 0.6902386546134949, + "num_tokens": 382096954.0, + "step": 15302 + }, + { + "epoch": 1.6805403030968593, + "grad_norm": 2.186016798019409, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7191705703735352, + "num_tokens": 382124862.0, + "step": 15303 + }, + { + "epoch": 1.6806501207994728, + "grad_norm": 2.2219936847686768, + "learning_rate": 1e-06, + "loss": 0.8353, + "mean_token_accuracy": 0.7339138984680176, + "num_tokens": 382153151.0, + "step": 15304 + }, + { + "epoch": 1.6807599385020864, + "grad_norm": 2.3650636672973633, + "learning_rate": 1e-06, + "loss": 0.9139, + "mean_token_accuracy": 0.7136561870574951, + "num_tokens": 382175495.0, + "step": 15305 + }, + { + "epoch": 1.6808697562047001, + "grad_norm": 2.1037867069244385, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.7004554271697998, + "num_tokens": 382205286.0, + "step": 15306 + }, + { + "epoch": 1.680979573907314, + "grad_norm": 1.955139398574829, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7102171182632446, + "num_tokens": 382237570.0, + "step": 15307 + }, + { + "epoch": 1.6810893916099277, + "grad_norm": 2.404296875, + "learning_rate": 1e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7278540730476379, + "num_tokens": 382258151.0, + "step": 15308 + }, + { + "epoch": 1.6811992093125412, + "grad_norm": 2.5854551792144775, + "learning_rate": 1e-06, + "loss": 0.8192, + "mean_token_accuracy": 0.7390363216400146, + "num_tokens": 382277790.0, + "step": 15309 + }, + { + "epoch": 1.6813090270151547, + "grad_norm": 2.1578710079193115, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.6971364617347717, + "num_tokens": 382308212.0, + "step": 15310 + }, + { + "epoch": 1.6814188447177685, + "grad_norm": 2.177863597869873, + "learning_rate": 1e-06, + "loss": 0.9853, + "mean_token_accuracy": 0.7013331651687622, + "num_tokens": 382334489.0, + "step": 15311 + }, + { + "epoch": 1.6815286624203822, + "grad_norm": 2.1626460552215576, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7068837881088257, + "num_tokens": 382363451.0, + "step": 15312 + }, + { + "epoch": 1.681638480122996, + "grad_norm": 2.5432281494140625, + "learning_rate": 1e-06, + "loss": 0.7669, + "mean_token_accuracy": 0.7518746852874756, + "num_tokens": 382384762.0, + "step": 15313 + }, + { + "epoch": 1.6817482978256095, + "grad_norm": 2.8068034648895264, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7254588603973389, + "num_tokens": 382401942.0, + "step": 15314 + }, + { + "epoch": 1.681858115528223, + "grad_norm": 2.572249412536621, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7335636615753174, + "num_tokens": 382421115.0, + "step": 15315 + }, + { + "epoch": 1.6819679332308368, + "grad_norm": 2.555217981338501, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7310087084770203, + "num_tokens": 382443446.0, + "step": 15316 + }, + { + "epoch": 1.6820777509334506, + "grad_norm": 2.420719861984253, + "learning_rate": 1e-06, + "loss": 0.8278, + "mean_token_accuracy": 0.7367300987243652, + "num_tokens": 382466764.0, + "step": 15317 + }, + { + "epoch": 1.6821875686360641, + "grad_norm": 2.022984027862549, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7136057615280151, + "num_tokens": 382498676.0, + "step": 15318 + }, + { + "epoch": 1.6822973863386776, + "grad_norm": 2.2186312675476074, + "learning_rate": 1e-06, + "loss": 1.0211, + "mean_token_accuracy": 0.6825187802314758, + "num_tokens": 382526835.0, + "step": 15319 + }, + { + "epoch": 1.6824072040412914, + "grad_norm": 2.4873664379119873, + "learning_rate": 1e-06, + "loss": 0.9369, + "mean_token_accuracy": 0.707635223865509, + "num_tokens": 382550137.0, + "step": 15320 + }, + { + "epoch": 1.6825170217439052, + "grad_norm": 2.165231943130493, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7333083152770996, + "num_tokens": 382577208.0, + "step": 15321 + }, + { + "epoch": 1.682626839446519, + "grad_norm": 2.069181442260742, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7164584398269653, + "num_tokens": 382607385.0, + "step": 15322 + }, + { + "epoch": 1.6827366571491325, + "grad_norm": 2.5823445320129395, + "learning_rate": 1e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.732325553894043, + "num_tokens": 382627398.0, + "step": 15323 + }, + { + "epoch": 1.682846474851746, + "grad_norm": 2.159050464630127, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.710899293422699, + "num_tokens": 382653244.0, + "step": 15324 + }, + { + "epoch": 1.6829562925543597, + "grad_norm": 2.6123790740966797, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7656574249267578, + "num_tokens": 382672182.0, + "step": 15325 + }, + { + "epoch": 1.6830661102569735, + "grad_norm": 2.356428384780884, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7309820055961609, + "num_tokens": 382694640.0, + "step": 15326 + }, + { + "epoch": 1.683175927959587, + "grad_norm": 2.2572317123413086, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7052875757217407, + "num_tokens": 382723069.0, + "step": 15327 + }, + { + "epoch": 1.6832857456622008, + "grad_norm": 2.6105217933654785, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.7219801545143127, + "num_tokens": 382742749.0, + "step": 15328 + }, + { + "epoch": 1.6833955633648143, + "grad_norm": 2.580538272857666, + "learning_rate": 1e-06, + "loss": 0.8692, + "mean_token_accuracy": 0.7262670993804932, + "num_tokens": 382761556.0, + "step": 15329 + }, + { + "epoch": 1.683505381067428, + "grad_norm": 2.4059088230133057, + "learning_rate": 1e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7458961009979248, + "num_tokens": 382782151.0, + "step": 15330 + }, + { + "epoch": 1.6836151987700418, + "grad_norm": 2.3526105880737305, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7316505908966064, + "num_tokens": 382804257.0, + "step": 15331 + }, + { + "epoch": 1.6837250164726554, + "grad_norm": 2.0647521018981934, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7274637818336487, + "num_tokens": 382832405.0, + "step": 15332 + }, + { + "epoch": 1.683834834175269, + "grad_norm": 2.2478199005126953, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7091981172561646, + "num_tokens": 382858149.0, + "step": 15333 + }, + { + "epoch": 1.6839446518778827, + "grad_norm": 2.2524890899658203, + "learning_rate": 1e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7303366661071777, + "num_tokens": 382881047.0, + "step": 15334 + }, + { + "epoch": 1.6840544695804964, + "grad_norm": 1.9223147630691528, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7267741560935974, + "num_tokens": 382914518.0, + "step": 15335 + }, + { + "epoch": 1.6841642872831102, + "grad_norm": 2.468761920928955, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7178512811660767, + "num_tokens": 382938618.0, + "step": 15336 + }, + { + "epoch": 1.6842741049857237, + "grad_norm": 2.0592238903045654, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7238390445709229, + "num_tokens": 382967007.0, + "step": 15337 + }, + { + "epoch": 1.6843839226883373, + "grad_norm": 2.0774307250976562, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7281079292297363, + "num_tokens": 382994575.0, + "step": 15338 + }, + { + "epoch": 1.684493740390951, + "grad_norm": 2.2069640159606934, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7096575498580933, + "num_tokens": 383022512.0, + "step": 15339 + }, + { + "epoch": 1.6846035580935648, + "grad_norm": 2.3440632820129395, + "learning_rate": 1e-06, + "loss": 0.7517, + "mean_token_accuracy": 0.761559247970581, + "num_tokens": 383044710.0, + "step": 15340 + }, + { + "epoch": 1.6847133757961783, + "grad_norm": 2.4529876708984375, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7508420944213867, + "num_tokens": 383065332.0, + "step": 15341 + }, + { + "epoch": 1.684823193498792, + "grad_norm": 2.430234670639038, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7159122228622437, + "num_tokens": 383089402.0, + "step": 15342 + }, + { + "epoch": 1.6849330112014056, + "grad_norm": 2.2034716606140137, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7328425645828247, + "num_tokens": 383114589.0, + "step": 15343 + }, + { + "epoch": 1.6850428289040194, + "grad_norm": 2.5964434146881104, + "learning_rate": 1e-06, + "loss": 0.7486, + "mean_token_accuracy": 0.7522675395011902, + "num_tokens": 383132936.0, + "step": 15344 + }, + { + "epoch": 1.685152646606633, + "grad_norm": 2.3861708641052246, + "learning_rate": 1e-06, + "loss": 0.7824, + "mean_token_accuracy": 0.7550367116928101, + "num_tokens": 383153787.0, + "step": 15345 + }, + { + "epoch": 1.6852624643092466, + "grad_norm": 2.1544699668884277, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7080698013305664, + "num_tokens": 383182349.0, + "step": 15346 + }, + { + "epoch": 1.6853722820118602, + "grad_norm": 2.157541036605835, + "learning_rate": 1e-06, + "loss": 0.852, + "mean_token_accuracy": 0.7426252365112305, + "num_tokens": 383208145.0, + "step": 15347 + }, + { + "epoch": 1.685482099714474, + "grad_norm": 2.1685500144958496, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7310184240341187, + "num_tokens": 383234492.0, + "step": 15348 + }, + { + "epoch": 1.6855919174170877, + "grad_norm": 2.4524269104003906, + "learning_rate": 1e-06, + "loss": 0.9611, + "mean_token_accuracy": 0.7040531635284424, + "num_tokens": 383257592.0, + "step": 15349 + }, + { + "epoch": 1.6857017351197014, + "grad_norm": 2.206322431564331, + "learning_rate": 1e-06, + "loss": 0.8203, + "mean_token_accuracy": 0.741163969039917, + "num_tokens": 383283583.0, + "step": 15350 + }, + { + "epoch": 1.685811552822315, + "grad_norm": 2.1184933185577393, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7134886980056763, + "num_tokens": 383311549.0, + "step": 15351 + }, + { + "epoch": 1.6859213705249285, + "grad_norm": 2.244473934173584, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7158767580986023, + "num_tokens": 383336560.0, + "step": 15352 + }, + { + "epoch": 1.6860311882275423, + "grad_norm": 2.1142258644104004, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7080514430999756, + "num_tokens": 383365639.0, + "step": 15353 + }, + { + "epoch": 1.686141005930156, + "grad_norm": 2.009995698928833, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7011089324951172, + "num_tokens": 383396059.0, + "step": 15354 + }, + { + "epoch": 1.6862508236327696, + "grad_norm": 2.201286554336548, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.7194294929504395, + "num_tokens": 383423514.0, + "step": 15355 + }, + { + "epoch": 1.686360641335383, + "grad_norm": 2.4675381183624268, + "learning_rate": 1e-06, + "loss": 0.8985, + "mean_token_accuracy": 0.7178380489349365, + "num_tokens": 383444797.0, + "step": 15356 + }, + { + "epoch": 1.6864704590379969, + "grad_norm": 3.0140738487243652, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7338317632675171, + "num_tokens": 383461467.0, + "step": 15357 + }, + { + "epoch": 1.6865802767406106, + "grad_norm": 2.118886947631836, + "learning_rate": 1e-06, + "loss": 1.05, + "mean_token_accuracy": 0.6809988021850586, + "num_tokens": 383491872.0, + "step": 15358 + }, + { + "epoch": 1.6866900944432244, + "grad_norm": 2.454404354095459, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7332955002784729, + "num_tokens": 383513716.0, + "step": 15359 + }, + { + "epoch": 1.686799912145838, + "grad_norm": 2.884495735168457, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7224029302597046, + "num_tokens": 383531867.0, + "step": 15360 + }, + { + "epoch": 1.6869097298484514, + "grad_norm": 2.4024693965911865, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7249006032943726, + "num_tokens": 383555113.0, + "step": 15361 + }, + { + "epoch": 1.6870195475510652, + "grad_norm": 2.1793196201324463, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7140292525291443, + "num_tokens": 383582217.0, + "step": 15362 + }, + { + "epoch": 1.687129365253679, + "grad_norm": 2.3367135524749756, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7060719132423401, + "num_tokens": 383606585.0, + "step": 15363 + }, + { + "epoch": 1.6872391829562927, + "grad_norm": 2.297438383102417, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7302894592285156, + "num_tokens": 383631116.0, + "step": 15364 + }, + { + "epoch": 1.6873490006589063, + "grad_norm": 2.419481039047241, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7345933318138123, + "num_tokens": 383655551.0, + "step": 15365 + }, + { + "epoch": 1.6874588183615198, + "grad_norm": 2.0702478885650635, + "learning_rate": 1e-06, + "loss": 1.0267, + "mean_token_accuracy": 0.703677773475647, + "num_tokens": 383687430.0, + "step": 15366 + }, + { + "epoch": 1.6875686360641335, + "grad_norm": 2.435697317123413, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.6950840950012207, + "num_tokens": 383710789.0, + "step": 15367 + }, + { + "epoch": 1.6876784537667473, + "grad_norm": 2.01098895072937, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7279718518257141, + "num_tokens": 383739693.0, + "step": 15368 + }, + { + "epoch": 1.6877882714693608, + "grad_norm": 2.405496120452881, + "learning_rate": 1e-06, + "loss": 0.7323, + "mean_token_accuracy": 0.7583858966827393, + "num_tokens": 383760716.0, + "step": 15369 + }, + { + "epoch": 1.6878980891719744, + "grad_norm": 2.4119062423706055, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7193729877471924, + "num_tokens": 383783946.0, + "step": 15370 + }, + { + "epoch": 1.6880079068745881, + "grad_norm": 2.3119394779205322, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7045619487762451, + "num_tokens": 383808070.0, + "step": 15371 + }, + { + "epoch": 1.6881177245772019, + "grad_norm": 2.6226701736450195, + "learning_rate": 1e-06, + "loss": 0.8377, + "mean_token_accuracy": 0.7361396551132202, + "num_tokens": 383827776.0, + "step": 15372 + }, + { + "epoch": 1.6882275422798156, + "grad_norm": 1.9959713220596313, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7128431797027588, + "num_tokens": 383858662.0, + "step": 15373 + }, + { + "epoch": 1.6883373599824292, + "grad_norm": 2.047523021697998, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.716259241104126, + "num_tokens": 383888472.0, + "step": 15374 + }, + { + "epoch": 1.6884471776850427, + "grad_norm": 2.2369229793548584, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.7003068327903748, + "num_tokens": 383913881.0, + "step": 15375 + }, + { + "epoch": 1.6885569953876565, + "grad_norm": 2.5703160762786865, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7223465442657471, + "num_tokens": 383935643.0, + "step": 15376 + }, + { + "epoch": 1.6886668130902702, + "grad_norm": 2.2798855304718018, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7130707502365112, + "num_tokens": 383962234.0, + "step": 15377 + }, + { + "epoch": 1.6887766307928838, + "grad_norm": 2.1208336353302, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7020817995071411, + "num_tokens": 383993201.0, + "step": 15378 + }, + { + "epoch": 1.6888864484954975, + "grad_norm": 2.3841629028320312, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7167962193489075, + "num_tokens": 384018146.0, + "step": 15379 + }, + { + "epoch": 1.688996266198111, + "grad_norm": 2.577238082885742, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7342281937599182, + "num_tokens": 384038510.0, + "step": 15380 + }, + { + "epoch": 1.6891060839007248, + "grad_norm": 2.190389633178711, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7195369601249695, + "num_tokens": 384066156.0, + "step": 15381 + }, + { + "epoch": 1.6892159016033386, + "grad_norm": 2.223834753036499, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7122395038604736, + "num_tokens": 384091247.0, + "step": 15382 + }, + { + "epoch": 1.689325719305952, + "grad_norm": 1.9055917263031006, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.7015656232833862, + "num_tokens": 384127996.0, + "step": 15383 + }, + { + "epoch": 1.6894355370085656, + "grad_norm": 2.2192788124084473, + "learning_rate": 1e-06, + "loss": 0.9343, + "mean_token_accuracy": 0.7149181365966797, + "num_tokens": 384155590.0, + "step": 15384 + }, + { + "epoch": 1.6895453547111794, + "grad_norm": 2.4146785736083984, + "learning_rate": 1e-06, + "loss": 0.9554, + "mean_token_accuracy": 0.7043249607086182, + "num_tokens": 384181946.0, + "step": 15385 + }, + { + "epoch": 1.6896551724137931, + "grad_norm": 2.4361135959625244, + "learning_rate": 1e-06, + "loss": 0.9807, + "mean_token_accuracy": 0.698395311832428, + "num_tokens": 384204470.0, + "step": 15386 + }, + { + "epoch": 1.689764990116407, + "grad_norm": 2.2867202758789062, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7055310010910034, + "num_tokens": 384230351.0, + "step": 15387 + }, + { + "epoch": 1.6898748078190204, + "grad_norm": 2.615159511566162, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7068608999252319, + "num_tokens": 384250897.0, + "step": 15388 + }, + { + "epoch": 1.689984625521634, + "grad_norm": 2.274726152420044, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7347279787063599, + "num_tokens": 384274519.0, + "step": 15389 + }, + { + "epoch": 1.6900944432242477, + "grad_norm": 2.4059536457061768, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.705875813961029, + "num_tokens": 384297190.0, + "step": 15390 + }, + { + "epoch": 1.6902042609268615, + "grad_norm": 2.2681453227996826, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7273939847946167, + "num_tokens": 384323295.0, + "step": 15391 + }, + { + "epoch": 1.690314078629475, + "grad_norm": 1.8831377029418945, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7254195213317871, + "num_tokens": 384355539.0, + "step": 15392 + }, + { + "epoch": 1.6904238963320888, + "grad_norm": 2.09118390083313, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7154146432876587, + "num_tokens": 384385757.0, + "step": 15393 + }, + { + "epoch": 1.6905337140347023, + "grad_norm": 2.418248176574707, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.730957567691803, + "num_tokens": 384408631.0, + "step": 15394 + }, + { + "epoch": 1.690643531737316, + "grad_norm": 2.4539692401885986, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.713219165802002, + "num_tokens": 384430917.0, + "step": 15395 + }, + { + "epoch": 1.6907533494399298, + "grad_norm": 2.310081720352173, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7277811765670776, + "num_tokens": 384454871.0, + "step": 15396 + }, + { + "epoch": 1.6908631671425434, + "grad_norm": 2.1901633739471436, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7040016651153564, + "num_tokens": 384480479.0, + "step": 15397 + }, + { + "epoch": 1.690972984845157, + "grad_norm": 2.071035385131836, + "learning_rate": 1e-06, + "loss": 1.0026, + "mean_token_accuracy": 0.6951744556427002, + "num_tokens": 384514822.0, + "step": 15398 + }, + { + "epoch": 1.6910828025477707, + "grad_norm": 2.348031997680664, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7211095094680786, + "num_tokens": 384539245.0, + "step": 15399 + }, + { + "epoch": 1.6911926202503844, + "grad_norm": 2.158874750137329, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.709420919418335, + "num_tokens": 384566510.0, + "step": 15400 + }, + { + "epoch": 1.6913024379529982, + "grad_norm": 2.236011028289795, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7123602628707886, + "num_tokens": 384592766.0, + "step": 15401 + }, + { + "epoch": 1.6914122556556117, + "grad_norm": 2.2377922534942627, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7253445982933044, + "num_tokens": 384618975.0, + "step": 15402 + }, + { + "epoch": 1.6915220733582252, + "grad_norm": 2.242121696472168, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7169774770736694, + "num_tokens": 384645390.0, + "step": 15403 + }, + { + "epoch": 1.691631891060839, + "grad_norm": 1.9361809492111206, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7020965814590454, + "num_tokens": 384679872.0, + "step": 15404 + }, + { + "epoch": 1.6917417087634528, + "grad_norm": 2.1502439975738525, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7199631929397583, + "num_tokens": 384707449.0, + "step": 15405 + }, + { + "epoch": 1.6918515264660663, + "grad_norm": 2.1779839992523193, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7149640321731567, + "num_tokens": 384734655.0, + "step": 15406 + }, + { + "epoch": 1.6919613441686798, + "grad_norm": 1.87302565574646, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.6993425488471985, + "num_tokens": 384768681.0, + "step": 15407 + }, + { + "epoch": 1.6920711618712936, + "grad_norm": 2.4036128520965576, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7269747257232666, + "num_tokens": 384790762.0, + "step": 15408 + }, + { + "epoch": 1.6921809795739073, + "grad_norm": 2.39831280708313, + "learning_rate": 1e-06, + "loss": 0.97, + "mean_token_accuracy": 0.7091583609580994, + "num_tokens": 384818147.0, + "step": 15409 + }, + { + "epoch": 1.692290797276521, + "grad_norm": 2.0455830097198486, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.722792387008667, + "num_tokens": 384846683.0, + "step": 15410 + }, + { + "epoch": 1.6924006149791346, + "grad_norm": 1.9323307275772095, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7010748982429504, + "num_tokens": 384880690.0, + "step": 15411 + }, + { + "epoch": 1.6925104326817482, + "grad_norm": 2.2816920280456543, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7270618677139282, + "num_tokens": 384903748.0, + "step": 15412 + }, + { + "epoch": 1.692620250384362, + "grad_norm": 2.2252886295318604, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.699211061000824, + "num_tokens": 384931194.0, + "step": 15413 + }, + { + "epoch": 1.6927300680869757, + "grad_norm": 1.9323922395706177, + "learning_rate": 1e-06, + "loss": 0.8121, + "mean_token_accuracy": 0.7446250915527344, + "num_tokens": 384961694.0, + "step": 15414 + }, + { + "epoch": 1.6928398857895894, + "grad_norm": 2.41654372215271, + "learning_rate": 1e-06, + "loss": 0.8759, + "mean_token_accuracy": 0.7336776256561279, + "num_tokens": 384983492.0, + "step": 15415 + }, + { + "epoch": 1.692949703492203, + "grad_norm": 2.1116392612457275, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.7222806811332703, + "num_tokens": 385009113.0, + "step": 15416 + }, + { + "epoch": 1.6930595211948165, + "grad_norm": 2.37178897857666, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7335363030433655, + "num_tokens": 385030331.0, + "step": 15417 + }, + { + "epoch": 1.6931693388974303, + "grad_norm": 2.261117935180664, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7286999821662903, + "num_tokens": 385055353.0, + "step": 15418 + }, + { + "epoch": 1.693279156600044, + "grad_norm": 2.2182390689849854, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7369159460067749, + "num_tokens": 385079892.0, + "step": 15419 + }, + { + "epoch": 1.6933889743026576, + "grad_norm": 2.4541871547698975, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7342691421508789, + "num_tokens": 385100408.0, + "step": 15420 + }, + { + "epoch": 1.693498792005271, + "grad_norm": 2.94705867767334, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.7390281558036804, + "num_tokens": 385117059.0, + "step": 15421 + }, + { + "epoch": 1.6936086097078848, + "grad_norm": 1.880816102027893, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.6995694637298584, + "num_tokens": 385153493.0, + "step": 15422 + }, + { + "epoch": 1.6937184274104986, + "grad_norm": 2.35516357421875, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7138433456420898, + "num_tokens": 385176675.0, + "step": 15423 + }, + { + "epoch": 1.6938282451131124, + "grad_norm": 2.2937710285186768, + "learning_rate": 1e-06, + "loss": 0.9815, + "mean_token_accuracy": 0.697002649307251, + "num_tokens": 385201364.0, + "step": 15424 + }, + { + "epoch": 1.693938062815726, + "grad_norm": 2.2607998847961426, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7068474292755127, + "num_tokens": 385227816.0, + "step": 15425 + }, + { + "epoch": 1.6940478805183394, + "grad_norm": 2.195295810699463, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7043780088424683, + "num_tokens": 385254099.0, + "step": 15426 + }, + { + "epoch": 1.6941576982209532, + "grad_norm": 2.5085737705230713, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7331075668334961, + "num_tokens": 385275817.0, + "step": 15427 + }, + { + "epoch": 1.694267515923567, + "grad_norm": 2.405815362930298, + "learning_rate": 1e-06, + "loss": 0.7928, + "mean_token_accuracy": 0.7462334632873535, + "num_tokens": 385299025.0, + "step": 15428 + }, + { + "epoch": 1.6943773336261807, + "grad_norm": 2.4761226177215576, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7298661470413208, + "num_tokens": 385321483.0, + "step": 15429 + }, + { + "epoch": 1.6944871513287942, + "grad_norm": 2.2003262042999268, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7176179885864258, + "num_tokens": 385348133.0, + "step": 15430 + }, + { + "epoch": 1.6945969690314078, + "grad_norm": 2.3176188468933105, + "learning_rate": 1e-06, + "loss": 0.7984, + "mean_token_accuracy": 0.7440636157989502, + "num_tokens": 385372665.0, + "step": 15431 + }, + { + "epoch": 1.6947067867340215, + "grad_norm": 2.214266538619995, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7246803045272827, + "num_tokens": 385400175.0, + "step": 15432 + }, + { + "epoch": 1.6948166044366353, + "grad_norm": 2.125725507736206, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7212517261505127, + "num_tokens": 385426641.0, + "step": 15433 + }, + { + "epoch": 1.6949264221392488, + "grad_norm": 2.29528546333313, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7202801704406738, + "num_tokens": 385449942.0, + "step": 15434 + }, + { + "epoch": 1.6950362398418624, + "grad_norm": 2.080125331878662, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.6970635652542114, + "num_tokens": 385479624.0, + "step": 15435 + }, + { + "epoch": 1.6951460575444761, + "grad_norm": 2.472938299179077, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7145286798477173, + "num_tokens": 385500989.0, + "step": 15436 + }, + { + "epoch": 1.6952558752470899, + "grad_norm": 2.6046414375305176, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7066347599029541, + "num_tokens": 385521359.0, + "step": 15437 + }, + { + "epoch": 1.6953656929497036, + "grad_norm": 2.1377384662628174, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7054359912872314, + "num_tokens": 385546036.0, + "step": 15438 + }, + { + "epoch": 1.6954755106523172, + "grad_norm": 2.0887672901153564, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.7060045003890991, + "num_tokens": 385577301.0, + "step": 15439 + }, + { + "epoch": 1.6955853283549307, + "grad_norm": 2.134148120880127, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7176017761230469, + "num_tokens": 385602869.0, + "step": 15440 + }, + { + "epoch": 1.6956951460575445, + "grad_norm": 2.28243088722229, + "learning_rate": 1e-06, + "loss": 0.9258, + "mean_token_accuracy": 0.7125744819641113, + "num_tokens": 385627819.0, + "step": 15441 + }, + { + "epoch": 1.6958049637601582, + "grad_norm": 2.716888666152954, + "learning_rate": 1e-06, + "loss": 0.867, + "mean_token_accuracy": 0.7302746772766113, + "num_tokens": 385646891.0, + "step": 15442 + }, + { + "epoch": 1.6959147814627717, + "grad_norm": 2.2754485607147217, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7234208583831787, + "num_tokens": 385670943.0, + "step": 15443 + }, + { + "epoch": 1.6960245991653855, + "grad_norm": 2.4608888626098633, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.715456485748291, + "num_tokens": 385693065.0, + "step": 15444 + }, + { + "epoch": 1.696134416867999, + "grad_norm": 2.5702199935913086, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7199865579605103, + "num_tokens": 385713956.0, + "step": 15445 + }, + { + "epoch": 1.6962442345706128, + "grad_norm": 2.255274534225464, + "learning_rate": 1e-06, + "loss": 0.7582, + "mean_token_accuracy": 0.7581090331077576, + "num_tokens": 385737810.0, + "step": 15446 + }, + { + "epoch": 1.6963540522732266, + "grad_norm": 2.1366522312164307, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.7196366786956787, + "num_tokens": 385763130.0, + "step": 15447 + }, + { + "epoch": 1.69646386997584, + "grad_norm": 2.2615649700164795, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7376666069030762, + "num_tokens": 385786513.0, + "step": 15448 + }, + { + "epoch": 1.6965736876784536, + "grad_norm": 2.417767286300659, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.720604419708252, + "num_tokens": 385807982.0, + "step": 15449 + }, + { + "epoch": 1.6966835053810674, + "grad_norm": 2.029712677001953, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7368650436401367, + "num_tokens": 385837742.0, + "step": 15450 + }, + { + "epoch": 1.6967933230836811, + "grad_norm": 2.1655266284942627, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7296678423881531, + "num_tokens": 385863425.0, + "step": 15451 + }, + { + "epoch": 1.696903140786295, + "grad_norm": 2.459346055984497, + "learning_rate": 1e-06, + "loss": 0.7739, + "mean_token_accuracy": 0.7544654607772827, + "num_tokens": 385883821.0, + "step": 15452 + }, + { + "epoch": 1.6970129584889084, + "grad_norm": 1.979614496231079, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7042001485824585, + "num_tokens": 385914715.0, + "step": 15453 + }, + { + "epoch": 1.697122776191522, + "grad_norm": 2.3477895259857178, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7260255813598633, + "num_tokens": 385939169.0, + "step": 15454 + }, + { + "epoch": 1.6972325938941357, + "grad_norm": 2.27870774269104, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7262341380119324, + "num_tokens": 385965650.0, + "step": 15455 + }, + { + "epoch": 1.6973424115967495, + "grad_norm": 2.427957773208618, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7224515676498413, + "num_tokens": 385988067.0, + "step": 15456 + }, + { + "epoch": 1.697452229299363, + "grad_norm": 2.6061973571777344, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7374448776245117, + "num_tokens": 386007768.0, + "step": 15457 + }, + { + "epoch": 1.6975620470019768, + "grad_norm": 2.087938070297241, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7088371515274048, + "num_tokens": 386036486.0, + "step": 15458 + }, + { + "epoch": 1.6976718647045903, + "grad_norm": 2.4165046215057373, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7024752497673035, + "num_tokens": 386059108.0, + "step": 15459 + }, + { + "epoch": 1.697781682407204, + "grad_norm": 2.4207165241241455, + "learning_rate": 1e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7377207279205322, + "num_tokens": 386081108.0, + "step": 15460 + }, + { + "epoch": 1.6978915001098178, + "grad_norm": 2.5381953716278076, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7310374975204468, + "num_tokens": 386102399.0, + "step": 15461 + }, + { + "epoch": 1.6980013178124314, + "grad_norm": 2.3167710304260254, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.713340163230896, + "num_tokens": 386126474.0, + "step": 15462 + }, + { + "epoch": 1.698111135515045, + "grad_norm": 2.5531365871429443, + "learning_rate": 1e-06, + "loss": 0.884, + "mean_token_accuracy": 0.7250957489013672, + "num_tokens": 386149374.0, + "step": 15463 + }, + { + "epoch": 1.6982209532176586, + "grad_norm": 1.9757821559906006, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7302404642105103, + "num_tokens": 386177744.0, + "step": 15464 + }, + { + "epoch": 1.6983307709202724, + "grad_norm": 2.3407468795776367, + "learning_rate": 1e-06, + "loss": 0.9032, + "mean_token_accuracy": 0.7200799584388733, + "num_tokens": 386200065.0, + "step": 15465 + }, + { + "epoch": 1.6984405886228862, + "grad_norm": 2.2653374671936035, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7131650447845459, + "num_tokens": 386225084.0, + "step": 15466 + }, + { + "epoch": 1.6985504063254997, + "grad_norm": 2.299715995788574, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7088868618011475, + "num_tokens": 386250082.0, + "step": 15467 + }, + { + "epoch": 1.6986602240281132, + "grad_norm": 2.2800304889678955, + "learning_rate": 1e-06, + "loss": 0.8705, + "mean_token_accuracy": 0.7259097099304199, + "num_tokens": 386276848.0, + "step": 15468 + }, + { + "epoch": 1.698770041730727, + "grad_norm": 1.9068795442581177, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7189105749130249, + "num_tokens": 386310591.0, + "step": 15469 + }, + { + "epoch": 1.6988798594333407, + "grad_norm": 1.9457511901855469, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7057396173477173, + "num_tokens": 386340728.0, + "step": 15470 + }, + { + "epoch": 1.6989896771359543, + "grad_norm": 2.3940446376800537, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7098646759986877, + "num_tokens": 386363685.0, + "step": 15471 + }, + { + "epoch": 1.6990994948385678, + "grad_norm": 2.0106656551361084, + "learning_rate": 1e-06, + "loss": 0.9172, + "mean_token_accuracy": 0.7064987421035767, + "num_tokens": 386393236.0, + "step": 15472 + }, + { + "epoch": 1.6992093125411816, + "grad_norm": 2.097552537918091, + "learning_rate": 1e-06, + "loss": 1.003, + "mean_token_accuracy": 0.7025381326675415, + "num_tokens": 386419867.0, + "step": 15473 + }, + { + "epoch": 1.6993191302437953, + "grad_norm": 2.7776083946228027, + "learning_rate": 1e-06, + "loss": 0.9104, + "mean_token_accuracy": 0.713946521282196, + "num_tokens": 386437790.0, + "step": 15474 + }, + { + "epoch": 1.699428947946409, + "grad_norm": 2.433830738067627, + "learning_rate": 1e-06, + "loss": 0.8243, + "mean_token_accuracy": 0.7413117289543152, + "num_tokens": 386458428.0, + "step": 15475 + }, + { + "epoch": 1.6995387656490226, + "grad_norm": 2.1984732151031494, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7387242317199707, + "num_tokens": 386481925.0, + "step": 15476 + }, + { + "epoch": 1.6996485833516362, + "grad_norm": 2.3646528720855713, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7055208683013916, + "num_tokens": 386506517.0, + "step": 15477 + }, + { + "epoch": 1.69975840105425, + "grad_norm": 2.363755702972412, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7215113639831543, + "num_tokens": 386527910.0, + "step": 15478 + }, + { + "epoch": 1.6998682187568637, + "grad_norm": 2.526439666748047, + "learning_rate": 1e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7390635013580322, + "num_tokens": 386548145.0, + "step": 15479 + }, + { + "epoch": 1.6999780364594774, + "grad_norm": 2.465491771697998, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.71884685754776, + "num_tokens": 386571285.0, + "step": 15480 + }, + { + "epoch": 1.700087854162091, + "grad_norm": 2.736380100250244, + "learning_rate": 1e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.747982382774353, + "num_tokens": 386588743.0, + "step": 15481 + }, + { + "epoch": 1.7001976718647045, + "grad_norm": 2.1864731311798096, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7160381078720093, + "num_tokens": 386615461.0, + "step": 15482 + }, + { + "epoch": 1.7003074895673183, + "grad_norm": 2.4517910480499268, + "learning_rate": 1e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7413422465324402, + "num_tokens": 386636084.0, + "step": 15483 + }, + { + "epoch": 1.700417307269932, + "grad_norm": 2.2314958572387695, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7074549794197083, + "num_tokens": 386661872.0, + "step": 15484 + }, + { + "epoch": 1.7005271249725455, + "grad_norm": 1.961050271987915, + "learning_rate": 1e-06, + "loss": 0.8946, + "mean_token_accuracy": 0.7197691202163696, + "num_tokens": 386692344.0, + "step": 15485 + }, + { + "epoch": 1.700636942675159, + "grad_norm": 1.9270997047424316, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.7052085399627686, + "num_tokens": 386725862.0, + "step": 15486 + }, + { + "epoch": 1.7007467603777728, + "grad_norm": 2.24477481842041, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7227557301521301, + "num_tokens": 386751083.0, + "step": 15487 + }, + { + "epoch": 1.7008565780803866, + "grad_norm": 2.4010415077209473, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7277002334594727, + "num_tokens": 386773817.0, + "step": 15488 + }, + { + "epoch": 1.7009663957830004, + "grad_norm": 2.3707242012023926, + "learning_rate": 1e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7291557788848877, + "num_tokens": 386796819.0, + "step": 15489 + }, + { + "epoch": 1.7010762134856139, + "grad_norm": 2.419598340988159, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7326387166976929, + "num_tokens": 386819940.0, + "step": 15490 + }, + { + "epoch": 1.7011860311882274, + "grad_norm": 2.0773651599884033, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7015765905380249, + "num_tokens": 386850735.0, + "step": 15491 + }, + { + "epoch": 1.7012958488908412, + "grad_norm": 2.165682077407837, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7259706258773804, + "num_tokens": 386875009.0, + "step": 15492 + }, + { + "epoch": 1.701405666593455, + "grad_norm": 2.161818742752075, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7359923124313354, + "num_tokens": 386899528.0, + "step": 15493 + }, + { + "epoch": 1.7015154842960687, + "grad_norm": 2.3046114444732666, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7256541848182678, + "num_tokens": 386925389.0, + "step": 15494 + }, + { + "epoch": 1.7016253019986822, + "grad_norm": 2.161135673522949, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7111714482307434, + "num_tokens": 386952903.0, + "step": 15495 + }, + { + "epoch": 1.7017351197012958, + "grad_norm": 2.3243966102600098, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7128342986106873, + "num_tokens": 386977634.0, + "step": 15496 + }, + { + "epoch": 1.7018449374039095, + "grad_norm": 2.393336296081543, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7370860576629639, + "num_tokens": 386998673.0, + "step": 15497 + }, + { + "epoch": 1.7019547551065233, + "grad_norm": 2.34812593460083, + "learning_rate": 1e-06, + "loss": 0.9814, + "mean_token_accuracy": 0.714832067489624, + "num_tokens": 387024153.0, + "step": 15498 + }, + { + "epoch": 1.7020645728091368, + "grad_norm": 2.2820982933044434, + "learning_rate": 1e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7482528686523438, + "num_tokens": 387047686.0, + "step": 15499 + }, + { + "epoch": 1.7021743905117503, + "grad_norm": 2.082688093185425, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.6987142562866211, + "num_tokens": 387075020.0, + "step": 15500 + }, + { + "epoch": 1.702284208214364, + "grad_norm": 2.2142703533172607, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7160367369651794, + "num_tokens": 387099936.0, + "step": 15501 + }, + { + "epoch": 1.7023940259169779, + "grad_norm": 2.5696709156036377, + "learning_rate": 1e-06, + "loss": 0.8186, + "mean_token_accuracy": 0.7375579476356506, + "num_tokens": 387119536.0, + "step": 15502 + }, + { + "epoch": 1.7025038436195916, + "grad_norm": 2.008183240890503, + "learning_rate": 1e-06, + "loss": 1.0037, + "mean_token_accuracy": 0.6885058283805847, + "num_tokens": 387150910.0, + "step": 15503 + }, + { + "epoch": 1.7026136613222052, + "grad_norm": 2.4528467655181885, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7145000696182251, + "num_tokens": 387176221.0, + "step": 15504 + }, + { + "epoch": 1.7027234790248187, + "grad_norm": 2.7112629413604736, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7283421754837036, + "num_tokens": 387195621.0, + "step": 15505 + }, + { + "epoch": 1.7028332967274324, + "grad_norm": 2.619028091430664, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7279856204986572, + "num_tokens": 387216683.0, + "step": 15506 + }, + { + "epoch": 1.7029431144300462, + "grad_norm": 2.0716464519500732, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7107571363449097, + "num_tokens": 387245567.0, + "step": 15507 + }, + { + "epoch": 1.7030529321326597, + "grad_norm": 2.312399387359619, + "learning_rate": 1e-06, + "loss": 0.9235, + "mean_token_accuracy": 0.7164449691772461, + "num_tokens": 387272141.0, + "step": 15508 + }, + { + "epoch": 1.7031627498352735, + "grad_norm": 2.5199620723724365, + "learning_rate": 1e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7374197840690613, + "num_tokens": 387293783.0, + "step": 15509 + }, + { + "epoch": 1.703272567537887, + "grad_norm": 2.2697579860687256, + "learning_rate": 1e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7407101988792419, + "num_tokens": 387317109.0, + "step": 15510 + }, + { + "epoch": 1.7033823852405008, + "grad_norm": 1.8700981140136719, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7151483297348022, + "num_tokens": 387351290.0, + "step": 15511 + }, + { + "epoch": 1.7034922029431145, + "grad_norm": 2.5600593090057373, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7267782688140869, + "num_tokens": 387373272.0, + "step": 15512 + }, + { + "epoch": 1.703602020645728, + "grad_norm": 2.772517442703247, + "learning_rate": 1e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7416704297065735, + "num_tokens": 387389935.0, + "step": 15513 + }, + { + "epoch": 1.7037118383483416, + "grad_norm": 2.3763952255249023, + "learning_rate": 1e-06, + "loss": 0.8782, + "mean_token_accuracy": 0.720841646194458, + "num_tokens": 387414836.0, + "step": 15514 + }, + { + "epoch": 1.7038216560509554, + "grad_norm": 2.2638931274414062, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7340955138206482, + "num_tokens": 387438539.0, + "step": 15515 + }, + { + "epoch": 1.7039314737535691, + "grad_norm": 2.2170097827911377, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7075642347335815, + "num_tokens": 387464039.0, + "step": 15516 + }, + { + "epoch": 1.7040412914561829, + "grad_norm": 2.3366847038269043, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.720986008644104, + "num_tokens": 387485730.0, + "step": 15517 + }, + { + "epoch": 1.7041511091587964, + "grad_norm": 2.3098390102386475, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7157552242279053, + "num_tokens": 387510522.0, + "step": 15518 + }, + { + "epoch": 1.70426092686141, + "grad_norm": 2.7686758041381836, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.7309086918830872, + "num_tokens": 387527820.0, + "step": 15519 + }, + { + "epoch": 1.7043707445640237, + "grad_norm": 2.272878885269165, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7128262519836426, + "num_tokens": 387554068.0, + "step": 15520 + }, + { + "epoch": 1.7044805622666375, + "grad_norm": 2.2935149669647217, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7185813188552856, + "num_tokens": 387577857.0, + "step": 15521 + }, + { + "epoch": 1.704590379969251, + "grad_norm": 2.1792874336242676, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7247943878173828, + "num_tokens": 387602886.0, + "step": 15522 + }, + { + "epoch": 1.7047001976718648, + "grad_norm": 2.1501286029815674, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7109571695327759, + "num_tokens": 387632236.0, + "step": 15523 + }, + { + "epoch": 1.7048100153744783, + "grad_norm": 2.432543992996216, + "learning_rate": 1e-06, + "loss": 0.9291, + "mean_token_accuracy": 0.7073656916618347, + "num_tokens": 387656313.0, + "step": 15524 + }, + { + "epoch": 1.704919833077092, + "grad_norm": 2.0051333904266357, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.718481183052063, + "num_tokens": 387688742.0, + "step": 15525 + }, + { + "epoch": 1.7050296507797058, + "grad_norm": 2.505924940109253, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7247211933135986, + "num_tokens": 387709489.0, + "step": 15526 + }, + { + "epoch": 1.7051394684823193, + "grad_norm": 2.3281445503234863, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7162972092628479, + "num_tokens": 387733921.0, + "step": 15527 + }, + { + "epoch": 1.7052492861849329, + "grad_norm": 2.2916760444641113, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7295676469802856, + "num_tokens": 387756219.0, + "step": 15528 + }, + { + "epoch": 1.7053591038875466, + "grad_norm": 2.3964474201202393, + "learning_rate": 1e-06, + "loss": 0.9618, + "mean_token_accuracy": 0.7134813070297241, + "num_tokens": 387777607.0, + "step": 15529 + }, + { + "epoch": 1.7054689215901604, + "grad_norm": 2.2107107639312744, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7176909446716309, + "num_tokens": 387803998.0, + "step": 15530 + }, + { + "epoch": 1.7055787392927741, + "grad_norm": 2.3524317741394043, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7267792224884033, + "num_tokens": 387827169.0, + "step": 15531 + }, + { + "epoch": 1.7056885569953877, + "grad_norm": 1.988720178604126, + "learning_rate": 1e-06, + "loss": 0.9469, + "mean_token_accuracy": 0.708128809928894, + "num_tokens": 387860543.0, + "step": 15532 + }, + { + "epoch": 1.7057983746980012, + "grad_norm": 2.313817024230957, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7175959944725037, + "num_tokens": 387885122.0, + "step": 15533 + }, + { + "epoch": 1.705908192400615, + "grad_norm": 2.2097327709198, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7138141989707947, + "num_tokens": 387911633.0, + "step": 15534 + }, + { + "epoch": 1.7060180101032287, + "grad_norm": 2.3782434463500977, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7331899404525757, + "num_tokens": 387936266.0, + "step": 15535 + }, + { + "epoch": 1.7061278278058423, + "grad_norm": 2.088421583175659, + "learning_rate": 1e-06, + "loss": 1.0136, + "mean_token_accuracy": 0.6925373077392578, + "num_tokens": 387968238.0, + "step": 15536 + }, + { + "epoch": 1.7062376455084558, + "grad_norm": 2.682304859161377, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7325625419616699, + "num_tokens": 387986931.0, + "step": 15537 + }, + { + "epoch": 1.7063474632110696, + "grad_norm": 2.3859968185424805, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7218910455703735, + "num_tokens": 388010541.0, + "step": 15538 + }, + { + "epoch": 1.7064572809136833, + "grad_norm": 2.3069074153900146, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7211930155754089, + "num_tokens": 388034469.0, + "step": 15539 + }, + { + "epoch": 1.706567098616297, + "grad_norm": 2.6157100200653076, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7184215784072876, + "num_tokens": 388054515.0, + "step": 15540 + }, + { + "epoch": 1.7066769163189106, + "grad_norm": 2.048153877258301, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7355489134788513, + "num_tokens": 388086126.0, + "step": 15541 + }, + { + "epoch": 1.7067867340215241, + "grad_norm": 2.0921854972839355, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7218631505966187, + "num_tokens": 388116058.0, + "step": 15542 + }, + { + "epoch": 1.706896551724138, + "grad_norm": 2.495166301727295, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7264811396598816, + "num_tokens": 388136826.0, + "step": 15543 + }, + { + "epoch": 1.7070063694267517, + "grad_norm": 2.3290069103240967, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7068823575973511, + "num_tokens": 388161198.0, + "step": 15544 + }, + { + "epoch": 1.7071161871293654, + "grad_norm": 2.596684455871582, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7156832218170166, + "num_tokens": 388182077.0, + "step": 15545 + }, + { + "epoch": 1.707226004831979, + "grad_norm": 2.455146312713623, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7238236665725708, + "num_tokens": 388203528.0, + "step": 15546 + }, + { + "epoch": 1.7073358225345925, + "grad_norm": 2.0391685962677, + "learning_rate": 1e-06, + "loss": 0.8125, + "mean_token_accuracy": 0.7419147491455078, + "num_tokens": 388229380.0, + "step": 15547 + }, + { + "epoch": 1.7074456402372062, + "grad_norm": 2.3151943683624268, + "learning_rate": 1e-06, + "loss": 0.8404, + "mean_token_accuracy": 0.728866696357727, + "num_tokens": 388253628.0, + "step": 15548 + }, + { + "epoch": 1.70755545793982, + "grad_norm": 2.4325478076934814, + "learning_rate": 1e-06, + "loss": 0.9685, + "mean_token_accuracy": 0.6947709918022156, + "num_tokens": 388277200.0, + "step": 15549 + }, + { + "epoch": 1.7076652756424335, + "grad_norm": 2.438120126724243, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7342296838760376, + "num_tokens": 388296335.0, + "step": 15550 + }, + { + "epoch": 1.707775093345047, + "grad_norm": 2.6176884174346924, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7395299673080444, + "num_tokens": 388315597.0, + "step": 15551 + }, + { + "epoch": 1.7078849110476608, + "grad_norm": 2.17419695854187, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7187747955322266, + "num_tokens": 388339922.0, + "step": 15552 + }, + { + "epoch": 1.7079947287502746, + "grad_norm": 2.349616765975952, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7166039347648621, + "num_tokens": 388363067.0, + "step": 15553 + }, + { + "epoch": 1.7081045464528883, + "grad_norm": 2.4639556407928467, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7356435060501099, + "num_tokens": 388385174.0, + "step": 15554 + }, + { + "epoch": 1.7082143641555019, + "grad_norm": 2.435932159423828, + "learning_rate": 1e-06, + "loss": 0.8289, + "mean_token_accuracy": 0.7363826036453247, + "num_tokens": 388409299.0, + "step": 15555 + }, + { + "epoch": 1.7083241818581154, + "grad_norm": 2.4434168338775635, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7273886203765869, + "num_tokens": 388430729.0, + "step": 15556 + }, + { + "epoch": 1.7084339995607292, + "grad_norm": 2.5686466693878174, + "learning_rate": 1e-06, + "loss": 0.7999, + "mean_token_accuracy": 0.7359145283699036, + "num_tokens": 388449555.0, + "step": 15557 + }, + { + "epoch": 1.708543817263343, + "grad_norm": 2.265836000442505, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.7033628225326538, + "num_tokens": 388474479.0, + "step": 15558 + }, + { + "epoch": 1.7086536349659565, + "grad_norm": 2.2431795597076416, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7181423306465149, + "num_tokens": 388501038.0, + "step": 15559 + }, + { + "epoch": 1.7087634526685702, + "grad_norm": 2.3951592445373535, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7307582497596741, + "num_tokens": 388523768.0, + "step": 15560 + }, + { + "epoch": 1.7088732703711838, + "grad_norm": 2.2860443592071533, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7216401100158691, + "num_tokens": 388548200.0, + "step": 15561 + }, + { + "epoch": 1.7089830880737975, + "grad_norm": 2.557347536087036, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.717827320098877, + "num_tokens": 388569878.0, + "step": 15562 + }, + { + "epoch": 1.7090929057764113, + "grad_norm": 2.1531503200531006, + "learning_rate": 1e-06, + "loss": 0.9095, + "mean_token_accuracy": 0.7158429622650146, + "num_tokens": 388595267.0, + "step": 15563 + }, + { + "epoch": 1.7092027234790248, + "grad_norm": 2.0687243938446045, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7227279543876648, + "num_tokens": 388621576.0, + "step": 15564 + }, + { + "epoch": 1.7093125411816383, + "grad_norm": 2.1736538410186768, + "learning_rate": 1e-06, + "loss": 0.8184, + "mean_token_accuracy": 0.7404403686523438, + "num_tokens": 388646652.0, + "step": 15565 + }, + { + "epoch": 1.709422358884252, + "grad_norm": 2.332651138305664, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.7313544750213623, + "num_tokens": 388670889.0, + "step": 15566 + }, + { + "epoch": 1.7095321765868658, + "grad_norm": 2.0109753608703613, + "learning_rate": 1e-06, + "loss": 0.9325, + "mean_token_accuracy": 0.7116585969924927, + "num_tokens": 388702018.0, + "step": 15567 + }, + { + "epoch": 1.7096419942894796, + "grad_norm": 2.33890962600708, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.6954828500747681, + "num_tokens": 388727050.0, + "step": 15568 + }, + { + "epoch": 1.7097518119920931, + "grad_norm": 2.1825881004333496, + "learning_rate": 1e-06, + "loss": 1.0057, + "mean_token_accuracy": 0.6881850957870483, + "num_tokens": 388753480.0, + "step": 15569 + }, + { + "epoch": 1.7098616296947067, + "grad_norm": 2.5084853172302246, + "learning_rate": 1e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7283167839050293, + "num_tokens": 388775658.0, + "step": 15570 + }, + { + "epoch": 1.7099714473973204, + "grad_norm": 2.7948925495147705, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7385915517807007, + "num_tokens": 388792665.0, + "step": 15571 + }, + { + "epoch": 1.7100812650999342, + "grad_norm": 2.1279351711273193, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7039133310317993, + "num_tokens": 388821170.0, + "step": 15572 + }, + { + "epoch": 1.7101910828025477, + "grad_norm": 2.5615954399108887, + "learning_rate": 1e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7459592819213867, + "num_tokens": 388840197.0, + "step": 15573 + }, + { + "epoch": 1.7103009005051615, + "grad_norm": 2.338545560836792, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7147115468978882, + "num_tokens": 388864513.0, + "step": 15574 + }, + { + "epoch": 1.710410718207775, + "grad_norm": 2.7985804080963135, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7206250429153442, + "num_tokens": 388882963.0, + "step": 15575 + }, + { + "epoch": 1.7105205359103888, + "grad_norm": 2.527369737625122, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7393515706062317, + "num_tokens": 388905092.0, + "step": 15576 + }, + { + "epoch": 1.7106303536130025, + "grad_norm": 2.160756826400757, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7267174124717712, + "num_tokens": 388932448.0, + "step": 15577 + }, + { + "epoch": 1.710740171315616, + "grad_norm": 2.0795650482177734, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7245914936065674, + "num_tokens": 388961175.0, + "step": 15578 + }, + { + "epoch": 1.7108499890182296, + "grad_norm": 2.0846855640411377, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.736579179763794, + "num_tokens": 388987897.0, + "step": 15579 + }, + { + "epoch": 1.7109598067208434, + "grad_norm": 2.2923998832702637, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7202003598213196, + "num_tokens": 389013422.0, + "step": 15580 + }, + { + "epoch": 1.7110696244234571, + "grad_norm": 2.5457510948181152, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7287396192550659, + "num_tokens": 389035040.0, + "step": 15581 + }, + { + "epoch": 1.7111794421260709, + "grad_norm": 2.4367053508758545, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7312653660774231, + "num_tokens": 389056996.0, + "step": 15582 + }, + { + "epoch": 1.7112892598286844, + "grad_norm": 2.2069475650787354, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7281288504600525, + "num_tokens": 389082429.0, + "step": 15583 + }, + { + "epoch": 1.711399077531298, + "grad_norm": 2.256659507751465, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7200701236724854, + "num_tokens": 389107825.0, + "step": 15584 + }, + { + "epoch": 1.7115088952339117, + "grad_norm": 2.34250545501709, + "learning_rate": 1e-06, + "loss": 0.9287, + "mean_token_accuracy": 0.7101571559906006, + "num_tokens": 389134595.0, + "step": 15585 + }, + { + "epoch": 1.7116187129365255, + "grad_norm": 1.9974067211151123, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7162542939186096, + "num_tokens": 389164166.0, + "step": 15586 + }, + { + "epoch": 1.711728530639139, + "grad_norm": 2.1224262714385986, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7370038628578186, + "num_tokens": 389190444.0, + "step": 15587 + }, + { + "epoch": 1.7118383483417525, + "grad_norm": 2.131444215774536, + "learning_rate": 1e-06, + "loss": 0.7757, + "mean_token_accuracy": 0.7537165880203247, + "num_tokens": 389215482.0, + "step": 15588 + }, + { + "epoch": 1.7119481660443663, + "grad_norm": 2.3953850269317627, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.689043402671814, + "num_tokens": 389241419.0, + "step": 15589 + }, + { + "epoch": 1.71205798374698, + "grad_norm": 2.0754122734069824, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7201736569404602, + "num_tokens": 389272711.0, + "step": 15590 + }, + { + "epoch": 1.7121678014495938, + "grad_norm": 2.2690017223358154, + "learning_rate": 1e-06, + "loss": 0.7734, + "mean_token_accuracy": 0.7483225464820862, + "num_tokens": 389296733.0, + "step": 15591 + }, + { + "epoch": 1.7122776191522073, + "grad_norm": 2.1017935276031494, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7172032594680786, + "num_tokens": 389323398.0, + "step": 15592 + }, + { + "epoch": 1.7123874368548209, + "grad_norm": 2.458495855331421, + "learning_rate": 1e-06, + "loss": 0.9703, + "mean_token_accuracy": 0.715467095375061, + "num_tokens": 389346700.0, + "step": 15593 + }, + { + "epoch": 1.7124972545574346, + "grad_norm": 2.2666525840759277, + "learning_rate": 1e-06, + "loss": 0.8426, + "mean_token_accuracy": 0.7366829514503479, + "num_tokens": 389369783.0, + "step": 15594 + }, + { + "epoch": 1.7126070722600484, + "grad_norm": 2.3739633560180664, + "learning_rate": 1e-06, + "loss": 0.835, + "mean_token_accuracy": 0.7355910539627075, + "num_tokens": 389390741.0, + "step": 15595 + }, + { + "epoch": 1.7127168899626621, + "grad_norm": 2.225147247314453, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7164302468299866, + "num_tokens": 389415516.0, + "step": 15596 + }, + { + "epoch": 1.7128267076652757, + "grad_norm": 2.056845188140869, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7266070246696472, + "num_tokens": 389445215.0, + "step": 15597 + }, + { + "epoch": 1.7129365253678892, + "grad_norm": 2.224944591522217, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.6989423632621765, + "num_tokens": 389473477.0, + "step": 15598 + }, + { + "epoch": 1.713046343070503, + "grad_norm": 2.4023828506469727, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.697502851486206, + "num_tokens": 389496828.0, + "step": 15599 + }, + { + "epoch": 1.7131561607731167, + "grad_norm": 2.1491971015930176, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7377132177352905, + "num_tokens": 389524813.0, + "step": 15600 + }, + { + "epoch": 1.7132659784757303, + "grad_norm": 2.4644768238067627, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7063412666320801, + "num_tokens": 389548257.0, + "step": 15601 + }, + { + "epoch": 1.7133757961783438, + "grad_norm": 2.2457540035247803, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7194039821624756, + "num_tokens": 389574501.0, + "step": 15602 + }, + { + "epoch": 1.7134856138809575, + "grad_norm": 2.2547483444213867, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7165248990058899, + "num_tokens": 389600630.0, + "step": 15603 + }, + { + "epoch": 1.7135954315835713, + "grad_norm": 2.146733283996582, + "learning_rate": 1e-06, + "loss": 0.8912, + "mean_token_accuracy": 0.726818323135376, + "num_tokens": 389632129.0, + "step": 15604 + }, + { + "epoch": 1.713705249286185, + "grad_norm": 1.8067958354949951, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7002831697463989, + "num_tokens": 389671369.0, + "step": 15605 + }, + { + "epoch": 1.7138150669887986, + "grad_norm": 2.3250539302825928, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7093841433525085, + "num_tokens": 389696573.0, + "step": 15606 + }, + { + "epoch": 1.7139248846914121, + "grad_norm": 2.1434078216552734, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7244194746017456, + "num_tokens": 389722912.0, + "step": 15607 + }, + { + "epoch": 1.7140347023940259, + "grad_norm": 2.1148440837860107, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7275364398956299, + "num_tokens": 389748302.0, + "step": 15608 + }, + { + "epoch": 1.7141445200966396, + "grad_norm": 2.3392210006713867, + "learning_rate": 1e-06, + "loss": 1.0472, + "mean_token_accuracy": 0.6809263825416565, + "num_tokens": 389773600.0, + "step": 15609 + }, + { + "epoch": 1.7142543377992534, + "grad_norm": 2.357745885848999, + "learning_rate": 1e-06, + "loss": 0.9674, + "mean_token_accuracy": 0.7092999219894409, + "num_tokens": 389798161.0, + "step": 15610 + }, + { + "epoch": 1.714364155501867, + "grad_norm": 2.3950397968292236, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7230747938156128, + "num_tokens": 389820574.0, + "step": 15611 + }, + { + "epoch": 1.7144739732044805, + "grad_norm": 2.5198814868927, + "learning_rate": 1e-06, + "loss": 1.0101, + "mean_token_accuracy": 0.6934118270874023, + "num_tokens": 389842021.0, + "step": 15612 + }, + { + "epoch": 1.7145837909070942, + "grad_norm": 2.240396738052368, + "learning_rate": 1e-06, + "loss": 0.9011, + "mean_token_accuracy": 0.7106353640556335, + "num_tokens": 389868742.0, + "step": 15613 + }, + { + "epoch": 1.714693608609708, + "grad_norm": 2.3305060863494873, + "learning_rate": 1e-06, + "loss": 0.9719, + "mean_token_accuracy": 0.7111091017723083, + "num_tokens": 389895086.0, + "step": 15614 + }, + { + "epoch": 1.7148034263123215, + "grad_norm": 2.2479023933410645, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7154483795166016, + "num_tokens": 389922791.0, + "step": 15615 + }, + { + "epoch": 1.714913244014935, + "grad_norm": 2.4290895462036133, + "learning_rate": 1e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7373671531677246, + "num_tokens": 389943747.0, + "step": 15616 + }, + { + "epoch": 1.7150230617175488, + "grad_norm": 2.109557867050171, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7181721329689026, + "num_tokens": 389973315.0, + "step": 15617 + }, + { + "epoch": 1.7151328794201626, + "grad_norm": 2.0715086460113525, + "learning_rate": 1e-06, + "loss": 0.9737, + "mean_token_accuracy": 0.7013901472091675, + "num_tokens": 390002814.0, + "step": 15618 + }, + { + "epoch": 1.7152426971227763, + "grad_norm": 2.382322072982788, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7344728708267212, + "num_tokens": 390025854.0, + "step": 15619 + }, + { + "epoch": 1.7153525148253899, + "grad_norm": 2.2704432010650635, + "learning_rate": 1e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7359853982925415, + "num_tokens": 390050295.0, + "step": 15620 + }, + { + "epoch": 1.7154623325280034, + "grad_norm": 2.479825496673584, + "learning_rate": 1e-06, + "loss": 0.8881, + "mean_token_accuracy": 0.7248166799545288, + "num_tokens": 390071304.0, + "step": 15621 + }, + { + "epoch": 1.7155721502306172, + "grad_norm": 1.9634675979614258, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7032855749130249, + "num_tokens": 390105010.0, + "step": 15622 + }, + { + "epoch": 1.715681967933231, + "grad_norm": 2.1312649250030518, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7295304536819458, + "num_tokens": 390131448.0, + "step": 15623 + }, + { + "epoch": 1.7157917856358444, + "grad_norm": 2.175144672393799, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.7216336131095886, + "num_tokens": 390158375.0, + "step": 15624 + }, + { + "epoch": 1.7159016033384582, + "grad_norm": 1.9878584146499634, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7318378686904907, + "num_tokens": 390186220.0, + "step": 15625 + }, + { + "epoch": 1.7160114210410717, + "grad_norm": 2.271705389022827, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7118839025497437, + "num_tokens": 390210717.0, + "step": 15626 + }, + { + "epoch": 1.7161212387436855, + "grad_norm": 2.6921029090881348, + "learning_rate": 1e-06, + "loss": 0.9117, + "mean_token_accuracy": 0.7247065305709839, + "num_tokens": 390230537.0, + "step": 15627 + }, + { + "epoch": 1.7162310564462993, + "grad_norm": 2.5068180561065674, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7204982042312622, + "num_tokens": 390253156.0, + "step": 15628 + }, + { + "epoch": 1.7163408741489128, + "grad_norm": 2.503192901611328, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7296484112739563, + "num_tokens": 390275552.0, + "step": 15629 + }, + { + "epoch": 1.7164506918515263, + "grad_norm": 2.2360119819641113, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.7140652537345886, + "num_tokens": 390302486.0, + "step": 15630 + }, + { + "epoch": 1.71656050955414, + "grad_norm": 2.6414833068847656, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7330113649368286, + "num_tokens": 390323036.0, + "step": 15631 + }, + { + "epoch": 1.7166703272567538, + "grad_norm": 2.11226224899292, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7031775712966919, + "num_tokens": 390349049.0, + "step": 15632 + }, + { + "epoch": 1.7167801449593676, + "grad_norm": 2.155468463897705, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7048927545547485, + "num_tokens": 390378020.0, + "step": 15633 + }, + { + "epoch": 1.7168899626619811, + "grad_norm": 2.3643951416015625, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.7324666380882263, + "num_tokens": 390400425.0, + "step": 15634 + }, + { + "epoch": 1.7169997803645947, + "grad_norm": 2.3645689487457275, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7140389084815979, + "num_tokens": 390423166.0, + "step": 15635 + }, + { + "epoch": 1.7171095980672084, + "grad_norm": 2.1285390853881836, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.6998413801193237, + "num_tokens": 390450692.0, + "step": 15636 + }, + { + "epoch": 1.7172194157698222, + "grad_norm": 2.18402099609375, + "learning_rate": 1e-06, + "loss": 0.9232, + "mean_token_accuracy": 0.7221782207489014, + "num_tokens": 390477793.0, + "step": 15637 + }, + { + "epoch": 1.7173292334724357, + "grad_norm": 2.6213643550872803, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.734835147857666, + "num_tokens": 390496433.0, + "step": 15638 + }, + { + "epoch": 1.7174390511750495, + "grad_norm": 2.2216827869415283, + "learning_rate": 1e-06, + "loss": 0.9855, + "mean_token_accuracy": 0.6967213153839111, + "num_tokens": 390522223.0, + "step": 15639 + }, + { + "epoch": 1.717548868877663, + "grad_norm": 2.031010150909424, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.6907323598861694, + "num_tokens": 390552113.0, + "step": 15640 + }, + { + "epoch": 1.7176586865802768, + "grad_norm": 2.4243721961975098, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7279407382011414, + "num_tokens": 390575533.0, + "step": 15641 + }, + { + "epoch": 1.7177685042828905, + "grad_norm": 2.123237371444702, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7207176089286804, + "num_tokens": 390603658.0, + "step": 15642 + }, + { + "epoch": 1.717878321985504, + "grad_norm": 2.1931748390197754, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7274661064147949, + "num_tokens": 390631213.0, + "step": 15643 + }, + { + "epoch": 1.7179881396881176, + "grad_norm": 2.6094160079956055, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7150649428367615, + "num_tokens": 390651040.0, + "step": 15644 + }, + { + "epoch": 1.7180979573907313, + "grad_norm": 2.025388240814209, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.6999785900115967, + "num_tokens": 390681730.0, + "step": 15645 + }, + { + "epoch": 1.718207775093345, + "grad_norm": 2.0702953338623047, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7180287837982178, + "num_tokens": 390715342.0, + "step": 15646 + }, + { + "epoch": 1.7183175927959589, + "grad_norm": 2.6543095111846924, + "learning_rate": 1e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7346110343933105, + "num_tokens": 390734981.0, + "step": 15647 + }, + { + "epoch": 1.7184274104985724, + "grad_norm": 2.3428831100463867, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7144013047218323, + "num_tokens": 390758194.0, + "step": 15648 + }, + { + "epoch": 1.718537228201186, + "grad_norm": 2.6167750358581543, + "learning_rate": 1e-06, + "loss": 0.7933, + "mean_token_accuracy": 0.7503061890602112, + "num_tokens": 390776913.0, + "step": 15649 + }, + { + "epoch": 1.7186470459037997, + "grad_norm": 2.3537704944610596, + "learning_rate": 1e-06, + "loss": 0.8339, + "mean_token_accuracy": 0.7538097500801086, + "num_tokens": 390799567.0, + "step": 15650 + }, + { + "epoch": 1.7187568636064134, + "grad_norm": 2.353438138961792, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7098360061645508, + "num_tokens": 390824471.0, + "step": 15651 + }, + { + "epoch": 1.718866681309027, + "grad_norm": 2.38028883934021, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7177413105964661, + "num_tokens": 390846652.0, + "step": 15652 + }, + { + "epoch": 1.7189764990116405, + "grad_norm": 2.235799789428711, + "learning_rate": 1e-06, + "loss": 0.9563, + "mean_token_accuracy": 0.7079328298568726, + "num_tokens": 390871606.0, + "step": 15653 + }, + { + "epoch": 1.7190863167142543, + "grad_norm": 2.4141948223114014, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7210354208946228, + "num_tokens": 390893446.0, + "step": 15654 + }, + { + "epoch": 1.719196134416868, + "grad_norm": 2.203704357147217, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7167304754257202, + "num_tokens": 390918064.0, + "step": 15655 + }, + { + "epoch": 1.7193059521194818, + "grad_norm": 2.1752078533172607, + "learning_rate": 1e-06, + "loss": 0.9427, + "mean_token_accuracy": 0.7020139098167419, + "num_tokens": 390946030.0, + "step": 15656 + }, + { + "epoch": 1.7194157698220953, + "grad_norm": 2.335745334625244, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7186222672462463, + "num_tokens": 390969570.0, + "step": 15657 + }, + { + "epoch": 1.7195255875247089, + "grad_norm": 2.3945181369781494, + "learning_rate": 1e-06, + "loss": 0.8846, + "mean_token_accuracy": 0.7263119220733643, + "num_tokens": 390992264.0, + "step": 15658 + }, + { + "epoch": 1.7196354052273226, + "grad_norm": 2.1325197219848633, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.7111619114875793, + "num_tokens": 391021355.0, + "step": 15659 + }, + { + "epoch": 1.7197452229299364, + "grad_norm": 2.1258513927459717, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7093581557273865, + "num_tokens": 391049241.0, + "step": 15660 + }, + { + "epoch": 1.7198550406325501, + "grad_norm": 2.409693956375122, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7208350896835327, + "num_tokens": 391072343.0, + "step": 15661 + }, + { + "epoch": 1.7199648583351637, + "grad_norm": 2.708346128463745, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7328146696090698, + "num_tokens": 391091252.0, + "step": 15662 + }, + { + "epoch": 1.7200746760377772, + "grad_norm": 2.4599697589874268, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7281749248504639, + "num_tokens": 391113113.0, + "step": 15663 + }, + { + "epoch": 1.720184493740391, + "grad_norm": 2.156489133834839, + "learning_rate": 1e-06, + "loss": 1.0948, + "mean_token_accuracy": 0.6885398626327515, + "num_tokens": 391143082.0, + "step": 15664 + }, + { + "epoch": 1.7202943114430047, + "grad_norm": 2.362813711166382, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7159944176673889, + "num_tokens": 391166406.0, + "step": 15665 + }, + { + "epoch": 1.7204041291456182, + "grad_norm": 2.3083014488220215, + "learning_rate": 1e-06, + "loss": 0.9615, + "mean_token_accuracy": 0.7032802104949951, + "num_tokens": 391191089.0, + "step": 15666 + }, + { + "epoch": 1.7205139468482318, + "grad_norm": 1.9133806228637695, + "learning_rate": 1e-06, + "loss": 0.9927, + "mean_token_accuracy": 0.6927363276481628, + "num_tokens": 391224711.0, + "step": 15667 + }, + { + "epoch": 1.7206237645508455, + "grad_norm": 2.5929930210113525, + "learning_rate": 1e-06, + "loss": 0.8604, + "mean_token_accuracy": 0.7267999649047852, + "num_tokens": 391245816.0, + "step": 15668 + }, + { + "epoch": 1.7207335822534593, + "grad_norm": 2.028578996658325, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.711247980594635, + "num_tokens": 391279494.0, + "step": 15669 + }, + { + "epoch": 1.720843399956073, + "grad_norm": 2.297468662261963, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7245888710021973, + "num_tokens": 391303829.0, + "step": 15670 + }, + { + "epoch": 1.7209532176586866, + "grad_norm": 2.408052682876587, + "learning_rate": 1e-06, + "loss": 0.8974, + "mean_token_accuracy": 0.712375283241272, + "num_tokens": 391327189.0, + "step": 15671 + }, + { + "epoch": 1.7210630353613001, + "grad_norm": 2.0062475204467773, + "learning_rate": 1e-06, + "loss": 0.9365, + "mean_token_accuracy": 0.706621527671814, + "num_tokens": 391358997.0, + "step": 15672 + }, + { + "epoch": 1.7211728530639139, + "grad_norm": 2.2575066089630127, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7171001434326172, + "num_tokens": 391385744.0, + "step": 15673 + }, + { + "epoch": 1.7212826707665276, + "grad_norm": 2.2472574710845947, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7084581851959229, + "num_tokens": 391412124.0, + "step": 15674 + }, + { + "epoch": 1.7213924884691414, + "grad_norm": 2.1681082248687744, + "learning_rate": 1e-06, + "loss": 0.9589, + "mean_token_accuracy": 0.6987165212631226, + "num_tokens": 391441587.0, + "step": 15675 + }, + { + "epoch": 1.721502306171755, + "grad_norm": 2.1948351860046387, + "learning_rate": 1e-06, + "loss": 0.8648, + "mean_token_accuracy": 0.7307913899421692, + "num_tokens": 391469561.0, + "step": 15676 + }, + { + "epoch": 1.7216121238743685, + "grad_norm": 2.2874538898468018, + "learning_rate": 1e-06, + "loss": 0.8025, + "mean_token_accuracy": 0.7413325309753418, + "num_tokens": 391494584.0, + "step": 15677 + }, + { + "epoch": 1.7217219415769822, + "grad_norm": 2.403106451034546, + "learning_rate": 1e-06, + "loss": 0.8911, + "mean_token_accuracy": 0.7229523062705994, + "num_tokens": 391515277.0, + "step": 15678 + }, + { + "epoch": 1.721831759279596, + "grad_norm": 2.4984452724456787, + "learning_rate": 1e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7346693277359009, + "num_tokens": 391535383.0, + "step": 15679 + }, + { + "epoch": 1.7219415769822095, + "grad_norm": 2.1757187843322754, + "learning_rate": 1e-06, + "loss": 0.8844, + "mean_token_accuracy": 0.7223530411720276, + "num_tokens": 391561148.0, + "step": 15680 + }, + { + "epoch": 1.722051394684823, + "grad_norm": 2.026806354522705, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6995619535446167, + "num_tokens": 391592373.0, + "step": 15681 + }, + { + "epoch": 1.7221612123874368, + "grad_norm": 2.4683918952941895, + "learning_rate": 1e-06, + "loss": 0.7776, + "mean_token_accuracy": 0.7509448528289795, + "num_tokens": 391612162.0, + "step": 15682 + }, + { + "epoch": 1.7222710300900506, + "grad_norm": 2.069714069366455, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7138042449951172, + "num_tokens": 391640834.0, + "step": 15683 + }, + { + "epoch": 1.7223808477926643, + "grad_norm": 2.188798189163208, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.7142113447189331, + "num_tokens": 391667689.0, + "step": 15684 + }, + { + "epoch": 1.7224906654952779, + "grad_norm": 2.3798410892486572, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7145262956619263, + "num_tokens": 391690581.0, + "step": 15685 + }, + { + "epoch": 1.7226004831978914, + "grad_norm": 2.523831844329834, + "learning_rate": 1e-06, + "loss": 0.9983, + "mean_token_accuracy": 0.6966294050216675, + "num_tokens": 391712511.0, + "step": 15686 + }, + { + "epoch": 1.7227103009005051, + "grad_norm": 2.5918054580688477, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7306495904922485, + "num_tokens": 391733141.0, + "step": 15687 + }, + { + "epoch": 1.722820118603119, + "grad_norm": 2.1667821407318115, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7151254415512085, + "num_tokens": 391758915.0, + "step": 15688 + }, + { + "epoch": 1.7229299363057324, + "grad_norm": 2.1426074504852295, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7149125337600708, + "num_tokens": 391786616.0, + "step": 15689 + }, + { + "epoch": 1.7230397540083462, + "grad_norm": 2.249009847640991, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.7160388231277466, + "num_tokens": 391813167.0, + "step": 15690 + }, + { + "epoch": 1.7231495717109597, + "grad_norm": 1.9389959573745728, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7188971638679504, + "num_tokens": 391843048.0, + "step": 15691 + }, + { + "epoch": 1.7232593894135735, + "grad_norm": 2.6295061111450195, + "learning_rate": 1e-06, + "loss": 0.8374, + "mean_token_accuracy": 0.7361913323402405, + "num_tokens": 391863840.0, + "step": 15692 + }, + { + "epoch": 1.7233692071161872, + "grad_norm": 2.5363776683807373, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7348297834396362, + "num_tokens": 391888012.0, + "step": 15693 + }, + { + "epoch": 1.7234790248188008, + "grad_norm": 2.3581550121307373, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7041632533073425, + "num_tokens": 391912332.0, + "step": 15694 + }, + { + "epoch": 1.7235888425214143, + "grad_norm": 1.9692524671554565, + "learning_rate": 1e-06, + "loss": 0.9789, + "mean_token_accuracy": 0.6963983774185181, + "num_tokens": 391943893.0, + "step": 15695 + }, + { + "epoch": 1.723698660224028, + "grad_norm": 2.1978533267974854, + "learning_rate": 1e-06, + "loss": 0.8806, + "mean_token_accuracy": 0.7208409309387207, + "num_tokens": 391969250.0, + "step": 15696 + }, + { + "epoch": 1.7238084779266418, + "grad_norm": 2.2831246852874756, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7109838724136353, + "num_tokens": 391994119.0, + "step": 15697 + }, + { + "epoch": 1.7239182956292556, + "grad_norm": 2.3013079166412354, + "learning_rate": 1e-06, + "loss": 0.8899, + "mean_token_accuracy": 0.7217628359794617, + "num_tokens": 392019356.0, + "step": 15698 + }, + { + "epoch": 1.7240281133318691, + "grad_norm": 2.14575457572937, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7306003570556641, + "num_tokens": 392046130.0, + "step": 15699 + }, + { + "epoch": 1.7241379310344827, + "grad_norm": 2.1619369983673096, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7174091935157776, + "num_tokens": 392072851.0, + "step": 15700 + }, + { + "epoch": 1.7242477487370964, + "grad_norm": 2.098578453063965, + "learning_rate": 1e-06, + "loss": 0.9251, + "mean_token_accuracy": 0.7075352668762207, + "num_tokens": 392100850.0, + "step": 15701 + }, + { + "epoch": 1.7243575664397102, + "grad_norm": 2.013305425643921, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7013100385665894, + "num_tokens": 392131261.0, + "step": 15702 + }, + { + "epoch": 1.7244673841423237, + "grad_norm": 2.598432779312134, + "learning_rate": 1e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7434025406837463, + "num_tokens": 392150265.0, + "step": 15703 + }, + { + "epoch": 1.7245772018449375, + "grad_norm": 2.5087196826934814, + "learning_rate": 1e-06, + "loss": 0.8178, + "mean_token_accuracy": 0.738376796245575, + "num_tokens": 392171172.0, + "step": 15704 + }, + { + "epoch": 1.724687019547551, + "grad_norm": 2.5232937335968018, + "learning_rate": 1e-06, + "loss": 0.8563, + "mean_token_accuracy": 0.7361860275268555, + "num_tokens": 392191228.0, + "step": 15705 + }, + { + "epoch": 1.7247968372501647, + "grad_norm": 2.354212522506714, + "learning_rate": 1e-06, + "loss": 1.0093, + "mean_token_accuracy": 0.7019819021224976, + "num_tokens": 392217132.0, + "step": 15706 + }, + { + "epoch": 1.7249066549527785, + "grad_norm": 2.7085564136505127, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.721289336681366, + "num_tokens": 392236640.0, + "step": 15707 + }, + { + "epoch": 1.725016472655392, + "grad_norm": 2.2647151947021484, + "learning_rate": 1e-06, + "loss": 0.8055, + "mean_token_accuracy": 0.7449405789375305, + "num_tokens": 392261293.0, + "step": 15708 + }, + { + "epoch": 1.7251262903580056, + "grad_norm": 2.3916401863098145, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7365388870239258, + "num_tokens": 392284246.0, + "step": 15709 + }, + { + "epoch": 1.7252361080606193, + "grad_norm": 2.4019527435302734, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7105541229248047, + "num_tokens": 392306447.0, + "step": 15710 + }, + { + "epoch": 1.725345925763233, + "grad_norm": 2.3536946773529053, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7270118594169617, + "num_tokens": 392329559.0, + "step": 15711 + }, + { + "epoch": 1.7254557434658468, + "grad_norm": 2.577065944671631, + "learning_rate": 1e-06, + "loss": 0.8281, + "mean_token_accuracy": 0.733848512172699, + "num_tokens": 392348569.0, + "step": 15712 + }, + { + "epoch": 1.7255655611684604, + "grad_norm": 2.712171792984009, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7201606631278992, + "num_tokens": 392367488.0, + "step": 15713 + }, + { + "epoch": 1.725675378871074, + "grad_norm": 2.1671483516693115, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7101428508758545, + "num_tokens": 392393932.0, + "step": 15714 + }, + { + "epoch": 1.7257851965736877, + "grad_norm": 2.611065149307251, + "learning_rate": 1e-06, + "loss": 0.833, + "mean_token_accuracy": 0.7372473478317261, + "num_tokens": 392412920.0, + "step": 15715 + }, + { + "epoch": 1.7258950142763014, + "grad_norm": 2.3902270793914795, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7117730975151062, + "num_tokens": 392436000.0, + "step": 15716 + }, + { + "epoch": 1.726004831978915, + "grad_norm": 2.6064252853393555, + "learning_rate": 1e-06, + "loss": 0.7794, + "mean_token_accuracy": 0.7530712485313416, + "num_tokens": 392455241.0, + "step": 15717 + }, + { + "epoch": 1.7261146496815285, + "grad_norm": 2.3574330806732178, + "learning_rate": 1e-06, + "loss": 0.7888, + "mean_token_accuracy": 0.7478642463684082, + "num_tokens": 392476960.0, + "step": 15718 + }, + { + "epoch": 1.7262244673841423, + "grad_norm": 2.276729106903076, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7165016531944275, + "num_tokens": 392500446.0, + "step": 15719 + }, + { + "epoch": 1.726334285086756, + "grad_norm": 2.2036943435668945, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7080333232879639, + "num_tokens": 392526419.0, + "step": 15720 + }, + { + "epoch": 1.7264441027893698, + "grad_norm": 2.097383737564087, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7159997224807739, + "num_tokens": 392555678.0, + "step": 15721 + }, + { + "epoch": 1.7265539204919833, + "grad_norm": 2.565532922744751, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.708064079284668, + "num_tokens": 392577439.0, + "step": 15722 + }, + { + "epoch": 1.7266637381945968, + "grad_norm": 2.1968586444854736, + "learning_rate": 1e-06, + "loss": 0.937, + "mean_token_accuracy": 0.712311863899231, + "num_tokens": 392606262.0, + "step": 15723 + }, + { + "epoch": 1.7267735558972106, + "grad_norm": 2.3557770252227783, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7408536076545715, + "num_tokens": 392628567.0, + "step": 15724 + }, + { + "epoch": 1.7268833735998244, + "grad_norm": 2.375322103500366, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7133070230484009, + "num_tokens": 392652843.0, + "step": 15725 + }, + { + "epoch": 1.7269931913024381, + "grad_norm": 2.121833562850952, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.732619047164917, + "num_tokens": 392678476.0, + "step": 15726 + }, + { + "epoch": 1.7271030090050516, + "grad_norm": 2.5223734378814697, + "learning_rate": 1e-06, + "loss": 0.8882, + "mean_token_accuracy": 0.7311669588088989, + "num_tokens": 392699374.0, + "step": 15727 + }, + { + "epoch": 1.7272128267076652, + "grad_norm": 2.1683778762817383, + "learning_rate": 1e-06, + "loss": 0.9233, + "mean_token_accuracy": 0.7189459800720215, + "num_tokens": 392726499.0, + "step": 15728 + }, + { + "epoch": 1.727322644410279, + "grad_norm": 2.3034515380859375, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7206621170043945, + "num_tokens": 392750652.0, + "step": 15729 + }, + { + "epoch": 1.7274324621128927, + "grad_norm": 2.42533278465271, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7189635038375854, + "num_tokens": 392774053.0, + "step": 15730 + }, + { + "epoch": 1.7275422798155062, + "grad_norm": 2.594675064086914, + "learning_rate": 1e-06, + "loss": 0.9268, + "mean_token_accuracy": 0.7160356044769287, + "num_tokens": 392794458.0, + "step": 15731 + }, + { + "epoch": 1.7276520975181198, + "grad_norm": 2.214250326156616, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.7017358541488647, + "num_tokens": 392822508.0, + "step": 15732 + }, + { + "epoch": 1.7277619152207335, + "grad_norm": 2.051945447921753, + "learning_rate": 1e-06, + "loss": 1.0087, + "mean_token_accuracy": 0.6959426403045654, + "num_tokens": 392852203.0, + "step": 15733 + }, + { + "epoch": 1.7278717329233473, + "grad_norm": 2.0542514324188232, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7262861728668213, + "num_tokens": 392881226.0, + "step": 15734 + }, + { + "epoch": 1.727981550625961, + "grad_norm": 2.2754528522491455, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7230846881866455, + "num_tokens": 392905087.0, + "step": 15735 + }, + { + "epoch": 1.7280913683285746, + "grad_norm": 2.150164842605591, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7213829159736633, + "num_tokens": 392932432.0, + "step": 15736 + }, + { + "epoch": 1.728201186031188, + "grad_norm": 2.100172281265259, + "learning_rate": 1e-06, + "loss": 1.0369, + "mean_token_accuracy": 0.6870132684707642, + "num_tokens": 392963436.0, + "step": 15737 + }, + { + "epoch": 1.7283110037338019, + "grad_norm": 2.129986047744751, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.6998279094696045, + "num_tokens": 392993884.0, + "step": 15738 + }, + { + "epoch": 1.7284208214364156, + "grad_norm": 2.177234649658203, + "learning_rate": 1e-06, + "loss": 0.9792, + "mean_token_accuracy": 0.6956723928451538, + "num_tokens": 393020566.0, + "step": 15739 + }, + { + "epoch": 1.7285306391390292, + "grad_norm": 2.3052971363067627, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7085132598876953, + "num_tokens": 393044928.0, + "step": 15740 + }, + { + "epoch": 1.728640456841643, + "grad_norm": 2.141946315765381, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7141583561897278, + "num_tokens": 393073196.0, + "step": 15741 + }, + { + "epoch": 1.7287502745442564, + "grad_norm": 2.3876466751098633, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6959161758422852, + "num_tokens": 393096852.0, + "step": 15742 + }, + { + "epoch": 1.7288600922468702, + "grad_norm": 1.9108281135559082, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.698847770690918, + "num_tokens": 393129915.0, + "step": 15743 + }, + { + "epoch": 1.728969909949484, + "grad_norm": 1.9680616855621338, + "learning_rate": 1e-06, + "loss": 0.9923, + "mean_token_accuracy": 0.6951420903205872, + "num_tokens": 393163660.0, + "step": 15744 + }, + { + "epoch": 1.7290797276520975, + "grad_norm": 2.1055614948272705, + "learning_rate": 1e-06, + "loss": 0.9049, + "mean_token_accuracy": 0.7163643836975098, + "num_tokens": 393193987.0, + "step": 15745 + }, + { + "epoch": 1.729189545354711, + "grad_norm": 2.202831506729126, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7214998602867126, + "num_tokens": 393219791.0, + "step": 15746 + }, + { + "epoch": 1.7292993630573248, + "grad_norm": 2.3782520294189453, + "learning_rate": 1e-06, + "loss": 0.8325, + "mean_token_accuracy": 0.7401036620140076, + "num_tokens": 393240969.0, + "step": 15747 + }, + { + "epoch": 1.7294091807599385, + "grad_norm": 2.073157787322998, + "learning_rate": 1e-06, + "loss": 0.9604, + "mean_token_accuracy": 0.7031875848770142, + "num_tokens": 393268695.0, + "step": 15748 + }, + { + "epoch": 1.7295189984625523, + "grad_norm": 1.9558262825012207, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7052892446517944, + "num_tokens": 393300377.0, + "step": 15749 + }, + { + "epoch": 1.7296288161651658, + "grad_norm": 2.146618366241455, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.718275785446167, + "num_tokens": 393327903.0, + "step": 15750 + }, + { + "epoch": 1.7297386338677794, + "grad_norm": 2.0936479568481445, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7115292549133301, + "num_tokens": 393356882.0, + "step": 15751 + }, + { + "epoch": 1.7298484515703931, + "grad_norm": 1.9998672008514404, + "learning_rate": 1e-06, + "loss": 1.0015, + "mean_token_accuracy": 0.6922541856765747, + "num_tokens": 393389733.0, + "step": 15752 + }, + { + "epoch": 1.7299582692730069, + "grad_norm": 2.161245107650757, + "learning_rate": 1e-06, + "loss": 0.9832, + "mean_token_accuracy": 0.6981015205383301, + "num_tokens": 393418793.0, + "step": 15753 + }, + { + "epoch": 1.7300680869756204, + "grad_norm": 2.0942459106445312, + "learning_rate": 1e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7326399087905884, + "num_tokens": 393447378.0, + "step": 15754 + }, + { + "epoch": 1.7301779046782342, + "grad_norm": 1.8968573808670044, + "learning_rate": 1e-06, + "loss": 1.0162, + "mean_token_accuracy": 0.6908233165740967, + "num_tokens": 393480625.0, + "step": 15755 + }, + { + "epoch": 1.7302877223808477, + "grad_norm": 2.1354668140411377, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7291440963745117, + "num_tokens": 393506960.0, + "step": 15756 + }, + { + "epoch": 1.7303975400834615, + "grad_norm": 2.4470174312591553, + "learning_rate": 1e-06, + "loss": 0.8633, + "mean_token_accuracy": 0.734406590461731, + "num_tokens": 393528586.0, + "step": 15757 + }, + { + "epoch": 1.7305073577860752, + "grad_norm": 2.3713250160217285, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7277870774269104, + "num_tokens": 393553613.0, + "step": 15758 + }, + { + "epoch": 1.7306171754886888, + "grad_norm": 2.076618194580078, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6892229318618774, + "num_tokens": 393584388.0, + "step": 15759 + }, + { + "epoch": 1.7307269931913023, + "grad_norm": 2.367154836654663, + "learning_rate": 1e-06, + "loss": 0.8969, + "mean_token_accuracy": 0.7259595990180969, + "num_tokens": 393607236.0, + "step": 15760 + }, + { + "epoch": 1.730836810893916, + "grad_norm": 2.2152507305145264, + "learning_rate": 1e-06, + "loss": 0.9617, + "mean_token_accuracy": 0.7081284523010254, + "num_tokens": 393635155.0, + "step": 15761 + }, + { + "epoch": 1.7309466285965298, + "grad_norm": 2.1999282836914062, + "learning_rate": 1e-06, + "loss": 0.8815, + "mean_token_accuracy": 0.7230063676834106, + "num_tokens": 393663197.0, + "step": 15762 + }, + { + "epoch": 1.7310564462991436, + "grad_norm": 2.8596978187561035, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7401293516159058, + "num_tokens": 393679873.0, + "step": 15763 + }, + { + "epoch": 1.731166264001757, + "grad_norm": 2.1634764671325684, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7046029567718506, + "num_tokens": 393708095.0, + "step": 15764 + }, + { + "epoch": 1.7312760817043706, + "grad_norm": 2.0762996673583984, + "learning_rate": 1e-06, + "loss": 0.9652, + "mean_token_accuracy": 0.7000163197517395, + "num_tokens": 393738597.0, + "step": 15765 + }, + { + "epoch": 1.7313858994069844, + "grad_norm": 2.4413816928863525, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7416436672210693, + "num_tokens": 393760764.0, + "step": 15766 + }, + { + "epoch": 1.7314957171095982, + "grad_norm": 2.108201026916504, + "learning_rate": 1e-06, + "loss": 0.8465, + "mean_token_accuracy": 0.7378024458885193, + "num_tokens": 393786891.0, + "step": 15767 + }, + { + "epoch": 1.7316055348122117, + "grad_norm": 2.049323081970215, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.716461181640625, + "num_tokens": 393820546.0, + "step": 15768 + }, + { + "epoch": 1.7317153525148252, + "grad_norm": 1.9693983793258667, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.703373908996582, + "num_tokens": 393854700.0, + "step": 15769 + }, + { + "epoch": 1.731825170217439, + "grad_norm": 2.4999570846557617, + "learning_rate": 1e-06, + "loss": 0.935, + "mean_token_accuracy": 0.708003580570221, + "num_tokens": 393877042.0, + "step": 15770 + }, + { + "epoch": 1.7319349879200527, + "grad_norm": 2.3230156898498535, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7331795692443848, + "num_tokens": 393900935.0, + "step": 15771 + }, + { + "epoch": 1.7320448056226665, + "grad_norm": 2.313964605331421, + "learning_rate": 1e-06, + "loss": 0.7802, + "mean_token_accuracy": 0.7527562379837036, + "num_tokens": 393925402.0, + "step": 15772 + }, + { + "epoch": 1.73215462332528, + "grad_norm": 2.083470106124878, + "learning_rate": 1e-06, + "loss": 0.9064, + "mean_token_accuracy": 0.7129755020141602, + "num_tokens": 393953790.0, + "step": 15773 + }, + { + "epoch": 1.7322644410278936, + "grad_norm": 2.0444226264953613, + "learning_rate": 1e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7272024154663086, + "num_tokens": 393984975.0, + "step": 15774 + }, + { + "epoch": 1.7323742587305073, + "grad_norm": 2.3295159339904785, + "learning_rate": 1e-06, + "loss": 0.9468, + "mean_token_accuracy": 0.7030268311500549, + "num_tokens": 394009442.0, + "step": 15775 + }, + { + "epoch": 1.732484076433121, + "grad_norm": 2.212130069732666, + "learning_rate": 1e-06, + "loss": 0.8205, + "mean_token_accuracy": 0.7435208559036255, + "num_tokens": 394034746.0, + "step": 15776 + }, + { + "epoch": 1.7325938941357348, + "grad_norm": 2.0237345695495605, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7313699722290039, + "num_tokens": 394063869.0, + "step": 15777 + }, + { + "epoch": 1.7327037118383484, + "grad_norm": 2.405951499938965, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7042449712753296, + "num_tokens": 394087754.0, + "step": 15778 + }, + { + "epoch": 1.732813529540962, + "grad_norm": 2.5994365215301514, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7238360643386841, + "num_tokens": 394108125.0, + "step": 15779 + }, + { + "epoch": 1.7329233472435757, + "grad_norm": 2.4278807640075684, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7129006385803223, + "num_tokens": 394130958.0, + "step": 15780 + }, + { + "epoch": 1.7330331649461894, + "grad_norm": 2.1489052772521973, + "learning_rate": 1e-06, + "loss": 0.8783, + "mean_token_accuracy": 0.7211954593658447, + "num_tokens": 394157603.0, + "step": 15781 + }, + { + "epoch": 1.733142982648803, + "grad_norm": 2.2211215496063232, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7239677906036377, + "num_tokens": 394183086.0, + "step": 15782 + }, + { + "epoch": 1.7332528003514165, + "grad_norm": 2.4147205352783203, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7316941022872925, + "num_tokens": 394204426.0, + "step": 15783 + }, + { + "epoch": 1.7333626180540302, + "grad_norm": 2.5966439247131348, + "learning_rate": 1e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7436104416847229, + "num_tokens": 394222920.0, + "step": 15784 + }, + { + "epoch": 1.733472435756644, + "grad_norm": 2.151522636413574, + "learning_rate": 1e-06, + "loss": 0.8798, + "mean_token_accuracy": 0.7314850687980652, + "num_tokens": 394248640.0, + "step": 15785 + }, + { + "epoch": 1.7335822534592578, + "grad_norm": 2.243889808654785, + "learning_rate": 1e-06, + "loss": 0.7472, + "mean_token_accuracy": 0.758415699005127, + "num_tokens": 394271740.0, + "step": 15786 + }, + { + "epoch": 1.7336920711618713, + "grad_norm": 2.304152727127075, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7113770246505737, + "num_tokens": 394295567.0, + "step": 15787 + }, + { + "epoch": 1.7338018888644848, + "grad_norm": 2.058722496032715, + "learning_rate": 1e-06, + "loss": 1.0116, + "mean_token_accuracy": 0.6898851990699768, + "num_tokens": 394326452.0, + "step": 15788 + }, + { + "epoch": 1.7339117065670986, + "grad_norm": 2.3968923091888428, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7315294146537781, + "num_tokens": 394348279.0, + "step": 15789 + }, + { + "epoch": 1.7340215242697123, + "grad_norm": 2.1339733600616455, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.702175498008728, + "num_tokens": 394377585.0, + "step": 15790 + }, + { + "epoch": 1.734131341972326, + "grad_norm": 2.371885299682617, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7164987921714783, + "num_tokens": 394399899.0, + "step": 15791 + }, + { + "epoch": 1.7342411596749396, + "grad_norm": 2.2881975173950195, + "learning_rate": 1e-06, + "loss": 0.9274, + "mean_token_accuracy": 0.7115259170532227, + "num_tokens": 394424083.0, + "step": 15792 + }, + { + "epoch": 1.7343509773775532, + "grad_norm": 2.1565470695495605, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.724716305732727, + "num_tokens": 394451688.0, + "step": 15793 + }, + { + "epoch": 1.734460795080167, + "grad_norm": 2.488614797592163, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7190206050872803, + "num_tokens": 394474549.0, + "step": 15794 + }, + { + "epoch": 1.7345706127827807, + "grad_norm": 1.9199604988098145, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.7007837891578674, + "num_tokens": 394509574.0, + "step": 15795 + }, + { + "epoch": 1.7346804304853942, + "grad_norm": 2.2720656394958496, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7205370664596558, + "num_tokens": 394535483.0, + "step": 15796 + }, + { + "epoch": 1.7347902481880078, + "grad_norm": 2.5397887229919434, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7171653509140015, + "num_tokens": 394556276.0, + "step": 15797 + }, + { + "epoch": 1.7349000658906215, + "grad_norm": 2.1596717834472656, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7332037687301636, + "num_tokens": 394583244.0, + "step": 15798 + }, + { + "epoch": 1.7350098835932353, + "grad_norm": 2.7636075019836426, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7127968072891235, + "num_tokens": 394602052.0, + "step": 15799 + }, + { + "epoch": 1.735119701295849, + "grad_norm": 2.4660682678222656, + "learning_rate": 1e-06, + "loss": 0.793, + "mean_token_accuracy": 0.7455536127090454, + "num_tokens": 394622352.0, + "step": 15800 + }, + { + "epoch": 1.7352295189984626, + "grad_norm": 1.9981309175491333, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7316704392433167, + "num_tokens": 394651597.0, + "step": 15801 + }, + { + "epoch": 1.735339336701076, + "grad_norm": 2.12233829498291, + "learning_rate": 1e-06, + "loss": 0.8967, + "mean_token_accuracy": 0.7218191623687744, + "num_tokens": 394679853.0, + "step": 15802 + }, + { + "epoch": 1.7354491544036899, + "grad_norm": 2.262902021408081, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7341294288635254, + "num_tokens": 394706505.0, + "step": 15803 + }, + { + "epoch": 1.7355589721063036, + "grad_norm": 2.188171148300171, + "learning_rate": 1e-06, + "loss": 0.8614, + "mean_token_accuracy": 0.7263592481613159, + "num_tokens": 394733636.0, + "step": 15804 + }, + { + "epoch": 1.7356687898089171, + "grad_norm": 2.1798129081726074, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7045698761940002, + "num_tokens": 394761200.0, + "step": 15805 + }, + { + "epoch": 1.735778607511531, + "grad_norm": 2.3893964290618896, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7155433893203735, + "num_tokens": 394784644.0, + "step": 15806 + }, + { + "epoch": 1.7358884252141444, + "grad_norm": 2.893580198287964, + "learning_rate": 1e-06, + "loss": 0.7988, + "mean_token_accuracy": 0.7399847507476807, + "num_tokens": 394801039.0, + "step": 15807 + }, + { + "epoch": 1.7359982429167582, + "grad_norm": 2.4013314247131348, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7329199314117432, + "num_tokens": 394823423.0, + "step": 15808 + }, + { + "epoch": 1.736108060619372, + "grad_norm": 2.262617826461792, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7117122411727905, + "num_tokens": 394848456.0, + "step": 15809 + }, + { + "epoch": 1.7362178783219855, + "grad_norm": 2.047663450241089, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7210705280303955, + "num_tokens": 394876381.0, + "step": 15810 + }, + { + "epoch": 1.736327696024599, + "grad_norm": 2.024993658065796, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7228127121925354, + "num_tokens": 394907891.0, + "step": 15811 + }, + { + "epoch": 1.7364375137272128, + "grad_norm": 2.0932838916778564, + "learning_rate": 1e-06, + "loss": 0.9058, + "mean_token_accuracy": 0.719346821308136, + "num_tokens": 394936568.0, + "step": 15812 + }, + { + "epoch": 1.7365473314298265, + "grad_norm": 2.107208490371704, + "learning_rate": 1e-06, + "loss": 0.9271, + "mean_token_accuracy": 0.7118811011314392, + "num_tokens": 394965623.0, + "step": 15813 + }, + { + "epoch": 1.7366571491324403, + "grad_norm": 2.0494225025177, + "learning_rate": 1e-06, + "loss": 0.9659, + "mean_token_accuracy": 0.699008584022522, + "num_tokens": 394995544.0, + "step": 15814 + }, + { + "epoch": 1.7367669668350538, + "grad_norm": 2.0472915172576904, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7262228727340698, + "num_tokens": 395024133.0, + "step": 15815 + }, + { + "epoch": 1.7368767845376674, + "grad_norm": 2.383624792098999, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7173491716384888, + "num_tokens": 395048175.0, + "step": 15816 + }, + { + "epoch": 1.7369866022402811, + "grad_norm": 2.2894248962402344, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7218337059020996, + "num_tokens": 395072887.0, + "step": 15817 + }, + { + "epoch": 1.7370964199428949, + "grad_norm": 2.482346534729004, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7087336182594299, + "num_tokens": 395093915.0, + "step": 15818 + }, + { + "epoch": 1.7372062376455084, + "grad_norm": 2.5650808811187744, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.7314925193786621, + "num_tokens": 395113601.0, + "step": 15819 + }, + { + "epoch": 1.7373160553481222, + "grad_norm": 2.0497024059295654, + "learning_rate": 1e-06, + "loss": 1.0056, + "mean_token_accuracy": 0.6956952214241028, + "num_tokens": 395146570.0, + "step": 15820 + }, + { + "epoch": 1.7374258730507357, + "grad_norm": 2.467902421951294, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7153353691101074, + "num_tokens": 395167677.0, + "step": 15821 + }, + { + "epoch": 1.7375356907533495, + "grad_norm": 2.148608922958374, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7225582599639893, + "num_tokens": 395196958.0, + "step": 15822 + }, + { + "epoch": 1.7376455084559632, + "grad_norm": 2.254793882369995, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7428325414657593, + "num_tokens": 395222234.0, + "step": 15823 + }, + { + "epoch": 1.7377553261585768, + "grad_norm": 2.505481719970703, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7102899551391602, + "num_tokens": 395244270.0, + "step": 15824 + }, + { + "epoch": 1.7378651438611903, + "grad_norm": 2.467454195022583, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7021037936210632, + "num_tokens": 395269533.0, + "step": 15825 + }, + { + "epoch": 1.737974961563804, + "grad_norm": 2.0988757610321045, + "learning_rate": 1e-06, + "loss": 0.9723, + "mean_token_accuracy": 0.7052516341209412, + "num_tokens": 395299792.0, + "step": 15826 + }, + { + "epoch": 1.7380847792664178, + "grad_norm": 2.2859909534454346, + "learning_rate": 1e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7218482494354248, + "num_tokens": 395322833.0, + "step": 15827 + }, + { + "epoch": 1.7381945969690316, + "grad_norm": 2.66062593460083, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.723393976688385, + "num_tokens": 395343641.0, + "step": 15828 + }, + { + "epoch": 1.738304414671645, + "grad_norm": 2.0627434253692627, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7295248508453369, + "num_tokens": 395370839.0, + "step": 15829 + }, + { + "epoch": 1.7384142323742586, + "grad_norm": 2.0070202350616455, + "learning_rate": 1e-06, + "loss": 0.8896, + "mean_token_accuracy": 0.720492422580719, + "num_tokens": 395400857.0, + "step": 15830 + }, + { + "epoch": 1.7385240500768724, + "grad_norm": 2.266303539276123, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7247622013092041, + "num_tokens": 395427232.0, + "step": 15831 + }, + { + "epoch": 1.7386338677794861, + "grad_norm": 2.2744383811950684, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7139166593551636, + "num_tokens": 395452914.0, + "step": 15832 + }, + { + "epoch": 1.7387436854820997, + "grad_norm": 2.3896946907043457, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7179626226425171, + "num_tokens": 395475965.0, + "step": 15833 + }, + { + "epoch": 1.7388535031847132, + "grad_norm": 2.521479368209839, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7368383407592773, + "num_tokens": 395497282.0, + "step": 15834 + }, + { + "epoch": 1.738963320887327, + "grad_norm": 2.328648805618286, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7106412649154663, + "num_tokens": 395521459.0, + "step": 15835 + }, + { + "epoch": 1.7390731385899407, + "grad_norm": 2.214810371398926, + "learning_rate": 1e-06, + "loss": 0.9449, + "mean_token_accuracy": 0.7103855013847351, + "num_tokens": 395548911.0, + "step": 15836 + }, + { + "epoch": 1.7391829562925545, + "grad_norm": 2.1504974365234375, + "learning_rate": 1e-06, + "loss": 0.9229, + "mean_token_accuracy": 0.7097121477127075, + "num_tokens": 395576945.0, + "step": 15837 + }, + { + "epoch": 1.739292773995168, + "grad_norm": 2.343137264251709, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.725856363773346, + "num_tokens": 395602070.0, + "step": 15838 + }, + { + "epoch": 1.7394025916977816, + "grad_norm": 2.428290367126465, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7223958969116211, + "num_tokens": 395624754.0, + "step": 15839 + }, + { + "epoch": 1.7395124094003953, + "grad_norm": 2.1267294883728027, + "learning_rate": 1e-06, + "loss": 0.9988, + "mean_token_accuracy": 0.6932407021522522, + "num_tokens": 395652575.0, + "step": 15840 + }, + { + "epoch": 1.739622227103009, + "grad_norm": 2.810269355773926, + "learning_rate": 1e-06, + "loss": 0.8094, + "mean_token_accuracy": 0.7398849725723267, + "num_tokens": 395670820.0, + "step": 15841 + }, + { + "epoch": 1.7397320448056228, + "grad_norm": 2.1785876750946045, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7266058921813965, + "num_tokens": 395696583.0, + "step": 15842 + }, + { + "epoch": 1.7398418625082364, + "grad_norm": 2.1409971714019775, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7117725610733032, + "num_tokens": 395724682.0, + "step": 15843 + }, + { + "epoch": 1.73995168021085, + "grad_norm": 2.3112542629241943, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7156805992126465, + "num_tokens": 395751831.0, + "step": 15844 + }, + { + "epoch": 1.7400614979134637, + "grad_norm": 1.9159826040267944, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7090704441070557, + "num_tokens": 395783175.0, + "step": 15845 + }, + { + "epoch": 1.7401713156160774, + "grad_norm": 2.3664615154266357, + "learning_rate": 1e-06, + "loss": 0.8443, + "mean_token_accuracy": 0.7419246435165405, + "num_tokens": 395805381.0, + "step": 15846 + }, + { + "epoch": 1.740281133318691, + "grad_norm": 2.28118896484375, + "learning_rate": 1e-06, + "loss": 1.002, + "mean_token_accuracy": 0.6917190551757812, + "num_tokens": 395830230.0, + "step": 15847 + }, + { + "epoch": 1.7403909510213045, + "grad_norm": 2.2902655601501465, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7185957431793213, + "num_tokens": 395855616.0, + "step": 15848 + }, + { + "epoch": 1.7405007687239182, + "grad_norm": 2.0012567043304443, + "learning_rate": 1e-06, + "loss": 0.7476, + "mean_token_accuracy": 0.762706995010376, + "num_tokens": 395883300.0, + "step": 15849 + }, + { + "epoch": 1.740610586426532, + "grad_norm": 2.1864237785339355, + "learning_rate": 1e-06, + "loss": 0.8596, + "mean_token_accuracy": 0.7247052788734436, + "num_tokens": 395907728.0, + "step": 15850 + }, + { + "epoch": 1.7407204041291457, + "grad_norm": 2.1336169242858887, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7176468372344971, + "num_tokens": 395936637.0, + "step": 15851 + }, + { + "epoch": 1.7408302218317593, + "grad_norm": 2.1150989532470703, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7119964361190796, + "num_tokens": 395962760.0, + "step": 15852 + }, + { + "epoch": 1.7409400395343728, + "grad_norm": 2.6681504249572754, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7187966704368591, + "num_tokens": 395983322.0, + "step": 15853 + }, + { + "epoch": 1.7410498572369866, + "grad_norm": 2.0746099948883057, + "learning_rate": 1e-06, + "loss": 1.0074, + "mean_token_accuracy": 0.69687819480896, + "num_tokens": 396011050.0, + "step": 15854 + }, + { + "epoch": 1.7411596749396003, + "grad_norm": 2.369062900543213, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.741492509841919, + "num_tokens": 396032970.0, + "step": 15855 + }, + { + "epoch": 1.741269492642214, + "grad_norm": 2.4332382678985596, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7375534772872925, + "num_tokens": 396054576.0, + "step": 15856 + }, + { + "epoch": 1.7413793103448276, + "grad_norm": 2.187967300415039, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7159721851348877, + "num_tokens": 396080934.0, + "step": 15857 + }, + { + "epoch": 1.7414891280474412, + "grad_norm": 2.147038698196411, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7024115324020386, + "num_tokens": 396109677.0, + "step": 15858 + }, + { + "epoch": 1.741598945750055, + "grad_norm": 2.4746761322021484, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7266442179679871, + "num_tokens": 396131506.0, + "step": 15859 + }, + { + "epoch": 1.7417087634526687, + "grad_norm": 2.304327964782715, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7460619211196899, + "num_tokens": 396153860.0, + "step": 15860 + }, + { + "epoch": 1.7418185811552822, + "grad_norm": 2.37027907371521, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7201458215713501, + "num_tokens": 396176828.0, + "step": 15861 + }, + { + "epoch": 1.7419283988578957, + "grad_norm": 2.978646993637085, + "learning_rate": 1e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7372192144393921, + "num_tokens": 396193090.0, + "step": 15862 + }, + { + "epoch": 1.7420382165605095, + "grad_norm": 2.1259007453918457, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7213163375854492, + "num_tokens": 396220145.0, + "step": 15863 + }, + { + "epoch": 1.7421480342631233, + "grad_norm": 1.9954761266708374, + "learning_rate": 1e-06, + "loss": 0.963, + "mean_token_accuracy": 0.7151625156402588, + "num_tokens": 396252095.0, + "step": 15864 + }, + { + "epoch": 1.742257851965737, + "grad_norm": 2.4164628982543945, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.714576244354248, + "num_tokens": 396274236.0, + "step": 15865 + }, + { + "epoch": 1.7423676696683505, + "grad_norm": 2.2575571537017822, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7179290056228638, + "num_tokens": 396299573.0, + "step": 15866 + }, + { + "epoch": 1.742477487370964, + "grad_norm": 1.9952675104141235, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7141955494880676, + "num_tokens": 396331221.0, + "step": 15867 + }, + { + "epoch": 1.7425873050735778, + "grad_norm": 2.2118635177612305, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7259238958358765, + "num_tokens": 396355379.0, + "step": 15868 + }, + { + "epoch": 1.7426971227761916, + "grad_norm": 2.158970594406128, + "learning_rate": 1e-06, + "loss": 1.0398, + "mean_token_accuracy": 0.6766636967658997, + "num_tokens": 396386057.0, + "step": 15869 + }, + { + "epoch": 1.7428069404788051, + "grad_norm": 2.576805353164673, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7298052906990051, + "num_tokens": 396407335.0, + "step": 15870 + }, + { + "epoch": 1.742916758181419, + "grad_norm": 2.0846409797668457, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.7409930229187012, + "num_tokens": 396434799.0, + "step": 15871 + }, + { + "epoch": 1.7430265758840324, + "grad_norm": 2.303194284439087, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.7205607891082764, + "num_tokens": 396458445.0, + "step": 15872 + }, + { + "epoch": 1.7431363935866462, + "grad_norm": 2.5143649578094482, + "learning_rate": 1e-06, + "loss": 0.9398, + "mean_token_accuracy": 0.7090311646461487, + "num_tokens": 396480577.0, + "step": 15873 + }, + { + "epoch": 1.74324621128926, + "grad_norm": 2.5635061264038086, + "learning_rate": 1e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.7428946495056152, + "num_tokens": 396499886.0, + "step": 15874 + }, + { + "epoch": 1.7433560289918735, + "grad_norm": 2.2048678398132324, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7437711954116821, + "num_tokens": 396524125.0, + "step": 15875 + }, + { + "epoch": 1.743465846694487, + "grad_norm": 2.3909122943878174, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7282990217208862, + "num_tokens": 396545434.0, + "step": 15876 + }, + { + "epoch": 1.7435756643971008, + "grad_norm": 2.1897714138031006, + "learning_rate": 1e-06, + "loss": 0.9581, + "mean_token_accuracy": 0.7063916921615601, + "num_tokens": 396573981.0, + "step": 15877 + }, + { + "epoch": 1.7436854820997145, + "grad_norm": 2.4395346641540527, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7224022150039673, + "num_tokens": 396596161.0, + "step": 15878 + }, + { + "epoch": 1.7437952998023283, + "grad_norm": 2.4106619358062744, + "learning_rate": 1e-06, + "loss": 0.8956, + "mean_token_accuracy": 0.717776894569397, + "num_tokens": 396620025.0, + "step": 15879 + }, + { + "epoch": 1.7439051175049418, + "grad_norm": 2.2349793910980225, + "learning_rate": 1e-06, + "loss": 0.9299, + "mean_token_accuracy": 0.7142845988273621, + "num_tokens": 396646519.0, + "step": 15880 + }, + { + "epoch": 1.7440149352075554, + "grad_norm": 2.0774919986724854, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7123613953590393, + "num_tokens": 396676602.0, + "step": 15881 + }, + { + "epoch": 1.744124752910169, + "grad_norm": 2.6225621700286865, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7145458459854126, + "num_tokens": 396696968.0, + "step": 15882 + }, + { + "epoch": 1.7442345706127829, + "grad_norm": 2.5039806365966797, + "learning_rate": 1e-06, + "loss": 0.8994, + "mean_token_accuracy": 0.7182079553604126, + "num_tokens": 396717985.0, + "step": 15883 + }, + { + "epoch": 1.7443443883153964, + "grad_norm": 2.1203761100769043, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7082695960998535, + "num_tokens": 396745526.0, + "step": 15884 + }, + { + "epoch": 1.7444542060180102, + "grad_norm": 2.6039278507232666, + "learning_rate": 1e-06, + "loss": 0.772, + "mean_token_accuracy": 0.7438379526138306, + "num_tokens": 396763697.0, + "step": 15885 + }, + { + "epoch": 1.7445640237206237, + "grad_norm": 2.7141802310943604, + "learning_rate": 1e-06, + "loss": 0.8945, + "mean_token_accuracy": 0.7325342893600464, + "num_tokens": 396782414.0, + "step": 15886 + }, + { + "epoch": 1.7446738414232374, + "grad_norm": 2.3297338485717773, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7080507278442383, + "num_tokens": 396807268.0, + "step": 15887 + }, + { + "epoch": 1.7447836591258512, + "grad_norm": 2.1897530555725098, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.733546793460846, + "num_tokens": 396835206.0, + "step": 15888 + }, + { + "epoch": 1.7448934768284647, + "grad_norm": 2.6091110706329346, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7312406301498413, + "num_tokens": 396855482.0, + "step": 15889 + }, + { + "epoch": 1.7450032945310783, + "grad_norm": 2.2416434288024902, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7440711259841919, + "num_tokens": 396879710.0, + "step": 15890 + }, + { + "epoch": 1.745113112233692, + "grad_norm": 2.303615093231201, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7157601118087769, + "num_tokens": 396906401.0, + "step": 15891 + }, + { + "epoch": 1.7452229299363058, + "grad_norm": 2.4081742763519287, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7109836339950562, + "num_tokens": 396930295.0, + "step": 15892 + }, + { + "epoch": 1.7453327476389195, + "grad_norm": 2.087862730026245, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.6980788707733154, + "num_tokens": 396960760.0, + "step": 15893 + }, + { + "epoch": 1.745442565341533, + "grad_norm": 2.318305253982544, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7089238166809082, + "num_tokens": 396986361.0, + "step": 15894 + }, + { + "epoch": 1.7455523830441466, + "grad_norm": 2.2093725204467773, + "learning_rate": 1e-06, + "loss": 0.8951, + "mean_token_accuracy": 0.7220278382301331, + "num_tokens": 397012848.0, + "step": 15895 + }, + { + "epoch": 1.7456622007467604, + "grad_norm": 2.182292938232422, + "learning_rate": 1e-06, + "loss": 0.942, + "mean_token_accuracy": 0.7060561180114746, + "num_tokens": 397041351.0, + "step": 15896 + }, + { + "epoch": 1.7457720184493741, + "grad_norm": 2.1271843910217285, + "learning_rate": 1e-06, + "loss": 0.7857, + "mean_token_accuracy": 0.7465459704399109, + "num_tokens": 397066470.0, + "step": 15897 + }, + { + "epoch": 1.7458818361519877, + "grad_norm": 2.2729604244232178, + "learning_rate": 1e-06, + "loss": 0.9544, + "mean_token_accuracy": 0.7010242938995361, + "num_tokens": 397090035.0, + "step": 15898 + }, + { + "epoch": 1.7459916538546012, + "grad_norm": 2.167715072631836, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7106269598007202, + "num_tokens": 397118992.0, + "step": 15899 + }, + { + "epoch": 1.746101471557215, + "grad_norm": 2.126788377761841, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7015706300735474, + "num_tokens": 397151052.0, + "step": 15900 + }, + { + "epoch": 1.7462112892598287, + "grad_norm": 2.327214002609253, + "learning_rate": 1e-06, + "loss": 0.8835, + "mean_token_accuracy": 0.7328605651855469, + "num_tokens": 397173795.0, + "step": 15901 + }, + { + "epoch": 1.7463211069624425, + "grad_norm": 2.1659443378448486, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7106762528419495, + "num_tokens": 397200542.0, + "step": 15902 + }, + { + "epoch": 1.746430924665056, + "grad_norm": 2.051511526107788, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7090080380439758, + "num_tokens": 397230122.0, + "step": 15903 + }, + { + "epoch": 1.7465407423676695, + "grad_norm": 2.2463483810424805, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.721764326095581, + "num_tokens": 397256421.0, + "step": 15904 + }, + { + "epoch": 1.7466505600702833, + "grad_norm": 2.678077220916748, + "learning_rate": 1e-06, + "loss": 0.7537, + "mean_token_accuracy": 0.7574399709701538, + "num_tokens": 397273631.0, + "step": 15905 + }, + { + "epoch": 1.746760377772897, + "grad_norm": 2.271897077560425, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7096101641654968, + "num_tokens": 397300747.0, + "step": 15906 + }, + { + "epoch": 1.7468701954755108, + "grad_norm": 2.389833688735962, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7124649882316589, + "num_tokens": 397325636.0, + "step": 15907 + }, + { + "epoch": 1.7469800131781243, + "grad_norm": 2.3202898502349854, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7270432710647583, + "num_tokens": 397348612.0, + "step": 15908 + }, + { + "epoch": 1.7470898308807379, + "grad_norm": 2.656399965286255, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7337120771408081, + "num_tokens": 397367005.0, + "step": 15909 + }, + { + "epoch": 1.7471996485833516, + "grad_norm": 2.6682281494140625, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7399791479110718, + "num_tokens": 397384923.0, + "step": 15910 + }, + { + "epoch": 1.7473094662859654, + "grad_norm": 1.9672356843948364, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7100580930709839, + "num_tokens": 397417425.0, + "step": 15911 + }, + { + "epoch": 1.747419283988579, + "grad_norm": 2.105729579925537, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7273427844047546, + "num_tokens": 397443128.0, + "step": 15912 + }, + { + "epoch": 1.7475291016911925, + "grad_norm": 2.2225496768951416, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7004457712173462, + "num_tokens": 397469225.0, + "step": 15913 + }, + { + "epoch": 1.7476389193938062, + "grad_norm": 2.2345759868621826, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7000815272331238, + "num_tokens": 397494921.0, + "step": 15914 + }, + { + "epoch": 1.74774873709642, + "grad_norm": 2.0000057220458984, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7004343867301941, + "num_tokens": 397528242.0, + "step": 15915 + }, + { + "epoch": 1.7478585547990337, + "grad_norm": 2.0369176864624023, + "learning_rate": 1e-06, + "loss": 0.9541, + "mean_token_accuracy": 0.7041701078414917, + "num_tokens": 397557151.0, + "step": 15916 + }, + { + "epoch": 1.7479683725016473, + "grad_norm": 2.195781946182251, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7251702547073364, + "num_tokens": 397581753.0, + "step": 15917 + }, + { + "epoch": 1.7480781902042608, + "grad_norm": 2.2879586219787598, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.722519040107727, + "num_tokens": 397607776.0, + "step": 15918 + }, + { + "epoch": 1.7481880079068746, + "grad_norm": 2.0225656032562256, + "learning_rate": 1e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7258769273757935, + "num_tokens": 397637881.0, + "step": 15919 + }, + { + "epoch": 1.7482978256094883, + "grad_norm": 2.027735471725464, + "learning_rate": 1e-06, + "loss": 0.8955, + "mean_token_accuracy": 0.7173670530319214, + "num_tokens": 397665936.0, + "step": 15920 + }, + { + "epoch": 1.7484076433121019, + "grad_norm": 2.378336191177368, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7340161204338074, + "num_tokens": 397688462.0, + "step": 15921 + }, + { + "epoch": 1.7485174610147156, + "grad_norm": 2.180368423461914, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7264689803123474, + "num_tokens": 397712610.0, + "step": 15922 + }, + { + "epoch": 1.7486272787173291, + "grad_norm": 2.578819513320923, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7124696969985962, + "num_tokens": 397733702.0, + "step": 15923 + }, + { + "epoch": 1.748737096419943, + "grad_norm": 2.576275110244751, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.7054343223571777, + "num_tokens": 397753904.0, + "step": 15924 + }, + { + "epoch": 1.7488469141225567, + "grad_norm": 2.226044178009033, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.697181224822998, + "num_tokens": 397782152.0, + "step": 15925 + }, + { + "epoch": 1.7489567318251702, + "grad_norm": 2.1968891620635986, + "learning_rate": 1e-06, + "loss": 0.9629, + "mean_token_accuracy": 0.7131451368331909, + "num_tokens": 397808058.0, + "step": 15926 + }, + { + "epoch": 1.7490665495277837, + "grad_norm": 2.513723850250244, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7243542075157166, + "num_tokens": 397830178.0, + "step": 15927 + }, + { + "epoch": 1.7491763672303975, + "grad_norm": 2.2901663780212402, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7121683955192566, + "num_tokens": 397854107.0, + "step": 15928 + }, + { + "epoch": 1.7492861849330112, + "grad_norm": 2.2959001064300537, + "learning_rate": 1e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7229137420654297, + "num_tokens": 397879454.0, + "step": 15929 + }, + { + "epoch": 1.749396002635625, + "grad_norm": 2.3366968631744385, + "learning_rate": 1e-06, + "loss": 0.9357, + "mean_token_accuracy": 0.710313618183136, + "num_tokens": 397904021.0, + "step": 15930 + }, + { + "epoch": 1.7495058203382385, + "grad_norm": 2.299811601638794, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7177441120147705, + "num_tokens": 397929323.0, + "step": 15931 + }, + { + "epoch": 1.749615638040852, + "grad_norm": 2.1841461658477783, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.713752269744873, + "num_tokens": 397955697.0, + "step": 15932 + }, + { + "epoch": 1.7497254557434658, + "grad_norm": 2.1887476444244385, + "learning_rate": 1e-06, + "loss": 1.0225, + "mean_token_accuracy": 0.7002643942832947, + "num_tokens": 397983623.0, + "step": 15933 + }, + { + "epoch": 1.7498352734460796, + "grad_norm": 2.0369017124176025, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.7032169103622437, + "num_tokens": 398015185.0, + "step": 15934 + }, + { + "epoch": 1.7499450911486931, + "grad_norm": 2.1058292388916016, + "learning_rate": 1e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7533869743347168, + "num_tokens": 398041637.0, + "step": 15935 + }, + { + "epoch": 1.7500549088513069, + "grad_norm": 2.134406089782715, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7145366668701172, + "num_tokens": 398070184.0, + "step": 15936 + }, + { + "epoch": 1.7501647265539204, + "grad_norm": 2.5112531185150146, + "learning_rate": 1e-06, + "loss": 0.922, + "mean_token_accuracy": 0.731030285358429, + "num_tokens": 398091982.0, + "step": 15937 + }, + { + "epoch": 1.7502745442565342, + "grad_norm": 2.0354771614074707, + "learning_rate": 1e-06, + "loss": 0.9572, + "mean_token_accuracy": 0.7017271518707275, + "num_tokens": 398122176.0, + "step": 15938 + }, + { + "epoch": 1.750384361959148, + "grad_norm": 2.032400608062744, + "learning_rate": 1e-06, + "loss": 0.7934, + "mean_token_accuracy": 0.7480239272117615, + "num_tokens": 398149546.0, + "step": 15939 + }, + { + "epoch": 1.7504941796617615, + "grad_norm": 2.2724850177764893, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7294055819511414, + "num_tokens": 398174551.0, + "step": 15940 + }, + { + "epoch": 1.750603997364375, + "grad_norm": 2.205197811126709, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7248974442481995, + "num_tokens": 398200219.0, + "step": 15941 + }, + { + "epoch": 1.7507138150669888, + "grad_norm": 2.5125999450683594, + "learning_rate": 1e-06, + "loss": 0.7143, + "mean_token_accuracy": 0.7583415508270264, + "num_tokens": 398218876.0, + "step": 15942 + }, + { + "epoch": 1.7508236327696025, + "grad_norm": 2.352220058441162, + "learning_rate": 1e-06, + "loss": 0.9372, + "mean_token_accuracy": 0.7049844861030579, + "num_tokens": 398241870.0, + "step": 15943 + }, + { + "epoch": 1.7509334504722163, + "grad_norm": 2.1278374195098877, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7125483751296997, + "num_tokens": 398269068.0, + "step": 15944 + }, + { + "epoch": 1.7510432681748298, + "grad_norm": 1.9951521158218384, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.7061387896537781, + "num_tokens": 398298987.0, + "step": 15945 + }, + { + "epoch": 1.7511530858774433, + "grad_norm": 2.5038390159606934, + "learning_rate": 1e-06, + "loss": 0.9137, + "mean_token_accuracy": 0.7282351851463318, + "num_tokens": 398319881.0, + "step": 15946 + }, + { + "epoch": 1.751262903580057, + "grad_norm": 2.530733823776245, + "learning_rate": 1e-06, + "loss": 0.8431, + "mean_token_accuracy": 0.7428625822067261, + "num_tokens": 398339730.0, + "step": 15947 + }, + { + "epoch": 1.7513727212826709, + "grad_norm": 2.0165421962738037, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.6994877457618713, + "num_tokens": 398368353.0, + "step": 15948 + }, + { + "epoch": 1.7514825389852844, + "grad_norm": 2.265021324157715, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7137938141822815, + "num_tokens": 398394144.0, + "step": 15949 + }, + { + "epoch": 1.7515923566878981, + "grad_norm": 2.304532766342163, + "learning_rate": 1e-06, + "loss": 0.9656, + "mean_token_accuracy": 0.7022235989570618, + "num_tokens": 398421197.0, + "step": 15950 + }, + { + "epoch": 1.7517021743905117, + "grad_norm": 2.433502435684204, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7150119543075562, + "num_tokens": 398444240.0, + "step": 15951 + }, + { + "epoch": 1.7518119920931254, + "grad_norm": 2.2378990650177, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7346850633621216, + "num_tokens": 398467980.0, + "step": 15952 + }, + { + "epoch": 1.7519218097957392, + "grad_norm": 2.2565159797668457, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.6993149518966675, + "num_tokens": 398494944.0, + "step": 15953 + }, + { + "epoch": 1.7520316274983527, + "grad_norm": 2.3895888328552246, + "learning_rate": 1e-06, + "loss": 0.8874, + "mean_token_accuracy": 0.7258000373840332, + "num_tokens": 398517446.0, + "step": 15954 + }, + { + "epoch": 1.7521414452009663, + "grad_norm": 2.436488389968872, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7177231311798096, + "num_tokens": 398539207.0, + "step": 15955 + }, + { + "epoch": 1.75225126290358, + "grad_norm": 2.3365793228149414, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7231038212776184, + "num_tokens": 398563225.0, + "step": 15956 + }, + { + "epoch": 1.7523610806061938, + "grad_norm": 2.107372522354126, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7278417348861694, + "num_tokens": 398589858.0, + "step": 15957 + }, + { + "epoch": 1.7524708983088075, + "grad_norm": 2.1960291862487793, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.7296392321586609, + "num_tokens": 398614823.0, + "step": 15958 + }, + { + "epoch": 1.752580716011421, + "grad_norm": 2.036591053009033, + "learning_rate": 1e-06, + "loss": 1.024, + "mean_token_accuracy": 0.6877175569534302, + "num_tokens": 398647186.0, + "step": 15959 + }, + { + "epoch": 1.7526905337140346, + "grad_norm": 2.2151384353637695, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7268965840339661, + "num_tokens": 398673380.0, + "step": 15960 + }, + { + "epoch": 1.7528003514166484, + "grad_norm": 2.428128719329834, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7244913578033447, + "num_tokens": 398694422.0, + "step": 15961 + }, + { + "epoch": 1.7529101691192621, + "grad_norm": 2.380162477493286, + "learning_rate": 1e-06, + "loss": 0.9594, + "mean_token_accuracy": 0.6974234580993652, + "num_tokens": 398718245.0, + "step": 15962 + }, + { + "epoch": 1.7530199868218757, + "grad_norm": 1.926571011543274, + "learning_rate": 1e-06, + "loss": 1.0174, + "mean_token_accuracy": 0.6862139105796814, + "num_tokens": 398750515.0, + "step": 15963 + }, + { + "epoch": 1.7531298045244892, + "grad_norm": 1.9652968645095825, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.7000330686569214, + "num_tokens": 398784115.0, + "step": 15964 + }, + { + "epoch": 1.753239622227103, + "grad_norm": 2.506147861480713, + "learning_rate": 1e-06, + "loss": 0.9127, + "mean_token_accuracy": 0.716529369354248, + "num_tokens": 398805119.0, + "step": 15965 + }, + { + "epoch": 1.7533494399297167, + "grad_norm": 2.2430007457733154, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.718892514705658, + "num_tokens": 398829849.0, + "step": 15966 + }, + { + "epoch": 1.7534592576323305, + "grad_norm": 2.0359179973602295, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7222362756729126, + "num_tokens": 398860385.0, + "step": 15967 + }, + { + "epoch": 1.753569075334944, + "grad_norm": 2.2628374099731445, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7207565307617188, + "num_tokens": 398886573.0, + "step": 15968 + }, + { + "epoch": 1.7536788930375575, + "grad_norm": 2.177835702896118, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7084996700286865, + "num_tokens": 398913664.0, + "step": 15969 + }, + { + "epoch": 1.7537887107401713, + "grad_norm": 2.2594683170318604, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7232808470726013, + "num_tokens": 398938484.0, + "step": 15970 + }, + { + "epoch": 1.753898528442785, + "grad_norm": 2.3753602504730225, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7257736921310425, + "num_tokens": 398961999.0, + "step": 15971 + }, + { + "epoch": 1.7540083461453988, + "grad_norm": 2.4313809871673584, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7204892635345459, + "num_tokens": 398984563.0, + "step": 15972 + }, + { + "epoch": 1.7541181638480123, + "grad_norm": 2.4048526287078857, + "learning_rate": 1e-06, + "loss": 0.9363, + "mean_token_accuracy": 0.7148160338401794, + "num_tokens": 399009097.0, + "step": 15973 + }, + { + "epoch": 1.7542279815506259, + "grad_norm": 2.550706386566162, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7127339839935303, + "num_tokens": 399031074.0, + "step": 15974 + }, + { + "epoch": 1.7543377992532396, + "grad_norm": 2.0259246826171875, + "learning_rate": 1e-06, + "loss": 0.8322, + "mean_token_accuracy": 0.7392014861106873, + "num_tokens": 399059616.0, + "step": 15975 + }, + { + "epoch": 1.7544476169558534, + "grad_norm": 2.407771587371826, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7063533067703247, + "num_tokens": 399083831.0, + "step": 15976 + }, + { + "epoch": 1.754557434658467, + "grad_norm": 2.313176155090332, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7265865802764893, + "num_tokens": 399106593.0, + "step": 15977 + }, + { + "epoch": 1.7546672523610805, + "grad_norm": 1.9893701076507568, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7020639181137085, + "num_tokens": 399137845.0, + "step": 15978 + }, + { + "epoch": 1.7547770700636942, + "grad_norm": 2.1576404571533203, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7140008807182312, + "num_tokens": 399164711.0, + "step": 15979 + }, + { + "epoch": 1.754886887766308, + "grad_norm": 2.402599334716797, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7260541915893555, + "num_tokens": 399185839.0, + "step": 15980 + }, + { + "epoch": 1.7549967054689217, + "grad_norm": 2.5322303771972656, + "learning_rate": 1e-06, + "loss": 0.8303, + "mean_token_accuracy": 0.7409108877182007, + "num_tokens": 399206738.0, + "step": 15981 + }, + { + "epoch": 1.7551065231715353, + "grad_norm": 2.4734883308410645, + "learning_rate": 1e-06, + "loss": 0.853, + "mean_token_accuracy": 0.7273063659667969, + "num_tokens": 399230688.0, + "step": 15982 + }, + { + "epoch": 1.7552163408741488, + "grad_norm": 2.501868486404419, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7292734384536743, + "num_tokens": 399251494.0, + "step": 15983 + }, + { + "epoch": 1.7553261585767626, + "grad_norm": 2.0015227794647217, + "learning_rate": 1e-06, + "loss": 0.8423, + "mean_token_accuracy": 0.7334369421005249, + "num_tokens": 399280558.0, + "step": 15984 + }, + { + "epoch": 1.7554359762793763, + "grad_norm": 2.068408966064453, + "learning_rate": 1e-06, + "loss": 0.9919, + "mean_token_accuracy": 0.696757972240448, + "num_tokens": 399312662.0, + "step": 15985 + }, + { + "epoch": 1.7555457939819898, + "grad_norm": 2.779700517654419, + "learning_rate": 1e-06, + "loss": 0.8489, + "mean_token_accuracy": 0.7292402982711792, + "num_tokens": 399331521.0, + "step": 15986 + }, + { + "epoch": 1.7556556116846036, + "grad_norm": 1.878043293952942, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7045401930809021, + "num_tokens": 399363696.0, + "step": 15987 + }, + { + "epoch": 1.7557654293872171, + "grad_norm": 1.9774593114852905, + "learning_rate": 1e-06, + "loss": 0.9278, + "mean_token_accuracy": 0.7064228057861328, + "num_tokens": 399396029.0, + "step": 15988 + }, + { + "epoch": 1.755875247089831, + "grad_norm": 2.2275893688201904, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7242758870124817, + "num_tokens": 399421755.0, + "step": 15989 + }, + { + "epoch": 1.7559850647924446, + "grad_norm": 2.253941535949707, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.689608097076416, + "num_tokens": 399449817.0, + "step": 15990 + }, + { + "epoch": 1.7560948824950582, + "grad_norm": 2.6373836994171143, + "learning_rate": 1e-06, + "loss": 0.8982, + "mean_token_accuracy": 0.7257446050643921, + "num_tokens": 399470121.0, + "step": 15991 + }, + { + "epoch": 1.7562047001976717, + "grad_norm": 2.205235242843628, + "learning_rate": 1e-06, + "loss": 0.9128, + "mean_token_accuracy": 0.7116581201553345, + "num_tokens": 399496950.0, + "step": 15992 + }, + { + "epoch": 1.7563145179002855, + "grad_norm": 2.388963460922241, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7297418713569641, + "num_tokens": 399518770.0, + "step": 15993 + }, + { + "epoch": 1.7564243356028992, + "grad_norm": 2.405724287033081, + "learning_rate": 1e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7354284524917603, + "num_tokens": 399541828.0, + "step": 15994 + }, + { + "epoch": 1.756534153305513, + "grad_norm": 2.3258113861083984, + "learning_rate": 1e-06, + "loss": 0.8026, + "mean_token_accuracy": 0.7412492036819458, + "num_tokens": 399565791.0, + "step": 15995 + }, + { + "epoch": 1.7566439710081265, + "grad_norm": 2.1809263229370117, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.711833119392395, + "num_tokens": 399590242.0, + "step": 15996 + }, + { + "epoch": 1.75675378871074, + "grad_norm": 2.462984085083008, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7282552719116211, + "num_tokens": 399611578.0, + "step": 15997 + }, + { + "epoch": 1.7568636064133538, + "grad_norm": 2.36434268951416, + "learning_rate": 1e-06, + "loss": 0.8387, + "mean_token_accuracy": 0.7381361722946167, + "num_tokens": 399633950.0, + "step": 15998 + }, + { + "epoch": 1.7569734241159676, + "grad_norm": 2.5833771228790283, + "learning_rate": 1e-06, + "loss": 0.7699, + "mean_token_accuracy": 0.7449509501457214, + "num_tokens": 399651988.0, + "step": 15999 + }, + { + "epoch": 1.757083241818581, + "grad_norm": 2.4284746646881104, + "learning_rate": 1e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7362931966781616, + "num_tokens": 399674304.0, + "step": 16000 + }, + { + "epoch": 1.7571930595211949, + "grad_norm": 2.0602190494537354, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6885766983032227, + "num_tokens": 399705238.0, + "step": 16001 + }, + { + "epoch": 1.7573028772238084, + "grad_norm": 2.4707930088043213, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7221267223358154, + "num_tokens": 399727213.0, + "step": 16002 + }, + { + "epoch": 1.7574126949264222, + "grad_norm": 2.290339708328247, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7219030857086182, + "num_tokens": 399753326.0, + "step": 16003 + }, + { + "epoch": 1.757522512629036, + "grad_norm": 2.1083438396453857, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7084184885025024, + "num_tokens": 399781445.0, + "step": 16004 + }, + { + "epoch": 1.7576323303316495, + "grad_norm": 2.1386702060699463, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.7203178405761719, + "num_tokens": 399809713.0, + "step": 16005 + }, + { + "epoch": 1.757742148034263, + "grad_norm": 2.2464277744293213, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7225486040115356, + "num_tokens": 399836013.0, + "step": 16006 + }, + { + "epoch": 1.7578519657368767, + "grad_norm": 2.4999735355377197, + "learning_rate": 1e-06, + "loss": 0.8728, + "mean_token_accuracy": 0.7237340211868286, + "num_tokens": 399858916.0, + "step": 16007 + }, + { + "epoch": 1.7579617834394905, + "grad_norm": 2.3567214012145996, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7121206521987915, + "num_tokens": 399882637.0, + "step": 16008 + }, + { + "epoch": 1.7580716011421043, + "grad_norm": 2.3287689685821533, + "learning_rate": 1e-06, + "loss": 0.9496, + "mean_token_accuracy": 0.7028039693832397, + "num_tokens": 399907596.0, + "step": 16009 + }, + { + "epoch": 1.7581814188447178, + "grad_norm": 2.5672879219055176, + "learning_rate": 1e-06, + "loss": 0.9285, + "mean_token_accuracy": 0.7082090377807617, + "num_tokens": 399928669.0, + "step": 16010 + }, + { + "epoch": 1.7582912365473313, + "grad_norm": 2.558187961578369, + "learning_rate": 1e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7293217182159424, + "num_tokens": 399948739.0, + "step": 16011 + }, + { + "epoch": 1.758401054249945, + "grad_norm": 2.1076831817626953, + "learning_rate": 1e-06, + "loss": 1.0094, + "mean_token_accuracy": 0.6954359412193298, + "num_tokens": 399978892.0, + "step": 16012 + }, + { + "epoch": 1.7585108719525588, + "grad_norm": 2.4111385345458984, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7297264337539673, + "num_tokens": 400001022.0, + "step": 16013 + }, + { + "epoch": 1.7586206896551724, + "grad_norm": 2.094052791595459, + "learning_rate": 1e-06, + "loss": 0.811, + "mean_token_accuracy": 0.7541297078132629, + "num_tokens": 400027938.0, + "step": 16014 + }, + { + "epoch": 1.758730507357786, + "grad_norm": 2.048736095428467, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7087987661361694, + "num_tokens": 400056968.0, + "step": 16015 + }, + { + "epoch": 1.7588403250603997, + "grad_norm": 2.159907341003418, + "learning_rate": 1e-06, + "loss": 0.9625, + "mean_token_accuracy": 0.7068028450012207, + "num_tokens": 400084240.0, + "step": 16016 + }, + { + "epoch": 1.7589501427630134, + "grad_norm": 2.0643842220306396, + "learning_rate": 1e-06, + "loss": 0.9721, + "mean_token_accuracy": 0.7137041687965393, + "num_tokens": 400115188.0, + "step": 16017 + }, + { + "epoch": 1.7590599604656272, + "grad_norm": 2.148664951324463, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.7000439167022705, + "num_tokens": 400143516.0, + "step": 16018 + }, + { + "epoch": 1.7591697781682407, + "grad_norm": 2.6146228313446045, + "learning_rate": 1e-06, + "loss": 0.828, + "mean_token_accuracy": 0.7380788326263428, + "num_tokens": 400162840.0, + "step": 16019 + }, + { + "epoch": 1.7592795958708543, + "grad_norm": 2.5460729598999023, + "learning_rate": 1e-06, + "loss": 0.865, + "mean_token_accuracy": 0.7189639806747437, + "num_tokens": 400183599.0, + "step": 16020 + }, + { + "epoch": 1.759389413573468, + "grad_norm": 2.526746988296509, + "learning_rate": 1e-06, + "loss": 0.8174, + "mean_token_accuracy": 0.7380257844924927, + "num_tokens": 400201887.0, + "step": 16021 + }, + { + "epoch": 1.7594992312760818, + "grad_norm": 2.2446930408477783, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7116962671279907, + "num_tokens": 400225659.0, + "step": 16022 + }, + { + "epoch": 1.7596090489786955, + "grad_norm": 2.067586660385132, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7134050130844116, + "num_tokens": 400255810.0, + "step": 16023 + }, + { + "epoch": 1.759718866681309, + "grad_norm": 2.1698906421661377, + "learning_rate": 1e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7425145506858826, + "num_tokens": 400281190.0, + "step": 16024 + }, + { + "epoch": 1.7598286843839226, + "grad_norm": 2.1800479888916016, + "learning_rate": 1e-06, + "loss": 0.9705, + "mean_token_accuracy": 0.7031772136688232, + "num_tokens": 400309002.0, + "step": 16025 + }, + { + "epoch": 1.7599385020865363, + "grad_norm": 2.20977783203125, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7113775610923767, + "num_tokens": 400335263.0, + "step": 16026 + }, + { + "epoch": 1.76004831978915, + "grad_norm": 2.6740238666534424, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.718996524810791, + "num_tokens": 400353627.0, + "step": 16027 + }, + { + "epoch": 1.7601581374917636, + "grad_norm": 1.8707430362701416, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.6970561742782593, + "num_tokens": 400388441.0, + "step": 16028 + }, + { + "epoch": 1.7602679551943772, + "grad_norm": 2.100717306137085, + "learning_rate": 1e-06, + "loss": 0.9252, + "mean_token_accuracy": 0.715484619140625, + "num_tokens": 400418826.0, + "step": 16029 + }, + { + "epoch": 1.760377772896991, + "grad_norm": 2.2488789558410645, + "learning_rate": 1e-06, + "loss": 0.8836, + "mean_token_accuracy": 0.7262026071548462, + "num_tokens": 400444102.0, + "step": 16030 + }, + { + "epoch": 1.7604875905996047, + "grad_norm": 2.158766031265259, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7384536266326904, + "num_tokens": 400470193.0, + "step": 16031 + }, + { + "epoch": 1.7605974083022184, + "grad_norm": 2.2868504524230957, + "learning_rate": 1e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7311147451400757, + "num_tokens": 400494653.0, + "step": 16032 + }, + { + "epoch": 1.760707226004832, + "grad_norm": 2.220940351486206, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7239719033241272, + "num_tokens": 400520320.0, + "step": 16033 + }, + { + "epoch": 1.7608170437074455, + "grad_norm": 2.146941661834717, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.714307963848114, + "num_tokens": 400549064.0, + "step": 16034 + }, + { + "epoch": 1.7609268614100593, + "grad_norm": 2.0888805389404297, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7382361888885498, + "num_tokens": 400577094.0, + "step": 16035 + }, + { + "epoch": 1.761036679112673, + "grad_norm": 2.259390115737915, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7216737866401672, + "num_tokens": 400603060.0, + "step": 16036 + }, + { + "epoch": 1.7611464968152868, + "grad_norm": 2.143266201019287, + "learning_rate": 1e-06, + "loss": 0.9948, + "mean_token_accuracy": 0.698932945728302, + "num_tokens": 400632150.0, + "step": 16037 + }, + { + "epoch": 1.7612563145179003, + "grad_norm": 2.236539125442505, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7217625379562378, + "num_tokens": 400658450.0, + "step": 16038 + }, + { + "epoch": 1.7613661322205139, + "grad_norm": 2.137615442276001, + "learning_rate": 1e-06, + "loss": 0.948, + "mean_token_accuracy": 0.7088721990585327, + "num_tokens": 400685507.0, + "step": 16039 + }, + { + "epoch": 1.7614759499231276, + "grad_norm": 2.4431726932525635, + "learning_rate": 1e-06, + "loss": 0.9939, + "mean_token_accuracy": 0.6925134658813477, + "num_tokens": 400706805.0, + "step": 16040 + }, + { + "epoch": 1.7615857676257414, + "grad_norm": 2.11200213432312, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7229625582695007, + "num_tokens": 400733706.0, + "step": 16041 + }, + { + "epoch": 1.761695585328355, + "grad_norm": 2.2515103816986084, + "learning_rate": 1e-06, + "loss": 0.8556, + "mean_token_accuracy": 0.7329849004745483, + "num_tokens": 400757396.0, + "step": 16042 + }, + { + "epoch": 1.7618054030309684, + "grad_norm": 2.275920867919922, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7155754566192627, + "num_tokens": 400783814.0, + "step": 16043 + }, + { + "epoch": 1.7619152207335822, + "grad_norm": 2.110250473022461, + "learning_rate": 1e-06, + "loss": 0.8712, + "mean_token_accuracy": 0.7331516146659851, + "num_tokens": 400811832.0, + "step": 16044 + }, + { + "epoch": 1.762025038436196, + "grad_norm": 2.2270169258117676, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7056642770767212, + "num_tokens": 400838288.0, + "step": 16045 + }, + { + "epoch": 1.7621348561388097, + "grad_norm": 2.4963769912719727, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.739372730255127, + "num_tokens": 400859775.0, + "step": 16046 + }, + { + "epoch": 1.7622446738414232, + "grad_norm": 2.296800374984741, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7177002429962158, + "num_tokens": 400884569.0, + "step": 16047 + }, + { + "epoch": 1.7623544915440368, + "grad_norm": 2.371511220932007, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7019400596618652, + "num_tokens": 400905885.0, + "step": 16048 + }, + { + "epoch": 1.7624643092466505, + "grad_norm": 2.6722261905670166, + "learning_rate": 1e-06, + "loss": 0.8484, + "mean_token_accuracy": 0.7259034514427185, + "num_tokens": 400925782.0, + "step": 16049 + }, + { + "epoch": 1.7625741269492643, + "grad_norm": 2.93888783454895, + "learning_rate": 1e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.7409638166427612, + "num_tokens": 400941818.0, + "step": 16050 + }, + { + "epoch": 1.7626839446518778, + "grad_norm": 2.391592502593994, + "learning_rate": 1e-06, + "loss": 0.9061, + "mean_token_accuracy": 0.7219272255897522, + "num_tokens": 400963978.0, + "step": 16051 + }, + { + "epoch": 1.7627937623544916, + "grad_norm": 2.245089054107666, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7245069742202759, + "num_tokens": 400987531.0, + "step": 16052 + }, + { + "epoch": 1.7629035800571051, + "grad_norm": 1.9081523418426514, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.745749831199646, + "num_tokens": 401016519.0, + "step": 16053 + }, + { + "epoch": 1.7630133977597189, + "grad_norm": 2.483652114868164, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.7024667859077454, + "num_tokens": 401038705.0, + "step": 16054 + }, + { + "epoch": 1.7631232154623326, + "grad_norm": 2.3107028007507324, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7192193269729614, + "num_tokens": 401061869.0, + "step": 16055 + }, + { + "epoch": 1.7632330331649462, + "grad_norm": 2.1697139739990234, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7065841555595398, + "num_tokens": 401089348.0, + "step": 16056 + }, + { + "epoch": 1.7633428508675597, + "grad_norm": 2.21528959274292, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7290976047515869, + "num_tokens": 401114880.0, + "step": 16057 + }, + { + "epoch": 1.7634526685701735, + "grad_norm": 2.0617380142211914, + "learning_rate": 1e-06, + "loss": 0.8276, + "mean_token_accuracy": 0.738945484161377, + "num_tokens": 401144128.0, + "step": 16058 + }, + { + "epoch": 1.7635624862727872, + "grad_norm": 2.123368740081787, + "learning_rate": 1e-06, + "loss": 0.9072, + "mean_token_accuracy": 0.717220664024353, + "num_tokens": 401170995.0, + "step": 16059 + }, + { + "epoch": 1.763672303975401, + "grad_norm": 2.226886749267578, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7288568019866943, + "num_tokens": 401196516.0, + "step": 16060 + }, + { + "epoch": 1.7637821216780145, + "grad_norm": 2.3652350902557373, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7248393297195435, + "num_tokens": 401220177.0, + "step": 16061 + }, + { + "epoch": 1.763891939380628, + "grad_norm": 2.362715005874634, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7104987502098083, + "num_tokens": 401244179.0, + "step": 16062 + }, + { + "epoch": 1.7640017570832418, + "grad_norm": 2.378197431564331, + "learning_rate": 1e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7362506985664368, + "num_tokens": 401266497.0, + "step": 16063 + }, + { + "epoch": 1.7641115747858556, + "grad_norm": 2.433323383331299, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7156574130058289, + "num_tokens": 401287405.0, + "step": 16064 + }, + { + "epoch": 1.764221392488469, + "grad_norm": 2.3161261081695557, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7293824553489685, + "num_tokens": 401310929.0, + "step": 16065 + }, + { + "epoch": 1.7643312101910829, + "grad_norm": 2.3740808963775635, + "learning_rate": 1e-06, + "loss": 0.8853, + "mean_token_accuracy": 0.7183904647827148, + "num_tokens": 401333696.0, + "step": 16066 + }, + { + "epoch": 1.7644410278936964, + "grad_norm": 2.440619945526123, + "learning_rate": 1e-06, + "loss": 0.846, + "mean_token_accuracy": 0.7334874868392944, + "num_tokens": 401356192.0, + "step": 16067 + }, + { + "epoch": 1.7645508455963101, + "grad_norm": 2.3990297317504883, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7211061716079712, + "num_tokens": 401379718.0, + "step": 16068 + }, + { + "epoch": 1.764660663298924, + "grad_norm": 2.0121941566467285, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.6970400810241699, + "num_tokens": 401410778.0, + "step": 16069 + }, + { + "epoch": 1.7647704810015374, + "grad_norm": 2.5375771522521973, + "learning_rate": 1e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7365182638168335, + "num_tokens": 401431002.0, + "step": 16070 + }, + { + "epoch": 1.764880298704151, + "grad_norm": 2.3886585235595703, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7154910564422607, + "num_tokens": 401454075.0, + "step": 16071 + }, + { + "epoch": 1.7649901164067647, + "grad_norm": 2.2675578594207764, + "learning_rate": 1e-06, + "loss": 0.8799, + "mean_token_accuracy": 0.7226426005363464, + "num_tokens": 401482200.0, + "step": 16072 + }, + { + "epoch": 1.7650999341093785, + "grad_norm": 2.79974365234375, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7171435952186584, + "num_tokens": 401498818.0, + "step": 16073 + }, + { + "epoch": 1.7652097518119922, + "grad_norm": 2.405841827392578, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7218695878982544, + "num_tokens": 401522435.0, + "step": 16074 + }, + { + "epoch": 1.7653195695146058, + "grad_norm": 2.1485702991485596, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7049965262413025, + "num_tokens": 401550461.0, + "step": 16075 + }, + { + "epoch": 1.7654293872172193, + "grad_norm": 2.564929723739624, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7379179000854492, + "num_tokens": 401571129.0, + "step": 16076 + }, + { + "epoch": 1.765539204919833, + "grad_norm": 2.1464383602142334, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7343404293060303, + "num_tokens": 401598090.0, + "step": 16077 + }, + { + "epoch": 1.7656490226224468, + "grad_norm": 2.1754727363586426, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7355303168296814, + "num_tokens": 401625224.0, + "step": 16078 + }, + { + "epoch": 1.7657588403250604, + "grad_norm": 2.3375155925750732, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7401551008224487, + "num_tokens": 401647228.0, + "step": 16079 + }, + { + "epoch": 1.765868658027674, + "grad_norm": 2.417222499847412, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.7252703905105591, + "num_tokens": 401670024.0, + "step": 16080 + }, + { + "epoch": 1.7659784757302877, + "grad_norm": 2.3162872791290283, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7066275477409363, + "num_tokens": 401695268.0, + "step": 16081 + }, + { + "epoch": 1.7660882934329014, + "grad_norm": 2.9529709815979004, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7166460752487183, + "num_tokens": 401710832.0, + "step": 16082 + }, + { + "epoch": 1.7661981111355152, + "grad_norm": 2.1613705158233643, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7057884931564331, + "num_tokens": 401736878.0, + "step": 16083 + }, + { + "epoch": 1.7663079288381287, + "grad_norm": 2.4028351306915283, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7333310842514038, + "num_tokens": 401758118.0, + "step": 16084 + }, + { + "epoch": 1.7664177465407422, + "grad_norm": 2.3678719997406006, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7290722131729126, + "num_tokens": 401780190.0, + "step": 16085 + }, + { + "epoch": 1.766527564243356, + "grad_norm": 2.07438063621521, + "learning_rate": 1e-06, + "loss": 0.9592, + "mean_token_accuracy": 0.6993187665939331, + "num_tokens": 401810955.0, + "step": 16086 + }, + { + "epoch": 1.7666373819459698, + "grad_norm": 2.330735206604004, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7066342234611511, + "num_tokens": 401835157.0, + "step": 16087 + }, + { + "epoch": 1.7667471996485835, + "grad_norm": 2.025358200073242, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6891566514968872, + "num_tokens": 401866528.0, + "step": 16088 + }, + { + "epoch": 1.766857017351197, + "grad_norm": 2.1760048866271973, + "learning_rate": 1e-06, + "loss": 0.8873, + "mean_token_accuracy": 0.7167102098464966, + "num_tokens": 401891773.0, + "step": 16089 + }, + { + "epoch": 1.7669668350538106, + "grad_norm": 2.185209274291992, + "learning_rate": 1e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7285831570625305, + "num_tokens": 401917425.0, + "step": 16090 + }, + { + "epoch": 1.7670766527564243, + "grad_norm": 2.4185900688171387, + "learning_rate": 1e-06, + "loss": 0.9689, + "mean_token_accuracy": 0.7074519395828247, + "num_tokens": 401939796.0, + "step": 16091 + }, + { + "epoch": 1.767186470459038, + "grad_norm": 2.120875358581543, + "learning_rate": 1e-06, + "loss": 0.9483, + "mean_token_accuracy": 0.7127749919891357, + "num_tokens": 401966810.0, + "step": 16092 + }, + { + "epoch": 1.7672962881616516, + "grad_norm": 2.1438276767730713, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7091766595840454, + "num_tokens": 401994202.0, + "step": 16093 + }, + { + "epoch": 1.7674061058642652, + "grad_norm": 2.184037446975708, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.727229654788971, + "num_tokens": 402020191.0, + "step": 16094 + }, + { + "epoch": 1.767515923566879, + "grad_norm": 2.3201606273651123, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7191574573516846, + "num_tokens": 402044239.0, + "step": 16095 + }, + { + "epoch": 1.7676257412694927, + "grad_norm": 2.2645349502563477, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7125096917152405, + "num_tokens": 402068579.0, + "step": 16096 + }, + { + "epoch": 1.7677355589721064, + "grad_norm": 2.051027297973633, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7161363363265991, + "num_tokens": 402097196.0, + "step": 16097 + }, + { + "epoch": 1.76784537667472, + "grad_norm": 2.200862407684326, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7338505983352661, + "num_tokens": 402121603.0, + "step": 16098 + }, + { + "epoch": 1.7679551943773335, + "grad_norm": 2.2595386505126953, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7097705006599426, + "num_tokens": 402146274.0, + "step": 16099 + }, + { + "epoch": 1.7680650120799473, + "grad_norm": 2.246619701385498, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7336275577545166, + "num_tokens": 402172292.0, + "step": 16100 + }, + { + "epoch": 1.768174829782561, + "grad_norm": 2.47967267036438, + "learning_rate": 1e-06, + "loss": 0.9189, + "mean_token_accuracy": 0.7192922234535217, + "num_tokens": 402193292.0, + "step": 16101 + }, + { + "epoch": 1.7682846474851748, + "grad_norm": 2.1774277687072754, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7249246835708618, + "num_tokens": 402219637.0, + "step": 16102 + }, + { + "epoch": 1.7683944651877883, + "grad_norm": 2.065962314605713, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7052827477455139, + "num_tokens": 402249382.0, + "step": 16103 + }, + { + "epoch": 1.7685042828904018, + "grad_norm": 2.311570644378662, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7254713177680969, + "num_tokens": 402272983.0, + "step": 16104 + }, + { + "epoch": 1.7686141005930156, + "grad_norm": 2.326477527618408, + "learning_rate": 1e-06, + "loss": 0.8741, + "mean_token_accuracy": 0.7263548970222473, + "num_tokens": 402296868.0, + "step": 16105 + }, + { + "epoch": 1.7687239182956294, + "grad_norm": 2.2635207176208496, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7239256501197815, + "num_tokens": 402322318.0, + "step": 16106 + }, + { + "epoch": 1.768833735998243, + "grad_norm": 2.3647170066833496, + "learning_rate": 1e-06, + "loss": 0.9972, + "mean_token_accuracy": 0.695905327796936, + "num_tokens": 402347131.0, + "step": 16107 + }, + { + "epoch": 1.7689435537008564, + "grad_norm": 2.2451915740966797, + "learning_rate": 1e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7270306944847107, + "num_tokens": 402372610.0, + "step": 16108 + }, + { + "epoch": 1.7690533714034702, + "grad_norm": 2.701258659362793, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7309840321540833, + "num_tokens": 402391600.0, + "step": 16109 + }, + { + "epoch": 1.769163189106084, + "grad_norm": 2.1966190338134766, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7003262639045715, + "num_tokens": 402418350.0, + "step": 16110 + }, + { + "epoch": 1.7692730068086977, + "grad_norm": 2.215829372406006, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7153220176696777, + "num_tokens": 402444667.0, + "step": 16111 + }, + { + "epoch": 1.7693828245113112, + "grad_norm": 2.0730373859405518, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7235056757926941, + "num_tokens": 402471397.0, + "step": 16112 + }, + { + "epoch": 1.7694926422139248, + "grad_norm": 2.4865951538085938, + "learning_rate": 1e-06, + "loss": 0.9447, + "mean_token_accuracy": 0.7069169282913208, + "num_tokens": 402492815.0, + "step": 16113 + }, + { + "epoch": 1.7696024599165385, + "grad_norm": 2.5538082122802734, + "learning_rate": 1e-06, + "loss": 0.944, + "mean_token_accuracy": 0.7086218595504761, + "num_tokens": 402512812.0, + "step": 16114 + }, + { + "epoch": 1.7697122776191523, + "grad_norm": 2.296722888946533, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7073590159416199, + "num_tokens": 402538905.0, + "step": 16115 + }, + { + "epoch": 1.7698220953217658, + "grad_norm": 2.6393654346466064, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7284356355667114, + "num_tokens": 402559151.0, + "step": 16116 + }, + { + "epoch": 1.7699319130243796, + "grad_norm": 2.4099950790405273, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7301024198532104, + "num_tokens": 402581095.0, + "step": 16117 + }, + { + "epoch": 1.7700417307269931, + "grad_norm": 2.3255763053894043, + "learning_rate": 1e-06, + "loss": 0.9326, + "mean_token_accuracy": 0.7142519950866699, + "num_tokens": 402604294.0, + "step": 16118 + }, + { + "epoch": 1.7701515484296069, + "grad_norm": 2.189091444015503, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7161998748779297, + "num_tokens": 402631376.0, + "step": 16119 + }, + { + "epoch": 1.7702613661322206, + "grad_norm": 2.3374972343444824, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7173193693161011, + "num_tokens": 402655865.0, + "step": 16120 + }, + { + "epoch": 1.7703711838348342, + "grad_norm": 2.398319959640503, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7234832048416138, + "num_tokens": 402678502.0, + "step": 16121 + }, + { + "epoch": 1.7704810015374477, + "grad_norm": 2.2929961681365967, + "learning_rate": 1e-06, + "loss": 0.8797, + "mean_token_accuracy": 0.7413365840911865, + "num_tokens": 402703222.0, + "step": 16122 + }, + { + "epoch": 1.7705908192400615, + "grad_norm": 2.6983020305633545, + "learning_rate": 1e-06, + "loss": 0.8037, + "mean_token_accuracy": 0.7454226016998291, + "num_tokens": 402721789.0, + "step": 16123 + }, + { + "epoch": 1.7707006369426752, + "grad_norm": 2.5055441856384277, + "learning_rate": 1e-06, + "loss": 0.8537, + "mean_token_accuracy": 0.7320540547370911, + "num_tokens": 402742402.0, + "step": 16124 + }, + { + "epoch": 1.770810454645289, + "grad_norm": 2.1566779613494873, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7199087142944336, + "num_tokens": 402771075.0, + "step": 16125 + }, + { + "epoch": 1.7709202723479025, + "grad_norm": 1.907917857170105, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7173364758491516, + "num_tokens": 402805558.0, + "step": 16126 + }, + { + "epoch": 1.771030090050516, + "grad_norm": 2.058246374130249, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7157207727432251, + "num_tokens": 402834699.0, + "step": 16127 + }, + { + "epoch": 1.7711399077531298, + "grad_norm": 2.0407941341400146, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.6950064897537231, + "num_tokens": 402863961.0, + "step": 16128 + }, + { + "epoch": 1.7712497254557436, + "grad_norm": 2.308229923248291, + "learning_rate": 1e-06, + "loss": 0.8181, + "mean_token_accuracy": 0.7382908463478088, + "num_tokens": 402887624.0, + "step": 16129 + }, + { + "epoch": 1.771359543158357, + "grad_norm": 2.069490671157837, + "learning_rate": 1e-06, + "loss": 0.9016, + "mean_token_accuracy": 0.7171536087989807, + "num_tokens": 402917463.0, + "step": 16130 + }, + { + "epoch": 1.7714693608609708, + "grad_norm": 2.291407823562622, + "learning_rate": 1e-06, + "loss": 0.8669, + "mean_token_accuracy": 0.7302127480506897, + "num_tokens": 402941829.0, + "step": 16131 + }, + { + "epoch": 1.7715791785635844, + "grad_norm": 2.2291171550750732, + "learning_rate": 1e-06, + "loss": 0.9267, + "mean_token_accuracy": 0.7124307155609131, + "num_tokens": 402968102.0, + "step": 16132 + }, + { + "epoch": 1.7716889962661981, + "grad_norm": 2.4802136421203613, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7385101318359375, + "num_tokens": 402987816.0, + "step": 16133 + }, + { + "epoch": 1.771798813968812, + "grad_norm": 2.2361233234405518, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7218741178512573, + "num_tokens": 403014359.0, + "step": 16134 + }, + { + "epoch": 1.7719086316714254, + "grad_norm": 2.3096182346343994, + "learning_rate": 1e-06, + "loss": 0.8237, + "mean_token_accuracy": 0.7387140989303589, + "num_tokens": 403039292.0, + "step": 16135 + }, + { + "epoch": 1.772018449374039, + "grad_norm": 2.396596670150757, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7173734903335571, + "num_tokens": 403061586.0, + "step": 16136 + }, + { + "epoch": 1.7721282670766527, + "grad_norm": 2.5886356830596924, + "learning_rate": 1e-06, + "loss": 0.8576, + "mean_token_accuracy": 0.7299282550811768, + "num_tokens": 403082147.0, + "step": 16137 + }, + { + "epoch": 1.7722380847792665, + "grad_norm": 2.2206921577453613, + "learning_rate": 1e-06, + "loss": 0.9262, + "mean_token_accuracy": 0.7143505811691284, + "num_tokens": 403108824.0, + "step": 16138 + }, + { + "epoch": 1.7723479024818802, + "grad_norm": 2.361997127532959, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7414601445198059, + "num_tokens": 403131584.0, + "step": 16139 + }, + { + "epoch": 1.7724577201844938, + "grad_norm": 2.378087282180786, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.7281827926635742, + "num_tokens": 403153943.0, + "step": 16140 + }, + { + "epoch": 1.7725675378871073, + "grad_norm": 2.5462806224823, + "learning_rate": 1e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7399427890777588, + "num_tokens": 403173304.0, + "step": 16141 + }, + { + "epoch": 1.772677355589721, + "grad_norm": 2.7466468811035156, + "learning_rate": 1e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7292674779891968, + "num_tokens": 403189989.0, + "step": 16142 + }, + { + "epoch": 1.7727871732923348, + "grad_norm": 2.0770232677459717, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7055366039276123, + "num_tokens": 403217296.0, + "step": 16143 + }, + { + "epoch": 1.7728969909949484, + "grad_norm": 2.359938859939575, + "learning_rate": 1e-06, + "loss": 0.9036, + "mean_token_accuracy": 0.7129936814308167, + "num_tokens": 403240286.0, + "step": 16144 + }, + { + "epoch": 1.7730068086975619, + "grad_norm": 2.1107146739959717, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.730613112449646, + "num_tokens": 403267992.0, + "step": 16145 + }, + { + "epoch": 1.7731166264001756, + "grad_norm": 2.4951772689819336, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7362803220748901, + "num_tokens": 403288719.0, + "step": 16146 + }, + { + "epoch": 1.7732264441027894, + "grad_norm": 2.2930004596710205, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.7205671072006226, + "num_tokens": 403314125.0, + "step": 16147 + }, + { + "epoch": 1.7733362618054032, + "grad_norm": 2.0129902362823486, + "learning_rate": 1e-06, + "loss": 1.0039, + "mean_token_accuracy": 0.6985329389572144, + "num_tokens": 403345559.0, + "step": 16148 + }, + { + "epoch": 1.7734460795080167, + "grad_norm": 1.9994544982910156, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7104518413543701, + "num_tokens": 403377473.0, + "step": 16149 + }, + { + "epoch": 1.7735558972106302, + "grad_norm": 2.607846975326538, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7350556254386902, + "num_tokens": 403397854.0, + "step": 16150 + }, + { + "epoch": 1.773665714913244, + "grad_norm": 2.0623881816864014, + "learning_rate": 1e-06, + "loss": 0.8507, + "mean_token_accuracy": 0.7343922853469849, + "num_tokens": 403426769.0, + "step": 16151 + }, + { + "epoch": 1.7737755326158577, + "grad_norm": 2.0345356464385986, + "learning_rate": 1e-06, + "loss": 0.8428, + "mean_token_accuracy": 0.7346099615097046, + "num_tokens": 403455054.0, + "step": 16152 + }, + { + "epoch": 1.7738853503184715, + "grad_norm": 2.42091965675354, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7120522260665894, + "num_tokens": 403479075.0, + "step": 16153 + }, + { + "epoch": 1.773995168021085, + "grad_norm": 2.3554325103759766, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7208739519119263, + "num_tokens": 403500408.0, + "step": 16154 + }, + { + "epoch": 1.7741049857236986, + "grad_norm": 2.034722328186035, + "learning_rate": 1e-06, + "loss": 1.0104, + "mean_token_accuracy": 0.688788890838623, + "num_tokens": 403533462.0, + "step": 16155 + }, + { + "epoch": 1.7742148034263123, + "grad_norm": 2.280320167541504, + "learning_rate": 1e-06, + "loss": 0.9467, + "mean_token_accuracy": 0.7050839066505432, + "num_tokens": 403558803.0, + "step": 16156 + }, + { + "epoch": 1.774324621128926, + "grad_norm": 2.2556936740875244, + "learning_rate": 1e-06, + "loss": 0.9406, + "mean_token_accuracy": 0.7104192972183228, + "num_tokens": 403582962.0, + "step": 16157 + }, + { + "epoch": 1.7744344388315396, + "grad_norm": 1.9970011711120605, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7231732606887817, + "num_tokens": 403614169.0, + "step": 16158 + }, + { + "epoch": 1.7745442565341532, + "grad_norm": 2.1431803703308105, + "learning_rate": 1e-06, + "loss": 0.8624, + "mean_token_accuracy": 0.7301427721977234, + "num_tokens": 403641510.0, + "step": 16159 + }, + { + "epoch": 1.774654074236767, + "grad_norm": 2.390899896621704, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7348452806472778, + "num_tokens": 403663557.0, + "step": 16160 + }, + { + "epoch": 1.7747638919393807, + "grad_norm": 1.9269306659698486, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7176225185394287, + "num_tokens": 403697035.0, + "step": 16161 + }, + { + "epoch": 1.7748737096419944, + "grad_norm": 2.6222593784332275, + "learning_rate": 1e-06, + "loss": 0.8124, + "mean_token_accuracy": 0.7441312074661255, + "num_tokens": 403715645.0, + "step": 16162 + }, + { + "epoch": 1.774983527344608, + "grad_norm": 2.1121346950531006, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7191216945648193, + "num_tokens": 403744372.0, + "step": 16163 + }, + { + "epoch": 1.7750933450472215, + "grad_norm": 2.2957518100738525, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7305374145507812, + "num_tokens": 403769080.0, + "step": 16164 + }, + { + "epoch": 1.7752031627498353, + "grad_norm": 1.7553256750106812, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7134448885917664, + "num_tokens": 403804932.0, + "step": 16165 + }, + { + "epoch": 1.775312980452449, + "grad_norm": 2.3777644634246826, + "learning_rate": 1e-06, + "loss": 0.954, + "mean_token_accuracy": 0.7103387117385864, + "num_tokens": 403829869.0, + "step": 16166 + }, + { + "epoch": 1.7754227981550625, + "grad_norm": 2.5779306888580322, + "learning_rate": 1e-06, + "loss": 0.8636, + "mean_token_accuracy": 0.7343209981918335, + "num_tokens": 403849857.0, + "step": 16167 + }, + { + "epoch": 1.7755326158576763, + "grad_norm": 2.379016160964966, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7095841765403748, + "num_tokens": 403874741.0, + "step": 16168 + }, + { + "epoch": 1.7756424335602898, + "grad_norm": 2.2823545932769775, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7465848922729492, + "num_tokens": 403898435.0, + "step": 16169 + }, + { + "epoch": 1.7757522512629036, + "grad_norm": 2.285229444503784, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7226423621177673, + "num_tokens": 403922858.0, + "step": 16170 + }, + { + "epoch": 1.7758620689655173, + "grad_norm": 2.1335256099700928, + "learning_rate": 1e-06, + "loss": 0.9555, + "mean_token_accuracy": 0.7016336917877197, + "num_tokens": 403952570.0, + "step": 16171 + }, + { + "epoch": 1.7759718866681309, + "grad_norm": 2.0644664764404297, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7237175703048706, + "num_tokens": 403981062.0, + "step": 16172 + }, + { + "epoch": 1.7760817043707444, + "grad_norm": 2.3559231758117676, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7302777767181396, + "num_tokens": 404003037.0, + "step": 16173 + }, + { + "epoch": 1.7761915220733582, + "grad_norm": 2.1460511684417725, + "learning_rate": 1e-06, + "loss": 0.9562, + "mean_token_accuracy": 0.7019377946853638, + "num_tokens": 404030183.0, + "step": 16174 + }, + { + "epoch": 1.776301339775972, + "grad_norm": 2.608837127685547, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7256736755371094, + "num_tokens": 404049500.0, + "step": 16175 + }, + { + "epoch": 1.7764111574785857, + "grad_norm": 2.291050434112549, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7284774780273438, + "num_tokens": 404073527.0, + "step": 16176 + }, + { + "epoch": 1.7765209751811992, + "grad_norm": 2.3136792182922363, + "learning_rate": 1e-06, + "loss": 0.9714, + "mean_token_accuracy": 0.6962187886238098, + "num_tokens": 404098209.0, + "step": 16177 + }, + { + "epoch": 1.7766307928838128, + "grad_norm": 2.395949125289917, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7259392738342285, + "num_tokens": 404119778.0, + "step": 16178 + }, + { + "epoch": 1.7767406105864265, + "grad_norm": 2.1887502670288086, + "learning_rate": 1e-06, + "loss": 0.9935, + "mean_token_accuracy": 0.6960961222648621, + "num_tokens": 404147180.0, + "step": 16179 + }, + { + "epoch": 1.7768504282890403, + "grad_norm": 2.1622540950775146, + "learning_rate": 1e-06, + "loss": 0.8274, + "mean_token_accuracy": 0.7408255934715271, + "num_tokens": 404172971.0, + "step": 16180 + }, + { + "epoch": 1.7769602459916538, + "grad_norm": 2.276850938796997, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7159071564674377, + "num_tokens": 404198146.0, + "step": 16181 + }, + { + "epoch": 1.7770700636942676, + "grad_norm": 2.1884217262268066, + "learning_rate": 1e-06, + "loss": 0.8883, + "mean_token_accuracy": 0.7235970497131348, + "num_tokens": 404224281.0, + "step": 16182 + }, + { + "epoch": 1.777179881396881, + "grad_norm": 2.0713956356048584, + "learning_rate": 1e-06, + "loss": 0.9485, + "mean_token_accuracy": 0.7102551460266113, + "num_tokens": 404252262.0, + "step": 16183 + }, + { + "epoch": 1.7772896990994949, + "grad_norm": 2.361586570739746, + "learning_rate": 1e-06, + "loss": 0.8587, + "mean_token_accuracy": 0.7330085039138794, + "num_tokens": 404273978.0, + "step": 16184 + }, + { + "epoch": 1.7773995168021086, + "grad_norm": 1.8846769332885742, + "learning_rate": 1e-06, + "loss": 0.9532, + "mean_token_accuracy": 0.7013700008392334, + "num_tokens": 404309734.0, + "step": 16185 + }, + { + "epoch": 1.7775093345047221, + "grad_norm": 2.0438976287841797, + "learning_rate": 1e-06, + "loss": 0.9067, + "mean_token_accuracy": 0.7176195383071899, + "num_tokens": 404338009.0, + "step": 16186 + }, + { + "epoch": 1.7776191522073357, + "grad_norm": 2.075744390487671, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7319313287734985, + "num_tokens": 404365241.0, + "step": 16187 + }, + { + "epoch": 1.7777289699099494, + "grad_norm": 2.7256550788879395, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7453906536102295, + "num_tokens": 404382408.0, + "step": 16188 + }, + { + "epoch": 1.7778387876125632, + "grad_norm": 2.4752020835876465, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7232859134674072, + "num_tokens": 404402829.0, + "step": 16189 + }, + { + "epoch": 1.777948605315177, + "grad_norm": 2.36584210395813, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.7209891676902771, + "num_tokens": 404424972.0, + "step": 16190 + }, + { + "epoch": 1.7780584230177905, + "grad_norm": 2.2138187885284424, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7335659265518188, + "num_tokens": 404453472.0, + "step": 16191 + }, + { + "epoch": 1.778168240720404, + "grad_norm": 2.424139976501465, + "learning_rate": 1e-06, + "loss": 0.88, + "mean_token_accuracy": 0.7272980809211731, + "num_tokens": 404475939.0, + "step": 16192 + }, + { + "epoch": 1.7782780584230178, + "grad_norm": 2.2869343757629395, + "learning_rate": 1e-06, + "loss": 1.0041, + "mean_token_accuracy": 0.6942980289459229, + "num_tokens": 404502184.0, + "step": 16193 + }, + { + "epoch": 1.7783878761256315, + "grad_norm": 2.219028949737549, + "learning_rate": 1e-06, + "loss": 0.9362, + "mean_token_accuracy": 0.7092559337615967, + "num_tokens": 404527486.0, + "step": 16194 + }, + { + "epoch": 1.778497693828245, + "grad_norm": 2.1633706092834473, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7063524723052979, + "num_tokens": 404553932.0, + "step": 16195 + }, + { + "epoch": 1.7786075115308586, + "grad_norm": 2.116112470626831, + "learning_rate": 1e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.735257625579834, + "num_tokens": 404579190.0, + "step": 16196 + }, + { + "epoch": 1.7787173292334724, + "grad_norm": 2.341717004776001, + "learning_rate": 1e-06, + "loss": 0.9515, + "mean_token_accuracy": 0.702072262763977, + "num_tokens": 404604234.0, + "step": 16197 + }, + { + "epoch": 1.7788271469360861, + "grad_norm": 2.3222153186798096, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.726270318031311, + "num_tokens": 404628078.0, + "step": 16198 + }, + { + "epoch": 1.7789369646386999, + "grad_norm": 2.170210599899292, + "learning_rate": 1e-06, + "loss": 0.9711, + "mean_token_accuracy": 0.699874222278595, + "num_tokens": 404656613.0, + "step": 16199 + }, + { + "epoch": 1.7790467823413134, + "grad_norm": 2.234358787536621, + "learning_rate": 1e-06, + "loss": 0.9245, + "mean_token_accuracy": 0.7090544700622559, + "num_tokens": 404680835.0, + "step": 16200 + }, + { + "epoch": 1.779156600043927, + "grad_norm": 2.5919151306152344, + "learning_rate": 1e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7379840612411499, + "num_tokens": 404699433.0, + "step": 16201 + }, + { + "epoch": 1.7792664177465407, + "grad_norm": 2.4350717067718506, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7332088947296143, + "num_tokens": 404720519.0, + "step": 16202 + }, + { + "epoch": 1.7793762354491545, + "grad_norm": 2.2658486366271973, + "learning_rate": 1e-06, + "loss": 0.9014, + "mean_token_accuracy": 0.7146151065826416, + "num_tokens": 404746278.0, + "step": 16203 + }, + { + "epoch": 1.7794860531517682, + "grad_norm": 2.2159018516540527, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7216619253158569, + "num_tokens": 404770900.0, + "step": 16204 + }, + { + "epoch": 1.7795958708543818, + "grad_norm": 1.9943331480026245, + "learning_rate": 1e-06, + "loss": 0.8811, + "mean_token_accuracy": 0.7266294956207275, + "num_tokens": 404801965.0, + "step": 16205 + }, + { + "epoch": 1.7797056885569953, + "grad_norm": 2.2722485065460205, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.729968786239624, + "num_tokens": 404826660.0, + "step": 16206 + }, + { + "epoch": 1.779815506259609, + "grad_norm": 2.0920636653900146, + "learning_rate": 1e-06, + "loss": 0.92, + "mean_token_accuracy": 0.7072811722755432, + "num_tokens": 404856411.0, + "step": 16207 + }, + { + "epoch": 1.7799253239622228, + "grad_norm": 2.4673235416412354, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7192530632019043, + "num_tokens": 404878861.0, + "step": 16208 + }, + { + "epoch": 1.7800351416648363, + "grad_norm": 2.3868000507354736, + "learning_rate": 1e-06, + "loss": 0.7814, + "mean_token_accuracy": 0.7565778493881226, + "num_tokens": 404901528.0, + "step": 16209 + }, + { + "epoch": 1.7801449593674499, + "grad_norm": 2.334261178970337, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7166427969932556, + "num_tokens": 404925792.0, + "step": 16210 + }, + { + "epoch": 1.7802547770700636, + "grad_norm": 2.2244598865509033, + "learning_rate": 1e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7242786884307861, + "num_tokens": 404952448.0, + "step": 16211 + }, + { + "epoch": 1.7803645947726774, + "grad_norm": 2.3330607414245605, + "learning_rate": 1e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7547763586044312, + "num_tokens": 404973582.0, + "step": 16212 + }, + { + "epoch": 1.7804744124752911, + "grad_norm": 2.2924516201019287, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7145806550979614, + "num_tokens": 404997329.0, + "step": 16213 + }, + { + "epoch": 1.7805842301779047, + "grad_norm": 2.057922601699829, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.7005869746208191, + "num_tokens": 405028167.0, + "step": 16214 + }, + { + "epoch": 1.7806940478805182, + "grad_norm": 2.4716057777404785, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7130218148231506, + "num_tokens": 405052303.0, + "step": 16215 + }, + { + "epoch": 1.780803865583132, + "grad_norm": 2.8254356384277344, + "learning_rate": 1e-06, + "loss": 0.8245, + "mean_token_accuracy": 0.7380000352859497, + "num_tokens": 405070304.0, + "step": 16216 + }, + { + "epoch": 1.7809136832857457, + "grad_norm": 2.462972640991211, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7058287262916565, + "num_tokens": 405091843.0, + "step": 16217 + }, + { + "epoch": 1.7810235009883595, + "grad_norm": 2.2870113849639893, + "learning_rate": 1e-06, + "loss": 0.8579, + "mean_token_accuracy": 0.7322436571121216, + "num_tokens": 405115635.0, + "step": 16218 + }, + { + "epoch": 1.781133318690973, + "grad_norm": 2.342193365097046, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7150497436523438, + "num_tokens": 405145148.0, + "step": 16219 + }, + { + "epoch": 1.7812431363935866, + "grad_norm": 2.1180763244628906, + "learning_rate": 1e-06, + "loss": 0.9914, + "mean_token_accuracy": 0.699455738067627, + "num_tokens": 405174574.0, + "step": 16220 + }, + { + "epoch": 1.7813529540962003, + "grad_norm": 2.5174965858459473, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7133275866508484, + "num_tokens": 405195898.0, + "step": 16221 + }, + { + "epoch": 1.781462771798814, + "grad_norm": 2.0644867420196533, + "learning_rate": 1e-06, + "loss": 1.0183, + "mean_token_accuracy": 0.690960705280304, + "num_tokens": 405225151.0, + "step": 16222 + }, + { + "epoch": 1.7815725895014276, + "grad_norm": 2.355086326599121, + "learning_rate": 1e-06, + "loss": 0.9019, + "mean_token_accuracy": 0.7147060036659241, + "num_tokens": 405247712.0, + "step": 16223 + }, + { + "epoch": 1.7816824072040411, + "grad_norm": 2.1677637100219727, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7333499193191528, + "num_tokens": 405272711.0, + "step": 16224 + }, + { + "epoch": 1.781792224906655, + "grad_norm": 2.240346908569336, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.7007275819778442, + "num_tokens": 405298416.0, + "step": 16225 + }, + { + "epoch": 1.7819020426092687, + "grad_norm": 2.7768514156341553, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7292152047157288, + "num_tokens": 405316008.0, + "step": 16226 + }, + { + "epoch": 1.7820118603118824, + "grad_norm": 2.1572766304016113, + "learning_rate": 1e-06, + "loss": 0.9087, + "mean_token_accuracy": 0.7275673151016235, + "num_tokens": 405342783.0, + "step": 16227 + }, + { + "epoch": 1.782121678014496, + "grad_norm": 2.213263511657715, + "learning_rate": 1e-06, + "loss": 0.9198, + "mean_token_accuracy": 0.7124088406562805, + "num_tokens": 405369476.0, + "step": 16228 + }, + { + "epoch": 1.7822314957171095, + "grad_norm": 2.138972282409668, + "learning_rate": 1e-06, + "loss": 0.8787, + "mean_token_accuracy": 0.7325263023376465, + "num_tokens": 405395337.0, + "step": 16229 + }, + { + "epoch": 1.7823413134197232, + "grad_norm": 2.397663116455078, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7325887084007263, + "num_tokens": 405418577.0, + "step": 16230 + }, + { + "epoch": 1.782451131122337, + "grad_norm": 2.324781894683838, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.7190988063812256, + "num_tokens": 405441354.0, + "step": 16231 + }, + { + "epoch": 1.7825609488249505, + "grad_norm": 2.512673854827881, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7045869827270508, + "num_tokens": 405463120.0, + "step": 16232 + }, + { + "epoch": 1.7826707665275643, + "grad_norm": 2.178344964981079, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.718461275100708, + "num_tokens": 405488883.0, + "step": 16233 + }, + { + "epoch": 1.7827805842301778, + "grad_norm": 2.5306808948516846, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7307220697402954, + "num_tokens": 405511443.0, + "step": 16234 + }, + { + "epoch": 1.7828904019327916, + "grad_norm": 2.185497760772705, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.72176194190979, + "num_tokens": 405537774.0, + "step": 16235 + }, + { + "epoch": 1.7830002196354053, + "grad_norm": 2.1336629390716553, + "learning_rate": 1e-06, + "loss": 0.9639, + "mean_token_accuracy": 0.6999403834342957, + "num_tokens": 405565882.0, + "step": 16236 + }, + { + "epoch": 1.7831100373380189, + "grad_norm": 2.1090474128723145, + "learning_rate": 1e-06, + "loss": 0.8986, + "mean_token_accuracy": 0.7142816781997681, + "num_tokens": 405593248.0, + "step": 16237 + }, + { + "epoch": 1.7832198550406324, + "grad_norm": 2.159987211227417, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7230809926986694, + "num_tokens": 405618295.0, + "step": 16238 + }, + { + "epoch": 1.7833296727432462, + "grad_norm": 2.379376173019409, + "learning_rate": 1e-06, + "loss": 0.9571, + "mean_token_accuracy": 0.6985776424407959, + "num_tokens": 405641782.0, + "step": 16239 + }, + { + "epoch": 1.78343949044586, + "grad_norm": 2.2252230644226074, + "learning_rate": 1e-06, + "loss": 1.0222, + "mean_token_accuracy": 0.6998226642608643, + "num_tokens": 405668404.0, + "step": 16240 + }, + { + "epoch": 1.7835493081484737, + "grad_norm": 2.5241520404815674, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7158297896385193, + "num_tokens": 405691190.0, + "step": 16241 + }, + { + "epoch": 1.7836591258510872, + "grad_norm": 2.226222276687622, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7329195737838745, + "num_tokens": 405716526.0, + "step": 16242 + }, + { + "epoch": 1.7837689435537007, + "grad_norm": 2.3104794025421143, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7359877228736877, + "num_tokens": 405741511.0, + "step": 16243 + }, + { + "epoch": 1.7838787612563145, + "grad_norm": 2.5007994174957275, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7203829884529114, + "num_tokens": 405763849.0, + "step": 16244 + }, + { + "epoch": 1.7839885789589283, + "grad_norm": 2.3033363819122314, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.739973783493042, + "num_tokens": 405787046.0, + "step": 16245 + }, + { + "epoch": 1.7840983966615418, + "grad_norm": 2.048438310623169, + "learning_rate": 1e-06, + "loss": 1.0155, + "mean_token_accuracy": 0.7032554745674133, + "num_tokens": 405816281.0, + "step": 16246 + }, + { + "epoch": 1.7842082143641556, + "grad_norm": 2.0556209087371826, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7169762253761292, + "num_tokens": 405845004.0, + "step": 16247 + }, + { + "epoch": 1.784318032066769, + "grad_norm": 2.0553667545318604, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7098891735076904, + "num_tokens": 405876272.0, + "step": 16248 + }, + { + "epoch": 1.7844278497693828, + "grad_norm": 2.1975841522216797, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7310347557067871, + "num_tokens": 405905182.0, + "step": 16249 + }, + { + "epoch": 1.7845376674719966, + "grad_norm": 2.555328845977783, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.727283239364624, + "num_tokens": 405925949.0, + "step": 16250 + }, + { + "epoch": 1.7846474851746101, + "grad_norm": 2.251870632171631, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7339736223220825, + "num_tokens": 405952381.0, + "step": 16251 + }, + { + "epoch": 1.7847573028772237, + "grad_norm": 2.224604606628418, + "learning_rate": 1e-06, + "loss": 0.8942, + "mean_token_accuracy": 0.724128007888794, + "num_tokens": 405977805.0, + "step": 16252 + }, + { + "epoch": 1.7848671205798374, + "grad_norm": 2.4756245613098145, + "learning_rate": 1e-06, + "loss": 0.8407, + "mean_token_accuracy": 0.7389711141586304, + "num_tokens": 405998896.0, + "step": 16253 + }, + { + "epoch": 1.7849769382824512, + "grad_norm": 2.436495542526245, + "learning_rate": 1e-06, + "loss": 0.827, + "mean_token_accuracy": 0.7361489534378052, + "num_tokens": 406019640.0, + "step": 16254 + }, + { + "epoch": 1.785086755985065, + "grad_norm": 2.382143020629883, + "learning_rate": 1e-06, + "loss": 0.7867, + "mean_token_accuracy": 0.7515061497688293, + "num_tokens": 406039949.0, + "step": 16255 + }, + { + "epoch": 1.7851965736876785, + "grad_norm": 2.3140101432800293, + "learning_rate": 1e-06, + "loss": 0.9364, + "mean_token_accuracy": 0.7073583006858826, + "num_tokens": 406065272.0, + "step": 16256 + }, + { + "epoch": 1.785306391390292, + "grad_norm": 2.1931121349334717, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7321316003799438, + "num_tokens": 406090421.0, + "step": 16257 + }, + { + "epoch": 1.7854162090929058, + "grad_norm": 2.058939218521118, + "learning_rate": 1e-06, + "loss": 0.8619, + "mean_token_accuracy": 0.7335896492004395, + "num_tokens": 406119096.0, + "step": 16258 + }, + { + "epoch": 1.7855260267955195, + "grad_norm": 2.213275671005249, + "learning_rate": 1e-06, + "loss": 0.9337, + "mean_token_accuracy": 0.7210260629653931, + "num_tokens": 406147067.0, + "step": 16259 + }, + { + "epoch": 1.785635844498133, + "grad_norm": 2.2685084342956543, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7230743169784546, + "num_tokens": 406171564.0, + "step": 16260 + }, + { + "epoch": 1.7857456622007466, + "grad_norm": 2.2889013290405273, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7253949046134949, + "num_tokens": 406196843.0, + "step": 16261 + }, + { + "epoch": 1.7858554799033604, + "grad_norm": 2.319512367248535, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7108681201934814, + "num_tokens": 406222712.0, + "step": 16262 + }, + { + "epoch": 1.7859652976059741, + "grad_norm": 2.510167121887207, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7268657684326172, + "num_tokens": 406242554.0, + "step": 16263 + }, + { + "epoch": 1.7860751153085879, + "grad_norm": 2.4161570072174072, + "learning_rate": 1e-06, + "loss": 0.7746, + "mean_token_accuracy": 0.7527417540550232, + "num_tokens": 406264344.0, + "step": 16264 + }, + { + "epoch": 1.7861849330112014, + "grad_norm": 2.0628228187561035, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7319082021713257, + "num_tokens": 406292441.0, + "step": 16265 + }, + { + "epoch": 1.786294750713815, + "grad_norm": 2.5494911670684814, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.734076738357544, + "num_tokens": 406314488.0, + "step": 16266 + }, + { + "epoch": 1.7864045684164287, + "grad_norm": 2.6047890186309814, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7253863215446472, + "num_tokens": 406334941.0, + "step": 16267 + }, + { + "epoch": 1.7865143861190425, + "grad_norm": 2.234757661819458, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7124911546707153, + "num_tokens": 406360400.0, + "step": 16268 + }, + { + "epoch": 1.7866242038216562, + "grad_norm": 2.205996513366699, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7118806838989258, + "num_tokens": 406387346.0, + "step": 16269 + }, + { + "epoch": 1.7867340215242697, + "grad_norm": 2.778385639190674, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.7365632653236389, + "num_tokens": 406404410.0, + "step": 16270 + }, + { + "epoch": 1.7868438392268833, + "grad_norm": 2.7135307788848877, + "learning_rate": 1e-06, + "loss": 0.8719, + "mean_token_accuracy": 0.727802038192749, + "num_tokens": 406423629.0, + "step": 16271 + }, + { + "epoch": 1.786953656929497, + "grad_norm": 2.2045364379882812, + "learning_rate": 1e-06, + "loss": 0.9877, + "mean_token_accuracy": 0.6971297264099121, + "num_tokens": 406448517.0, + "step": 16272 + }, + { + "epoch": 1.7870634746321108, + "grad_norm": 2.146245241165161, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.717555820941925, + "num_tokens": 406474997.0, + "step": 16273 + }, + { + "epoch": 1.7871732923347243, + "grad_norm": 2.448498487472534, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7281489372253418, + "num_tokens": 406495935.0, + "step": 16274 + }, + { + "epoch": 1.7872831100373379, + "grad_norm": 2.2813737392425537, + "learning_rate": 1e-06, + "loss": 0.95, + "mean_token_accuracy": 0.7065509557723999, + "num_tokens": 406522091.0, + "step": 16275 + }, + { + "epoch": 1.7873929277399516, + "grad_norm": 2.170971155166626, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7189673185348511, + "num_tokens": 406549432.0, + "step": 16276 + }, + { + "epoch": 1.7875027454425654, + "grad_norm": 2.323730707168579, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.7110424041748047, + "num_tokens": 406572699.0, + "step": 16277 + }, + { + "epoch": 1.7876125631451791, + "grad_norm": 2.3285186290740967, + "learning_rate": 1e-06, + "loss": 0.8572, + "mean_token_accuracy": 0.7303028702735901, + "num_tokens": 406592570.0, + "step": 16278 + }, + { + "epoch": 1.7877223808477927, + "grad_norm": 1.988700270652771, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7281204462051392, + "num_tokens": 406621668.0, + "step": 16279 + }, + { + "epoch": 1.7878321985504062, + "grad_norm": 2.100217342376709, + "learning_rate": 1e-06, + "loss": 0.986, + "mean_token_accuracy": 0.6945288181304932, + "num_tokens": 406653006.0, + "step": 16280 + }, + { + "epoch": 1.78794201625302, + "grad_norm": 2.4976730346679688, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.6974791288375854, + "num_tokens": 406676895.0, + "step": 16281 + }, + { + "epoch": 1.7880518339556337, + "grad_norm": 2.37577486038208, + "learning_rate": 1e-06, + "loss": 0.7718, + "mean_token_accuracy": 0.7531540393829346, + "num_tokens": 406697766.0, + "step": 16282 + }, + { + "epoch": 1.7881616516582475, + "grad_norm": 2.1299655437469482, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7156911492347717, + "num_tokens": 406725397.0, + "step": 16283 + }, + { + "epoch": 1.788271469360861, + "grad_norm": 2.6219255924224854, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7333111763000488, + "num_tokens": 406743528.0, + "step": 16284 + }, + { + "epoch": 1.7883812870634745, + "grad_norm": 2.311067819595337, + "learning_rate": 1e-06, + "loss": 0.9598, + "mean_token_accuracy": 0.7004788517951965, + "num_tokens": 406770642.0, + "step": 16285 + }, + { + "epoch": 1.7884911047660883, + "grad_norm": 2.3033580780029297, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7233518362045288, + "num_tokens": 406796500.0, + "step": 16286 + }, + { + "epoch": 1.788600922468702, + "grad_norm": 1.9339827299118042, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7222380638122559, + "num_tokens": 406827801.0, + "step": 16287 + }, + { + "epoch": 1.7887107401713156, + "grad_norm": 2.5560948848724365, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7353789806365967, + "num_tokens": 406847824.0, + "step": 16288 + }, + { + "epoch": 1.7888205578739291, + "grad_norm": 2.1333084106445312, + "learning_rate": 1e-06, + "loss": 0.8984, + "mean_token_accuracy": 0.7127713561058044, + "num_tokens": 406874616.0, + "step": 16289 + }, + { + "epoch": 1.7889303755765429, + "grad_norm": 2.2944529056549072, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7109843492507935, + "num_tokens": 406899546.0, + "step": 16290 + }, + { + "epoch": 1.7890401932791566, + "grad_norm": 2.0377495288848877, + "learning_rate": 1e-06, + "loss": 0.987, + "mean_token_accuracy": 0.6935986280441284, + "num_tokens": 406932699.0, + "step": 16291 + }, + { + "epoch": 1.7891500109817704, + "grad_norm": 2.36590313911438, + "learning_rate": 1e-06, + "loss": 0.9179, + "mean_token_accuracy": 0.7164923548698425, + "num_tokens": 406954989.0, + "step": 16292 + }, + { + "epoch": 1.789259828684384, + "grad_norm": 2.379395008087158, + "learning_rate": 1e-06, + "loss": 0.9403, + "mean_token_accuracy": 0.7078795433044434, + "num_tokens": 406979089.0, + "step": 16293 + }, + { + "epoch": 1.7893696463869975, + "grad_norm": 2.4526169300079346, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7277913093566895, + "num_tokens": 406999708.0, + "step": 16294 + }, + { + "epoch": 1.7894794640896112, + "grad_norm": 2.6108646392822266, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7061799764633179, + "num_tokens": 407020958.0, + "step": 16295 + }, + { + "epoch": 1.789589281792225, + "grad_norm": 2.0707712173461914, + "learning_rate": 1e-06, + "loss": 0.9787, + "mean_token_accuracy": 0.7034084796905518, + "num_tokens": 407049701.0, + "step": 16296 + }, + { + "epoch": 1.7896990994948385, + "grad_norm": 2.298532485961914, + "learning_rate": 1e-06, + "loss": 0.841, + "mean_token_accuracy": 0.730687141418457, + "num_tokens": 407074177.0, + "step": 16297 + }, + { + "epoch": 1.7898089171974523, + "grad_norm": 2.2590088844299316, + "learning_rate": 1e-06, + "loss": 1.0186, + "mean_token_accuracy": 0.6903116703033447, + "num_tokens": 407102653.0, + "step": 16298 + }, + { + "epoch": 1.7899187349000658, + "grad_norm": 2.132564067840576, + "learning_rate": 1e-06, + "loss": 0.9731, + "mean_token_accuracy": 0.7186708450317383, + "num_tokens": 407129470.0, + "step": 16299 + }, + { + "epoch": 1.7900285526026796, + "grad_norm": 2.2963807582855225, + "learning_rate": 1e-06, + "loss": 0.928, + "mean_token_accuracy": 0.7094902992248535, + "num_tokens": 407154077.0, + "step": 16300 + }, + { + "epoch": 1.7901383703052933, + "grad_norm": 2.1185851097106934, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7267806529998779, + "num_tokens": 407180690.0, + "step": 16301 + }, + { + "epoch": 1.7902481880079069, + "grad_norm": 2.102396011352539, + "learning_rate": 1e-06, + "loss": 0.8626, + "mean_token_accuracy": 0.7269942760467529, + "num_tokens": 407209404.0, + "step": 16302 + }, + { + "epoch": 1.7903580057105204, + "grad_norm": 2.6871695518493652, + "learning_rate": 1e-06, + "loss": 0.8337, + "mean_token_accuracy": 0.7365866899490356, + "num_tokens": 407227077.0, + "step": 16303 + }, + { + "epoch": 1.7904678234131342, + "grad_norm": 2.1347179412841797, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7040529251098633, + "num_tokens": 407256986.0, + "step": 16304 + }, + { + "epoch": 1.790577641115748, + "grad_norm": 2.2089099884033203, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7213724851608276, + "num_tokens": 407282686.0, + "step": 16305 + }, + { + "epoch": 1.7906874588183617, + "grad_norm": 2.402221202850342, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7369139194488525, + "num_tokens": 407304982.0, + "step": 16306 + }, + { + "epoch": 1.7907972765209752, + "grad_norm": 2.044741153717041, + "learning_rate": 1e-06, + "loss": 0.9542, + "mean_token_accuracy": 0.7139909863471985, + "num_tokens": 407334318.0, + "step": 16307 + }, + { + "epoch": 1.7909070942235887, + "grad_norm": 2.1766433715820312, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.7097448110580444, + "num_tokens": 407364537.0, + "step": 16308 + }, + { + "epoch": 1.7910169119262025, + "grad_norm": 2.3476297855377197, + "learning_rate": 1e-06, + "loss": 0.8778, + "mean_token_accuracy": 0.7214338183403015, + "num_tokens": 407387788.0, + "step": 16309 + }, + { + "epoch": 1.7911267296288162, + "grad_norm": 2.3054378032684326, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7085981369018555, + "num_tokens": 407412411.0, + "step": 16310 + }, + { + "epoch": 1.7912365473314298, + "grad_norm": 2.6250228881835938, + "learning_rate": 1e-06, + "loss": 0.8228, + "mean_token_accuracy": 0.7351354360580444, + "num_tokens": 407431811.0, + "step": 16311 + }, + { + "epoch": 1.7913463650340435, + "grad_norm": 2.274662971496582, + "learning_rate": 1e-06, + "loss": 0.8623, + "mean_token_accuracy": 0.7288472056388855, + "num_tokens": 407456230.0, + "step": 16312 + }, + { + "epoch": 1.791456182736657, + "grad_norm": 2.282367706298828, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7053623199462891, + "num_tokens": 407479754.0, + "step": 16313 + }, + { + "epoch": 1.7915660004392708, + "grad_norm": 2.422046184539795, + "learning_rate": 1e-06, + "loss": 0.8961, + "mean_token_accuracy": 0.7274060845375061, + "num_tokens": 407502276.0, + "step": 16314 + }, + { + "epoch": 1.7916758181418846, + "grad_norm": 2.333650588989258, + "learning_rate": 1e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7275118827819824, + "num_tokens": 407525378.0, + "step": 16315 + }, + { + "epoch": 1.7917856358444981, + "grad_norm": 2.957512140274048, + "learning_rate": 1e-06, + "loss": 0.8479, + "mean_token_accuracy": 0.7292945981025696, + "num_tokens": 407541145.0, + "step": 16316 + }, + { + "epoch": 1.7918954535471117, + "grad_norm": 2.4529645442962646, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7222146987915039, + "num_tokens": 407562581.0, + "step": 16317 + }, + { + "epoch": 1.7920052712497254, + "grad_norm": 2.1629369258880615, + "learning_rate": 1e-06, + "loss": 0.9747, + "mean_token_accuracy": 0.6955300569534302, + "num_tokens": 407589548.0, + "step": 16318 + }, + { + "epoch": 1.7921150889523392, + "grad_norm": 1.9662870168685913, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7164521217346191, + "num_tokens": 407621209.0, + "step": 16319 + }, + { + "epoch": 1.792224906654953, + "grad_norm": 2.2868285179138184, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7203485369682312, + "num_tokens": 407646420.0, + "step": 16320 + }, + { + "epoch": 1.7923347243575665, + "grad_norm": 2.0466504096984863, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7118701934814453, + "num_tokens": 407674525.0, + "step": 16321 + }, + { + "epoch": 1.79244454206018, + "grad_norm": 2.2663629055023193, + "learning_rate": 1e-06, + "loss": 1.0002, + "mean_token_accuracy": 0.6994369626045227, + "num_tokens": 407702124.0, + "step": 16322 + }, + { + "epoch": 1.7925543597627938, + "grad_norm": 2.42496395111084, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7294579148292542, + "num_tokens": 407725162.0, + "step": 16323 + }, + { + "epoch": 1.7926641774654075, + "grad_norm": 2.2138819694519043, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7371870875358582, + "num_tokens": 407751051.0, + "step": 16324 + }, + { + "epoch": 1.792773995168021, + "grad_norm": 2.2377071380615234, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7265716791152954, + "num_tokens": 407777731.0, + "step": 16325 + }, + { + "epoch": 1.7928838128706346, + "grad_norm": 2.3544256687164307, + "learning_rate": 1e-06, + "loss": 0.9958, + "mean_token_accuracy": 0.6948086023330688, + "num_tokens": 407804635.0, + "step": 16326 + }, + { + "epoch": 1.7929936305732483, + "grad_norm": 2.239532709121704, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.712376058101654, + "num_tokens": 407830229.0, + "step": 16327 + }, + { + "epoch": 1.793103448275862, + "grad_norm": 2.038383960723877, + "learning_rate": 1e-06, + "loss": 0.8508, + "mean_token_accuracy": 0.7341060042381287, + "num_tokens": 407858258.0, + "step": 16328 + }, + { + "epoch": 1.7932132659784759, + "grad_norm": 2.656123638153076, + "learning_rate": 1e-06, + "loss": 0.8005, + "mean_token_accuracy": 0.7411171793937683, + "num_tokens": 407877001.0, + "step": 16329 + }, + { + "epoch": 1.7933230836810894, + "grad_norm": 2.2645151615142822, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7126697897911072, + "num_tokens": 407901534.0, + "step": 16330 + }, + { + "epoch": 1.793432901383703, + "grad_norm": 1.8740999698638916, + "learning_rate": 1e-06, + "loss": 0.9766, + "mean_token_accuracy": 0.7015539407730103, + "num_tokens": 407937422.0, + "step": 16331 + }, + { + "epoch": 1.7935427190863167, + "grad_norm": 2.550708532333374, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7244025468826294, + "num_tokens": 407959972.0, + "step": 16332 + }, + { + "epoch": 1.7936525367889304, + "grad_norm": 2.3946969509124756, + "learning_rate": 1e-06, + "loss": 0.8607, + "mean_token_accuracy": 0.7311661839485168, + "num_tokens": 407983169.0, + "step": 16333 + }, + { + "epoch": 1.7937623544915442, + "grad_norm": 2.278423547744751, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7249464988708496, + "num_tokens": 408009218.0, + "step": 16334 + }, + { + "epoch": 1.7938721721941577, + "grad_norm": 2.285276412963867, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7292850017547607, + "num_tokens": 408032387.0, + "step": 16335 + }, + { + "epoch": 1.7939819898967713, + "grad_norm": 2.621277093887329, + "learning_rate": 1e-06, + "loss": 0.8075, + "mean_token_accuracy": 0.7403042316436768, + "num_tokens": 408050764.0, + "step": 16336 + }, + { + "epoch": 1.794091807599385, + "grad_norm": 2.3833889961242676, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7329138517379761, + "num_tokens": 408072882.0, + "step": 16337 + }, + { + "epoch": 1.7942016253019988, + "grad_norm": 2.0594420433044434, + "learning_rate": 1e-06, + "loss": 0.8927, + "mean_token_accuracy": 0.7228631377220154, + "num_tokens": 408100495.0, + "step": 16338 + }, + { + "epoch": 1.7943114430046123, + "grad_norm": 2.0225090980529785, + "learning_rate": 1e-06, + "loss": 1.0117, + "mean_token_accuracy": 0.6965000033378601, + "num_tokens": 408130629.0, + "step": 16339 + }, + { + "epoch": 1.7944212607072259, + "grad_norm": 2.3815853595733643, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7276914119720459, + "num_tokens": 408153460.0, + "step": 16340 + }, + { + "epoch": 1.7945310784098396, + "grad_norm": 2.60446834564209, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7176492214202881, + "num_tokens": 408173760.0, + "step": 16341 + }, + { + "epoch": 1.7946408961124534, + "grad_norm": 2.2510182857513428, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7163141965866089, + "num_tokens": 408200520.0, + "step": 16342 + }, + { + "epoch": 1.7947507138150671, + "grad_norm": 2.0983617305755615, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7256083488464355, + "num_tokens": 408229918.0, + "step": 16343 + }, + { + "epoch": 1.7948605315176807, + "grad_norm": 2.093100070953369, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7249135971069336, + "num_tokens": 408259298.0, + "step": 16344 + }, + { + "epoch": 1.7949703492202942, + "grad_norm": 2.0919389724731445, + "learning_rate": 1e-06, + "loss": 0.9146, + "mean_token_accuracy": 0.7156164050102234, + "num_tokens": 408291668.0, + "step": 16345 + }, + { + "epoch": 1.795080166922908, + "grad_norm": 3.020892381668091, + "learning_rate": 1e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.7320476174354553, + "num_tokens": 408307256.0, + "step": 16346 + }, + { + "epoch": 1.7951899846255217, + "grad_norm": 2.2940428256988525, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.7345689535140991, + "num_tokens": 408331912.0, + "step": 16347 + }, + { + "epoch": 1.7952998023281352, + "grad_norm": 2.1718971729278564, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6933072209358215, + "num_tokens": 408358272.0, + "step": 16348 + }, + { + "epoch": 1.795409620030749, + "grad_norm": 2.3567111492156982, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7195656895637512, + "num_tokens": 408382336.0, + "step": 16349 + }, + { + "epoch": 1.7955194377333625, + "grad_norm": 2.4641687870025635, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7121703624725342, + "num_tokens": 408405392.0, + "step": 16350 + }, + { + "epoch": 1.7956292554359763, + "grad_norm": 2.340052366256714, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6956878900527954, + "num_tokens": 408431410.0, + "step": 16351 + }, + { + "epoch": 1.79573907313859, + "grad_norm": 2.2048094272613525, + "learning_rate": 1e-06, + "loss": 0.7835, + "mean_token_accuracy": 0.7547624111175537, + "num_tokens": 408455780.0, + "step": 16352 + }, + { + "epoch": 1.7958488908412036, + "grad_norm": 2.5210120677948, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.732799232006073, + "num_tokens": 408479304.0, + "step": 16353 + }, + { + "epoch": 1.7959587085438171, + "grad_norm": 2.4788174629211426, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7165502905845642, + "num_tokens": 408502638.0, + "step": 16354 + }, + { + "epoch": 1.7960685262464309, + "grad_norm": 2.081674575805664, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7372444272041321, + "num_tokens": 408531961.0, + "step": 16355 + }, + { + "epoch": 1.7961783439490446, + "grad_norm": 2.0676281452178955, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.7133538722991943, + "num_tokens": 408561457.0, + "step": 16356 + }, + { + "epoch": 1.7962881616516584, + "grad_norm": 2.617112636566162, + "learning_rate": 1e-06, + "loss": 0.9012, + "mean_token_accuracy": 0.7156550288200378, + "num_tokens": 408582499.0, + "step": 16357 + }, + { + "epoch": 1.796397979354272, + "grad_norm": 1.9267019033432007, + "learning_rate": 1e-06, + "loss": 0.9538, + "mean_token_accuracy": 0.7092005014419556, + "num_tokens": 408614415.0, + "step": 16358 + }, + { + "epoch": 1.7965077970568855, + "grad_norm": 2.1005771160125732, + "learning_rate": 1e-06, + "loss": 0.992, + "mean_token_accuracy": 0.6967233419418335, + "num_tokens": 408643849.0, + "step": 16359 + }, + { + "epoch": 1.7966176147594992, + "grad_norm": 2.19010066986084, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7109543681144714, + "num_tokens": 408669181.0, + "step": 16360 + }, + { + "epoch": 1.796727432462113, + "grad_norm": 2.2997727394104004, + "learning_rate": 1e-06, + "loss": 0.8191, + "mean_token_accuracy": 0.7448616623878479, + "num_tokens": 408692714.0, + "step": 16361 + }, + { + "epoch": 1.7968372501647265, + "grad_norm": 2.3932411670684814, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.7123121023178101, + "num_tokens": 408715231.0, + "step": 16362 + }, + { + "epoch": 1.7969470678673403, + "grad_norm": 2.447903871536255, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.718224287033081, + "num_tokens": 408738531.0, + "step": 16363 + }, + { + "epoch": 1.7970568855699538, + "grad_norm": 2.2358996868133545, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7354403734207153, + "num_tokens": 408763753.0, + "step": 16364 + }, + { + "epoch": 1.7971667032725676, + "grad_norm": 1.9360789060592651, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7062726020812988, + "num_tokens": 408795842.0, + "step": 16365 + }, + { + "epoch": 1.7972765209751813, + "grad_norm": 2.3286828994750977, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7316566705703735, + "num_tokens": 408820249.0, + "step": 16366 + }, + { + "epoch": 1.7973863386777948, + "grad_norm": 2.4046196937561035, + "learning_rate": 1e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7226306796073914, + "num_tokens": 408841325.0, + "step": 16367 + }, + { + "epoch": 1.7974961563804084, + "grad_norm": 2.358924150466919, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7019422054290771, + "num_tokens": 408867895.0, + "step": 16368 + }, + { + "epoch": 1.7976059740830221, + "grad_norm": 2.158888578414917, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7304518222808838, + "num_tokens": 408894626.0, + "step": 16369 + }, + { + "epoch": 1.797715791785636, + "grad_norm": 2.555091142654419, + "learning_rate": 1e-06, + "loss": 0.8802, + "mean_token_accuracy": 0.7274008989334106, + "num_tokens": 408913665.0, + "step": 16370 + }, + { + "epoch": 1.7978256094882497, + "grad_norm": 2.2594375610351562, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7228473424911499, + "num_tokens": 408937730.0, + "step": 16371 + }, + { + "epoch": 1.7979354271908632, + "grad_norm": 2.581282377243042, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7102513313293457, + "num_tokens": 408957634.0, + "step": 16372 + }, + { + "epoch": 1.7980452448934767, + "grad_norm": 1.9537978172302246, + "learning_rate": 1e-06, + "loss": 0.8557, + "mean_token_accuracy": 0.7261191606521606, + "num_tokens": 408989020.0, + "step": 16373 + }, + { + "epoch": 1.7981550625960905, + "grad_norm": 2.7248036861419678, + "learning_rate": 1e-06, + "loss": 0.9453, + "mean_token_accuracy": 0.7178279161453247, + "num_tokens": 409008714.0, + "step": 16374 + }, + { + "epoch": 1.7982648802987042, + "grad_norm": 2.1800923347473145, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7516690492630005, + "num_tokens": 409032930.0, + "step": 16375 + }, + { + "epoch": 1.7983746980013178, + "grad_norm": 2.152529239654541, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7396960258483887, + "num_tokens": 409059127.0, + "step": 16376 + }, + { + "epoch": 1.7984845157039313, + "grad_norm": 2.3144731521606445, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7285983562469482, + "num_tokens": 409082385.0, + "step": 16377 + }, + { + "epoch": 1.798594333406545, + "grad_norm": 2.40335750579834, + "learning_rate": 1e-06, + "loss": 0.8499, + "mean_token_accuracy": 0.7272733449935913, + "num_tokens": 409105373.0, + "step": 16378 + }, + { + "epoch": 1.7987041511091588, + "grad_norm": 2.089940309524536, + "learning_rate": 1e-06, + "loss": 0.9846, + "mean_token_accuracy": 0.6956115961074829, + "num_tokens": 409134103.0, + "step": 16379 + }, + { + "epoch": 1.7988139688117726, + "grad_norm": 2.116408109664917, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7117307782173157, + "num_tokens": 409161918.0, + "step": 16380 + }, + { + "epoch": 1.7989237865143861, + "grad_norm": 1.993544101715088, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7251465320587158, + "num_tokens": 409194754.0, + "step": 16381 + }, + { + "epoch": 1.7990336042169996, + "grad_norm": 2.119997262954712, + "learning_rate": 1e-06, + "loss": 0.8526, + "mean_token_accuracy": 0.7296404838562012, + "num_tokens": 409221995.0, + "step": 16382 + }, + { + "epoch": 1.7991434219196134, + "grad_norm": 2.0765388011932373, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7106121778488159, + "num_tokens": 409253057.0, + "step": 16383 + }, + { + "epoch": 1.7992532396222272, + "grad_norm": 2.3767454624176025, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.743772029876709, + "num_tokens": 409275947.0, + "step": 16384 + }, + { + "epoch": 1.799363057324841, + "grad_norm": 2.3959267139434814, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7087374925613403, + "num_tokens": 409298281.0, + "step": 16385 + }, + { + "epoch": 1.7994728750274545, + "grad_norm": 2.046478271484375, + "learning_rate": 1e-06, + "loss": 0.9303, + "mean_token_accuracy": 0.7161818146705627, + "num_tokens": 409328298.0, + "step": 16386 + }, + { + "epoch": 1.799582692730068, + "grad_norm": 2.4945075511932373, + "learning_rate": 1e-06, + "loss": 0.9833, + "mean_token_accuracy": 0.6970306634902954, + "num_tokens": 409352164.0, + "step": 16387 + }, + { + "epoch": 1.7996925104326817, + "grad_norm": 2.6017119884490967, + "learning_rate": 1e-06, + "loss": 0.7756, + "mean_token_accuracy": 0.750718355178833, + "num_tokens": 409370827.0, + "step": 16388 + }, + { + "epoch": 1.7998023281352955, + "grad_norm": 2.5075883865356445, + "learning_rate": 1e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7267007231712341, + "num_tokens": 409390984.0, + "step": 16389 + }, + { + "epoch": 1.799912145837909, + "grad_norm": 2.1578445434570312, + "learning_rate": 1e-06, + "loss": 0.9382, + "mean_token_accuracy": 0.706109344959259, + "num_tokens": 409417187.0, + "step": 16390 + }, + { + "epoch": 1.8000219635405226, + "grad_norm": 2.1186132431030273, + "learning_rate": 1e-06, + "loss": 0.8009, + "mean_token_accuracy": 0.7429194450378418, + "num_tokens": 409442435.0, + "step": 16391 + }, + { + "epoch": 1.8001317812431363, + "grad_norm": 2.225389003753662, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7131310701370239, + "num_tokens": 409468475.0, + "step": 16392 + }, + { + "epoch": 1.80024159894575, + "grad_norm": 2.3288044929504395, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6918097138404846, + "num_tokens": 409490914.0, + "step": 16393 + }, + { + "epoch": 1.8003514166483638, + "grad_norm": 1.931736707687378, + "learning_rate": 1e-06, + "loss": 0.9574, + "mean_token_accuracy": 0.6989347338676453, + "num_tokens": 409522646.0, + "step": 16394 + }, + { + "epoch": 1.8004612343509774, + "grad_norm": 2.0366742610931396, + "learning_rate": 1e-06, + "loss": 0.8964, + "mean_token_accuracy": 0.7192187309265137, + "num_tokens": 409552067.0, + "step": 16395 + }, + { + "epoch": 1.800571052053591, + "grad_norm": 2.399811029434204, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7186800241470337, + "num_tokens": 409577154.0, + "step": 16396 + }, + { + "epoch": 1.8006808697562047, + "grad_norm": 2.2047224044799805, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7207609415054321, + "num_tokens": 409603961.0, + "step": 16397 + }, + { + "epoch": 1.8007906874588184, + "grad_norm": 2.422234535217285, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.731067419052124, + "num_tokens": 409627033.0, + "step": 16398 + }, + { + "epoch": 1.8009005051614322, + "grad_norm": 2.2654058933258057, + "learning_rate": 1e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7393957376480103, + "num_tokens": 409650819.0, + "step": 16399 + }, + { + "epoch": 1.8010103228640457, + "grad_norm": 2.129040241241455, + "learning_rate": 1e-06, + "loss": 0.9237, + "mean_token_accuracy": 0.7100061774253845, + "num_tokens": 409677508.0, + "step": 16400 + }, + { + "epoch": 1.8011201405666593, + "grad_norm": 2.7069661617279053, + "learning_rate": 1e-06, + "loss": 0.7788, + "mean_token_accuracy": 0.7590349316596985, + "num_tokens": 409695425.0, + "step": 16401 + }, + { + "epoch": 1.801229958269273, + "grad_norm": 2.2405805587768555, + "learning_rate": 1e-06, + "loss": 0.875, + "mean_token_accuracy": 0.7228211760520935, + "num_tokens": 409722095.0, + "step": 16402 + }, + { + "epoch": 1.8013397759718868, + "grad_norm": 2.2572944164276123, + "learning_rate": 1e-06, + "loss": 0.8519, + "mean_token_accuracy": 0.7354782819747925, + "num_tokens": 409746442.0, + "step": 16403 + }, + { + "epoch": 1.8014495936745003, + "grad_norm": 2.5421879291534424, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.726372241973877, + "num_tokens": 409766563.0, + "step": 16404 + }, + { + "epoch": 1.8015594113771138, + "grad_norm": 2.3951501846313477, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.7195234894752502, + "num_tokens": 409789648.0, + "step": 16405 + }, + { + "epoch": 1.8016692290797276, + "grad_norm": 2.05983567237854, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7239223122596741, + "num_tokens": 409818649.0, + "step": 16406 + }, + { + "epoch": 1.8017790467823414, + "grad_norm": 2.2622711658477783, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7086291909217834, + "num_tokens": 409846914.0, + "step": 16407 + }, + { + "epoch": 1.801888864484955, + "grad_norm": 2.621934652328491, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7242732048034668, + "num_tokens": 409868540.0, + "step": 16408 + }, + { + "epoch": 1.8019986821875686, + "grad_norm": 2.265531063079834, + "learning_rate": 1e-06, + "loss": 0.9755, + "mean_token_accuracy": 0.7015381455421448, + "num_tokens": 409897036.0, + "step": 16409 + }, + { + "epoch": 1.8021084998901822, + "grad_norm": 2.11873459815979, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7290861010551453, + "num_tokens": 409925570.0, + "step": 16410 + }, + { + "epoch": 1.802218317592796, + "grad_norm": 2.3302197456359863, + "learning_rate": 1e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7339284420013428, + "num_tokens": 409950634.0, + "step": 16411 + }, + { + "epoch": 1.8023281352954097, + "grad_norm": 2.041107177734375, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7306862473487854, + "num_tokens": 409979251.0, + "step": 16412 + }, + { + "epoch": 1.8024379529980232, + "grad_norm": 2.1716065406799316, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7250462174415588, + "num_tokens": 410007793.0, + "step": 16413 + }, + { + "epoch": 1.802547770700637, + "grad_norm": 2.347200870513916, + "learning_rate": 1e-06, + "loss": 0.9692, + "mean_token_accuracy": 0.6976269483566284, + "num_tokens": 410032747.0, + "step": 16414 + }, + { + "epoch": 1.8026575884032505, + "grad_norm": 1.9325324296951294, + "learning_rate": 1e-06, + "loss": 1.0347, + "mean_token_accuracy": 0.6851599216461182, + "num_tokens": 410069975.0, + "step": 16415 + }, + { + "epoch": 1.8027674061058643, + "grad_norm": 2.0443997383117676, + "learning_rate": 1e-06, + "loss": 0.8939, + "mean_token_accuracy": 0.7206578254699707, + "num_tokens": 410098998.0, + "step": 16416 + }, + { + "epoch": 1.802877223808478, + "grad_norm": 1.9242384433746338, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7127506136894226, + "num_tokens": 410133123.0, + "step": 16417 + }, + { + "epoch": 1.8029870415110916, + "grad_norm": 2.418179512023926, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7254254817962646, + "num_tokens": 410154412.0, + "step": 16418 + }, + { + "epoch": 1.803096859213705, + "grad_norm": 2.111776113510132, + "learning_rate": 1e-06, + "loss": 0.8491, + "mean_token_accuracy": 0.73460453748703, + "num_tokens": 410180496.0, + "step": 16419 + }, + { + "epoch": 1.8032066769163189, + "grad_norm": 2.3463380336761475, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6958203315734863, + "num_tokens": 410203755.0, + "step": 16420 + }, + { + "epoch": 1.8033164946189326, + "grad_norm": 2.565499782562256, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.6986007690429688, + "num_tokens": 410225337.0, + "step": 16421 + }, + { + "epoch": 1.8034263123215464, + "grad_norm": 2.3709635734558105, + "learning_rate": 1e-06, + "loss": 0.9724, + "mean_token_accuracy": 0.7061609625816345, + "num_tokens": 410248601.0, + "step": 16422 + }, + { + "epoch": 1.80353613002416, + "grad_norm": 2.3267767429351807, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7168352603912354, + "num_tokens": 410271834.0, + "step": 16423 + }, + { + "epoch": 1.8036459477267734, + "grad_norm": 2.142660140991211, + "learning_rate": 1e-06, + "loss": 1.0563, + "mean_token_accuracy": 0.6793458461761475, + "num_tokens": 410301436.0, + "step": 16424 + }, + { + "epoch": 1.8037557654293872, + "grad_norm": 2.213376522064209, + "learning_rate": 1e-06, + "loss": 0.9781, + "mean_token_accuracy": 0.7006560564041138, + "num_tokens": 410329536.0, + "step": 16425 + }, + { + "epoch": 1.803865583132001, + "grad_norm": 2.3987655639648438, + "learning_rate": 1e-06, + "loss": 0.933, + "mean_token_accuracy": 0.7115381956100464, + "num_tokens": 410352580.0, + "step": 16426 + }, + { + "epoch": 1.8039754008346145, + "grad_norm": 2.2879745960235596, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7303962707519531, + "num_tokens": 410376835.0, + "step": 16427 + }, + { + "epoch": 1.8040852185372283, + "grad_norm": 2.0046980381011963, + "learning_rate": 1e-06, + "loss": 0.822, + "mean_token_accuracy": 0.7421441674232483, + "num_tokens": 410404123.0, + "step": 16428 + }, + { + "epoch": 1.8041950362398418, + "grad_norm": 2.553708076477051, + "learning_rate": 1e-06, + "loss": 0.7252, + "mean_token_accuracy": 0.7658294439315796, + "num_tokens": 410422946.0, + "step": 16429 + }, + { + "epoch": 1.8043048539424555, + "grad_norm": 2.2066659927368164, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.7193008661270142, + "num_tokens": 410449302.0, + "step": 16430 + }, + { + "epoch": 1.8044146716450693, + "grad_norm": 1.9534660577774048, + "learning_rate": 1e-06, + "loss": 0.9601, + "mean_token_accuracy": 0.7019591331481934, + "num_tokens": 410482356.0, + "step": 16431 + }, + { + "epoch": 1.8045244893476828, + "grad_norm": 2.4025049209594727, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7284691333770752, + "num_tokens": 410504254.0, + "step": 16432 + }, + { + "epoch": 1.8046343070502964, + "grad_norm": 1.9955958127975464, + "learning_rate": 1e-06, + "loss": 0.8087, + "mean_token_accuracy": 0.7337311506271362, + "num_tokens": 410533612.0, + "step": 16433 + }, + { + "epoch": 1.8047441247529101, + "grad_norm": 2.1541175842285156, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7221874594688416, + "num_tokens": 410560278.0, + "step": 16434 + }, + { + "epoch": 1.8048539424555239, + "grad_norm": 2.0661582946777344, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7135722637176514, + "num_tokens": 410590660.0, + "step": 16435 + }, + { + "epoch": 1.8049637601581376, + "grad_norm": 2.865408420562744, + "learning_rate": 1e-06, + "loss": 0.7885, + "mean_token_accuracy": 0.751129686832428, + "num_tokens": 410607304.0, + "step": 16436 + }, + { + "epoch": 1.8050735778607512, + "grad_norm": 2.2208001613616943, + "learning_rate": 1e-06, + "loss": 0.8906, + "mean_token_accuracy": 0.7206243872642517, + "num_tokens": 410631428.0, + "step": 16437 + }, + { + "epoch": 1.8051833955633647, + "grad_norm": 2.014861583709717, + "learning_rate": 1e-06, + "loss": 0.9732, + "mean_token_accuracy": 0.6980919241905212, + "num_tokens": 410663295.0, + "step": 16438 + }, + { + "epoch": 1.8052932132659785, + "grad_norm": 2.3179473876953125, + "learning_rate": 1e-06, + "loss": 0.8908, + "mean_token_accuracy": 0.7280737161636353, + "num_tokens": 410687076.0, + "step": 16439 + }, + { + "epoch": 1.8054030309685922, + "grad_norm": 2.2245240211486816, + "learning_rate": 1e-06, + "loss": 1.0481, + "mean_token_accuracy": 0.6813703179359436, + "num_tokens": 410715556.0, + "step": 16440 + }, + { + "epoch": 1.8055128486712058, + "grad_norm": 2.3476319313049316, + "learning_rate": 1e-06, + "loss": 0.9827, + "mean_token_accuracy": 0.6972161531448364, + "num_tokens": 410740392.0, + "step": 16441 + }, + { + "epoch": 1.8056226663738193, + "grad_norm": 2.3238840103149414, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7258152961730957, + "num_tokens": 410765499.0, + "step": 16442 + }, + { + "epoch": 1.805732484076433, + "grad_norm": 2.0583105087280273, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7045373320579529, + "num_tokens": 410797957.0, + "step": 16443 + }, + { + "epoch": 1.8058423017790468, + "grad_norm": 2.244546890258789, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7137500643730164, + "num_tokens": 410825287.0, + "step": 16444 + }, + { + "epoch": 1.8059521194816606, + "grad_norm": 2.634643077850342, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.7290076613426208, + "num_tokens": 410845820.0, + "step": 16445 + }, + { + "epoch": 1.806061937184274, + "grad_norm": 2.3780429363250732, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7291310429573059, + "num_tokens": 410868365.0, + "step": 16446 + }, + { + "epoch": 1.8061717548868876, + "grad_norm": 2.1896770000457764, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7274700403213501, + "num_tokens": 410895060.0, + "step": 16447 + }, + { + "epoch": 1.8062815725895014, + "grad_norm": 2.4912750720977783, + "learning_rate": 1e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7405425310134888, + "num_tokens": 410917542.0, + "step": 16448 + }, + { + "epoch": 1.8063913902921152, + "grad_norm": 2.18701171875, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7219453454017639, + "num_tokens": 410943891.0, + "step": 16449 + }, + { + "epoch": 1.806501207994729, + "grad_norm": 2.375912666320801, + "learning_rate": 1e-06, + "loss": 0.8197, + "mean_token_accuracy": 0.7398779988288879, + "num_tokens": 410967503.0, + "step": 16450 + }, + { + "epoch": 1.8066110256973424, + "grad_norm": 2.432345390319824, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7079644799232483, + "num_tokens": 410990356.0, + "step": 16451 + }, + { + "epoch": 1.806720843399956, + "grad_norm": 2.2719924449920654, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7215018272399902, + "num_tokens": 411015058.0, + "step": 16452 + }, + { + "epoch": 1.8068306611025697, + "grad_norm": 2.028317928314209, + "learning_rate": 1e-06, + "loss": 0.8336, + "mean_token_accuracy": 0.7392898797988892, + "num_tokens": 411043374.0, + "step": 16453 + }, + { + "epoch": 1.8069404788051835, + "grad_norm": 2.2070472240448, + "learning_rate": 1e-06, + "loss": 0.9096, + "mean_token_accuracy": 0.7271606922149658, + "num_tokens": 411068445.0, + "step": 16454 + }, + { + "epoch": 1.807050296507797, + "grad_norm": 2.573059320449829, + "learning_rate": 1e-06, + "loss": 0.8452, + "mean_token_accuracy": 0.7351892590522766, + "num_tokens": 411087559.0, + "step": 16455 + }, + { + "epoch": 1.8071601142104106, + "grad_norm": 2.070244073867798, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7140112519264221, + "num_tokens": 411117097.0, + "step": 16456 + }, + { + "epoch": 1.8072699319130243, + "grad_norm": 2.2455475330352783, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7144689559936523, + "num_tokens": 411142238.0, + "step": 16457 + }, + { + "epoch": 1.807379749615638, + "grad_norm": 2.1516358852386475, + "learning_rate": 1e-06, + "loss": 0.9359, + "mean_token_accuracy": 0.7038846015930176, + "num_tokens": 411171030.0, + "step": 16458 + }, + { + "epoch": 1.8074895673182518, + "grad_norm": 1.9695950746536255, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7429265975952148, + "num_tokens": 411200933.0, + "step": 16459 + }, + { + "epoch": 1.8075993850208654, + "grad_norm": 2.594393253326416, + "learning_rate": 1e-06, + "loss": 0.7721, + "mean_token_accuracy": 0.7520483732223511, + "num_tokens": 411220205.0, + "step": 16460 + }, + { + "epoch": 1.807709202723479, + "grad_norm": 2.3249919414520264, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7243266105651855, + "num_tokens": 411246351.0, + "step": 16461 + }, + { + "epoch": 1.8078190204260927, + "grad_norm": 2.390313148498535, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7427923083305359, + "num_tokens": 411268457.0, + "step": 16462 + }, + { + "epoch": 1.8079288381287064, + "grad_norm": 1.9295871257781982, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7213874459266663, + "num_tokens": 411300966.0, + "step": 16463 + }, + { + "epoch": 1.8080386558313202, + "grad_norm": 2.3216934204101562, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7179738283157349, + "num_tokens": 411323382.0, + "step": 16464 + }, + { + "epoch": 1.8081484735339337, + "grad_norm": 2.1224820613861084, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7233603596687317, + "num_tokens": 411349124.0, + "step": 16465 + }, + { + "epoch": 1.8082582912365472, + "grad_norm": 2.163710355758667, + "learning_rate": 1e-06, + "loss": 0.9638, + "mean_token_accuracy": 0.7074885368347168, + "num_tokens": 411376387.0, + "step": 16466 + }, + { + "epoch": 1.808368108939161, + "grad_norm": 1.907863974571228, + "learning_rate": 1e-06, + "loss": 0.9162, + "mean_token_accuracy": 0.7165883779525757, + "num_tokens": 411410995.0, + "step": 16467 + }, + { + "epoch": 1.8084779266417748, + "grad_norm": 2.0457310676574707, + "learning_rate": 1e-06, + "loss": 0.966, + "mean_token_accuracy": 0.7027485966682434, + "num_tokens": 411442573.0, + "step": 16468 + }, + { + "epoch": 1.8085877443443883, + "grad_norm": 2.118696689605713, + "learning_rate": 1e-06, + "loss": 0.8888, + "mean_token_accuracy": 0.7219158411026001, + "num_tokens": 411470776.0, + "step": 16469 + }, + { + "epoch": 1.8086975620470018, + "grad_norm": 2.3899242877960205, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7250623106956482, + "num_tokens": 411495768.0, + "step": 16470 + }, + { + "epoch": 1.8088073797496156, + "grad_norm": 1.9147557020187378, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.7105473875999451, + "num_tokens": 411528759.0, + "step": 16471 + }, + { + "epoch": 1.8089171974522293, + "grad_norm": 2.3621020317077637, + "learning_rate": 1e-06, + "loss": 0.9643, + "mean_token_accuracy": 0.6998108625411987, + "num_tokens": 411552652.0, + "step": 16472 + }, + { + "epoch": 1.809027015154843, + "grad_norm": 2.4337759017944336, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7239216566085815, + "num_tokens": 411577407.0, + "step": 16473 + }, + { + "epoch": 1.8091368328574566, + "grad_norm": 2.453502893447876, + "learning_rate": 1e-06, + "loss": 0.9934, + "mean_token_accuracy": 0.6977216005325317, + "num_tokens": 411601920.0, + "step": 16474 + }, + { + "epoch": 1.8092466505600702, + "grad_norm": 2.2293076515197754, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.719762921333313, + "num_tokens": 411625909.0, + "step": 16475 + }, + { + "epoch": 1.809356468262684, + "grad_norm": 2.2578930854797363, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.705697774887085, + "num_tokens": 411650256.0, + "step": 16476 + }, + { + "epoch": 1.8094662859652977, + "grad_norm": 2.003664970397949, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.6903656125068665, + "num_tokens": 411680857.0, + "step": 16477 + }, + { + "epoch": 1.8095761036679112, + "grad_norm": 2.7483291625976562, + "learning_rate": 1e-06, + "loss": 0.8768, + "mean_token_accuracy": 0.7327210307121277, + "num_tokens": 411699689.0, + "step": 16478 + }, + { + "epoch": 1.809685921370525, + "grad_norm": 2.2758822441101074, + "learning_rate": 1e-06, + "loss": 0.8983, + "mean_token_accuracy": 0.7215802073478699, + "num_tokens": 411722262.0, + "step": 16479 + }, + { + "epoch": 1.8097957390731385, + "grad_norm": 2.4730310440063477, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7390002012252808, + "num_tokens": 411744825.0, + "step": 16480 + }, + { + "epoch": 1.8099055567757523, + "grad_norm": 2.4319918155670166, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7198818922042847, + "num_tokens": 411767665.0, + "step": 16481 + }, + { + "epoch": 1.810015374478366, + "grad_norm": 2.648256540298462, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7249371409416199, + "num_tokens": 411786599.0, + "step": 16482 + }, + { + "epoch": 1.8101251921809796, + "grad_norm": 2.097013235092163, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7298973202705383, + "num_tokens": 411816129.0, + "step": 16483 + }, + { + "epoch": 1.810235009883593, + "grad_norm": 2.506849527359009, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7303119897842407, + "num_tokens": 411838568.0, + "step": 16484 + }, + { + "epoch": 1.8103448275862069, + "grad_norm": 2.2697434425354004, + "learning_rate": 1e-06, + "loss": 0.8859, + "mean_token_accuracy": 0.7167674899101257, + "num_tokens": 411863943.0, + "step": 16485 + }, + { + "epoch": 1.8104546452888206, + "grad_norm": 2.503131151199341, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.730434000492096, + "num_tokens": 411885478.0, + "step": 16486 + }, + { + "epoch": 1.8105644629914344, + "grad_norm": 2.309046983718872, + "learning_rate": 1e-06, + "loss": 0.9273, + "mean_token_accuracy": 0.7095725536346436, + "num_tokens": 411910102.0, + "step": 16487 + }, + { + "epoch": 1.810674280694048, + "grad_norm": 2.638523578643799, + "learning_rate": 1e-06, + "loss": 0.9088, + "mean_token_accuracy": 0.7228308916091919, + "num_tokens": 411930028.0, + "step": 16488 + }, + { + "epoch": 1.8107840983966614, + "grad_norm": 1.908426284790039, + "learning_rate": 1e-06, + "loss": 0.968, + "mean_token_accuracy": 0.7206381559371948, + "num_tokens": 411963802.0, + "step": 16489 + }, + { + "epoch": 1.8108939160992752, + "grad_norm": 2.0983805656433105, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.710132360458374, + "num_tokens": 411992044.0, + "step": 16490 + }, + { + "epoch": 1.811003733801889, + "grad_norm": 2.0217177867889404, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.7163447141647339, + "num_tokens": 412020450.0, + "step": 16491 + }, + { + "epoch": 1.8111135515045025, + "grad_norm": 2.265512466430664, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7115480303764343, + "num_tokens": 412044856.0, + "step": 16492 + }, + { + "epoch": 1.8112233692071162, + "grad_norm": 2.2127554416656494, + "learning_rate": 1e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.72416752576828, + "num_tokens": 412070863.0, + "step": 16493 + }, + { + "epoch": 1.8113331869097298, + "grad_norm": 2.7295732498168945, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.736254096031189, + "num_tokens": 412088766.0, + "step": 16494 + }, + { + "epoch": 1.8114430046123435, + "grad_norm": 2.416466236114502, + "learning_rate": 1e-06, + "loss": 0.9431, + "mean_token_accuracy": 0.7114483714103699, + "num_tokens": 412112018.0, + "step": 16495 + }, + { + "epoch": 1.8115528223149573, + "grad_norm": 2.057302713394165, + "learning_rate": 1e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7300346493721008, + "num_tokens": 412142195.0, + "step": 16496 + }, + { + "epoch": 1.8116626400175708, + "grad_norm": 2.508516311645508, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7217909097671509, + "num_tokens": 412162371.0, + "step": 16497 + }, + { + "epoch": 1.8117724577201844, + "grad_norm": 2.5807197093963623, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7511944770812988, + "num_tokens": 412180690.0, + "step": 16498 + }, + { + "epoch": 1.8118822754227981, + "grad_norm": 2.0107004642486572, + "learning_rate": 1e-06, + "loss": 0.8813, + "mean_token_accuracy": 0.7228856682777405, + "num_tokens": 412211514.0, + "step": 16499 + }, + { + "epoch": 1.8119920931254119, + "grad_norm": 2.269263744354248, + "learning_rate": 1e-06, + "loss": 0.8903, + "mean_token_accuracy": 0.727952778339386, + "num_tokens": 412236080.0, + "step": 16500 + }, + { + "epoch": 1.8121019108280256, + "grad_norm": 2.1844425201416016, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.714214563369751, + "num_tokens": 412262547.0, + "step": 16501 + }, + { + "epoch": 1.8122117285306392, + "grad_norm": 2.529863119125366, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7153433561325073, + "num_tokens": 412286259.0, + "step": 16502 + }, + { + "epoch": 1.8123215462332527, + "grad_norm": 1.88788902759552, + "learning_rate": 1e-06, + "loss": 0.9455, + "mean_token_accuracy": 0.7078408002853394, + "num_tokens": 412321066.0, + "step": 16503 + }, + { + "epoch": 1.8124313639358665, + "grad_norm": 2.512693405151367, + "learning_rate": 1e-06, + "loss": 0.8831, + "mean_token_accuracy": 0.7248761057853699, + "num_tokens": 412344642.0, + "step": 16504 + }, + { + "epoch": 1.8125411816384802, + "grad_norm": 2.280827045440674, + "learning_rate": 1e-06, + "loss": 0.9547, + "mean_token_accuracy": 0.7009605765342712, + "num_tokens": 412372291.0, + "step": 16505 + }, + { + "epoch": 1.8126509993410937, + "grad_norm": 2.5095930099487305, + "learning_rate": 1e-06, + "loss": 0.9673, + "mean_token_accuracy": 0.7014864683151245, + "num_tokens": 412396374.0, + "step": 16506 + }, + { + "epoch": 1.8127608170437073, + "grad_norm": 2.3669469356536865, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.7032088041305542, + "num_tokens": 412420873.0, + "step": 16507 + }, + { + "epoch": 1.812870634746321, + "grad_norm": 2.656184434890747, + "learning_rate": 1e-06, + "loss": 0.8843, + "mean_token_accuracy": 0.7197697758674622, + "num_tokens": 412441039.0, + "step": 16508 + }, + { + "epoch": 1.8129804524489348, + "grad_norm": 2.2520790100097656, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7193306684494019, + "num_tokens": 412466124.0, + "step": 16509 + }, + { + "epoch": 1.8130902701515486, + "grad_norm": 1.9477770328521729, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7196155786514282, + "num_tokens": 412500695.0, + "step": 16510 + }, + { + "epoch": 1.813200087854162, + "grad_norm": 2.450178384780884, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7182667851448059, + "num_tokens": 412523108.0, + "step": 16511 + }, + { + "epoch": 1.8133099055567756, + "grad_norm": 2.3890035152435303, + "learning_rate": 1e-06, + "loss": 0.7172, + "mean_token_accuracy": 0.7736622095108032, + "num_tokens": 412543076.0, + "step": 16512 + }, + { + "epoch": 1.8134197232593894, + "grad_norm": 2.4431514739990234, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7139590978622437, + "num_tokens": 412566145.0, + "step": 16513 + }, + { + "epoch": 1.8135295409620031, + "grad_norm": 2.415665864944458, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7154621481895447, + "num_tokens": 412589860.0, + "step": 16514 + }, + { + "epoch": 1.813639358664617, + "grad_norm": 2.0545833110809326, + "learning_rate": 1e-06, + "loss": 0.8366, + "mean_token_accuracy": 0.7337183356285095, + "num_tokens": 412621059.0, + "step": 16515 + }, + { + "epoch": 1.8137491763672304, + "grad_norm": 2.2882003784179688, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7225296497344971, + "num_tokens": 412645054.0, + "step": 16516 + }, + { + "epoch": 1.813858994069844, + "grad_norm": 2.771376132965088, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7262114882469177, + "num_tokens": 412663833.0, + "step": 16517 + }, + { + "epoch": 1.8139688117724577, + "grad_norm": 2.137810468673706, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7275452017784119, + "num_tokens": 412689805.0, + "step": 16518 + }, + { + "epoch": 1.8140786294750715, + "grad_norm": 2.204045534133911, + "learning_rate": 1e-06, + "loss": 0.9157, + "mean_token_accuracy": 0.7093963027000427, + "num_tokens": 412716361.0, + "step": 16519 + }, + { + "epoch": 1.814188447177685, + "grad_norm": 2.4272994995117188, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.731574296951294, + "num_tokens": 412738363.0, + "step": 16520 + }, + { + "epoch": 1.8142982648802986, + "grad_norm": 2.222649097442627, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7437739372253418, + "num_tokens": 412761924.0, + "step": 16521 + }, + { + "epoch": 1.8144080825829123, + "grad_norm": 2.127882242202759, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7139321565628052, + "num_tokens": 412790089.0, + "step": 16522 + }, + { + "epoch": 1.814517900285526, + "grad_norm": 2.4324557781219482, + "learning_rate": 1e-06, + "loss": 0.8114, + "mean_token_accuracy": 0.74045729637146, + "num_tokens": 412811533.0, + "step": 16523 + }, + { + "epoch": 1.8146277179881398, + "grad_norm": 2.5927672386169434, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7229208946228027, + "num_tokens": 412832756.0, + "step": 16524 + }, + { + "epoch": 1.8147375356907534, + "grad_norm": 2.4849472045898438, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7293504476547241, + "num_tokens": 412852479.0, + "step": 16525 + }, + { + "epoch": 1.814847353393367, + "grad_norm": 2.5738818645477295, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7288145422935486, + "num_tokens": 412873305.0, + "step": 16526 + }, + { + "epoch": 1.8149571710959806, + "grad_norm": 2.2265431880950928, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7017726898193359, + "num_tokens": 412901351.0, + "step": 16527 + }, + { + "epoch": 1.8150669887985944, + "grad_norm": 2.550800085067749, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7208136916160583, + "num_tokens": 412921623.0, + "step": 16528 + }, + { + "epoch": 1.815176806501208, + "grad_norm": 2.148334503173828, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7159367203712463, + "num_tokens": 412948507.0, + "step": 16529 + }, + { + "epoch": 1.8152866242038217, + "grad_norm": 2.3648693561553955, + "learning_rate": 1e-06, + "loss": 0.8461, + "mean_token_accuracy": 0.7299429178237915, + "num_tokens": 412972386.0, + "step": 16530 + }, + { + "epoch": 1.8153964419064352, + "grad_norm": 2.5595333576202393, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7103065252304077, + "num_tokens": 412994008.0, + "step": 16531 + }, + { + "epoch": 1.815506259609049, + "grad_norm": 1.9814194440841675, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7113034129142761, + "num_tokens": 413025376.0, + "step": 16532 + }, + { + "epoch": 1.8156160773116627, + "grad_norm": 2.226649761199951, + "learning_rate": 1e-06, + "loss": 0.8924, + "mean_token_accuracy": 0.7235186100006104, + "num_tokens": 413050510.0, + "step": 16533 + }, + { + "epoch": 1.8157258950142763, + "grad_norm": 2.058432102203369, + "learning_rate": 1e-06, + "loss": 0.9253, + "mean_token_accuracy": 0.7077383995056152, + "num_tokens": 413079081.0, + "step": 16534 + }, + { + "epoch": 1.8158357127168898, + "grad_norm": 2.116535186767578, + "learning_rate": 1e-06, + "loss": 0.9076, + "mean_token_accuracy": 0.7202593088150024, + "num_tokens": 413106951.0, + "step": 16535 + }, + { + "epoch": 1.8159455304195036, + "grad_norm": 2.365676164627075, + "learning_rate": 1e-06, + "loss": 1.0184, + "mean_token_accuracy": 0.6855801343917847, + "num_tokens": 413131630.0, + "step": 16536 + }, + { + "epoch": 1.8160553481221173, + "grad_norm": 2.2586278915405273, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7134803533554077, + "num_tokens": 413158081.0, + "step": 16537 + }, + { + "epoch": 1.816165165824731, + "grad_norm": 2.57660174369812, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7208773493766785, + "num_tokens": 413178244.0, + "step": 16538 + }, + { + "epoch": 1.8162749835273446, + "grad_norm": 2.099951982498169, + "learning_rate": 1e-06, + "loss": 0.936, + "mean_token_accuracy": 0.7133986949920654, + "num_tokens": 413208358.0, + "step": 16539 + }, + { + "epoch": 1.8163848012299582, + "grad_norm": 2.429520606994629, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.727028489112854, + "num_tokens": 413230153.0, + "step": 16540 + }, + { + "epoch": 1.816494618932572, + "grad_norm": 2.0707337856292725, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7159827351570129, + "num_tokens": 413263615.0, + "step": 16541 + }, + { + "epoch": 1.8166044366351857, + "grad_norm": 2.5314605236053467, + "learning_rate": 1e-06, + "loss": 0.838, + "mean_token_accuracy": 0.7287352681159973, + "num_tokens": 413284324.0, + "step": 16542 + }, + { + "epoch": 1.8167142543377992, + "grad_norm": 2.3209950923919678, + "learning_rate": 1e-06, + "loss": 0.8977, + "mean_token_accuracy": 0.7195093631744385, + "num_tokens": 413309596.0, + "step": 16543 + }, + { + "epoch": 1.816824072040413, + "grad_norm": 1.9099870920181274, + "learning_rate": 1e-06, + "loss": 0.9928, + "mean_token_accuracy": 0.6943020820617676, + "num_tokens": 413343884.0, + "step": 16544 + }, + { + "epoch": 1.8169338897430265, + "grad_norm": 2.1293070316314697, + "learning_rate": 1e-06, + "loss": 0.9442, + "mean_token_accuracy": 0.7113863229751587, + "num_tokens": 413371191.0, + "step": 16545 + }, + { + "epoch": 1.8170437074456403, + "grad_norm": 2.492297649383545, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7295451760292053, + "num_tokens": 413392533.0, + "step": 16546 + }, + { + "epoch": 1.817153525148254, + "grad_norm": 2.1125881671905518, + "learning_rate": 1e-06, + "loss": 0.9436, + "mean_token_accuracy": 0.7143920660018921, + "num_tokens": 413420416.0, + "step": 16547 + }, + { + "epoch": 1.8172633428508675, + "grad_norm": 2.5448689460754395, + "learning_rate": 1e-06, + "loss": 0.9482, + "mean_token_accuracy": 0.7032241821289062, + "num_tokens": 413442802.0, + "step": 16548 + }, + { + "epoch": 1.817373160553481, + "grad_norm": 2.2816948890686035, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7153404355049133, + "num_tokens": 413466560.0, + "step": 16549 + }, + { + "epoch": 1.8174829782560948, + "grad_norm": 2.509061813354492, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.725250780582428, + "num_tokens": 413488900.0, + "step": 16550 + }, + { + "epoch": 1.8175927959587086, + "grad_norm": 2.1010661125183105, + "learning_rate": 1e-06, + "loss": 0.9649, + "mean_token_accuracy": 0.7078844308853149, + "num_tokens": 413518804.0, + "step": 16551 + }, + { + "epoch": 1.8177026136613224, + "grad_norm": 2.3929085731506348, + "learning_rate": 1e-06, + "loss": 0.8668, + "mean_token_accuracy": 0.7293815612792969, + "num_tokens": 413542087.0, + "step": 16552 + }, + { + "epoch": 1.8178124313639359, + "grad_norm": 2.015070915222168, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7317027449607849, + "num_tokens": 413572544.0, + "step": 16553 + }, + { + "epoch": 1.8179222490665494, + "grad_norm": 2.195175886154175, + "learning_rate": 1e-06, + "loss": 0.7913, + "mean_token_accuracy": 0.7542867064476013, + "num_tokens": 413595642.0, + "step": 16554 + }, + { + "epoch": 1.8180320667691632, + "grad_norm": 2.5462911128997803, + "learning_rate": 1e-06, + "loss": 0.9478, + "mean_token_accuracy": 0.7014115452766418, + "num_tokens": 413620899.0, + "step": 16555 + }, + { + "epoch": 1.818141884471777, + "grad_norm": 2.327676773071289, + "learning_rate": 1e-06, + "loss": 0.9434, + "mean_token_accuracy": 0.7068018317222595, + "num_tokens": 413645915.0, + "step": 16556 + }, + { + "epoch": 1.8182517021743905, + "grad_norm": 2.4529311656951904, + "learning_rate": 1e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7269173860549927, + "num_tokens": 413665974.0, + "step": 16557 + }, + { + "epoch": 1.818361519877004, + "grad_norm": 2.139125347137451, + "learning_rate": 1e-06, + "loss": 0.9897, + "mean_token_accuracy": 0.691887378692627, + "num_tokens": 413694463.0, + "step": 16558 + }, + { + "epoch": 1.8184713375796178, + "grad_norm": 2.2175774574279785, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7282206416130066, + "num_tokens": 413720022.0, + "step": 16559 + }, + { + "epoch": 1.8185811552822315, + "grad_norm": 2.217132091522217, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7053142786026001, + "num_tokens": 413746294.0, + "step": 16560 + }, + { + "epoch": 1.8186909729848453, + "grad_norm": 2.255610942840576, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7237390279769897, + "num_tokens": 413769677.0, + "step": 16561 + }, + { + "epoch": 1.8188007906874588, + "grad_norm": 2.364405870437622, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7125301361083984, + "num_tokens": 413793051.0, + "step": 16562 + }, + { + "epoch": 1.8189106083900723, + "grad_norm": 2.5619008541107178, + "learning_rate": 1e-06, + "loss": 0.9399, + "mean_token_accuracy": 0.7081438302993774, + "num_tokens": 413815937.0, + "step": 16563 + }, + { + "epoch": 1.819020426092686, + "grad_norm": 2.06728196144104, + "learning_rate": 1e-06, + "loss": 0.9513, + "mean_token_accuracy": 0.7146661281585693, + "num_tokens": 413845348.0, + "step": 16564 + }, + { + "epoch": 1.8191302437952999, + "grad_norm": 2.2700178623199463, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7265214323997498, + "num_tokens": 413868824.0, + "step": 16565 + }, + { + "epoch": 1.8192400614979136, + "grad_norm": 2.4878127574920654, + "learning_rate": 1e-06, + "loss": 0.7976, + "mean_token_accuracy": 0.7494839429855347, + "num_tokens": 413890295.0, + "step": 16566 + }, + { + "epoch": 1.8193498792005272, + "grad_norm": 2.170131206512451, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.733350396156311, + "num_tokens": 413915391.0, + "step": 16567 + }, + { + "epoch": 1.8194596969031407, + "grad_norm": 2.416703462600708, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7187705039978027, + "num_tokens": 413935147.0, + "step": 16568 + }, + { + "epoch": 1.8195695146057544, + "grad_norm": 2.3077685832977295, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7259065508842468, + "num_tokens": 413957225.0, + "step": 16569 + }, + { + "epoch": 1.8196793323083682, + "grad_norm": 2.5536410808563232, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7018603682518005, + "num_tokens": 413978815.0, + "step": 16570 + }, + { + "epoch": 1.8197891500109817, + "grad_norm": 2.9786911010742188, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7192459106445312, + "num_tokens": 413995226.0, + "step": 16571 + }, + { + "epoch": 1.8198989677135953, + "grad_norm": 2.3336338996887207, + "learning_rate": 1e-06, + "loss": 0.9889, + "mean_token_accuracy": 0.6932629346847534, + "num_tokens": 414020412.0, + "step": 16572 + }, + { + "epoch": 1.820008785416209, + "grad_norm": 2.454596996307373, + "learning_rate": 1e-06, + "loss": 0.8417, + "mean_token_accuracy": 0.7319450378417969, + "num_tokens": 414042065.0, + "step": 16573 + }, + { + "epoch": 1.8201186031188228, + "grad_norm": 2.240628719329834, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7087267637252808, + "num_tokens": 414068301.0, + "step": 16574 + }, + { + "epoch": 1.8202284208214365, + "grad_norm": 2.164459466934204, + "learning_rate": 1e-06, + "loss": 0.8877, + "mean_token_accuracy": 0.7275554537773132, + "num_tokens": 414093753.0, + "step": 16575 + }, + { + "epoch": 1.82033823852405, + "grad_norm": 2.721693277359009, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.7130571007728577, + "num_tokens": 414111637.0, + "step": 16576 + }, + { + "epoch": 1.8204480562266636, + "grad_norm": 2.462437391281128, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7190923690795898, + "num_tokens": 414136067.0, + "step": 16577 + }, + { + "epoch": 1.8205578739292774, + "grad_norm": 2.406325340270996, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.704041600227356, + "num_tokens": 414159066.0, + "step": 16578 + }, + { + "epoch": 1.8206676916318911, + "grad_norm": 2.1694579124450684, + "learning_rate": 1e-06, + "loss": 0.9793, + "mean_token_accuracy": 0.6943273544311523, + "num_tokens": 414185310.0, + "step": 16579 + }, + { + "epoch": 1.8207775093345049, + "grad_norm": 2.2872161865234375, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7332635521888733, + "num_tokens": 414208470.0, + "step": 16580 + }, + { + "epoch": 1.8208873270371184, + "grad_norm": 2.5291802883148193, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7497298717498779, + "num_tokens": 414228280.0, + "step": 16581 + }, + { + "epoch": 1.820997144739732, + "grad_norm": 2.5830914974212646, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7274819016456604, + "num_tokens": 414248579.0, + "step": 16582 + }, + { + "epoch": 1.8211069624423457, + "grad_norm": 2.1503405570983887, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7327933311462402, + "num_tokens": 414276349.0, + "step": 16583 + }, + { + "epoch": 1.8212167801449595, + "grad_norm": 2.130685567855835, + "learning_rate": 1e-06, + "loss": 1.0684, + "mean_token_accuracy": 0.6868331432342529, + "num_tokens": 414304812.0, + "step": 16584 + }, + { + "epoch": 1.821326597847573, + "grad_norm": 2.4344615936279297, + "learning_rate": 1e-06, + "loss": 0.7628, + "mean_token_accuracy": 0.7526373267173767, + "num_tokens": 414324906.0, + "step": 16585 + }, + { + "epoch": 1.8214364155501865, + "grad_norm": 2.1747889518737793, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7122722864151001, + "num_tokens": 414352424.0, + "step": 16586 + }, + { + "epoch": 1.8215462332528003, + "grad_norm": 2.272393226623535, + "learning_rate": 1e-06, + "loss": 0.8152, + "mean_token_accuracy": 0.7389588952064514, + "num_tokens": 414377051.0, + "step": 16587 + }, + { + "epoch": 1.821656050955414, + "grad_norm": 1.9370330572128296, + "learning_rate": 1e-06, + "loss": 0.9709, + "mean_token_accuracy": 0.7066267728805542, + "num_tokens": 414411504.0, + "step": 16588 + }, + { + "epoch": 1.8217658686580278, + "grad_norm": 2.2770092487335205, + "learning_rate": 1e-06, + "loss": 0.8688, + "mean_token_accuracy": 0.7213941216468811, + "num_tokens": 414438597.0, + "step": 16589 + }, + { + "epoch": 1.8218756863606413, + "grad_norm": 2.2313649654388428, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7262132167816162, + "num_tokens": 414463026.0, + "step": 16590 + }, + { + "epoch": 1.8219855040632549, + "grad_norm": 1.9950931072235107, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.7083338499069214, + "num_tokens": 414491312.0, + "step": 16591 + }, + { + "epoch": 1.8220953217658686, + "grad_norm": 2.1959822177886963, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.721039354801178, + "num_tokens": 414517778.0, + "step": 16592 + }, + { + "epoch": 1.8222051394684824, + "grad_norm": 2.20822811126709, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7210479974746704, + "num_tokens": 414543359.0, + "step": 16593 + }, + { + "epoch": 1.822314957171096, + "grad_norm": 2.1522507667541504, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7147476673126221, + "num_tokens": 414571326.0, + "step": 16594 + }, + { + "epoch": 1.8224247748737097, + "grad_norm": 2.381437063217163, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7272434234619141, + "num_tokens": 414594112.0, + "step": 16595 + }, + { + "epoch": 1.8225345925763232, + "grad_norm": 2.2415382862091064, + "learning_rate": 1e-06, + "loss": 0.9633, + "mean_token_accuracy": 0.7157372236251831, + "num_tokens": 414620473.0, + "step": 16596 + }, + { + "epoch": 1.822644410278937, + "grad_norm": 2.214048147201538, + "learning_rate": 1e-06, + "loss": 1.095, + "mean_token_accuracy": 0.6837342977523804, + "num_tokens": 414649269.0, + "step": 16597 + }, + { + "epoch": 1.8227542279815507, + "grad_norm": 2.189554452896118, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7141586542129517, + "num_tokens": 414675179.0, + "step": 16598 + }, + { + "epoch": 1.8228640456841643, + "grad_norm": 2.0775609016418457, + "learning_rate": 1e-06, + "loss": 0.9221, + "mean_token_accuracy": 0.7239353060722351, + "num_tokens": 414704706.0, + "step": 16599 + }, + { + "epoch": 1.8229738633867778, + "grad_norm": 2.194640636444092, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7155733108520508, + "num_tokens": 414729837.0, + "step": 16600 + }, + { + "epoch": 1.8230836810893916, + "grad_norm": 2.2008562088012695, + "learning_rate": 1e-06, + "loss": 0.7948, + "mean_token_accuracy": 0.751979649066925, + "num_tokens": 414757299.0, + "step": 16601 + }, + { + "epoch": 1.8231934987920053, + "grad_norm": 2.7171847820281982, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7107723951339722, + "num_tokens": 414776876.0, + "step": 16602 + }, + { + "epoch": 1.823303316494619, + "grad_norm": 2.10113525390625, + "learning_rate": 1e-06, + "loss": 0.8953, + "mean_token_accuracy": 0.726494550704956, + "num_tokens": 414804444.0, + "step": 16603 + }, + { + "epoch": 1.8234131341972326, + "grad_norm": 2.184142589569092, + "learning_rate": 1e-06, + "loss": 0.8886, + "mean_token_accuracy": 0.7266494035720825, + "num_tokens": 414829940.0, + "step": 16604 + }, + { + "epoch": 1.8235229518998461, + "grad_norm": 2.142461061477661, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7087419033050537, + "num_tokens": 414856900.0, + "step": 16605 + }, + { + "epoch": 1.82363276960246, + "grad_norm": 2.234467029571533, + "learning_rate": 1e-06, + "loss": 0.8725, + "mean_token_accuracy": 0.7280528545379639, + "num_tokens": 414881459.0, + "step": 16606 + }, + { + "epoch": 1.8237425873050737, + "grad_norm": 2.4615232944488525, + "learning_rate": 1e-06, + "loss": 0.9366, + "mean_token_accuracy": 0.7077444195747375, + "num_tokens": 414903458.0, + "step": 16607 + }, + { + "epoch": 1.8238524050076872, + "grad_norm": 2.1437299251556396, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.705764651298523, + "num_tokens": 414929548.0, + "step": 16608 + }, + { + "epoch": 1.823962222710301, + "grad_norm": 2.484203815460205, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7154302597045898, + "num_tokens": 414950873.0, + "step": 16609 + }, + { + "epoch": 1.8240720404129145, + "grad_norm": 2.236924886703491, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7195167541503906, + "num_tokens": 414976284.0, + "step": 16610 + }, + { + "epoch": 1.8241818581155282, + "grad_norm": 2.585158109664917, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6935364007949829, + "num_tokens": 414997048.0, + "step": 16611 + }, + { + "epoch": 1.824291675818142, + "grad_norm": 2.2773399353027344, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7263211011886597, + "num_tokens": 415020796.0, + "step": 16612 + }, + { + "epoch": 1.8244014935207555, + "grad_norm": 2.160351276397705, + "learning_rate": 1e-06, + "loss": 0.8689, + "mean_token_accuracy": 0.7274006009101868, + "num_tokens": 415048858.0, + "step": 16613 + }, + { + "epoch": 1.824511311223369, + "grad_norm": 2.0682382583618164, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7203249335289001, + "num_tokens": 415076668.0, + "step": 16614 + }, + { + "epoch": 1.8246211289259828, + "grad_norm": 2.2950427532196045, + "learning_rate": 1e-06, + "loss": 0.7312, + "mean_token_accuracy": 0.7566773891448975, + "num_tokens": 415100211.0, + "step": 16615 + }, + { + "epoch": 1.8247309466285966, + "grad_norm": 2.261683464050293, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.733631432056427, + "num_tokens": 415123316.0, + "step": 16616 + }, + { + "epoch": 1.8248407643312103, + "grad_norm": 2.184558629989624, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7168059945106506, + "num_tokens": 415150971.0, + "step": 16617 + }, + { + "epoch": 1.8249505820338239, + "grad_norm": 2.0446462631225586, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7021675109863281, + "num_tokens": 415183042.0, + "step": 16618 + }, + { + "epoch": 1.8250603997364374, + "grad_norm": 2.123061180114746, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7166191935539246, + "num_tokens": 415210281.0, + "step": 16619 + }, + { + "epoch": 1.8251702174390512, + "grad_norm": 2.391387701034546, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.7257164716720581, + "num_tokens": 415234715.0, + "step": 16620 + }, + { + "epoch": 1.825280035141665, + "grad_norm": 2.3435328006744385, + "learning_rate": 1e-06, + "loss": 0.9186, + "mean_token_accuracy": 0.7100802659988403, + "num_tokens": 415258171.0, + "step": 16621 + }, + { + "epoch": 1.8253898528442785, + "grad_norm": 2.2196874618530273, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7328425645828247, + "num_tokens": 415283976.0, + "step": 16622 + }, + { + "epoch": 1.825499670546892, + "grad_norm": 2.4981842041015625, + "learning_rate": 1e-06, + "loss": 0.9103, + "mean_token_accuracy": 0.7174793481826782, + "num_tokens": 415307324.0, + "step": 16623 + }, + { + "epoch": 1.8256094882495058, + "grad_norm": 2.5438969135284424, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7615811824798584, + "num_tokens": 415325481.0, + "step": 16624 + }, + { + "epoch": 1.8257193059521195, + "grad_norm": 2.219106912612915, + "learning_rate": 1e-06, + "loss": 0.9025, + "mean_token_accuracy": 0.7235424518585205, + "num_tokens": 415352198.0, + "step": 16625 + }, + { + "epoch": 1.8258291236547333, + "grad_norm": 2.3354461193084717, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7067986726760864, + "num_tokens": 415377758.0, + "step": 16626 + }, + { + "epoch": 1.8259389413573468, + "grad_norm": 2.1334357261657715, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7100386619567871, + "num_tokens": 415405521.0, + "step": 16627 + }, + { + "epoch": 1.8260487590599603, + "grad_norm": 2.2261171340942383, + "learning_rate": 1e-06, + "loss": 0.8506, + "mean_token_accuracy": 0.7320845127105713, + "num_tokens": 415432335.0, + "step": 16628 + }, + { + "epoch": 1.826158576762574, + "grad_norm": 2.2107224464416504, + "learning_rate": 1e-06, + "loss": 0.9114, + "mean_token_accuracy": 0.7129606008529663, + "num_tokens": 415457669.0, + "step": 16629 + }, + { + "epoch": 1.8262683944651878, + "grad_norm": 2.186910390853882, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7167027592658997, + "num_tokens": 415482667.0, + "step": 16630 + }, + { + "epoch": 1.8263782121678016, + "grad_norm": 2.2004408836364746, + "learning_rate": 1e-06, + "loss": 0.9557, + "mean_token_accuracy": 0.70369952917099, + "num_tokens": 415511906.0, + "step": 16631 + }, + { + "epoch": 1.8264880298704151, + "grad_norm": 2.3408873081207275, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7059280276298523, + "num_tokens": 415536144.0, + "step": 16632 + }, + { + "epoch": 1.8265978475730287, + "grad_norm": 2.0680644512176514, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7158287167549133, + "num_tokens": 415564055.0, + "step": 16633 + }, + { + "epoch": 1.8267076652756424, + "grad_norm": 2.0664432048797607, + "learning_rate": 1e-06, + "loss": 0.8849, + "mean_token_accuracy": 0.7214882373809814, + "num_tokens": 415593975.0, + "step": 16634 + }, + { + "epoch": 1.8268174829782562, + "grad_norm": 2.4402430057525635, + "learning_rate": 1e-06, + "loss": 0.9297, + "mean_token_accuracy": 0.7178452014923096, + "num_tokens": 415617645.0, + "step": 16635 + }, + { + "epoch": 1.8269273006808697, + "grad_norm": 2.5690715312957764, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7230033278465271, + "num_tokens": 415638257.0, + "step": 16636 + }, + { + "epoch": 1.8270371183834833, + "grad_norm": 2.298194408416748, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7036932706832886, + "num_tokens": 415662524.0, + "step": 16637 + }, + { + "epoch": 1.827146936086097, + "grad_norm": 2.0144855976104736, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7079559564590454, + "num_tokens": 415693012.0, + "step": 16638 + }, + { + "epoch": 1.8272567537887108, + "grad_norm": 2.078747272491455, + "learning_rate": 1e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7308293581008911, + "num_tokens": 415720535.0, + "step": 16639 + }, + { + "epoch": 1.8273665714913245, + "grad_norm": 2.0017549991607666, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7054034471511841, + "num_tokens": 415751866.0, + "step": 16640 + }, + { + "epoch": 1.827476389193938, + "grad_norm": 2.7130720615386963, + "learning_rate": 1e-06, + "loss": 0.7483, + "mean_token_accuracy": 0.7634530067443848, + "num_tokens": 415767023.0, + "step": 16641 + }, + { + "epoch": 1.8275862068965516, + "grad_norm": 2.2874014377593994, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7344058752059937, + "num_tokens": 415791137.0, + "step": 16642 + }, + { + "epoch": 1.8276960245991654, + "grad_norm": 2.308396339416504, + "learning_rate": 1e-06, + "loss": 0.9446, + "mean_token_accuracy": 0.7042902708053589, + "num_tokens": 415816723.0, + "step": 16643 + }, + { + "epoch": 1.8278058423017791, + "grad_norm": 1.9321287870407104, + "learning_rate": 1e-06, + "loss": 0.9893, + "mean_token_accuracy": 0.6905325651168823, + "num_tokens": 415850165.0, + "step": 16644 + }, + { + "epoch": 1.8279156600043929, + "grad_norm": 2.45281982421875, + "learning_rate": 1e-06, + "loss": 0.7512, + "mean_token_accuracy": 0.7580615282058716, + "num_tokens": 415871071.0, + "step": 16645 + }, + { + "epoch": 1.8280254777070064, + "grad_norm": 2.0875701904296875, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7392676472663879, + "num_tokens": 415898209.0, + "step": 16646 + }, + { + "epoch": 1.82813529540962, + "grad_norm": 2.39616322517395, + "learning_rate": 1e-06, + "loss": 0.8943, + "mean_token_accuracy": 0.7117598652839661, + "num_tokens": 415920103.0, + "step": 16647 + }, + { + "epoch": 1.8282451131122337, + "grad_norm": 2.3541197776794434, + "learning_rate": 1e-06, + "loss": 0.9825, + "mean_token_accuracy": 0.7090235948562622, + "num_tokens": 415940068.0, + "step": 16648 + }, + { + "epoch": 1.8283549308148475, + "grad_norm": 2.175407648086548, + "learning_rate": 1e-06, + "loss": 0.8745, + "mean_token_accuracy": 0.7278590798377991, + "num_tokens": 415965317.0, + "step": 16649 + }, + { + "epoch": 1.828464748517461, + "grad_norm": 2.6249358654022217, + "learning_rate": 1e-06, + "loss": 0.9841, + "mean_token_accuracy": 0.6986966133117676, + "num_tokens": 415985112.0, + "step": 16650 + }, + { + "epoch": 1.8285745662200745, + "grad_norm": 2.137650728225708, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7170352935791016, + "num_tokens": 416013192.0, + "step": 16651 + }, + { + "epoch": 1.8286843839226883, + "grad_norm": 2.0305678844451904, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7189624309539795, + "num_tokens": 416041764.0, + "step": 16652 + }, + { + "epoch": 1.828794201625302, + "grad_norm": 2.2059884071350098, + "learning_rate": 1e-06, + "loss": 0.8485, + "mean_token_accuracy": 0.7337023615837097, + "num_tokens": 416067037.0, + "step": 16653 + }, + { + "epoch": 1.8289040193279158, + "grad_norm": 2.3895888328552246, + "learning_rate": 1e-06, + "loss": 0.9313, + "mean_token_accuracy": 0.7132201790809631, + "num_tokens": 416091333.0, + "step": 16654 + }, + { + "epoch": 1.8290138370305293, + "grad_norm": 2.150372266769409, + "learning_rate": 1e-06, + "loss": 0.9347, + "mean_token_accuracy": 0.719279408454895, + "num_tokens": 416119884.0, + "step": 16655 + }, + { + "epoch": 1.8291236547331429, + "grad_norm": 2.0386898517608643, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.7202996015548706, + "num_tokens": 416148522.0, + "step": 16656 + }, + { + "epoch": 1.8292334724357566, + "grad_norm": 2.514744997024536, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7208313345909119, + "num_tokens": 416168695.0, + "step": 16657 + }, + { + "epoch": 1.8293432901383704, + "grad_norm": 2.4269237518310547, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.7039886713027954, + "num_tokens": 416194454.0, + "step": 16658 + }, + { + "epoch": 1.829453107840984, + "grad_norm": 2.894847869873047, + "learning_rate": 1e-06, + "loss": 0.8784, + "mean_token_accuracy": 0.7200455665588379, + "num_tokens": 416212396.0, + "step": 16659 + }, + { + "epoch": 1.8295629255435977, + "grad_norm": 2.3615102767944336, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7216411828994751, + "num_tokens": 416239944.0, + "step": 16660 + }, + { + "epoch": 1.8296727432462112, + "grad_norm": 2.4799554347991943, + "learning_rate": 1e-06, + "loss": 0.9092, + "mean_token_accuracy": 0.7295861840248108, + "num_tokens": 416262454.0, + "step": 16661 + }, + { + "epoch": 1.829782560948825, + "grad_norm": 2.0627357959747314, + "learning_rate": 1e-06, + "loss": 0.921, + "mean_token_accuracy": 0.716206967830658, + "num_tokens": 416290039.0, + "step": 16662 + }, + { + "epoch": 1.8298923786514387, + "grad_norm": 2.243100643157959, + "learning_rate": 1e-06, + "loss": 0.8714, + "mean_token_accuracy": 0.72819983959198, + "num_tokens": 416312991.0, + "step": 16663 + }, + { + "epoch": 1.8300021963540523, + "grad_norm": 2.314066171646118, + "learning_rate": 1e-06, + "loss": 0.9306, + "mean_token_accuracy": 0.7078720331192017, + "num_tokens": 416336423.0, + "step": 16664 + }, + { + "epoch": 1.8301120140566658, + "grad_norm": 2.5730834007263184, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7158207893371582, + "num_tokens": 416357136.0, + "step": 16665 + }, + { + "epoch": 1.8302218317592795, + "grad_norm": 2.2506818771362305, + "learning_rate": 1e-06, + "loss": 0.9079, + "mean_token_accuracy": 0.7133339643478394, + "num_tokens": 416382141.0, + "step": 16666 + }, + { + "epoch": 1.8303316494618933, + "grad_norm": 2.154144287109375, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7422385215759277, + "num_tokens": 416410151.0, + "step": 16667 + }, + { + "epoch": 1.830441467164507, + "grad_norm": 2.3392488956451416, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7548595666885376, + "num_tokens": 416433088.0, + "step": 16668 + }, + { + "epoch": 1.8305512848671206, + "grad_norm": 2.293025493621826, + "learning_rate": 1e-06, + "loss": 0.8663, + "mean_token_accuracy": 0.7258403301239014, + "num_tokens": 416455294.0, + "step": 16669 + }, + { + "epoch": 1.8306611025697341, + "grad_norm": 2.5374069213867188, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.724800705909729, + "num_tokens": 416475738.0, + "step": 16670 + }, + { + "epoch": 1.830770920272348, + "grad_norm": 2.0232975482940674, + "learning_rate": 1e-06, + "loss": 0.8941, + "mean_token_accuracy": 0.7187755703926086, + "num_tokens": 416503461.0, + "step": 16671 + }, + { + "epoch": 1.8308807379749616, + "grad_norm": 2.1947081089019775, + "learning_rate": 1e-06, + "loss": 0.9131, + "mean_token_accuracy": 0.7137806415557861, + "num_tokens": 416532403.0, + "step": 16672 + }, + { + "epoch": 1.8309905556775752, + "grad_norm": 1.9948266744613647, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.6958061456680298, + "num_tokens": 416563874.0, + "step": 16673 + }, + { + "epoch": 1.831100373380189, + "grad_norm": 2.576589822769165, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7303588390350342, + "num_tokens": 416585323.0, + "step": 16674 + }, + { + "epoch": 1.8312101910828025, + "grad_norm": 2.388125419616699, + "learning_rate": 1e-06, + "loss": 0.7779, + "mean_token_accuracy": 0.7489006519317627, + "num_tokens": 416607951.0, + "step": 16675 + }, + { + "epoch": 1.8313200087854162, + "grad_norm": 2.56118106842041, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.710289716720581, + "num_tokens": 416632067.0, + "step": 16676 + }, + { + "epoch": 1.83142982648803, + "grad_norm": 2.295074939727783, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7235943078994751, + "num_tokens": 416657415.0, + "step": 16677 + }, + { + "epoch": 1.8315396441906435, + "grad_norm": 2.737497329711914, + "learning_rate": 1e-06, + "loss": 0.9113, + "mean_token_accuracy": 0.7152479887008667, + "num_tokens": 416677045.0, + "step": 16678 + }, + { + "epoch": 1.831649461893257, + "grad_norm": 2.3632118701934814, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7142793536186218, + "num_tokens": 416700602.0, + "step": 16679 + }, + { + "epoch": 1.8317592795958708, + "grad_norm": 2.0628232955932617, + "learning_rate": 1e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7329819202423096, + "num_tokens": 416729971.0, + "step": 16680 + }, + { + "epoch": 1.8318690972984846, + "grad_norm": 2.213712692260742, + "learning_rate": 1e-06, + "loss": 0.9205, + "mean_token_accuracy": 0.7141141891479492, + "num_tokens": 416755290.0, + "step": 16681 + }, + { + "epoch": 1.8319789150010983, + "grad_norm": 2.3347678184509277, + "learning_rate": 1e-06, + "loss": 0.7952, + "mean_token_accuracy": 0.739832878112793, + "num_tokens": 416779016.0, + "step": 16682 + }, + { + "epoch": 1.8320887327037119, + "grad_norm": 2.373697519302368, + "learning_rate": 1e-06, + "loss": 0.8575, + "mean_token_accuracy": 0.7246928811073303, + "num_tokens": 416799528.0, + "step": 16683 + }, + { + "epoch": 1.8321985504063254, + "grad_norm": 2.1568007469177246, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6957319378852844, + "num_tokens": 416827731.0, + "step": 16684 + }, + { + "epoch": 1.8323083681089392, + "grad_norm": 2.4773001670837402, + "learning_rate": 1e-06, + "loss": 0.9056, + "mean_token_accuracy": 0.7165718078613281, + "num_tokens": 416849356.0, + "step": 16685 + }, + { + "epoch": 1.832418185811553, + "grad_norm": 2.2893214225769043, + "learning_rate": 1e-06, + "loss": 0.8907, + "mean_token_accuracy": 0.7242900729179382, + "num_tokens": 416872328.0, + "step": 16686 + }, + { + "epoch": 1.8325280035141664, + "grad_norm": 2.1036224365234375, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.710304856300354, + "num_tokens": 416900768.0, + "step": 16687 + }, + { + "epoch": 1.83263782121678, + "grad_norm": 2.4696128368377686, + "learning_rate": 1e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7405465245246887, + "num_tokens": 416921871.0, + "step": 16688 + }, + { + "epoch": 1.8327476389193937, + "grad_norm": 2.3414664268493652, + "learning_rate": 1e-06, + "loss": 0.8716, + "mean_token_accuracy": 0.7285079956054688, + "num_tokens": 416943689.0, + "step": 16689 + }, + { + "epoch": 1.8328574566220075, + "grad_norm": 2.334568977355957, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7178225517272949, + "num_tokens": 416970126.0, + "step": 16690 + }, + { + "epoch": 1.8329672743246213, + "grad_norm": 2.2547526359558105, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7049561142921448, + "num_tokens": 416995272.0, + "step": 16691 + }, + { + "epoch": 1.8330770920272348, + "grad_norm": 2.025631904602051, + "learning_rate": 1e-06, + "loss": 0.9596, + "mean_token_accuracy": 0.7127325534820557, + "num_tokens": 417023780.0, + "step": 16692 + }, + { + "epoch": 1.8331869097298483, + "grad_norm": 2.2239723205566406, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7315352559089661, + "num_tokens": 417049647.0, + "step": 16693 + }, + { + "epoch": 1.833296727432462, + "grad_norm": 1.9253164529800415, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7091012597084045, + "num_tokens": 417082616.0, + "step": 16694 + }, + { + "epoch": 1.8334065451350758, + "grad_norm": 2.109062910079956, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7327309846878052, + "num_tokens": 417108490.0, + "step": 16695 + }, + { + "epoch": 1.8335163628376896, + "grad_norm": 2.610389232635498, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7280696630477905, + "num_tokens": 417128068.0, + "step": 16696 + }, + { + "epoch": 1.8336261805403031, + "grad_norm": 2.0283031463623047, + "learning_rate": 1e-06, + "loss": 0.9516, + "mean_token_accuracy": 0.7006653547286987, + "num_tokens": 417158451.0, + "step": 16697 + }, + { + "epoch": 1.8337359982429167, + "grad_norm": 2.383903741836548, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.7417657375335693, + "num_tokens": 417181234.0, + "step": 16698 + }, + { + "epoch": 1.8338458159455304, + "grad_norm": 2.175494432449341, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7222462892532349, + "num_tokens": 417210401.0, + "step": 16699 + }, + { + "epoch": 1.8339556336481442, + "grad_norm": 2.2754807472229004, + "learning_rate": 1e-06, + "loss": 0.9242, + "mean_token_accuracy": 0.7256852984428406, + "num_tokens": 417236548.0, + "step": 16700 + }, + { + "epoch": 1.8340654513507577, + "grad_norm": 2.1734235286712646, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7167829871177673, + "num_tokens": 417261875.0, + "step": 16701 + }, + { + "epoch": 1.8341752690533712, + "grad_norm": 2.635756731033325, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7230807542800903, + "num_tokens": 417282512.0, + "step": 16702 + }, + { + "epoch": 1.834285086755985, + "grad_norm": 2.262958288192749, + "learning_rate": 1e-06, + "loss": 0.8988, + "mean_token_accuracy": 0.714202344417572, + "num_tokens": 417307677.0, + "step": 16703 + }, + { + "epoch": 1.8343949044585988, + "grad_norm": 2.0638599395751953, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7276755571365356, + "num_tokens": 417335500.0, + "step": 16704 + }, + { + "epoch": 1.8345047221612125, + "grad_norm": 2.030397653579712, + "learning_rate": 1e-06, + "loss": 0.9159, + "mean_token_accuracy": 0.7118878364562988, + "num_tokens": 417367305.0, + "step": 16705 + }, + { + "epoch": 1.834614539863826, + "grad_norm": 2.2095625400543213, + "learning_rate": 1e-06, + "loss": 0.8887, + "mean_token_accuracy": 0.7202982902526855, + "num_tokens": 417390538.0, + "step": 16706 + }, + { + "epoch": 1.8347243575664396, + "grad_norm": 2.3857455253601074, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7330271005630493, + "num_tokens": 417413334.0, + "step": 16707 + }, + { + "epoch": 1.8348341752690533, + "grad_norm": 2.2371268272399902, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7092478275299072, + "num_tokens": 417439893.0, + "step": 16708 + }, + { + "epoch": 1.834943992971667, + "grad_norm": 2.2043702602386475, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7052128314971924, + "num_tokens": 417466490.0, + "step": 16709 + }, + { + "epoch": 1.8350538106742806, + "grad_norm": 2.255885362625122, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7128780484199524, + "num_tokens": 417491383.0, + "step": 16710 + }, + { + "epoch": 1.8351636283768944, + "grad_norm": 2.4829084873199463, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7375297546386719, + "num_tokens": 417512265.0, + "step": 16711 + }, + { + "epoch": 1.835273446079508, + "grad_norm": 2.386695146560669, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7247262001037598, + "num_tokens": 417535523.0, + "step": 16712 + }, + { + "epoch": 1.8353832637821217, + "grad_norm": 2.5253994464874268, + "learning_rate": 1e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7283287048339844, + "num_tokens": 417556715.0, + "step": 16713 + }, + { + "epoch": 1.8354930814847354, + "grad_norm": 2.490144729614258, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7165422439575195, + "num_tokens": 417578712.0, + "step": 16714 + }, + { + "epoch": 1.835602899187349, + "grad_norm": 2.07955265045166, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7204543352127075, + "num_tokens": 417606460.0, + "step": 16715 + }, + { + "epoch": 1.8357127168899625, + "grad_norm": 2.364935874938965, + "learning_rate": 1e-06, + "loss": 0.9323, + "mean_token_accuracy": 0.7155090570449829, + "num_tokens": 417631959.0, + "step": 16716 + }, + { + "epoch": 1.8358225345925763, + "grad_norm": 2.089047908782959, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7219265699386597, + "num_tokens": 417661623.0, + "step": 16717 + }, + { + "epoch": 1.83593235229519, + "grad_norm": 2.1942312717437744, + "learning_rate": 1e-06, + "loss": 0.8524, + "mean_token_accuracy": 0.7330144047737122, + "num_tokens": 417689502.0, + "step": 16718 + }, + { + "epoch": 1.8360421699978038, + "grad_norm": 2.0528879165649414, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7152354717254639, + "num_tokens": 417721617.0, + "step": 16719 + }, + { + "epoch": 1.8361519877004173, + "grad_norm": 2.21260929107666, + "learning_rate": 1e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7374165654182434, + "num_tokens": 417745751.0, + "step": 16720 + }, + { + "epoch": 1.8362618054030309, + "grad_norm": 2.217404842376709, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7366733551025391, + "num_tokens": 417771242.0, + "step": 16721 + }, + { + "epoch": 1.8363716231056446, + "grad_norm": 2.082834005355835, + "learning_rate": 1e-06, + "loss": 0.9386, + "mean_token_accuracy": 0.7060753107070923, + "num_tokens": 417799583.0, + "step": 16722 + }, + { + "epoch": 1.8364814408082584, + "grad_norm": 2.1769092082977295, + "learning_rate": 1e-06, + "loss": 0.9734, + "mean_token_accuracy": 0.6983543038368225, + "num_tokens": 417826536.0, + "step": 16723 + }, + { + "epoch": 1.836591258510872, + "grad_norm": 2.1775062084198, + "learning_rate": 1e-06, + "loss": 0.9089, + "mean_token_accuracy": 0.7148153781890869, + "num_tokens": 417855974.0, + "step": 16724 + }, + { + "epoch": 1.8367010762134857, + "grad_norm": 2.4274556636810303, + "learning_rate": 1e-06, + "loss": 0.8238, + "mean_token_accuracy": 0.7449828386306763, + "num_tokens": 417877856.0, + "step": 16725 + }, + { + "epoch": 1.8368108939160992, + "grad_norm": 2.672903299331665, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7335561513900757, + "num_tokens": 417897343.0, + "step": 16726 + }, + { + "epoch": 1.836920711618713, + "grad_norm": 2.2923614978790283, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.727738618850708, + "num_tokens": 417921177.0, + "step": 16727 + }, + { + "epoch": 1.8370305293213267, + "grad_norm": 2.5826783180236816, + "learning_rate": 1e-06, + "loss": 0.7563, + "mean_token_accuracy": 0.7547814846038818, + "num_tokens": 417940649.0, + "step": 16728 + }, + { + "epoch": 1.8371403470239402, + "grad_norm": 2.379713296890259, + "learning_rate": 1e-06, + "loss": 0.8309, + "mean_token_accuracy": 0.7417095899581909, + "num_tokens": 417964252.0, + "step": 16729 + }, + { + "epoch": 1.8372501647265538, + "grad_norm": 2.053784132003784, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.724749743938446, + "num_tokens": 417996174.0, + "step": 16730 + }, + { + "epoch": 1.8373599824291675, + "grad_norm": 2.548670530319214, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7275665998458862, + "num_tokens": 418017156.0, + "step": 16731 + }, + { + "epoch": 1.8374698001317813, + "grad_norm": 2.1361546516418457, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.710680365562439, + "num_tokens": 418044187.0, + "step": 16732 + }, + { + "epoch": 1.837579617834395, + "grad_norm": 2.1821672916412354, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7197442650794983, + "num_tokens": 418068998.0, + "step": 16733 + }, + { + "epoch": 1.8376894355370086, + "grad_norm": 2.5444066524505615, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7086405754089355, + "num_tokens": 418090907.0, + "step": 16734 + }, + { + "epoch": 1.8377992532396221, + "grad_norm": 2.3115298748016357, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7248027324676514, + "num_tokens": 418112666.0, + "step": 16735 + }, + { + "epoch": 1.8379090709422359, + "grad_norm": 2.269242286682129, + "learning_rate": 1e-06, + "loss": 0.8661, + "mean_token_accuracy": 0.7265443801879883, + "num_tokens": 418137557.0, + "step": 16736 + }, + { + "epoch": 1.8380188886448496, + "grad_norm": 2.067040205001831, + "learning_rate": 1e-06, + "loss": 0.9161, + "mean_token_accuracy": 0.715230405330658, + "num_tokens": 418167488.0, + "step": 16737 + }, + { + "epoch": 1.8381287063474632, + "grad_norm": 2.6893508434295654, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7305647134780884, + "num_tokens": 418187596.0, + "step": 16738 + }, + { + "epoch": 1.838238524050077, + "grad_norm": 2.0700719356536865, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7198686003684998, + "num_tokens": 418217152.0, + "step": 16739 + }, + { + "epoch": 1.8383483417526905, + "grad_norm": 2.196277618408203, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.71296226978302, + "num_tokens": 418244451.0, + "step": 16740 + }, + { + "epoch": 1.8384581594553042, + "grad_norm": 2.13909912109375, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7249542474746704, + "num_tokens": 418272291.0, + "step": 16741 + }, + { + "epoch": 1.838567977157918, + "grad_norm": 2.367076873779297, + "learning_rate": 1e-06, + "loss": 0.8044, + "mean_token_accuracy": 0.7400981783866882, + "num_tokens": 418293556.0, + "step": 16742 + }, + { + "epoch": 1.8386777948605315, + "grad_norm": 2.4402997493743896, + "learning_rate": 1e-06, + "loss": 0.931, + "mean_token_accuracy": 0.7053579092025757, + "num_tokens": 418316435.0, + "step": 16743 + }, + { + "epoch": 1.838787612563145, + "grad_norm": 2.4445502758026123, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7234382629394531, + "num_tokens": 418338291.0, + "step": 16744 + }, + { + "epoch": 1.8388974302657588, + "grad_norm": 3.074232339859009, + "learning_rate": 1e-06, + "loss": 0.7657, + "mean_token_accuracy": 0.7553004026412964, + "num_tokens": 418353552.0, + "step": 16745 + }, + { + "epoch": 1.8390072479683726, + "grad_norm": 2.268754720687866, + "learning_rate": 1e-06, + "loss": 1.0007, + "mean_token_accuracy": 0.6958616971969604, + "num_tokens": 418382221.0, + "step": 16746 + }, + { + "epoch": 1.8391170656709863, + "grad_norm": 2.1555051803588867, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7256969809532166, + "num_tokens": 418408133.0, + "step": 16747 + }, + { + "epoch": 1.8392268833735999, + "grad_norm": 2.0882768630981445, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.727407693862915, + "num_tokens": 418438567.0, + "step": 16748 + }, + { + "epoch": 1.8393367010762134, + "grad_norm": 2.1794891357421875, + "learning_rate": 1e-06, + "loss": 0.848, + "mean_token_accuracy": 0.7307035326957703, + "num_tokens": 418464259.0, + "step": 16749 + }, + { + "epoch": 1.8394465187788271, + "grad_norm": 2.787766933441162, + "learning_rate": 1e-06, + "loss": 0.7841, + "mean_token_accuracy": 0.7461594343185425, + "num_tokens": 418480294.0, + "step": 16750 + }, + { + "epoch": 1.839556336481441, + "grad_norm": 2.1085822582244873, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6958789825439453, + "num_tokens": 418510360.0, + "step": 16751 + }, + { + "epoch": 1.8396661541840544, + "grad_norm": 2.1522045135498047, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.7070614099502563, + "num_tokens": 418536958.0, + "step": 16752 + }, + { + "epoch": 1.839775971886668, + "grad_norm": 2.278315305709839, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.7138408422470093, + "num_tokens": 418562837.0, + "step": 16753 + }, + { + "epoch": 1.8398857895892817, + "grad_norm": 2.351116418838501, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7376604080200195, + "num_tokens": 418585960.0, + "step": 16754 + }, + { + "epoch": 1.8399956072918955, + "grad_norm": 2.607239007949829, + "learning_rate": 1e-06, + "loss": 0.8217, + "mean_token_accuracy": 0.7394102811813354, + "num_tokens": 418604978.0, + "step": 16755 + }, + { + "epoch": 1.8401054249945092, + "grad_norm": 2.29375958442688, + "learning_rate": 1e-06, + "loss": 0.8408, + "mean_token_accuracy": 0.7339178323745728, + "num_tokens": 418627600.0, + "step": 16756 + }, + { + "epoch": 1.8402152426971228, + "grad_norm": 2.080197811126709, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7246040105819702, + "num_tokens": 418653696.0, + "step": 16757 + }, + { + "epoch": 1.8403250603997363, + "grad_norm": 2.2622690200805664, + "learning_rate": 1e-06, + "loss": 0.9384, + "mean_token_accuracy": 0.7153698205947876, + "num_tokens": 418680870.0, + "step": 16758 + }, + { + "epoch": 1.84043487810235, + "grad_norm": 2.349238157272339, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7212194204330444, + "num_tokens": 418705049.0, + "step": 16759 + }, + { + "epoch": 1.8405446958049638, + "grad_norm": 2.3399600982666016, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7092249393463135, + "num_tokens": 418728208.0, + "step": 16760 + }, + { + "epoch": 1.8406545135075776, + "grad_norm": 1.907486915588379, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.713168740272522, + "num_tokens": 418761073.0, + "step": 16761 + }, + { + "epoch": 1.8407643312101911, + "grad_norm": 2.260529041290283, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7145851850509644, + "num_tokens": 418786844.0, + "step": 16762 + }, + { + "epoch": 1.8408741489128047, + "grad_norm": 2.165930986404419, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7214186191558838, + "num_tokens": 418813935.0, + "step": 16763 + }, + { + "epoch": 1.8409839666154184, + "grad_norm": 2.369745969772339, + "learning_rate": 1e-06, + "loss": 0.9277, + "mean_token_accuracy": 0.7167664170265198, + "num_tokens": 418840340.0, + "step": 16764 + }, + { + "epoch": 1.8410937843180322, + "grad_norm": 2.5153701305389404, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7287166118621826, + "num_tokens": 418861949.0, + "step": 16765 + }, + { + "epoch": 1.8412036020206457, + "grad_norm": 2.126523971557617, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.722978949546814, + "num_tokens": 418888342.0, + "step": 16766 + }, + { + "epoch": 1.8413134197232592, + "grad_norm": 1.9680650234222412, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.733564555644989, + "num_tokens": 418920478.0, + "step": 16767 + }, + { + "epoch": 1.841423237425873, + "grad_norm": 2.433302879333496, + "learning_rate": 1e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7261011600494385, + "num_tokens": 418940965.0, + "step": 16768 + }, + { + "epoch": 1.8415330551284868, + "grad_norm": 2.176250696182251, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7130552530288696, + "num_tokens": 418969095.0, + "step": 16769 + }, + { + "epoch": 1.8416428728311005, + "grad_norm": 2.060002088546753, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7090641260147095, + "num_tokens": 419000511.0, + "step": 16770 + }, + { + "epoch": 1.841752690533714, + "grad_norm": 2.2832608222961426, + "learning_rate": 1e-06, + "loss": 0.8966, + "mean_token_accuracy": 0.7257407307624817, + "num_tokens": 419025470.0, + "step": 16771 + }, + { + "epoch": 1.8418625082363276, + "grad_norm": 2.4904377460479736, + "learning_rate": 1e-06, + "loss": 0.9026, + "mean_token_accuracy": 0.7204397320747375, + "num_tokens": 419047434.0, + "step": 16772 + }, + { + "epoch": 1.8419723259389413, + "grad_norm": 2.3610942363739014, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7070022821426392, + "num_tokens": 419073848.0, + "step": 16773 + }, + { + "epoch": 1.842082143641555, + "grad_norm": 2.135014772415161, + "learning_rate": 1e-06, + "loss": 0.7494, + "mean_token_accuracy": 0.7516315579414368, + "num_tokens": 419097935.0, + "step": 16774 + }, + { + "epoch": 1.8421919613441686, + "grad_norm": 1.9951612949371338, + "learning_rate": 1e-06, + "loss": 0.9217, + "mean_token_accuracy": 0.7183661460876465, + "num_tokens": 419128111.0, + "step": 16775 + }, + { + "epoch": 1.8423017790467824, + "grad_norm": 2.434135675430298, + "learning_rate": 1e-06, + "loss": 0.7469, + "mean_token_accuracy": 0.7582050561904907, + "num_tokens": 419150620.0, + "step": 16776 + }, + { + "epoch": 1.842411596749396, + "grad_norm": 2.065847396850586, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.715736448764801, + "num_tokens": 419178190.0, + "step": 16777 + }, + { + "epoch": 1.8425214144520097, + "grad_norm": 2.210756301879883, + "learning_rate": 1e-06, + "loss": 0.8354, + "mean_token_accuracy": 0.7313656210899353, + "num_tokens": 419203264.0, + "step": 16778 + }, + { + "epoch": 1.8426312321546234, + "grad_norm": 2.3657822608947754, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7129783630371094, + "num_tokens": 419229094.0, + "step": 16779 + }, + { + "epoch": 1.842741049857237, + "grad_norm": 2.250046730041504, + "learning_rate": 1e-06, + "loss": 0.9211, + "mean_token_accuracy": 0.7255409955978394, + "num_tokens": 419253949.0, + "step": 16780 + }, + { + "epoch": 1.8428508675598505, + "grad_norm": 2.0498857498168945, + "learning_rate": 1e-06, + "loss": 0.856, + "mean_token_accuracy": 0.7281197309494019, + "num_tokens": 419281738.0, + "step": 16781 + }, + { + "epoch": 1.8429606852624643, + "grad_norm": 2.0916738510131836, + "learning_rate": 1e-06, + "loss": 0.9376, + "mean_token_accuracy": 0.7044644951820374, + "num_tokens": 419309320.0, + "step": 16782 + }, + { + "epoch": 1.843070502965078, + "grad_norm": 2.2273333072662354, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7183674573898315, + "num_tokens": 419335778.0, + "step": 16783 + }, + { + "epoch": 1.8431803206676918, + "grad_norm": 2.439345359802246, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7360507249832153, + "num_tokens": 419357458.0, + "step": 16784 + }, + { + "epoch": 1.8432901383703053, + "grad_norm": 2.517298460006714, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7370623350143433, + "num_tokens": 419379672.0, + "step": 16785 + }, + { + "epoch": 1.8433999560729188, + "grad_norm": 2.130126714706421, + "learning_rate": 1e-06, + "loss": 0.8875, + "mean_token_accuracy": 0.7229548692703247, + "num_tokens": 419406670.0, + "step": 16786 + }, + { + "epoch": 1.8435097737755326, + "grad_norm": 2.1489453315734863, + "learning_rate": 1e-06, + "loss": 0.9658, + "mean_token_accuracy": 0.6965687870979309, + "num_tokens": 419433880.0, + "step": 16787 + }, + { + "epoch": 1.8436195914781464, + "grad_norm": 2.2791097164154053, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7284274101257324, + "num_tokens": 419458506.0, + "step": 16788 + }, + { + "epoch": 1.84372940918076, + "grad_norm": 2.222609519958496, + "learning_rate": 1e-06, + "loss": 0.861, + "mean_token_accuracy": 0.7342192530632019, + "num_tokens": 419483211.0, + "step": 16789 + }, + { + "epoch": 1.8438392268833736, + "grad_norm": 2.3105928897857666, + "learning_rate": 1e-06, + "loss": 0.8403, + "mean_token_accuracy": 0.7353952527046204, + "num_tokens": 419507313.0, + "step": 16790 + }, + { + "epoch": 1.8439490445859872, + "grad_norm": 2.838865280151367, + "learning_rate": 1e-06, + "loss": 0.8024, + "mean_token_accuracy": 0.7444714307785034, + "num_tokens": 419529895.0, + "step": 16791 + }, + { + "epoch": 1.844058862288601, + "grad_norm": 2.3513550758361816, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7194342613220215, + "num_tokens": 419551997.0, + "step": 16792 + }, + { + "epoch": 1.8441686799912147, + "grad_norm": 2.388286828994751, + "learning_rate": 1e-06, + "loss": 0.9312, + "mean_token_accuracy": 0.7098474502563477, + "num_tokens": 419575538.0, + "step": 16793 + }, + { + "epoch": 1.8442784976938282, + "grad_norm": 2.3760647773742676, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7321677803993225, + "num_tokens": 419598955.0, + "step": 16794 + }, + { + "epoch": 1.8443883153964418, + "grad_norm": 2.0572564601898193, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7089474201202393, + "num_tokens": 419628933.0, + "step": 16795 + }, + { + "epoch": 1.8444981330990555, + "grad_norm": 2.1415371894836426, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.7003717422485352, + "num_tokens": 419655551.0, + "step": 16796 + }, + { + "epoch": 1.8446079508016693, + "grad_norm": 2.007216215133667, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7133219242095947, + "num_tokens": 419684926.0, + "step": 16797 + }, + { + "epoch": 1.844717768504283, + "grad_norm": 2.1531929969787598, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.7006811499595642, + "num_tokens": 419715227.0, + "step": 16798 + }, + { + "epoch": 1.8448275862068966, + "grad_norm": 2.4594314098358154, + "learning_rate": 1e-06, + "loss": 0.9319, + "mean_token_accuracy": 0.7191198468208313, + "num_tokens": 419736001.0, + "step": 16799 + }, + { + "epoch": 1.84493740390951, + "grad_norm": 2.1756844520568848, + "learning_rate": 1e-06, + "loss": 1.0134, + "mean_token_accuracy": 0.6947744488716125, + "num_tokens": 419764444.0, + "step": 16800 + }, + { + "epoch": 1.8450472216121239, + "grad_norm": 2.673797130584717, + "learning_rate": 1e-06, + "loss": 0.9922, + "mean_token_accuracy": 0.6949993371963501, + "num_tokens": 419787094.0, + "step": 16801 + }, + { + "epoch": 1.8451570393147376, + "grad_norm": 2.4356706142425537, + "learning_rate": 1e-06, + "loss": 0.9334, + "mean_token_accuracy": 0.7196095585823059, + "num_tokens": 419807983.0, + "step": 16802 + }, + { + "epoch": 1.8452668570173512, + "grad_norm": 2.450908899307251, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.716217041015625, + "num_tokens": 419830205.0, + "step": 16803 + }, + { + "epoch": 1.8453766747199647, + "grad_norm": 2.2223055362701416, + "learning_rate": 1e-06, + "loss": 0.9328, + "mean_token_accuracy": 0.720851719379425, + "num_tokens": 419856515.0, + "step": 16804 + }, + { + "epoch": 1.8454864924225785, + "grad_norm": 2.1530001163482666, + "learning_rate": 1e-06, + "loss": 0.9294, + "mean_token_accuracy": 0.7123889923095703, + "num_tokens": 419882729.0, + "step": 16805 + }, + { + "epoch": 1.8455963101251922, + "grad_norm": 2.0281805992126465, + "learning_rate": 1e-06, + "loss": 0.8997, + "mean_token_accuracy": 0.7162066698074341, + "num_tokens": 419913230.0, + "step": 16806 + }, + { + "epoch": 1.845706127827806, + "grad_norm": 2.3751449584960938, + "learning_rate": 1e-06, + "loss": 0.8718, + "mean_token_accuracy": 0.7228281497955322, + "num_tokens": 419936026.0, + "step": 16807 + }, + { + "epoch": 1.8458159455304195, + "grad_norm": 2.2760555744171143, + "learning_rate": 1e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7314806580543518, + "num_tokens": 419961861.0, + "step": 16808 + }, + { + "epoch": 1.845925763233033, + "grad_norm": 2.228959560394287, + "learning_rate": 1e-06, + "loss": 0.8842, + "mean_token_accuracy": 0.7249740362167358, + "num_tokens": 419989937.0, + "step": 16809 + }, + { + "epoch": 1.8460355809356468, + "grad_norm": 2.074504852294922, + "learning_rate": 1e-06, + "loss": 0.9348, + "mean_token_accuracy": 0.7080017328262329, + "num_tokens": 420019014.0, + "step": 16810 + }, + { + "epoch": 1.8461453986382605, + "grad_norm": 2.8271708488464355, + "learning_rate": 1e-06, + "loss": 0.7798, + "mean_token_accuracy": 0.7481924295425415, + "num_tokens": 420036073.0, + "step": 16811 + }, + { + "epoch": 1.8462552163408743, + "grad_norm": 2.111133337020874, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7092788815498352, + "num_tokens": 420063410.0, + "step": 16812 + }, + { + "epoch": 1.8463650340434878, + "grad_norm": 2.3140385150909424, + "learning_rate": 1e-06, + "loss": 0.8438, + "mean_token_accuracy": 0.7313153743743896, + "num_tokens": 420084599.0, + "step": 16813 + }, + { + "epoch": 1.8464748517461014, + "grad_norm": 2.3181166648864746, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7128458023071289, + "num_tokens": 420109856.0, + "step": 16814 + }, + { + "epoch": 1.8465846694487151, + "grad_norm": 2.2810521125793457, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7105379104614258, + "num_tokens": 420135778.0, + "step": 16815 + }, + { + "epoch": 1.846694487151329, + "grad_norm": 2.1184489727020264, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7269469499588013, + "num_tokens": 420161886.0, + "step": 16816 + }, + { + "epoch": 1.8468043048539424, + "grad_norm": 2.129735231399536, + "learning_rate": 1e-06, + "loss": 0.8598, + "mean_token_accuracy": 0.7304040193557739, + "num_tokens": 420191572.0, + "step": 16817 + }, + { + "epoch": 1.846914122556556, + "grad_norm": 2.7934861183166504, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7282004952430725, + "num_tokens": 420209055.0, + "step": 16818 + }, + { + "epoch": 1.8470239402591697, + "grad_norm": 2.1901590824127197, + "learning_rate": 1e-06, + "loss": 0.8932, + "mean_token_accuracy": 0.7230304479598999, + "num_tokens": 420234560.0, + "step": 16819 + }, + { + "epoch": 1.8471337579617835, + "grad_norm": 2.548888921737671, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7308498620986938, + "num_tokens": 420255423.0, + "step": 16820 + }, + { + "epoch": 1.8472435756643972, + "grad_norm": 2.317392349243164, + "learning_rate": 1e-06, + "loss": 0.9346, + "mean_token_accuracy": 0.7082567811012268, + "num_tokens": 420281103.0, + "step": 16821 + }, + { + "epoch": 1.8473533933670108, + "grad_norm": 2.189643144607544, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7401422262191772, + "num_tokens": 420305511.0, + "step": 16822 + }, + { + "epoch": 1.8474632110696243, + "grad_norm": 1.9856457710266113, + "learning_rate": 1e-06, + "loss": 0.8638, + "mean_token_accuracy": 0.721742570400238, + "num_tokens": 420334896.0, + "step": 16823 + }, + { + "epoch": 1.847573028772238, + "grad_norm": 2.1682581901550293, + "learning_rate": 1e-06, + "loss": 0.806, + "mean_token_accuracy": 0.7341204881668091, + "num_tokens": 420360849.0, + "step": 16824 + }, + { + "epoch": 1.8476828464748518, + "grad_norm": 1.9636399745941162, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7044076919555664, + "num_tokens": 420392798.0, + "step": 16825 + }, + { + "epoch": 1.8477926641774656, + "grad_norm": 2.048142194747925, + "learning_rate": 1e-06, + "loss": 0.8451, + "mean_token_accuracy": 0.7394782304763794, + "num_tokens": 420421435.0, + "step": 16826 + }, + { + "epoch": 1.847902481880079, + "grad_norm": 2.196507453918457, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7180108428001404, + "num_tokens": 420446664.0, + "step": 16827 + }, + { + "epoch": 1.8480122995826926, + "grad_norm": 2.645200490951538, + "learning_rate": 1e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7289844751358032, + "num_tokens": 420465672.0, + "step": 16828 + }, + { + "epoch": 1.8481221172853064, + "grad_norm": 2.0870699882507324, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7343171834945679, + "num_tokens": 420491972.0, + "step": 16829 + }, + { + "epoch": 1.8482319349879202, + "grad_norm": 2.3448638916015625, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7388060688972473, + "num_tokens": 420514837.0, + "step": 16830 + }, + { + "epoch": 1.8483417526905337, + "grad_norm": 1.9926059246063232, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7104958295822144, + "num_tokens": 420546876.0, + "step": 16831 + }, + { + "epoch": 1.8484515703931472, + "grad_norm": 2.2130324840545654, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7194282412528992, + "num_tokens": 420571923.0, + "step": 16832 + }, + { + "epoch": 1.848561388095761, + "grad_norm": 2.111168622970581, + "learning_rate": 1e-06, + "loss": 1.0621, + "mean_token_accuracy": 0.6780363917350769, + "num_tokens": 420602048.0, + "step": 16833 + }, + { + "epoch": 1.8486712057983747, + "grad_norm": 2.2153825759887695, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7213513851165771, + "num_tokens": 420626777.0, + "step": 16834 + }, + { + "epoch": 1.8487810235009885, + "grad_norm": 2.2480485439300537, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.7195698022842407, + "num_tokens": 420651962.0, + "step": 16835 + }, + { + "epoch": 1.848890841203602, + "grad_norm": 2.0604701042175293, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6973825693130493, + "num_tokens": 420683177.0, + "step": 16836 + }, + { + "epoch": 1.8490006589062156, + "grad_norm": 2.214160203933716, + "learning_rate": 1e-06, + "loss": 0.9559, + "mean_token_accuracy": 0.700164794921875, + "num_tokens": 420710797.0, + "step": 16837 + }, + { + "epoch": 1.8491104766088293, + "grad_norm": 2.5602340698242188, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.731002926826477, + "num_tokens": 420731354.0, + "step": 16838 + }, + { + "epoch": 1.849220294311443, + "grad_norm": 2.3390324115753174, + "learning_rate": 1e-06, + "loss": 0.8671, + "mean_token_accuracy": 0.7250494360923767, + "num_tokens": 420756838.0, + "step": 16839 + }, + { + "epoch": 1.8493301120140566, + "grad_norm": 1.8856903314590454, + "learning_rate": 1e-06, + "loss": 0.9743, + "mean_token_accuracy": 0.7031873464584351, + "num_tokens": 420791501.0, + "step": 16840 + }, + { + "epoch": 1.8494399297166704, + "grad_norm": 2.562666416168213, + "learning_rate": 1e-06, + "loss": 0.789, + "mean_token_accuracy": 0.7491483688354492, + "num_tokens": 420810840.0, + "step": 16841 + }, + { + "epoch": 1.849549747419284, + "grad_norm": 1.9048844575881958, + "learning_rate": 1e-06, + "loss": 0.9813, + "mean_token_accuracy": 0.6988011598587036, + "num_tokens": 420847462.0, + "step": 16842 + }, + { + "epoch": 1.8496595651218977, + "grad_norm": 2.3723371028900146, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7211711406707764, + "num_tokens": 420869186.0, + "step": 16843 + }, + { + "epoch": 1.8497693828245114, + "grad_norm": 2.6069912910461426, + "learning_rate": 1e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7340871691703796, + "num_tokens": 420889018.0, + "step": 16844 + }, + { + "epoch": 1.849879200527125, + "grad_norm": 1.9689459800720215, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7271193265914917, + "num_tokens": 420919435.0, + "step": 16845 + }, + { + "epoch": 1.8499890182297385, + "grad_norm": 2.1402149200439453, + "learning_rate": 1e-06, + "loss": 0.9886, + "mean_token_accuracy": 0.6973937749862671, + "num_tokens": 420947060.0, + "step": 16846 + }, + { + "epoch": 1.8500988359323522, + "grad_norm": 2.1765432357788086, + "learning_rate": 1e-06, + "loss": 0.9706, + "mean_token_accuracy": 0.701041579246521, + "num_tokens": 420976750.0, + "step": 16847 + }, + { + "epoch": 1.850208653634966, + "grad_norm": 2.2461845874786377, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7175289988517761, + "num_tokens": 421003053.0, + "step": 16848 + }, + { + "epoch": 1.8503184713375798, + "grad_norm": 2.466550827026367, + "learning_rate": 1e-06, + "loss": 0.8675, + "mean_token_accuracy": 0.7278671264648438, + "num_tokens": 421024364.0, + "step": 16849 + }, + { + "epoch": 1.8504282890401933, + "grad_norm": 2.2449355125427246, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7210583090782166, + "num_tokens": 421048760.0, + "step": 16850 + }, + { + "epoch": 1.8505381067428068, + "grad_norm": 2.0467710494995117, + "learning_rate": 1e-06, + "loss": 0.8807, + "mean_token_accuracy": 0.7223243713378906, + "num_tokens": 421076853.0, + "step": 16851 + }, + { + "epoch": 1.8506479244454206, + "grad_norm": 2.488133430480957, + "learning_rate": 1e-06, + "loss": 0.7753, + "mean_token_accuracy": 0.7525379657745361, + "num_tokens": 421096946.0, + "step": 16852 + }, + { + "epoch": 1.8507577421480343, + "grad_norm": 2.3254811763763428, + "learning_rate": 1e-06, + "loss": 0.8878, + "mean_token_accuracy": 0.7229543924331665, + "num_tokens": 421119768.0, + "step": 16853 + }, + { + "epoch": 1.8508675598506479, + "grad_norm": 2.2959258556365967, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7219088077545166, + "num_tokens": 421145024.0, + "step": 16854 + }, + { + "epoch": 1.8509773775532616, + "grad_norm": 2.389918565750122, + "learning_rate": 1e-06, + "loss": 0.9514, + "mean_token_accuracy": 0.7081372141838074, + "num_tokens": 421169569.0, + "step": 16855 + }, + { + "epoch": 1.8510871952558752, + "grad_norm": 2.1627614498138428, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7181854844093323, + "num_tokens": 421197505.0, + "step": 16856 + }, + { + "epoch": 1.851197012958489, + "grad_norm": 2.151630401611328, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7259423732757568, + "num_tokens": 421223469.0, + "step": 16857 + }, + { + "epoch": 1.8513068306611027, + "grad_norm": 2.2854628562927246, + "learning_rate": 1e-06, + "loss": 0.9763, + "mean_token_accuracy": 0.6990363597869873, + "num_tokens": 421248416.0, + "step": 16858 + }, + { + "epoch": 1.8514166483637162, + "grad_norm": 2.6813292503356934, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.72126704454422, + "num_tokens": 421267752.0, + "step": 16859 + }, + { + "epoch": 1.8515264660663298, + "grad_norm": 2.353384494781494, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7369767427444458, + "num_tokens": 421290824.0, + "step": 16860 + }, + { + "epoch": 1.8516362837689435, + "grad_norm": 2.3138983249664307, + "learning_rate": 1e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7368052005767822, + "num_tokens": 421313838.0, + "step": 16861 + }, + { + "epoch": 1.8517461014715573, + "grad_norm": 2.3474783897399902, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7285007834434509, + "num_tokens": 421337473.0, + "step": 16862 + }, + { + "epoch": 1.851855919174171, + "grad_norm": 2.1515753269195557, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7166836261749268, + "num_tokens": 421363292.0, + "step": 16863 + }, + { + "epoch": 1.8519657368767846, + "grad_norm": 2.366570472717285, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7428786754608154, + "num_tokens": 421384678.0, + "step": 16864 + }, + { + "epoch": 1.852075554579398, + "grad_norm": 1.9295177459716797, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7106813788414001, + "num_tokens": 421415406.0, + "step": 16865 + }, + { + "epoch": 1.8521853722820119, + "grad_norm": 2.2529399394989014, + "learning_rate": 1e-06, + "loss": 0.9375, + "mean_token_accuracy": 0.7072798609733582, + "num_tokens": 421440996.0, + "step": 16866 + }, + { + "epoch": 1.8522951899846256, + "grad_norm": 2.0232574939727783, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7163022756576538, + "num_tokens": 421472401.0, + "step": 16867 + }, + { + "epoch": 1.8524050076872391, + "grad_norm": 2.21224308013916, + "learning_rate": 1e-06, + "loss": 0.9234, + "mean_token_accuracy": 0.7243372797966003, + "num_tokens": 421498168.0, + "step": 16868 + }, + { + "epoch": 1.8525148253898527, + "grad_norm": 2.2392168045043945, + "learning_rate": 1e-06, + "loss": 0.9052, + "mean_token_accuracy": 0.7238725423812866, + "num_tokens": 421524347.0, + "step": 16869 + }, + { + "epoch": 1.8526246430924664, + "grad_norm": 2.040538787841797, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7359879016876221, + "num_tokens": 421552101.0, + "step": 16870 + }, + { + "epoch": 1.8527344607950802, + "grad_norm": 2.297222137451172, + "learning_rate": 1e-06, + "loss": 0.9119, + "mean_token_accuracy": 0.717250645160675, + "num_tokens": 421575907.0, + "step": 16871 + }, + { + "epoch": 1.852844278497694, + "grad_norm": 2.123168468475342, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.7220853567123413, + "num_tokens": 421605303.0, + "step": 16872 + }, + { + "epoch": 1.8529540962003075, + "grad_norm": 2.425884246826172, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7169723510742188, + "num_tokens": 421626537.0, + "step": 16873 + }, + { + "epoch": 1.853063913902921, + "grad_norm": 2.3616466522216797, + "learning_rate": 1e-06, + "loss": 0.906, + "mean_token_accuracy": 0.7167633771896362, + "num_tokens": 421651424.0, + "step": 16874 + }, + { + "epoch": 1.8531737316055348, + "grad_norm": 2.234790325164795, + "learning_rate": 1e-06, + "loss": 0.9246, + "mean_token_accuracy": 0.7169734835624695, + "num_tokens": 421677891.0, + "step": 16875 + }, + { + "epoch": 1.8532835493081485, + "grad_norm": 2.315012216567993, + "learning_rate": 1e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7266135215759277, + "num_tokens": 421700512.0, + "step": 16876 + }, + { + "epoch": 1.8533933670107623, + "grad_norm": 2.381239175796509, + "learning_rate": 1e-06, + "loss": 0.9918, + "mean_token_accuracy": 0.7055844068527222, + "num_tokens": 421725265.0, + "step": 16877 + }, + { + "epoch": 1.8535031847133758, + "grad_norm": 2.299804210662842, + "learning_rate": 1e-06, + "loss": 0.7715, + "mean_token_accuracy": 0.7541710138320923, + "num_tokens": 421748769.0, + "step": 16878 + }, + { + "epoch": 1.8536130024159894, + "grad_norm": 2.0388951301574707, + "learning_rate": 1e-06, + "loss": 0.8522, + "mean_token_accuracy": 0.7333084344863892, + "num_tokens": 421778052.0, + "step": 16879 + }, + { + "epoch": 1.8537228201186031, + "grad_norm": 2.450427532196045, + "learning_rate": 1e-06, + "loss": 0.8147, + "mean_token_accuracy": 0.7461851835250854, + "num_tokens": 421800499.0, + "step": 16880 + }, + { + "epoch": 1.8538326378212169, + "grad_norm": 2.290985345840454, + "learning_rate": 1e-06, + "loss": 0.9822, + "mean_token_accuracy": 0.6932024955749512, + "num_tokens": 421824997.0, + "step": 16881 + }, + { + "epoch": 1.8539424555238304, + "grad_norm": 2.1706812381744385, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7167336344718933, + "num_tokens": 421852443.0, + "step": 16882 + }, + { + "epoch": 1.854052273226444, + "grad_norm": 2.168928384780884, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.718203604221344, + "num_tokens": 421879003.0, + "step": 16883 + }, + { + "epoch": 1.8541620909290577, + "grad_norm": 2.457437515258789, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7396003603935242, + "num_tokens": 421899700.0, + "step": 16884 + }, + { + "epoch": 1.8542719086316715, + "grad_norm": 2.4853360652923584, + "learning_rate": 1e-06, + "loss": 0.7892, + "mean_token_accuracy": 0.7417093515396118, + "num_tokens": 421919578.0, + "step": 16885 + }, + { + "epoch": 1.8543817263342852, + "grad_norm": 2.5087783336639404, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7394126653671265, + "num_tokens": 421940556.0, + "step": 16886 + }, + { + "epoch": 1.8544915440368988, + "grad_norm": 2.2941975593566895, + "learning_rate": 1e-06, + "loss": 0.91, + "mean_token_accuracy": 0.716961145401001, + "num_tokens": 421963878.0, + "step": 16887 + }, + { + "epoch": 1.8546013617395123, + "grad_norm": 2.2165839672088623, + "learning_rate": 1e-06, + "loss": 0.9266, + "mean_token_accuracy": 0.7098185420036316, + "num_tokens": 421992247.0, + "step": 16888 + }, + { + "epoch": 1.854711179442126, + "grad_norm": 2.166727304458618, + "learning_rate": 1e-06, + "loss": 0.8968, + "mean_token_accuracy": 0.7184647917747498, + "num_tokens": 422017998.0, + "step": 16889 + }, + { + "epoch": 1.8548209971447398, + "grad_norm": 2.51926326751709, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7405828833580017, + "num_tokens": 422038242.0, + "step": 16890 + }, + { + "epoch": 1.8549308148473536, + "grad_norm": 2.1586673259735107, + "learning_rate": 1e-06, + "loss": 1.0224, + "mean_token_accuracy": 0.696225643157959, + "num_tokens": 422067700.0, + "step": 16891 + }, + { + "epoch": 1.855040632549967, + "grad_norm": 2.6124603748321533, + "learning_rate": 1e-06, + "loss": 0.8653, + "mean_token_accuracy": 0.7348917126655579, + "num_tokens": 422085240.0, + "step": 16892 + }, + { + "epoch": 1.8551504502525806, + "grad_norm": 2.525947093963623, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7113230228424072, + "num_tokens": 422106919.0, + "step": 16893 + }, + { + "epoch": 1.8552602679551944, + "grad_norm": 2.1951985359191895, + "learning_rate": 1e-06, + "loss": 0.9989, + "mean_token_accuracy": 0.6898971796035767, + "num_tokens": 422135215.0, + "step": 16894 + }, + { + "epoch": 1.8553700856578081, + "grad_norm": 2.462641716003418, + "learning_rate": 1e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.728895902633667, + "num_tokens": 422157119.0, + "step": 16895 + }, + { + "epoch": 1.8554799033604217, + "grad_norm": 2.2800652980804443, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7274045944213867, + "num_tokens": 422181482.0, + "step": 16896 + }, + { + "epoch": 1.8555897210630352, + "grad_norm": 2.2604730129241943, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7100304961204529, + "num_tokens": 422207656.0, + "step": 16897 + }, + { + "epoch": 1.855699538765649, + "grad_norm": 2.425562858581543, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7328947186470032, + "num_tokens": 422229695.0, + "step": 16898 + }, + { + "epoch": 1.8558093564682627, + "grad_norm": 2.329432725906372, + "learning_rate": 1e-06, + "loss": 1.0142, + "mean_token_accuracy": 0.6892772912979126, + "num_tokens": 422257859.0, + "step": 16899 + }, + { + "epoch": 1.8559191741708765, + "grad_norm": 2.407177448272705, + "learning_rate": 1e-06, + "loss": 0.9425, + "mean_token_accuracy": 0.710140585899353, + "num_tokens": 422279414.0, + "step": 16900 + }, + { + "epoch": 1.85602899187349, + "grad_norm": 2.2378602027893066, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7254239320755005, + "num_tokens": 422306457.0, + "step": 16901 + }, + { + "epoch": 1.8561388095761036, + "grad_norm": 2.3111326694488525, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.721990168094635, + "num_tokens": 422329113.0, + "step": 16902 + }, + { + "epoch": 1.8562486272787173, + "grad_norm": 2.496213674545288, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.7019565105438232, + "num_tokens": 422353044.0, + "step": 16903 + }, + { + "epoch": 1.856358444981331, + "grad_norm": 2.1061079502105713, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7121080160140991, + "num_tokens": 422381565.0, + "step": 16904 + }, + { + "epoch": 1.8564682626839446, + "grad_norm": 2.206386089324951, + "learning_rate": 1e-06, + "loss": 0.8547, + "mean_token_accuracy": 0.7272540330886841, + "num_tokens": 422408161.0, + "step": 16905 + }, + { + "epoch": 1.8565780803865584, + "grad_norm": 2.421931028366089, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7187110781669617, + "num_tokens": 422431548.0, + "step": 16906 + }, + { + "epoch": 1.856687898089172, + "grad_norm": 2.3392722606658936, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7308700084686279, + "num_tokens": 422455878.0, + "step": 16907 + }, + { + "epoch": 1.8567977157917857, + "grad_norm": 2.241555690765381, + "learning_rate": 1e-06, + "loss": 0.8501, + "mean_token_accuracy": 0.7277476787567139, + "num_tokens": 422480813.0, + "step": 16908 + }, + { + "epoch": 1.8569075334943994, + "grad_norm": 2.760517120361328, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7001418471336365, + "num_tokens": 422501282.0, + "step": 16909 + }, + { + "epoch": 1.857017351197013, + "grad_norm": 1.9897477626800537, + "learning_rate": 1e-06, + "loss": 0.901, + "mean_token_accuracy": 0.7146238088607788, + "num_tokens": 422533273.0, + "step": 16910 + }, + { + "epoch": 1.8571271688996265, + "grad_norm": 2.5367014408111572, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.717725396156311, + "num_tokens": 422553900.0, + "step": 16911 + }, + { + "epoch": 1.8572369866022402, + "grad_norm": 2.337294816970825, + "learning_rate": 1e-06, + "loss": 0.815, + "mean_token_accuracy": 0.7382726669311523, + "num_tokens": 422575625.0, + "step": 16912 + }, + { + "epoch": 1.857346804304854, + "grad_norm": 2.2312018871307373, + "learning_rate": 1e-06, + "loss": 0.9671, + "mean_token_accuracy": 0.7062438130378723, + "num_tokens": 422601634.0, + "step": 16913 + }, + { + "epoch": 1.8574566220074678, + "grad_norm": 2.510761260986328, + "learning_rate": 1e-06, + "loss": 0.9187, + "mean_token_accuracy": 0.7132723331451416, + "num_tokens": 422622196.0, + "step": 16914 + }, + { + "epoch": 1.8575664397100813, + "grad_norm": 2.215240478515625, + "learning_rate": 1e-06, + "loss": 0.8159, + "mean_token_accuracy": 0.7445937395095825, + "num_tokens": 422647075.0, + "step": 16915 + }, + { + "epoch": 1.8576762574126948, + "grad_norm": 2.507490634918213, + "learning_rate": 1e-06, + "loss": 0.9209, + "mean_token_accuracy": 0.7163571119308472, + "num_tokens": 422669616.0, + "step": 16916 + }, + { + "epoch": 1.8577860751153086, + "grad_norm": 2.1052186489105225, + "learning_rate": 1e-06, + "loss": 0.998, + "mean_token_accuracy": 0.6930471658706665, + "num_tokens": 422698591.0, + "step": 16917 + }, + { + "epoch": 1.8578958928179223, + "grad_norm": 2.1549384593963623, + "learning_rate": 1e-06, + "loss": 0.9568, + "mean_token_accuracy": 0.7039268612861633, + "num_tokens": 422728352.0, + "step": 16918 + }, + { + "epoch": 1.8580057105205359, + "grad_norm": 2.3395895957946777, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7029787302017212, + "num_tokens": 422751534.0, + "step": 16919 + }, + { + "epoch": 1.8581155282231496, + "grad_norm": 2.596770763397217, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7372714281082153, + "num_tokens": 422771603.0, + "step": 16920 + }, + { + "epoch": 1.8582253459257632, + "grad_norm": 2.2487263679504395, + "learning_rate": 1e-06, + "loss": 0.999, + "mean_token_accuracy": 0.7021613717079163, + "num_tokens": 422797793.0, + "step": 16921 + }, + { + "epoch": 1.858335163628377, + "grad_norm": 2.1852638721466064, + "learning_rate": 1e-06, + "loss": 0.9492, + "mean_token_accuracy": 0.7034435272216797, + "num_tokens": 422826546.0, + "step": 16922 + }, + { + "epoch": 1.8584449813309907, + "grad_norm": 2.8724288940429688, + "learning_rate": 1e-06, + "loss": 0.8279, + "mean_token_accuracy": 0.7360263466835022, + "num_tokens": 422843982.0, + "step": 16923 + }, + { + "epoch": 1.8585547990336042, + "grad_norm": 2.296633243560791, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7242922782897949, + "num_tokens": 422868341.0, + "step": 16924 + }, + { + "epoch": 1.8586646167362177, + "grad_norm": 2.098022222518921, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.73642897605896, + "num_tokens": 422894891.0, + "step": 16925 + }, + { + "epoch": 1.8587744344388315, + "grad_norm": 2.0123565196990967, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7248761057853699, + "num_tokens": 422923568.0, + "step": 16926 + }, + { + "epoch": 1.8588842521414453, + "grad_norm": 2.1986083984375, + "learning_rate": 1e-06, + "loss": 0.9533, + "mean_token_accuracy": 0.7118582129478455, + "num_tokens": 422949035.0, + "step": 16927 + }, + { + "epoch": 1.858994069844059, + "grad_norm": 2.39467716217041, + "learning_rate": 1e-06, + "loss": 0.8386, + "mean_token_accuracy": 0.7319933772087097, + "num_tokens": 422971929.0, + "step": 16928 + }, + { + "epoch": 1.8591038875466726, + "grad_norm": 2.1305644512176514, + "learning_rate": 1e-06, + "loss": 1.0038, + "mean_token_accuracy": 0.6871107816696167, + "num_tokens": 423002186.0, + "step": 16929 + }, + { + "epoch": 1.859213705249286, + "grad_norm": 2.312220573425293, + "learning_rate": 1e-06, + "loss": 0.9473, + "mean_token_accuracy": 0.7054402232170105, + "num_tokens": 423026897.0, + "step": 16930 + }, + { + "epoch": 1.8593235229518998, + "grad_norm": 2.768947124481201, + "learning_rate": 1e-06, + "loss": 0.8151, + "mean_token_accuracy": 0.7472380995750427, + "num_tokens": 423046379.0, + "step": 16931 + }, + { + "epoch": 1.8594333406545136, + "grad_norm": 2.5765371322631836, + "learning_rate": 1e-06, + "loss": 0.8715, + "mean_token_accuracy": 0.7254114151000977, + "num_tokens": 423067855.0, + "step": 16932 + }, + { + "epoch": 1.8595431583571271, + "grad_norm": 2.2318356037139893, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7196165323257446, + "num_tokens": 423093911.0, + "step": 16933 + }, + { + "epoch": 1.8596529760597407, + "grad_norm": 2.4326298236846924, + "learning_rate": 1e-06, + "loss": 0.908, + "mean_token_accuracy": 0.7351105213165283, + "num_tokens": 423116137.0, + "step": 16934 + }, + { + "epoch": 1.8597627937623544, + "grad_norm": 2.250063896179199, + "learning_rate": 1e-06, + "loss": 0.8177, + "mean_token_accuracy": 0.7385991811752319, + "num_tokens": 423139752.0, + "step": 16935 + }, + { + "epoch": 1.8598726114649682, + "grad_norm": 2.2216317653656006, + "learning_rate": 1e-06, + "loss": 0.9591, + "mean_token_accuracy": 0.6975374221801758, + "num_tokens": 423165530.0, + "step": 16936 + }, + { + "epoch": 1.859982429167582, + "grad_norm": 2.4355852603912354, + "learning_rate": 1e-06, + "loss": 0.8954, + "mean_token_accuracy": 0.720312774181366, + "num_tokens": 423188283.0, + "step": 16937 + }, + { + "epoch": 1.8600922468701955, + "grad_norm": 2.527050733566284, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7213932275772095, + "num_tokens": 423210168.0, + "step": 16938 + }, + { + "epoch": 1.860202064572809, + "grad_norm": 2.5270886421203613, + "learning_rate": 1e-06, + "loss": 0.781, + "mean_token_accuracy": 0.750686526298523, + "num_tokens": 423229689.0, + "step": 16939 + }, + { + "epoch": 1.8603118822754228, + "grad_norm": 2.0542542934417725, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7185131907463074, + "num_tokens": 423258114.0, + "step": 16940 + }, + { + "epoch": 1.8604216999780365, + "grad_norm": 2.1750526428222656, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7180390357971191, + "num_tokens": 423283532.0, + "step": 16941 + }, + { + "epoch": 1.8605315176806503, + "grad_norm": 2.341496467590332, + "learning_rate": 1e-06, + "loss": 0.8602, + "mean_token_accuracy": 0.7373711466789246, + "num_tokens": 423305915.0, + "step": 16942 + }, + { + "epoch": 1.8606413353832638, + "grad_norm": 1.9563041925430298, + "learning_rate": 1e-06, + "loss": 0.8801, + "mean_token_accuracy": 0.7234201431274414, + "num_tokens": 423337558.0, + "step": 16943 + }, + { + "epoch": 1.8607511530858774, + "grad_norm": 2.344963550567627, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7304085493087769, + "num_tokens": 423361205.0, + "step": 16944 + }, + { + "epoch": 1.860860970788491, + "grad_norm": 2.230725049972534, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.7184275984764099, + "num_tokens": 423388152.0, + "step": 16945 + }, + { + "epoch": 1.8609707884911049, + "grad_norm": 2.2462265491485596, + "learning_rate": 1e-06, + "loss": 0.9167, + "mean_token_accuracy": 0.7179995775222778, + "num_tokens": 423413730.0, + "step": 16946 + }, + { + "epoch": 1.8610806061937184, + "grad_norm": 2.212324619293213, + "learning_rate": 1e-06, + "loss": 0.9936, + "mean_token_accuracy": 0.7027032375335693, + "num_tokens": 423441502.0, + "step": 16947 + }, + { + "epoch": 1.861190423896332, + "grad_norm": 2.25105619430542, + "learning_rate": 1e-06, + "loss": 0.8816, + "mean_token_accuracy": 0.7241989374160767, + "num_tokens": 423466518.0, + "step": 16948 + }, + { + "epoch": 1.8613002415989457, + "grad_norm": 1.853614091873169, + "learning_rate": 1e-06, + "loss": 0.9133, + "mean_token_accuracy": 0.7193728685379028, + "num_tokens": 423501398.0, + "step": 16949 + }, + { + "epoch": 1.8614100593015594, + "grad_norm": 2.0865633487701416, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.695056676864624, + "num_tokens": 423531452.0, + "step": 16950 + }, + { + "epoch": 1.8615198770041732, + "grad_norm": 2.243541717529297, + "learning_rate": 1e-06, + "loss": 0.957, + "mean_token_accuracy": 0.7201727628707886, + "num_tokens": 423556328.0, + "step": 16951 + }, + { + "epoch": 1.8616296947067867, + "grad_norm": 2.4631388187408447, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7253719568252563, + "num_tokens": 423577979.0, + "step": 16952 + }, + { + "epoch": 1.8617395124094003, + "grad_norm": 2.3537845611572266, + "learning_rate": 1e-06, + "loss": 0.9461, + "mean_token_accuracy": 0.704688549041748, + "num_tokens": 423601182.0, + "step": 16953 + }, + { + "epoch": 1.861849330112014, + "grad_norm": 2.2103686332702637, + "learning_rate": 1e-06, + "loss": 0.9208, + "mean_token_accuracy": 0.7136225700378418, + "num_tokens": 423627179.0, + "step": 16954 + }, + { + "epoch": 1.8619591478146278, + "grad_norm": 2.4547066688537598, + "learning_rate": 1e-06, + "loss": 0.8373, + "mean_token_accuracy": 0.7380220293998718, + "num_tokens": 423647939.0, + "step": 16955 + }, + { + "epoch": 1.8620689655172413, + "grad_norm": 2.124616861343384, + "learning_rate": 1e-06, + "loss": 0.9575, + "mean_token_accuracy": 0.6964921951293945, + "num_tokens": 423675977.0, + "step": 16956 + }, + { + "epoch": 1.862178783219855, + "grad_norm": 2.057830572128296, + "learning_rate": 1e-06, + "loss": 0.9636, + "mean_token_accuracy": 0.7023717164993286, + "num_tokens": 423703851.0, + "step": 16957 + }, + { + "epoch": 1.8622886009224686, + "grad_norm": 2.204624891281128, + "learning_rate": 1e-06, + "loss": 1.0299, + "mean_token_accuracy": 0.6977723240852356, + "num_tokens": 423733265.0, + "step": 16958 + }, + { + "epoch": 1.8623984186250824, + "grad_norm": 2.123077392578125, + "learning_rate": 1e-06, + "loss": 0.9522, + "mean_token_accuracy": 0.7090420722961426, + "num_tokens": 423762836.0, + "step": 16959 + }, + { + "epoch": 1.8625082363276961, + "grad_norm": 1.958453893661499, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7126550674438477, + "num_tokens": 423795573.0, + "step": 16960 + }, + { + "epoch": 1.8626180540303097, + "grad_norm": 2.182020902633667, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.721603512763977, + "num_tokens": 423820331.0, + "step": 16961 + }, + { + "epoch": 1.8627278717329232, + "grad_norm": 2.5134215354919434, + "learning_rate": 1e-06, + "loss": 0.9024, + "mean_token_accuracy": 0.7134842276573181, + "num_tokens": 423841634.0, + "step": 16962 + }, + { + "epoch": 1.862837689435537, + "grad_norm": 2.2049238681793213, + "learning_rate": 1e-06, + "loss": 1.0001, + "mean_token_accuracy": 0.6916244626045227, + "num_tokens": 423868711.0, + "step": 16963 + }, + { + "epoch": 1.8629475071381507, + "grad_norm": 2.469325065612793, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7053244113922119, + "num_tokens": 423891646.0, + "step": 16964 + }, + { + "epoch": 1.8630573248407645, + "grad_norm": 2.08129620552063, + "learning_rate": 1e-06, + "loss": 0.9566, + "mean_token_accuracy": 0.7074594497680664, + "num_tokens": 423922448.0, + "step": 16965 + }, + { + "epoch": 1.863167142543378, + "grad_norm": 2.11856746673584, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7336399555206299, + "num_tokens": 423949688.0, + "step": 16966 + }, + { + "epoch": 1.8632769602459915, + "grad_norm": 2.720329523086548, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7270058393478394, + "num_tokens": 423968820.0, + "step": 16967 + }, + { + "epoch": 1.8633867779486053, + "grad_norm": 2.2399914264678955, + "learning_rate": 1e-06, + "loss": 0.98, + "mean_token_accuracy": 0.7093034982681274, + "num_tokens": 423996543.0, + "step": 16968 + }, + { + "epoch": 1.863496595651219, + "grad_norm": 2.0975944995880127, + "learning_rate": 1e-06, + "loss": 0.9838, + "mean_token_accuracy": 0.6963925361633301, + "num_tokens": 424025476.0, + "step": 16969 + }, + { + "epoch": 1.8636064133538326, + "grad_norm": 2.0783588886260986, + "learning_rate": 1e-06, + "loss": 0.9898, + "mean_token_accuracy": 0.6955951452255249, + "num_tokens": 424055796.0, + "step": 16970 + }, + { + "epoch": 1.8637162310564463, + "grad_norm": 2.4365594387054443, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7180529832839966, + "num_tokens": 424077933.0, + "step": 16971 + }, + { + "epoch": 1.8638260487590599, + "grad_norm": 2.3261637687683105, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7261658906936646, + "num_tokens": 424101345.0, + "step": 16972 + }, + { + "epoch": 1.8639358664616736, + "grad_norm": 2.330609083175659, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7309005856513977, + "num_tokens": 424124316.0, + "step": 16973 + }, + { + "epoch": 1.8640456841642874, + "grad_norm": 2.119511842727661, + "learning_rate": 1e-06, + "loss": 0.9503, + "mean_token_accuracy": 0.708044171333313, + "num_tokens": 424153853.0, + "step": 16974 + }, + { + "epoch": 1.864155501866901, + "grad_norm": 2.083491802215576, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7184805274009705, + "num_tokens": 424182767.0, + "step": 16975 + }, + { + "epoch": 1.8642653195695145, + "grad_norm": 2.0300731658935547, + "learning_rate": 1e-06, + "loss": 0.8777, + "mean_token_accuracy": 0.7199109196662903, + "num_tokens": 424210484.0, + "step": 16976 + }, + { + "epoch": 1.8643751372721282, + "grad_norm": 2.2733500003814697, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7245035171508789, + "num_tokens": 424233701.0, + "step": 16977 + }, + { + "epoch": 1.864484954974742, + "grad_norm": 2.359262704849243, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7263609170913696, + "num_tokens": 424257562.0, + "step": 16978 + }, + { + "epoch": 1.8645947726773557, + "grad_norm": 2.809162139892578, + "learning_rate": 1e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7445456981658936, + "num_tokens": 424274545.0, + "step": 16979 + }, + { + "epoch": 1.8647045903799693, + "grad_norm": 2.2304205894470215, + "learning_rate": 1e-06, + "loss": 0.9439, + "mean_token_accuracy": 0.703126847743988, + "num_tokens": 424300362.0, + "step": 16980 + }, + { + "epoch": 1.8648144080825828, + "grad_norm": 2.0978212356567383, + "learning_rate": 1e-06, + "loss": 0.8641, + "mean_token_accuracy": 0.7258060574531555, + "num_tokens": 424327002.0, + "step": 16981 + }, + { + "epoch": 1.8649242257851966, + "grad_norm": 2.3468496799468994, + "learning_rate": 1e-06, + "loss": 0.9115, + "mean_token_accuracy": 0.7075914144515991, + "num_tokens": 424351957.0, + "step": 16982 + }, + { + "epoch": 1.8650340434878103, + "grad_norm": 2.4487733840942383, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.7086857557296753, + "num_tokens": 424374355.0, + "step": 16983 + }, + { + "epoch": 1.8651438611904239, + "grad_norm": 2.249267816543579, + "learning_rate": 1e-06, + "loss": 0.8601, + "mean_token_accuracy": 0.7273313403129578, + "num_tokens": 424399088.0, + "step": 16984 + }, + { + "epoch": 1.8652536788930374, + "grad_norm": 2.2862114906311035, + "learning_rate": 1e-06, + "loss": 0.8678, + "mean_token_accuracy": 0.7266085743904114, + "num_tokens": 424422362.0, + "step": 16985 + }, + { + "epoch": 1.8653634965956511, + "grad_norm": 2.171537160873413, + "learning_rate": 1e-06, + "loss": 0.9623, + "mean_token_accuracy": 0.6981760263442993, + "num_tokens": 424451758.0, + "step": 16986 + }, + { + "epoch": 1.865473314298265, + "grad_norm": 2.1579980850219727, + "learning_rate": 1e-06, + "loss": 0.9195, + "mean_token_accuracy": 0.7187150716781616, + "num_tokens": 424480213.0, + "step": 16987 + }, + { + "epoch": 1.8655831320008787, + "grad_norm": 2.3202102184295654, + "learning_rate": 1e-06, + "loss": 0.9677, + "mean_token_accuracy": 0.7073355913162231, + "num_tokens": 424506047.0, + "step": 16988 + }, + { + "epoch": 1.8656929497034922, + "grad_norm": 2.348501205444336, + "learning_rate": 1e-06, + "loss": 0.8938, + "mean_token_accuracy": 0.7199770212173462, + "num_tokens": 424530189.0, + "step": 16989 + }, + { + "epoch": 1.8658027674061057, + "grad_norm": 2.486988067626953, + "learning_rate": 1e-06, + "loss": 0.8771, + "mean_token_accuracy": 0.7209768891334534, + "num_tokens": 424551858.0, + "step": 16990 + }, + { + "epoch": 1.8659125851087195, + "grad_norm": 2.3308563232421875, + "learning_rate": 1e-06, + "loss": 0.8869, + "mean_token_accuracy": 0.7204763889312744, + "num_tokens": 424577487.0, + "step": 16991 + }, + { + "epoch": 1.8660224028113332, + "grad_norm": 2.2180895805358887, + "learning_rate": 1e-06, + "loss": 0.9929, + "mean_token_accuracy": 0.702022910118103, + "num_tokens": 424605849.0, + "step": 16992 + }, + { + "epoch": 1.866132220513947, + "grad_norm": 2.514115810394287, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.737045168876648, + "num_tokens": 424627201.0, + "step": 16993 + }, + { + "epoch": 1.8662420382165605, + "grad_norm": 2.156458854675293, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7278506755828857, + "num_tokens": 424653978.0, + "step": 16994 + }, + { + "epoch": 1.866351855919174, + "grad_norm": 2.590928792953491, + "learning_rate": 1e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7490622997283936, + "num_tokens": 424673376.0, + "step": 16995 + }, + { + "epoch": 1.8664616736217878, + "grad_norm": 2.4597134590148926, + "learning_rate": 1e-06, + "loss": 0.9122, + "mean_token_accuracy": 0.7267520427703857, + "num_tokens": 424695867.0, + "step": 16996 + }, + { + "epoch": 1.8665714913244016, + "grad_norm": 2.468761682510376, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7385760545730591, + "num_tokens": 424716872.0, + "step": 16997 + }, + { + "epoch": 1.8666813090270151, + "grad_norm": 2.208298444747925, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7187998294830322, + "num_tokens": 424744091.0, + "step": 16998 + }, + { + "epoch": 1.8667911267296287, + "grad_norm": 2.567166566848755, + "learning_rate": 1e-06, + "loss": 0.8099, + "mean_token_accuracy": 0.7375512719154358, + "num_tokens": 424764070.0, + "step": 16999 + }, + { + "epoch": 1.8669009444322424, + "grad_norm": 2.33414888381958, + "learning_rate": 1e-06, + "loss": 0.8194, + "mean_token_accuracy": 0.7492227554321289, + "num_tokens": 424786009.0, + "step": 17000 + }, + { + "epoch": 1.8670107621348562, + "grad_norm": 2.164153575897217, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7038823962211609, + "num_tokens": 424811599.0, + "step": 17001 + }, + { + "epoch": 1.86712057983747, + "grad_norm": 2.386441707611084, + "learning_rate": 1e-06, + "loss": 0.797, + "mean_token_accuracy": 0.7488256692886353, + "num_tokens": 424832492.0, + "step": 17002 + }, + { + "epoch": 1.8672303975400835, + "grad_norm": 2.257373809814453, + "learning_rate": 1e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.740520715713501, + "num_tokens": 424856403.0, + "step": 17003 + }, + { + "epoch": 1.867340215242697, + "grad_norm": 2.237238883972168, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7135804891586304, + "num_tokens": 424880661.0, + "step": 17004 + }, + { + "epoch": 1.8674500329453108, + "grad_norm": 2.2538228034973145, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7339103817939758, + "num_tokens": 424903286.0, + "step": 17005 + }, + { + "epoch": 1.8675598506479245, + "grad_norm": 2.211611032485962, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7156223058700562, + "num_tokens": 424927143.0, + "step": 17006 + }, + { + "epoch": 1.8676696683505383, + "grad_norm": 2.399341106414795, + "learning_rate": 1e-06, + "loss": 0.9003, + "mean_token_accuracy": 0.7204519510269165, + "num_tokens": 424951504.0, + "step": 17007 + }, + { + "epoch": 1.8677794860531518, + "grad_norm": 2.3999884128570557, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7214626669883728, + "num_tokens": 424974308.0, + "step": 17008 + }, + { + "epoch": 1.8678893037557653, + "grad_norm": 2.5707850456237793, + "learning_rate": 1e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7177243232727051, + "num_tokens": 424994568.0, + "step": 17009 + }, + { + "epoch": 1.867999121458379, + "grad_norm": 2.350853681564331, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7139196395874023, + "num_tokens": 425018627.0, + "step": 17010 + }, + { + "epoch": 1.8681089391609929, + "grad_norm": 2.3703887462615967, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7348614931106567, + "num_tokens": 425041256.0, + "step": 17011 + }, + { + "epoch": 1.8682187568636064, + "grad_norm": 2.2454981803894043, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7277371883392334, + "num_tokens": 425067524.0, + "step": 17012 + }, + { + "epoch": 1.86832857456622, + "grad_norm": 2.2822628021240234, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7213916778564453, + "num_tokens": 425093021.0, + "step": 17013 + }, + { + "epoch": 1.8684383922688337, + "grad_norm": 2.363703489303589, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7190698981285095, + "num_tokens": 425116876.0, + "step": 17014 + }, + { + "epoch": 1.8685482099714474, + "grad_norm": 2.5053560733795166, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7204070091247559, + "num_tokens": 425138225.0, + "step": 17015 + }, + { + "epoch": 1.8686580276740612, + "grad_norm": 2.2400693893432617, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7071173787117004, + "num_tokens": 425164450.0, + "step": 17016 + }, + { + "epoch": 1.8687678453766747, + "grad_norm": 2.0073678493499756, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7130439281463623, + "num_tokens": 425194834.0, + "step": 17017 + }, + { + "epoch": 1.8688776630792883, + "grad_norm": 2.1279456615448, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7207393050193787, + "num_tokens": 425223113.0, + "step": 17018 + }, + { + "epoch": 1.868987480781902, + "grad_norm": 1.9343969821929932, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7040334343910217, + "num_tokens": 425256562.0, + "step": 17019 + }, + { + "epoch": 1.8690972984845158, + "grad_norm": 2.3586742877960205, + "learning_rate": 1e-06, + "loss": 0.8029, + "mean_token_accuracy": 0.7518816590309143, + "num_tokens": 425278802.0, + "step": 17020 + }, + { + "epoch": 1.8692071161871293, + "grad_norm": 2.238057851791382, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7247364521026611, + "num_tokens": 425302437.0, + "step": 17021 + }, + { + "epoch": 1.869316933889743, + "grad_norm": 2.0847296714782715, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7262539267539978, + "num_tokens": 425328115.0, + "step": 17022 + }, + { + "epoch": 1.8694267515923566, + "grad_norm": 2.1604905128479004, + "learning_rate": 1e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.7276933193206787, + "num_tokens": 425353657.0, + "step": 17023 + }, + { + "epoch": 1.8695365692949704, + "grad_norm": 2.210937261581421, + "learning_rate": 1e-06, + "loss": 0.8861, + "mean_token_accuracy": 0.7161774039268494, + "num_tokens": 425380101.0, + "step": 17024 + }, + { + "epoch": 1.8696463869975841, + "grad_norm": 2.4753098487854004, + "learning_rate": 1e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7450885772705078, + "num_tokens": 425402236.0, + "step": 17025 + }, + { + "epoch": 1.8697562047001977, + "grad_norm": 2.2415308952331543, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7357337474822998, + "num_tokens": 425425197.0, + "step": 17026 + }, + { + "epoch": 1.8698660224028112, + "grad_norm": 2.2906527519226074, + "learning_rate": 1e-06, + "loss": 0.9657, + "mean_token_accuracy": 0.6992014646530151, + "num_tokens": 425450453.0, + "step": 17027 + }, + { + "epoch": 1.869975840105425, + "grad_norm": 2.376429557800293, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7271482944488525, + "num_tokens": 425472648.0, + "step": 17028 + }, + { + "epoch": 1.8700856578080387, + "grad_norm": 2.4683403968811035, + "learning_rate": 1e-06, + "loss": 0.9534, + "mean_token_accuracy": 0.7020113468170166, + "num_tokens": 425494880.0, + "step": 17029 + }, + { + "epoch": 1.8701954755106525, + "grad_norm": 2.250358819961548, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7143399715423584, + "num_tokens": 425521963.0, + "step": 17030 + }, + { + "epoch": 1.870305293213266, + "grad_norm": 3.436063528060913, + "learning_rate": 1e-06, + "loss": 0.7417, + "mean_token_accuracy": 0.7516854405403137, + "num_tokens": 425537056.0, + "step": 17031 + }, + { + "epoch": 1.8704151109158795, + "grad_norm": 2.1999242305755615, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7261039614677429, + "num_tokens": 425561697.0, + "step": 17032 + }, + { + "epoch": 1.8705249286184933, + "grad_norm": 2.4891676902770996, + "learning_rate": 1e-06, + "loss": 0.8709, + "mean_token_accuracy": 0.7315492033958435, + "num_tokens": 425583419.0, + "step": 17033 + }, + { + "epoch": 1.870634746321107, + "grad_norm": 2.088932991027832, + "learning_rate": 1e-06, + "loss": 0.899, + "mean_token_accuracy": 0.7206073999404907, + "num_tokens": 425612556.0, + "step": 17034 + }, + { + "epoch": 1.8707445640237206, + "grad_norm": 2.317692518234253, + "learning_rate": 1e-06, + "loss": 0.9666, + "mean_token_accuracy": 0.7159915566444397, + "num_tokens": 425636801.0, + "step": 17035 + }, + { + "epoch": 1.8708543817263343, + "grad_norm": 2.170844793319702, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7070187330245972, + "num_tokens": 425664607.0, + "step": 17036 + }, + { + "epoch": 1.8709641994289479, + "grad_norm": 2.2683844566345215, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7065112590789795, + "num_tokens": 425688279.0, + "step": 17037 + }, + { + "epoch": 1.8710740171315616, + "grad_norm": 2.4912636280059814, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7407963275909424, + "num_tokens": 425710860.0, + "step": 17038 + }, + { + "epoch": 1.8711838348341754, + "grad_norm": 2.132838249206543, + "learning_rate": 1e-06, + "loss": 0.8972, + "mean_token_accuracy": 0.7251365184783936, + "num_tokens": 425736303.0, + "step": 17039 + }, + { + "epoch": 1.871293652536789, + "grad_norm": 2.231372594833374, + "learning_rate": 1e-06, + "loss": 0.9523, + "mean_token_accuracy": 0.7012166976928711, + "num_tokens": 425762442.0, + "step": 17040 + }, + { + "epoch": 1.8714034702394025, + "grad_norm": 2.654637336730957, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.746062159538269, + "num_tokens": 425780827.0, + "step": 17041 + }, + { + "epoch": 1.8715132879420162, + "grad_norm": 2.245445728302002, + "learning_rate": 1e-06, + "loss": 0.8393, + "mean_token_accuracy": 0.7359991073608398, + "num_tokens": 425805741.0, + "step": 17042 + }, + { + "epoch": 1.87162310564463, + "grad_norm": 2.1300830841064453, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7401658892631531, + "num_tokens": 425831332.0, + "step": 17043 + }, + { + "epoch": 1.8717329233472437, + "grad_norm": 2.139718532562256, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7180739641189575, + "num_tokens": 425857258.0, + "step": 17044 + }, + { + "epoch": 1.8718427410498573, + "grad_norm": 2.4223358631134033, + "learning_rate": 1e-06, + "loss": 0.7639, + "mean_token_accuracy": 0.7588120698928833, + "num_tokens": 425878357.0, + "step": 17045 + }, + { + "epoch": 1.8719525587524708, + "grad_norm": 2.060502767562866, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.7040696144104004, + "num_tokens": 425912646.0, + "step": 17046 + }, + { + "epoch": 1.8720623764550846, + "grad_norm": 2.4135119915008545, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.7196186184883118, + "num_tokens": 425933208.0, + "step": 17047 + }, + { + "epoch": 1.8721721941576983, + "grad_norm": 2.1598472595214844, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7256457805633545, + "num_tokens": 425958658.0, + "step": 17048 + }, + { + "epoch": 1.8722820118603118, + "grad_norm": 2.2535510063171387, + "learning_rate": 1e-06, + "loss": 0.9471, + "mean_token_accuracy": 0.7018030881881714, + "num_tokens": 425985276.0, + "step": 17049 + }, + { + "epoch": 1.8723918295629254, + "grad_norm": 2.582083225250244, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.7123878598213196, + "num_tokens": 426007279.0, + "step": 17050 + }, + { + "epoch": 1.8725016472655391, + "grad_norm": 2.292215347290039, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7216060161590576, + "num_tokens": 426031952.0, + "step": 17051 + }, + { + "epoch": 1.872611464968153, + "grad_norm": 2.181213140487671, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7009469866752625, + "num_tokens": 426059990.0, + "step": 17052 + }, + { + "epoch": 1.8727212826707667, + "grad_norm": 2.1769556999206543, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.7214915156364441, + "num_tokens": 426085143.0, + "step": 17053 + }, + { + "epoch": 1.8728311003733802, + "grad_norm": 2.174806594848633, + "learning_rate": 1e-06, + "loss": 0.9497, + "mean_token_accuracy": 0.7042864561080933, + "num_tokens": 426112813.0, + "step": 17054 + }, + { + "epoch": 1.8729409180759937, + "grad_norm": 2.1999971866607666, + "learning_rate": 1e-06, + "loss": 0.9701, + "mean_token_accuracy": 0.7056387662887573, + "num_tokens": 426140437.0, + "step": 17055 + }, + { + "epoch": 1.8730507357786075, + "grad_norm": 2.3201229572296143, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7180432677268982, + "num_tokens": 426164401.0, + "step": 17056 + }, + { + "epoch": 1.8731605534812212, + "grad_norm": 2.204580545425415, + "learning_rate": 1e-06, + "loss": 0.9265, + "mean_token_accuracy": 0.7232406139373779, + "num_tokens": 426191299.0, + "step": 17057 + }, + { + "epoch": 1.873270371183835, + "grad_norm": 2.8215560913085938, + "learning_rate": 1e-06, + "loss": 0.7979, + "mean_token_accuracy": 0.7456988096237183, + "num_tokens": 426207782.0, + "step": 17058 + }, + { + "epoch": 1.8733801888864485, + "grad_norm": 2.7844150066375732, + "learning_rate": 1e-06, + "loss": 0.8355, + "mean_token_accuracy": 0.732859194278717, + "num_tokens": 426225674.0, + "step": 17059 + }, + { + "epoch": 1.873490006589062, + "grad_norm": 2.0184524059295654, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7142782211303711, + "num_tokens": 426257699.0, + "step": 17060 + }, + { + "epoch": 1.8735998242916758, + "grad_norm": 2.0041685104370117, + "learning_rate": 1e-06, + "loss": 1.0241, + "mean_token_accuracy": 0.6878381967544556, + "num_tokens": 426292619.0, + "step": 17061 + }, + { + "epoch": 1.8737096419942896, + "grad_norm": 2.3778560161590576, + "learning_rate": 1e-06, + "loss": 0.8341, + "mean_token_accuracy": 0.7360769510269165, + "num_tokens": 426314534.0, + "step": 17062 + }, + { + "epoch": 1.8738194596969031, + "grad_norm": 2.365981340408325, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.71177738904953, + "num_tokens": 426338783.0, + "step": 17063 + }, + { + "epoch": 1.8739292773995166, + "grad_norm": 2.104475975036621, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7076547741889954, + "num_tokens": 426368406.0, + "step": 17064 + }, + { + "epoch": 1.8740390951021304, + "grad_norm": 2.2025694847106934, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7451024055480957, + "num_tokens": 426393119.0, + "step": 17065 + }, + { + "epoch": 1.8741489128047442, + "grad_norm": 2.6490914821624756, + "learning_rate": 1e-06, + "loss": 0.8378, + "mean_token_accuracy": 0.7357795834541321, + "num_tokens": 426411647.0, + "step": 17066 + }, + { + "epoch": 1.874258730507358, + "grad_norm": 2.1626110076904297, + "learning_rate": 1e-06, + "loss": 0.9696, + "mean_token_accuracy": 0.6966633200645447, + "num_tokens": 426438247.0, + "step": 17067 + }, + { + "epoch": 1.8743685482099715, + "grad_norm": 2.1023480892181396, + "learning_rate": 1e-06, + "loss": 0.8928, + "mean_token_accuracy": 0.7285687923431396, + "num_tokens": 426467546.0, + "step": 17068 + }, + { + "epoch": 1.874478365912585, + "grad_norm": 2.1860592365264893, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7271173000335693, + "num_tokens": 426493633.0, + "step": 17069 + }, + { + "epoch": 1.8745881836151987, + "grad_norm": 2.1783390045166016, + "learning_rate": 1e-06, + "loss": 0.8732, + "mean_token_accuracy": 0.7312703132629395, + "num_tokens": 426519487.0, + "step": 17070 + }, + { + "epoch": 1.8746980013178125, + "grad_norm": 2.308969020843506, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7208444476127625, + "num_tokens": 426545701.0, + "step": 17071 + }, + { + "epoch": 1.8748078190204263, + "grad_norm": 2.0553038120269775, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7072532773017883, + "num_tokens": 426576813.0, + "step": 17072 + }, + { + "epoch": 1.8749176367230398, + "grad_norm": 2.2340145111083984, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7212371230125427, + "num_tokens": 426601813.0, + "step": 17073 + }, + { + "epoch": 1.8750274544256533, + "grad_norm": 2.8048465251922607, + "learning_rate": 1e-06, + "loss": 0.9105, + "mean_token_accuracy": 0.7146493196487427, + "num_tokens": 426620409.0, + "step": 17074 + }, + { + "epoch": 1.875137272128267, + "grad_norm": 2.345252513885498, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7268608808517456, + "num_tokens": 426643464.0, + "step": 17075 + }, + { + "epoch": 1.8752470898308808, + "grad_norm": 2.257052421569824, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7143991589546204, + "num_tokens": 426668515.0, + "step": 17076 + }, + { + "epoch": 1.8753569075334944, + "grad_norm": 2.352324962615967, + "learning_rate": 1e-06, + "loss": 0.9553, + "mean_token_accuracy": 0.7131549119949341, + "num_tokens": 426691834.0, + "step": 17077 + }, + { + "epoch": 1.875466725236108, + "grad_norm": 2.2946548461914062, + "learning_rate": 1e-06, + "loss": 0.9549, + "mean_token_accuracy": 0.715120792388916, + "num_tokens": 426716567.0, + "step": 17078 + }, + { + "epoch": 1.8755765429387217, + "grad_norm": 2.033568859100342, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.7289280891418457, + "num_tokens": 426748012.0, + "step": 17079 + }, + { + "epoch": 1.8756863606413354, + "grad_norm": 2.415148973464966, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7396687269210815, + "num_tokens": 426771999.0, + "step": 17080 + }, + { + "epoch": 1.8757961783439492, + "grad_norm": 2.450176239013672, + "learning_rate": 1e-06, + "loss": 0.9197, + "mean_token_accuracy": 0.718859076499939, + "num_tokens": 426797774.0, + "step": 17081 + }, + { + "epoch": 1.8759059960465627, + "grad_norm": 2.2836036682128906, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7321790456771851, + "num_tokens": 426823527.0, + "step": 17082 + }, + { + "epoch": 1.8760158137491763, + "grad_norm": 2.581232786178589, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7239222526550293, + "num_tokens": 426844239.0, + "step": 17083 + }, + { + "epoch": 1.87612563145179, + "grad_norm": 2.186666250228882, + "learning_rate": 1e-06, + "loss": 0.8937, + "mean_token_accuracy": 0.7259451746940613, + "num_tokens": 426869467.0, + "step": 17084 + }, + { + "epoch": 1.8762354491544038, + "grad_norm": 2.201340436935425, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7300026416778564, + "num_tokens": 426896785.0, + "step": 17085 + }, + { + "epoch": 1.8763452668570173, + "grad_norm": 2.031325578689575, + "learning_rate": 1e-06, + "loss": 0.9567, + "mean_token_accuracy": 0.7088053822517395, + "num_tokens": 426927943.0, + "step": 17086 + }, + { + "epoch": 1.876455084559631, + "grad_norm": 2.309751510620117, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7285666465759277, + "num_tokens": 426951857.0, + "step": 17087 + }, + { + "epoch": 1.8765649022622446, + "grad_norm": 2.473375082015991, + "learning_rate": 1e-06, + "loss": 0.9188, + "mean_token_accuracy": 0.724623441696167, + "num_tokens": 426972559.0, + "step": 17088 + }, + { + "epoch": 1.8766747199648584, + "grad_norm": 2.435227632522583, + "learning_rate": 1e-06, + "loss": 0.9397, + "mean_token_accuracy": 0.7094674706459045, + "num_tokens": 426995548.0, + "step": 17089 + }, + { + "epoch": 1.876784537667472, + "grad_norm": 2.2104992866516113, + "learning_rate": 1e-06, + "loss": 0.9202, + "mean_token_accuracy": 0.7164901494979858, + "num_tokens": 427020195.0, + "step": 17090 + }, + { + "epoch": 1.8768943553700856, + "grad_norm": 2.290419340133667, + "learning_rate": 1e-06, + "loss": 0.8352, + "mean_token_accuracy": 0.7422890663146973, + "num_tokens": 427043236.0, + "step": 17091 + }, + { + "epoch": 1.8770041730726992, + "grad_norm": 2.2497153282165527, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7106362581253052, + "num_tokens": 427068486.0, + "step": 17092 + }, + { + "epoch": 1.877113990775313, + "grad_norm": 2.1777873039245605, + "learning_rate": 1e-06, + "loss": 1.007, + "mean_token_accuracy": 0.6862208247184753, + "num_tokens": 427094420.0, + "step": 17093 + }, + { + "epoch": 1.8772238084779267, + "grad_norm": 2.8121230602264404, + "learning_rate": 1e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7318940758705139, + "num_tokens": 427113030.0, + "step": 17094 + }, + { + "epoch": 1.8773336261805404, + "grad_norm": 2.2021279335021973, + "learning_rate": 1e-06, + "loss": 0.8421, + "mean_token_accuracy": 0.7507927417755127, + "num_tokens": 427136344.0, + "step": 17095 + }, + { + "epoch": 1.877443443883154, + "grad_norm": 2.2263526916503906, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7230167984962463, + "num_tokens": 427161147.0, + "step": 17096 + }, + { + "epoch": 1.8775532615857675, + "grad_norm": 2.097282648086548, + "learning_rate": 1e-06, + "loss": 0.8574, + "mean_token_accuracy": 0.7302277088165283, + "num_tokens": 427187833.0, + "step": 17097 + }, + { + "epoch": 1.8776630792883813, + "grad_norm": 2.4584391117095947, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.729804277420044, + "num_tokens": 427207891.0, + "step": 17098 + }, + { + "epoch": 1.877772896990995, + "grad_norm": 2.407905101776123, + "learning_rate": 1e-06, + "loss": 0.8168, + "mean_token_accuracy": 0.744880199432373, + "num_tokens": 427228803.0, + "step": 17099 + }, + { + "epoch": 1.8778827146936086, + "grad_norm": 1.908611536026001, + "learning_rate": 1e-06, + "loss": 0.9599, + "mean_token_accuracy": 0.7072963714599609, + "num_tokens": 427263326.0, + "step": 17100 + }, + { + "epoch": 1.8779925323962223, + "grad_norm": 2.369649648666382, + "learning_rate": 1e-06, + "loss": 0.9204, + "mean_token_accuracy": 0.7079060077667236, + "num_tokens": 427288159.0, + "step": 17101 + }, + { + "epoch": 1.8781023500988359, + "grad_norm": 2.380156993865967, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7168070077896118, + "num_tokens": 427311160.0, + "step": 17102 + }, + { + "epoch": 1.8782121678014496, + "grad_norm": 2.2309131622314453, + "learning_rate": 1e-06, + "loss": 0.9798, + "mean_token_accuracy": 0.6971310377120972, + "num_tokens": 427337413.0, + "step": 17103 + }, + { + "epoch": 1.8783219855040634, + "grad_norm": 2.6226139068603516, + "learning_rate": 1e-06, + "loss": 0.7966, + "mean_token_accuracy": 0.7444430589675903, + "num_tokens": 427356980.0, + "step": 17104 + }, + { + "epoch": 1.878431803206677, + "grad_norm": 2.1373209953308105, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7051872611045837, + "num_tokens": 427385408.0, + "step": 17105 + }, + { + "epoch": 1.8785416209092904, + "grad_norm": 1.9247095584869385, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7208826541900635, + "num_tokens": 427419709.0, + "step": 17106 + }, + { + "epoch": 1.8786514386119042, + "grad_norm": 2.4179513454437256, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7223951816558838, + "num_tokens": 427444222.0, + "step": 17107 + }, + { + "epoch": 1.878761256314518, + "grad_norm": 2.5394105911254883, + "learning_rate": 1e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7265129089355469, + "num_tokens": 427463643.0, + "step": 17108 + }, + { + "epoch": 1.8788710740171317, + "grad_norm": 2.1599185466766357, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7205412983894348, + "num_tokens": 427491322.0, + "step": 17109 + }, + { + "epoch": 1.8789808917197452, + "grad_norm": 1.908378005027771, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7127373218536377, + "num_tokens": 427524968.0, + "step": 17110 + }, + { + "epoch": 1.8790907094223588, + "grad_norm": 2.026362657546997, + "learning_rate": 1e-06, + "loss": 0.9314, + "mean_token_accuracy": 0.7058080434799194, + "num_tokens": 427553934.0, + "step": 17111 + }, + { + "epoch": 1.8792005271249725, + "grad_norm": 2.393723487854004, + "learning_rate": 1e-06, + "loss": 0.9084, + "mean_token_accuracy": 0.7265956401824951, + "num_tokens": 427577545.0, + "step": 17112 + }, + { + "epoch": 1.8793103448275863, + "grad_norm": 2.2829504013061523, + "learning_rate": 1e-06, + "loss": 0.8978, + "mean_token_accuracy": 0.7275830507278442, + "num_tokens": 427601396.0, + "step": 17113 + }, + { + "epoch": 1.8794201625301998, + "grad_norm": 2.2989132404327393, + "learning_rate": 1e-06, + "loss": 0.9776, + "mean_token_accuracy": 0.7005569338798523, + "num_tokens": 427627828.0, + "step": 17114 + }, + { + "epoch": 1.8795299802328134, + "grad_norm": 2.5685887336730957, + "learning_rate": 1e-06, + "loss": 0.8568, + "mean_token_accuracy": 0.7225111722946167, + "num_tokens": 427647611.0, + "step": 17115 + }, + { + "epoch": 1.8796397979354271, + "grad_norm": 2.070950508117676, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7212966680526733, + "num_tokens": 427675430.0, + "step": 17116 + }, + { + "epoch": 1.8797496156380409, + "grad_norm": 2.4609110355377197, + "learning_rate": 1e-06, + "loss": 0.9584, + "mean_token_accuracy": 0.7054592370986938, + "num_tokens": 427697371.0, + "step": 17117 + }, + { + "epoch": 1.8798594333406546, + "grad_norm": 2.376410961151123, + "learning_rate": 1e-06, + "loss": 0.8057, + "mean_token_accuracy": 0.7491271495819092, + "num_tokens": 427718681.0, + "step": 17118 + }, + { + "epoch": 1.8799692510432682, + "grad_norm": 2.2301106452941895, + "learning_rate": 1e-06, + "loss": 0.9687, + "mean_token_accuracy": 0.7039657831192017, + "num_tokens": 427745420.0, + "step": 17119 + }, + { + "epoch": 1.8800790687458817, + "grad_norm": 2.4085946083068848, + "learning_rate": 1e-06, + "loss": 0.886, + "mean_token_accuracy": 0.7232196927070618, + "num_tokens": 427768052.0, + "step": 17120 + }, + { + "epoch": 1.8801888864484955, + "grad_norm": 2.5742580890655518, + "learning_rate": 1e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7445570230484009, + "num_tokens": 427786576.0, + "step": 17121 + }, + { + "epoch": 1.8802987041511092, + "grad_norm": 1.9907914400100708, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7275265455245972, + "num_tokens": 427816013.0, + "step": 17122 + }, + { + "epoch": 1.880408521853723, + "grad_norm": 2.1870009899139404, + "learning_rate": 1e-06, + "loss": 0.9561, + "mean_token_accuracy": 0.7125148773193359, + "num_tokens": 427842746.0, + "step": 17123 + }, + { + "epoch": 1.8805183395563365, + "grad_norm": 2.183727264404297, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7129524946212769, + "num_tokens": 427868441.0, + "step": 17124 + }, + { + "epoch": 1.88062815725895, + "grad_norm": 2.032503366470337, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7076528668403625, + "num_tokens": 427897735.0, + "step": 17125 + }, + { + "epoch": 1.8807379749615638, + "grad_norm": 1.972582459449768, + "learning_rate": 1e-06, + "loss": 0.9173, + "mean_token_accuracy": 0.7159439921379089, + "num_tokens": 427928771.0, + "step": 17126 + }, + { + "epoch": 1.8808477926641776, + "grad_norm": 2.400616407394409, + "learning_rate": 1e-06, + "loss": 0.8856, + "mean_token_accuracy": 0.7240958213806152, + "num_tokens": 427951866.0, + "step": 17127 + }, + { + "epoch": 1.880957610366791, + "grad_norm": 2.2454493045806885, + "learning_rate": 1e-06, + "loss": 0.9361, + "mean_token_accuracy": 0.7081474661827087, + "num_tokens": 427979442.0, + "step": 17128 + }, + { + "epoch": 1.8810674280694046, + "grad_norm": 2.313045024871826, + "learning_rate": 1e-06, + "loss": 0.8682, + "mean_token_accuracy": 0.7284696102142334, + "num_tokens": 428003946.0, + "step": 17129 + }, + { + "epoch": 1.8811772457720184, + "grad_norm": 2.3510220050811768, + "learning_rate": 1e-06, + "loss": 0.808, + "mean_token_accuracy": 0.7418708801269531, + "num_tokens": 428024928.0, + "step": 17130 + }, + { + "epoch": 1.8812870634746321, + "grad_norm": 2.2996160984039307, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7189295291900635, + "num_tokens": 428051222.0, + "step": 17131 + }, + { + "epoch": 1.881396881177246, + "grad_norm": 2.3739981651306152, + "learning_rate": 1e-06, + "loss": 0.9238, + "mean_token_accuracy": 0.7124846577644348, + "num_tokens": 428075349.0, + "step": 17132 + }, + { + "epoch": 1.8815066988798594, + "grad_norm": 2.371457099914551, + "learning_rate": 1e-06, + "loss": 0.8076, + "mean_token_accuracy": 0.752036452293396, + "num_tokens": 428097121.0, + "step": 17133 + }, + { + "epoch": 1.881616516582473, + "grad_norm": 2.1659083366394043, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7150698304176331, + "num_tokens": 428124953.0, + "step": 17134 + }, + { + "epoch": 1.8817263342850867, + "grad_norm": 2.31117582321167, + "learning_rate": 1e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7366608381271362, + "num_tokens": 428148980.0, + "step": 17135 + }, + { + "epoch": 1.8818361519877005, + "grad_norm": 2.449930429458618, + "learning_rate": 1e-06, + "loss": 0.8449, + "mean_token_accuracy": 0.7398337125778198, + "num_tokens": 428170086.0, + "step": 17136 + }, + { + "epoch": 1.881945969690314, + "grad_norm": 2.4309072494506836, + "learning_rate": 1e-06, + "loss": 0.9257, + "mean_token_accuracy": 0.7242944836616516, + "num_tokens": 428194089.0, + "step": 17137 + }, + { + "epoch": 1.8820557873929278, + "grad_norm": 2.5462427139282227, + "learning_rate": 1e-06, + "loss": 0.775, + "mean_token_accuracy": 0.7487205266952515, + "num_tokens": 428214122.0, + "step": 17138 + }, + { + "epoch": 1.8821656050955413, + "grad_norm": 2.320918321609497, + "learning_rate": 1e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7403900623321533, + "num_tokens": 428236782.0, + "step": 17139 + }, + { + "epoch": 1.882275422798155, + "grad_norm": 2.3840677738189697, + "learning_rate": 1e-06, + "loss": 0.9839, + "mean_token_accuracy": 0.700634241104126, + "num_tokens": 428262611.0, + "step": 17140 + }, + { + "epoch": 1.8823852405007688, + "grad_norm": 2.346741199493408, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7282286882400513, + "num_tokens": 428287344.0, + "step": 17141 + }, + { + "epoch": 1.8824950582033824, + "grad_norm": 1.8970284461975098, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7417581677436829, + "num_tokens": 428318767.0, + "step": 17142 + }, + { + "epoch": 1.882604875905996, + "grad_norm": 2.2373602390289307, + "learning_rate": 1e-06, + "loss": 0.8681, + "mean_token_accuracy": 0.719369113445282, + "num_tokens": 428344207.0, + "step": 17143 + }, + { + "epoch": 1.8827146936086097, + "grad_norm": 2.393556594848633, + "learning_rate": 1e-06, + "loss": 0.8904, + "mean_token_accuracy": 0.7321352958679199, + "num_tokens": 428367937.0, + "step": 17144 + }, + { + "epoch": 1.8828245113112234, + "grad_norm": 2.4199094772338867, + "learning_rate": 1e-06, + "loss": 0.9123, + "mean_token_accuracy": 0.708503246307373, + "num_tokens": 428390473.0, + "step": 17145 + }, + { + "epoch": 1.8829343290138372, + "grad_norm": 2.246946096420288, + "learning_rate": 1e-06, + "loss": 0.8758, + "mean_token_accuracy": 0.7259957194328308, + "num_tokens": 428415721.0, + "step": 17146 + }, + { + "epoch": 1.8830441467164507, + "grad_norm": 2.357410430908203, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6982243061065674, + "num_tokens": 428440799.0, + "step": 17147 + }, + { + "epoch": 1.8831539644190642, + "grad_norm": 2.03509521484375, + "learning_rate": 1e-06, + "loss": 0.9388, + "mean_token_accuracy": 0.7056505680084229, + "num_tokens": 428471323.0, + "step": 17148 + }, + { + "epoch": 1.883263782121678, + "grad_norm": 2.1980788707733154, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.738283634185791, + "num_tokens": 428494267.0, + "step": 17149 + }, + { + "epoch": 1.8833735998242918, + "grad_norm": 2.1339972019195557, + "learning_rate": 1e-06, + "loss": 0.9283, + "mean_token_accuracy": 0.7154859304428101, + "num_tokens": 428521948.0, + "step": 17150 + }, + { + "epoch": 1.8834834175269053, + "grad_norm": 2.2978291511535645, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7289822101593018, + "num_tokens": 428545971.0, + "step": 17151 + }, + { + "epoch": 1.883593235229519, + "grad_norm": 2.228445291519165, + "learning_rate": 1e-06, + "loss": 0.8207, + "mean_token_accuracy": 0.7402751445770264, + "num_tokens": 428570037.0, + "step": 17152 + }, + { + "epoch": 1.8837030529321326, + "grad_norm": 2.643040418624878, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.7233313322067261, + "num_tokens": 428589630.0, + "step": 17153 + }, + { + "epoch": 1.8838128706347463, + "grad_norm": 2.2761452198028564, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7282865047454834, + "num_tokens": 428613022.0, + "step": 17154 + }, + { + "epoch": 1.88392268833736, + "grad_norm": 2.247610092163086, + "learning_rate": 1e-06, + "loss": 0.919, + "mean_token_accuracy": 0.7227447628974915, + "num_tokens": 428637311.0, + "step": 17155 + }, + { + "epoch": 1.8840325060399736, + "grad_norm": 2.281766653060913, + "learning_rate": 1e-06, + "loss": 0.8091, + "mean_token_accuracy": 0.7355741858482361, + "num_tokens": 428659542.0, + "step": 17156 + }, + { + "epoch": 1.8841423237425872, + "grad_norm": 2.43259859085083, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7182420492172241, + "num_tokens": 428681711.0, + "step": 17157 + }, + { + "epoch": 1.884252141445201, + "grad_norm": 2.221179723739624, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7254814505577087, + "num_tokens": 428707037.0, + "step": 17158 + }, + { + "epoch": 1.8843619591478147, + "grad_norm": 2.680838108062744, + "learning_rate": 1e-06, + "loss": 0.8487, + "mean_token_accuracy": 0.7315658330917358, + "num_tokens": 428726057.0, + "step": 17159 + }, + { + "epoch": 1.8844717768504284, + "grad_norm": 2.2243423461914062, + "learning_rate": 1e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.734416127204895, + "num_tokens": 428751842.0, + "step": 17160 + }, + { + "epoch": 1.884581594553042, + "grad_norm": 2.446035385131836, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.719865083694458, + "num_tokens": 428772891.0, + "step": 17161 + }, + { + "epoch": 1.8846914122556555, + "grad_norm": 2.3224973678588867, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7333056330680847, + "num_tokens": 428795214.0, + "step": 17162 + }, + { + "epoch": 1.8848012299582693, + "grad_norm": 2.1827287673950195, + "learning_rate": 1e-06, + "loss": 0.8457, + "mean_token_accuracy": 0.7365477681159973, + "num_tokens": 428821497.0, + "step": 17163 + }, + { + "epoch": 1.884911047660883, + "grad_norm": 2.1272778511047363, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.723534345626831, + "num_tokens": 428849436.0, + "step": 17164 + }, + { + "epoch": 1.8850208653634966, + "grad_norm": 2.019619941711426, + "learning_rate": 1e-06, + "loss": 0.9488, + "mean_token_accuracy": 0.7086580991744995, + "num_tokens": 428881247.0, + "step": 17165 + }, + { + "epoch": 1.88513068306611, + "grad_norm": 2.188387870788574, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7167322635650635, + "num_tokens": 428907948.0, + "step": 17166 + }, + { + "epoch": 1.8852405007687238, + "grad_norm": 2.3189055919647217, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7250528335571289, + "num_tokens": 428932343.0, + "step": 17167 + }, + { + "epoch": 1.8853503184713376, + "grad_norm": 2.2668778896331787, + "learning_rate": 1e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7381277084350586, + "num_tokens": 428956756.0, + "step": 17168 + }, + { + "epoch": 1.8854601361739514, + "grad_norm": 2.2369112968444824, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7279380559921265, + "num_tokens": 428982673.0, + "step": 17169 + }, + { + "epoch": 1.885569953876565, + "grad_norm": 2.063192129135132, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7118465304374695, + "num_tokens": 429010585.0, + "step": 17170 + }, + { + "epoch": 1.8856797715791784, + "grad_norm": 2.042320966720581, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7104288339614868, + "num_tokens": 429040929.0, + "step": 17171 + }, + { + "epoch": 1.8857895892817922, + "grad_norm": 2.353332281112671, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7342326045036316, + "num_tokens": 429063302.0, + "step": 17172 + }, + { + "epoch": 1.885899406984406, + "grad_norm": 2.2414445877075195, + "learning_rate": 1e-06, + "loss": 0.93, + "mean_token_accuracy": 0.7053661346435547, + "num_tokens": 429086936.0, + "step": 17173 + }, + { + "epoch": 1.8860092246870197, + "grad_norm": 2.688232660293579, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7233788371086121, + "num_tokens": 429107698.0, + "step": 17174 + }, + { + "epoch": 1.8861190423896332, + "grad_norm": 2.0624184608459473, + "learning_rate": 1e-06, + "loss": 1.0003, + "mean_token_accuracy": 0.6899646520614624, + "num_tokens": 429138202.0, + "step": 17175 + }, + { + "epoch": 1.8862288600922468, + "grad_norm": 2.2451088428497314, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7049901485443115, + "num_tokens": 429164124.0, + "step": 17176 + }, + { + "epoch": 1.8863386777948605, + "grad_norm": 2.164829730987549, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7124358415603638, + "num_tokens": 429190199.0, + "step": 17177 + }, + { + "epoch": 1.8864484954974743, + "grad_norm": 2.254091501235962, + "learning_rate": 1e-06, + "loss": 0.9134, + "mean_token_accuracy": 0.7138127088546753, + "num_tokens": 429215585.0, + "step": 17178 + }, + { + "epoch": 1.8865583132000878, + "grad_norm": 2.2707178592681885, + "learning_rate": 1e-06, + "loss": 0.888, + "mean_token_accuracy": 0.7228903770446777, + "num_tokens": 429239755.0, + "step": 17179 + }, + { + "epoch": 1.8866681309027014, + "grad_norm": 1.932745099067688, + "learning_rate": 1e-06, + "loss": 0.8742, + "mean_token_accuracy": 0.7264537811279297, + "num_tokens": 429270235.0, + "step": 17180 + }, + { + "epoch": 1.8867779486053151, + "grad_norm": 2.062964916229248, + "learning_rate": 1e-06, + "loss": 0.9997, + "mean_token_accuracy": 0.6908861398696899, + "num_tokens": 429302667.0, + "step": 17181 + }, + { + "epoch": 1.8868877663079289, + "grad_norm": 1.9954599142074585, + "learning_rate": 1e-06, + "loss": 0.9678, + "mean_token_accuracy": 0.7023350596427917, + "num_tokens": 429335579.0, + "step": 17182 + }, + { + "epoch": 1.8869975840105426, + "grad_norm": 2.639310598373413, + "learning_rate": 1e-06, + "loss": 0.8372, + "mean_token_accuracy": 0.7325727343559265, + "num_tokens": 429355704.0, + "step": 17183 + }, + { + "epoch": 1.8871074017131562, + "grad_norm": 2.16971492767334, + "learning_rate": 1e-06, + "loss": 0.8453, + "mean_token_accuracy": 0.7410237789154053, + "num_tokens": 429382298.0, + "step": 17184 + }, + { + "epoch": 1.8872172194157697, + "grad_norm": 2.3037455081939697, + "learning_rate": 1e-06, + "loss": 1.0062, + "mean_token_accuracy": 0.6961977481842041, + "num_tokens": 429408483.0, + "step": 17185 + }, + { + "epoch": 1.8873270371183835, + "grad_norm": 2.6330068111419678, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7248002886772156, + "num_tokens": 429428356.0, + "step": 17186 + }, + { + "epoch": 1.8874368548209972, + "grad_norm": 2.231924295425415, + "learning_rate": 1e-06, + "loss": 0.8756, + "mean_token_accuracy": 0.7187322974205017, + "num_tokens": 429453993.0, + "step": 17187 + }, + { + "epoch": 1.887546672523611, + "grad_norm": 2.106968402862549, + "learning_rate": 1e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7300065159797668, + "num_tokens": 429481000.0, + "step": 17188 + }, + { + "epoch": 1.8876564902262245, + "grad_norm": 2.714085578918457, + "learning_rate": 1e-06, + "loss": 0.7914, + "mean_token_accuracy": 0.7471617460250854, + "num_tokens": 429499289.0, + "step": 17189 + }, + { + "epoch": 1.887766307928838, + "grad_norm": 2.4340572357177734, + "learning_rate": 1e-06, + "loss": 1.0128, + "mean_token_accuracy": 0.6817590594291687, + "num_tokens": 429522930.0, + "step": 17190 + }, + { + "epoch": 1.8878761256314518, + "grad_norm": 2.183213949203491, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7290581464767456, + "num_tokens": 429549331.0, + "step": 17191 + }, + { + "epoch": 1.8879859433340656, + "grad_norm": 2.032007932662964, + "learning_rate": 1e-06, + "loss": 0.9225, + "mean_token_accuracy": 0.7107365131378174, + "num_tokens": 429579722.0, + "step": 17192 + }, + { + "epoch": 1.888095761036679, + "grad_norm": 2.2849032878875732, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7331525087356567, + "num_tokens": 429604628.0, + "step": 17193 + }, + { + "epoch": 1.8882055787392926, + "grad_norm": 2.138707160949707, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7193903923034668, + "num_tokens": 429632302.0, + "step": 17194 + }, + { + "epoch": 1.8883153964419064, + "grad_norm": 2.3046109676361084, + "learning_rate": 1e-06, + "loss": 1.0119, + "mean_token_accuracy": 0.7086106538772583, + "num_tokens": 429656108.0, + "step": 17195 + }, + { + "epoch": 1.8884252141445201, + "grad_norm": 2.122918128967285, + "learning_rate": 1e-06, + "loss": 0.8529, + "mean_token_accuracy": 0.7268787026405334, + "num_tokens": 429682458.0, + "step": 17196 + }, + { + "epoch": 1.888535031847134, + "grad_norm": 2.434009313583374, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7337755560874939, + "num_tokens": 429702541.0, + "step": 17197 + }, + { + "epoch": 1.8886448495497474, + "grad_norm": 2.313974380493164, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7104364633560181, + "num_tokens": 429727491.0, + "step": 17198 + }, + { + "epoch": 1.888754667252361, + "grad_norm": 2.2970521450042725, + "learning_rate": 1e-06, + "loss": 0.9998, + "mean_token_accuracy": 0.6912063956260681, + "num_tokens": 429756943.0, + "step": 17199 + }, + { + "epoch": 1.8888644849549747, + "grad_norm": 2.1510491371154785, + "learning_rate": 1e-06, + "loss": 0.9298, + "mean_token_accuracy": 0.7253373265266418, + "num_tokens": 429783692.0, + "step": 17200 + }, + { + "epoch": 1.8889743026575885, + "grad_norm": 2.4060351848602295, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.710445761680603, + "num_tokens": 429807404.0, + "step": 17201 + }, + { + "epoch": 1.889084120360202, + "grad_norm": 2.434272289276123, + "learning_rate": 1e-06, + "loss": 0.8317, + "mean_token_accuracy": 0.7424119710922241, + "num_tokens": 429828349.0, + "step": 17202 + }, + { + "epoch": 1.8891939380628158, + "grad_norm": 2.153567314147949, + "learning_rate": 1e-06, + "loss": 1.0066, + "mean_token_accuracy": 0.6896915435791016, + "num_tokens": 429856800.0, + "step": 17203 + }, + { + "epoch": 1.8893037557654293, + "grad_norm": 2.3494937419891357, + "learning_rate": 1e-06, + "loss": 0.8314, + "mean_token_accuracy": 0.7386450171470642, + "num_tokens": 429879830.0, + "step": 17204 + }, + { + "epoch": 1.889413573468043, + "grad_norm": 2.058079242706299, + "learning_rate": 1e-06, + "loss": 0.9135, + "mean_token_accuracy": 0.7149062156677246, + "num_tokens": 429910878.0, + "step": 17205 + }, + { + "epoch": 1.8895233911706568, + "grad_norm": 2.5321431159973145, + "learning_rate": 1e-06, + "loss": 0.945, + "mean_token_accuracy": 0.7110605835914612, + "num_tokens": 429932173.0, + "step": 17206 + }, + { + "epoch": 1.8896332088732704, + "grad_norm": 2.1270973682403564, + "learning_rate": 1e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7421764731407166, + "num_tokens": 429957660.0, + "step": 17207 + }, + { + "epoch": 1.8897430265758839, + "grad_norm": 2.204247236251831, + "learning_rate": 1e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7292903661727905, + "num_tokens": 429983277.0, + "step": 17208 + }, + { + "epoch": 1.8898528442784976, + "grad_norm": 2.380478858947754, + "learning_rate": 1e-06, + "loss": 0.8313, + "mean_token_accuracy": 0.7367701530456543, + "num_tokens": 430006350.0, + "step": 17209 + }, + { + "epoch": 1.8899626619811114, + "grad_norm": 1.9489790201187134, + "learning_rate": 1e-06, + "loss": 0.9281, + "mean_token_accuracy": 0.708249032497406, + "num_tokens": 430038380.0, + "step": 17210 + }, + { + "epoch": 1.8900724796837252, + "grad_norm": 2.358675003051758, + "learning_rate": 1e-06, + "loss": 0.9031, + "mean_token_accuracy": 0.7265526652336121, + "num_tokens": 430063280.0, + "step": 17211 + }, + { + "epoch": 1.8901822973863387, + "grad_norm": 2.150463819503784, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7340333461761475, + "num_tokens": 430089023.0, + "step": 17212 + }, + { + "epoch": 1.8902921150889522, + "grad_norm": 2.6838176250457764, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7303346395492554, + "num_tokens": 430109078.0, + "step": 17213 + }, + { + "epoch": 1.890401932791566, + "grad_norm": 2.749518871307373, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7348707914352417, + "num_tokens": 430127471.0, + "step": 17214 + }, + { + "epoch": 1.8905117504941797, + "grad_norm": 2.5206151008605957, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7292850613594055, + "num_tokens": 430147189.0, + "step": 17215 + }, + { + "epoch": 1.8906215681967933, + "grad_norm": 2.3641490936279297, + "learning_rate": 1e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7375575304031372, + "num_tokens": 430169590.0, + "step": 17216 + }, + { + "epoch": 1.890731385899407, + "grad_norm": 2.3175017833709717, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7228753566741943, + "num_tokens": 430194930.0, + "step": 17217 + }, + { + "epoch": 1.8908412036020206, + "grad_norm": 2.350250482559204, + "learning_rate": 1e-06, + "loss": 0.8476, + "mean_token_accuracy": 0.7294634580612183, + "num_tokens": 430216397.0, + "step": 17218 + }, + { + "epoch": 1.8909510213046343, + "grad_norm": 2.2687106132507324, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7234383821487427, + "num_tokens": 430240014.0, + "step": 17219 + }, + { + "epoch": 1.891060839007248, + "grad_norm": 2.2822279930114746, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7249642610549927, + "num_tokens": 430263330.0, + "step": 17220 + }, + { + "epoch": 1.8911706567098616, + "grad_norm": 2.114232063293457, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7213679552078247, + "num_tokens": 430290663.0, + "step": 17221 + }, + { + "epoch": 1.8912804744124752, + "grad_norm": 2.21444034576416, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7280852794647217, + "num_tokens": 430314851.0, + "step": 17222 + }, + { + "epoch": 1.891390292115089, + "grad_norm": 2.330751419067383, + "learning_rate": 1e-06, + "loss": 0.9392, + "mean_token_accuracy": 0.7225951552391052, + "num_tokens": 430339855.0, + "step": 17223 + }, + { + "epoch": 1.8915001098177027, + "grad_norm": 2.055516242980957, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7017254829406738, + "num_tokens": 430368841.0, + "step": 17224 + }, + { + "epoch": 1.8916099275203164, + "grad_norm": 2.280137538909912, + "learning_rate": 1e-06, + "loss": 0.956, + "mean_token_accuracy": 0.7095556855201721, + "num_tokens": 430397266.0, + "step": 17225 + }, + { + "epoch": 1.89171974522293, + "grad_norm": 2.450584888458252, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.715814471244812, + "num_tokens": 430418667.0, + "step": 17226 + }, + { + "epoch": 1.8918295629255435, + "grad_norm": 2.130089521408081, + "learning_rate": 1e-06, + "loss": 0.9459, + "mean_token_accuracy": 0.7071632146835327, + "num_tokens": 430446278.0, + "step": 17227 + }, + { + "epoch": 1.8919393806281573, + "grad_norm": 2.289259672164917, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7140382528305054, + "num_tokens": 430469439.0, + "step": 17228 + }, + { + "epoch": 1.892049198330771, + "grad_norm": 2.2050275802612305, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7129228115081787, + "num_tokens": 430494787.0, + "step": 17229 + }, + { + "epoch": 1.8921590160333845, + "grad_norm": 2.4476916790008545, + "learning_rate": 1e-06, + "loss": 0.8677, + "mean_token_accuracy": 0.7358611226081848, + "num_tokens": 430516686.0, + "step": 17230 + }, + { + "epoch": 1.892268833735998, + "grad_norm": 2.5176002979278564, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.725824236869812, + "num_tokens": 430537757.0, + "step": 17231 + }, + { + "epoch": 1.8923786514386118, + "grad_norm": 2.1123273372650146, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7240604162216187, + "num_tokens": 430565600.0, + "step": 17232 + }, + { + "epoch": 1.8924884691412256, + "grad_norm": 2.2667694091796875, + "learning_rate": 1e-06, + "loss": 0.8744, + "mean_token_accuracy": 0.7409656643867493, + "num_tokens": 430588787.0, + "step": 17233 + }, + { + "epoch": 1.8925982868438394, + "grad_norm": 2.05570387840271, + "learning_rate": 1e-06, + "loss": 0.7903, + "mean_token_accuracy": 0.7475219964981079, + "num_tokens": 430615354.0, + "step": 17234 + }, + { + "epoch": 1.8927081045464529, + "grad_norm": 2.247145891189575, + "learning_rate": 1e-06, + "loss": 0.9042, + "mean_token_accuracy": 0.7217889428138733, + "num_tokens": 430644826.0, + "step": 17235 + }, + { + "epoch": 1.8928179222490664, + "grad_norm": 2.2063496112823486, + "learning_rate": 1e-06, + "loss": 0.8071, + "mean_token_accuracy": 0.7389892935752869, + "num_tokens": 430669217.0, + "step": 17236 + }, + { + "epoch": 1.8929277399516802, + "grad_norm": 2.178264617919922, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7389091849327087, + "num_tokens": 430697521.0, + "step": 17237 + }, + { + "epoch": 1.893037557654294, + "grad_norm": 2.258293390274048, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7113267183303833, + "num_tokens": 430722432.0, + "step": 17238 + }, + { + "epoch": 1.8931473753569077, + "grad_norm": 2.338711738586426, + "learning_rate": 1e-06, + "loss": 0.8766, + "mean_token_accuracy": 0.7281862497329712, + "num_tokens": 430746141.0, + "step": 17239 + }, + { + "epoch": 1.8932571930595212, + "grad_norm": 2.0152883529663086, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.700725793838501, + "num_tokens": 430776349.0, + "step": 17240 + }, + { + "epoch": 1.8933670107621348, + "grad_norm": 2.301229476928711, + "learning_rate": 1e-06, + "loss": 0.8263, + "mean_token_accuracy": 0.7382665872573853, + "num_tokens": 430798528.0, + "step": 17241 + }, + { + "epoch": 1.8934768284647485, + "grad_norm": 2.217538595199585, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7071289420127869, + "num_tokens": 430827119.0, + "step": 17242 + }, + { + "epoch": 1.8935866461673623, + "grad_norm": 2.4289307594299316, + "learning_rate": 1e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7354363799095154, + "num_tokens": 430850072.0, + "step": 17243 + }, + { + "epoch": 1.8936964638699758, + "grad_norm": 2.3011293411254883, + "learning_rate": 1e-06, + "loss": 0.8573, + "mean_token_accuracy": 0.7272214889526367, + "num_tokens": 430872420.0, + "step": 17244 + }, + { + "epoch": 1.8938062815725893, + "grad_norm": 2.362255573272705, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.71526700258255, + "num_tokens": 430896198.0, + "step": 17245 + }, + { + "epoch": 1.893916099275203, + "grad_norm": 2.383519411087036, + "learning_rate": 1e-06, + "loss": 0.8492, + "mean_token_accuracy": 0.7347888946533203, + "num_tokens": 430919262.0, + "step": 17246 + }, + { + "epoch": 1.8940259169778169, + "grad_norm": 2.1421287059783936, + "learning_rate": 1e-06, + "loss": 1.006, + "mean_token_accuracy": 0.7011901140213013, + "num_tokens": 430947969.0, + "step": 17247 + }, + { + "epoch": 1.8941357346804306, + "grad_norm": 2.214463233947754, + "learning_rate": 1e-06, + "loss": 0.9226, + "mean_token_accuracy": 0.7131557464599609, + "num_tokens": 430975659.0, + "step": 17248 + }, + { + "epoch": 1.8942455523830442, + "grad_norm": 2.38371205329895, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7353371381759644, + "num_tokens": 430998674.0, + "step": 17249 + }, + { + "epoch": 1.8943553700856577, + "grad_norm": 2.3138628005981445, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7254406213760376, + "num_tokens": 431021023.0, + "step": 17250 + }, + { + "epoch": 1.8944651877882714, + "grad_norm": 2.0513038635253906, + "learning_rate": 1e-06, + "loss": 0.9082, + "mean_token_accuracy": 0.7201334238052368, + "num_tokens": 431050882.0, + "step": 17251 + }, + { + "epoch": 1.8945750054908852, + "grad_norm": 2.132427930831909, + "learning_rate": 1e-06, + "loss": 0.9583, + "mean_token_accuracy": 0.6993533372879028, + "num_tokens": 431079649.0, + "step": 17252 + }, + { + "epoch": 1.894684823193499, + "grad_norm": 2.1741538047790527, + "learning_rate": 1e-06, + "loss": 0.8921, + "mean_token_accuracy": 0.7283948659896851, + "num_tokens": 431106870.0, + "step": 17253 + }, + { + "epoch": 1.8947946408961125, + "grad_norm": 2.5299010276794434, + "learning_rate": 1e-06, + "loss": 0.7422, + "mean_token_accuracy": 0.767525315284729, + "num_tokens": 431126247.0, + "step": 17254 + }, + { + "epoch": 1.894904458598726, + "grad_norm": 2.4688777923583984, + "learning_rate": 1e-06, + "loss": 0.9224, + "mean_token_accuracy": 0.7175724506378174, + "num_tokens": 431147360.0, + "step": 17255 + }, + { + "epoch": 1.8950142763013398, + "grad_norm": 2.0704967975616455, + "learning_rate": 1e-06, + "loss": 0.8822, + "mean_token_accuracy": 0.7228292226791382, + "num_tokens": 431176937.0, + "step": 17256 + }, + { + "epoch": 1.8951240940039535, + "grad_norm": 2.36972713470459, + "learning_rate": 1e-06, + "loss": 0.8952, + "mean_token_accuracy": 0.7170288562774658, + "num_tokens": 431199948.0, + "step": 17257 + }, + { + "epoch": 1.895233911706567, + "grad_norm": 2.1989448070526123, + "learning_rate": 1e-06, + "loss": 0.8879, + "mean_token_accuracy": 0.7222927808761597, + "num_tokens": 431225364.0, + "step": 17258 + }, + { + "epoch": 1.8953437294091806, + "grad_norm": 2.4045650959014893, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7225194573402405, + "num_tokens": 431247947.0, + "step": 17259 + }, + { + "epoch": 1.8954535471117944, + "grad_norm": 2.27189564704895, + "learning_rate": 1e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7268283367156982, + "num_tokens": 431272197.0, + "step": 17260 + }, + { + "epoch": 1.8955633648144081, + "grad_norm": 2.3324382305145264, + "learning_rate": 1e-06, + "loss": 0.9445, + "mean_token_accuracy": 0.7095783948898315, + "num_tokens": 431300056.0, + "step": 17261 + }, + { + "epoch": 1.8956731825170219, + "grad_norm": 2.346062660217285, + "learning_rate": 1e-06, + "loss": 0.8173, + "mean_token_accuracy": 0.741331160068512, + "num_tokens": 431321063.0, + "step": 17262 + }, + { + "epoch": 1.8957830002196354, + "grad_norm": 2.2787024974823, + "learning_rate": 1e-06, + "loss": 1.0151, + "mean_token_accuracy": 0.6961177587509155, + "num_tokens": 431345618.0, + "step": 17263 + }, + { + "epoch": 1.895892817922249, + "grad_norm": 2.3533270359039307, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7136282920837402, + "num_tokens": 431369928.0, + "step": 17264 + }, + { + "epoch": 1.8960026356248627, + "grad_norm": 2.095454216003418, + "learning_rate": 1e-06, + "loss": 0.949, + "mean_token_accuracy": 0.7206065654754639, + "num_tokens": 431397655.0, + "step": 17265 + }, + { + "epoch": 1.8961124533274765, + "grad_norm": 2.024803876876831, + "learning_rate": 1e-06, + "loss": 0.9318, + "mean_token_accuracy": 0.7105039358139038, + "num_tokens": 431426563.0, + "step": 17266 + }, + { + "epoch": 1.89622227103009, + "grad_norm": 2.4896647930145264, + "learning_rate": 1e-06, + "loss": 0.8589, + "mean_token_accuracy": 0.7247185707092285, + "num_tokens": 431446326.0, + "step": 17267 + }, + { + "epoch": 1.8963320887327038, + "grad_norm": 2.2154386043548584, + "learning_rate": 1e-06, + "loss": 0.8791, + "mean_token_accuracy": 0.7219206690788269, + "num_tokens": 431471869.0, + "step": 17268 + }, + { + "epoch": 1.8964419064353173, + "grad_norm": 2.4188530445098877, + "learning_rate": 1e-06, + "loss": 0.9435, + "mean_token_accuracy": 0.7058025002479553, + "num_tokens": 431496508.0, + "step": 17269 + }, + { + "epoch": 1.896551724137931, + "grad_norm": 2.0766396522521973, + "learning_rate": 1e-06, + "loss": 0.8215, + "mean_token_accuracy": 0.7387383580207825, + "num_tokens": 431525196.0, + "step": 17270 + }, + { + "epoch": 1.8966615418405448, + "grad_norm": 2.466606616973877, + "learning_rate": 1e-06, + "loss": 0.9699, + "mean_token_accuracy": 0.7028042078018188, + "num_tokens": 431548145.0, + "step": 17271 + }, + { + "epoch": 1.8967713595431583, + "grad_norm": 2.3078956604003906, + "learning_rate": 1e-06, + "loss": 0.8652, + "mean_token_accuracy": 0.7284705638885498, + "num_tokens": 431572282.0, + "step": 17272 + }, + { + "epoch": 1.8968811772457719, + "grad_norm": 2.211483955383301, + "learning_rate": 1e-06, + "loss": 0.9077, + "mean_token_accuracy": 0.7299184799194336, + "num_tokens": 431597831.0, + "step": 17273 + }, + { + "epoch": 1.8969909949483856, + "grad_norm": 2.280268430709839, + "learning_rate": 1e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.730976402759552, + "num_tokens": 431621877.0, + "step": 17274 + }, + { + "epoch": 1.8971008126509994, + "grad_norm": 2.572768449783325, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.7208328247070312, + "num_tokens": 431643884.0, + "step": 17275 + }, + { + "epoch": 1.8972106303536131, + "grad_norm": 2.4476776123046875, + "learning_rate": 1e-06, + "loss": 0.9158, + "mean_token_accuracy": 0.7146042585372925, + "num_tokens": 431666361.0, + "step": 17276 + }, + { + "epoch": 1.8973204480562267, + "grad_norm": 2.6445934772491455, + "learning_rate": 1e-06, + "loss": 0.9317, + "mean_token_accuracy": 0.7144608497619629, + "num_tokens": 431686837.0, + "step": 17277 + }, + { + "epoch": 1.8974302657588402, + "grad_norm": 2.259169578552246, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.6992815732955933, + "num_tokens": 431716281.0, + "step": 17278 + }, + { + "epoch": 1.897540083461454, + "grad_norm": 2.2910850048065186, + "learning_rate": 1e-06, + "loss": 0.8319, + "mean_token_accuracy": 0.7376046180725098, + "num_tokens": 431740665.0, + "step": 17279 + }, + { + "epoch": 1.8976499011640677, + "grad_norm": 2.136359214782715, + "learning_rate": 1e-06, + "loss": 0.8058, + "mean_token_accuracy": 0.7446936368942261, + "num_tokens": 431767089.0, + "step": 17280 + }, + { + "epoch": 1.8977597188666813, + "grad_norm": 2.671254873275757, + "learning_rate": 1e-06, + "loss": 0.8342, + "mean_token_accuracy": 0.7349985837936401, + "num_tokens": 431785342.0, + "step": 17281 + }, + { + "epoch": 1.897869536569295, + "grad_norm": 2.2509539127349854, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7183657884597778, + "num_tokens": 431811096.0, + "step": 17282 + }, + { + "epoch": 1.8979793542719086, + "grad_norm": 2.407315254211426, + "learning_rate": 1e-06, + "loss": 0.9336, + "mean_token_accuracy": 0.7110778093338013, + "num_tokens": 431834310.0, + "step": 17283 + }, + { + "epoch": 1.8980891719745223, + "grad_norm": 2.2788145542144775, + "learning_rate": 1e-06, + "loss": 0.9315, + "mean_token_accuracy": 0.7102982997894287, + "num_tokens": 431862571.0, + "step": 17284 + }, + { + "epoch": 1.898198989677136, + "grad_norm": 2.293569326400757, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7159194350242615, + "num_tokens": 431887608.0, + "step": 17285 + }, + { + "epoch": 1.8983088073797496, + "grad_norm": 2.177777051925659, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7273749113082886, + "num_tokens": 431914656.0, + "step": 17286 + }, + { + "epoch": 1.8984186250823631, + "grad_norm": 2.0795235633850098, + "learning_rate": 1e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7326401472091675, + "num_tokens": 431943158.0, + "step": 17287 + }, + { + "epoch": 1.898528442784977, + "grad_norm": 2.305816650390625, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7210367918014526, + "num_tokens": 431968778.0, + "step": 17288 + }, + { + "epoch": 1.8986382604875907, + "grad_norm": 2.0902037620544434, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7216978073120117, + "num_tokens": 432000040.0, + "step": 17289 + }, + { + "epoch": 1.8987480781902044, + "grad_norm": 2.1492998600006104, + "learning_rate": 1e-06, + "loss": 0.939, + "mean_token_accuracy": 0.7063925266265869, + "num_tokens": 432030379.0, + "step": 17290 + }, + { + "epoch": 1.898857895892818, + "grad_norm": 2.6157023906707764, + "learning_rate": 1e-06, + "loss": 0.7887, + "mean_token_accuracy": 0.7518377304077148, + "num_tokens": 432049813.0, + "step": 17291 + }, + { + "epoch": 1.8989677135954315, + "grad_norm": 2.5005271434783936, + "learning_rate": 1e-06, + "loss": 0.8216, + "mean_token_accuracy": 0.7372273206710815, + "num_tokens": 432070344.0, + "step": 17292 + }, + { + "epoch": 1.8990775312980452, + "grad_norm": 2.178642988204956, + "learning_rate": 1e-06, + "loss": 0.9351, + "mean_token_accuracy": 0.7108094096183777, + "num_tokens": 432098022.0, + "step": 17293 + }, + { + "epoch": 1.899187349000659, + "grad_norm": 2.810450792312622, + "learning_rate": 1e-06, + "loss": 0.859, + "mean_token_accuracy": 0.7315225601196289, + "num_tokens": 432116112.0, + "step": 17294 + }, + { + "epoch": 1.8992971667032725, + "grad_norm": 2.0724666118621826, + "learning_rate": 1e-06, + "loss": 0.9073, + "mean_token_accuracy": 0.7163276672363281, + "num_tokens": 432147285.0, + "step": 17295 + }, + { + "epoch": 1.899406984405886, + "grad_norm": 2.5321476459503174, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7220436930656433, + "num_tokens": 432168474.0, + "step": 17296 + }, + { + "epoch": 1.8995168021084998, + "grad_norm": 2.699544906616211, + "learning_rate": 1e-06, + "loss": 0.8225, + "mean_token_accuracy": 0.7408658266067505, + "num_tokens": 432186726.0, + "step": 17297 + }, + { + "epoch": 1.8996266198111136, + "grad_norm": 2.585880994796753, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7327591180801392, + "num_tokens": 432205596.0, + "step": 17298 + }, + { + "epoch": 1.8997364375137273, + "grad_norm": 2.8326361179351807, + "learning_rate": 1e-06, + "loss": 0.882, + "mean_token_accuracy": 0.7259591221809387, + "num_tokens": 432223575.0, + "step": 17299 + }, + { + "epoch": 1.8998462552163409, + "grad_norm": 2.0835700035095215, + "learning_rate": 1e-06, + "loss": 0.9773, + "mean_token_accuracy": 0.7001904249191284, + "num_tokens": 432253376.0, + "step": 17300 + }, + { + "epoch": 1.8999560729189544, + "grad_norm": 2.298097848892212, + "learning_rate": 1e-06, + "loss": 0.9068, + "mean_token_accuracy": 0.7182635068893433, + "num_tokens": 432277332.0, + "step": 17301 + }, + { + "epoch": 1.9000658906215682, + "grad_norm": 2.3298654556274414, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7110694050788879, + "num_tokens": 432301548.0, + "step": 17302 + }, + { + "epoch": 1.900175708324182, + "grad_norm": 2.153918743133545, + "learning_rate": 1e-06, + "loss": 0.8706, + "mean_token_accuracy": 0.722740650177002, + "num_tokens": 432329337.0, + "step": 17303 + }, + { + "epoch": 1.9002855260267957, + "grad_norm": 2.192687749862671, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7116162180900574, + "num_tokens": 432355775.0, + "step": 17304 + }, + { + "epoch": 1.9003953437294092, + "grad_norm": 2.3152294158935547, + "learning_rate": 1e-06, + "loss": 0.8748, + "mean_token_accuracy": 0.723762035369873, + "num_tokens": 432379871.0, + "step": 17305 + }, + { + "epoch": 1.9005051614320227, + "grad_norm": 2.35194993019104, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7103984951972961, + "num_tokens": 432403752.0, + "step": 17306 + }, + { + "epoch": 1.9006149791346365, + "grad_norm": 2.207529067993164, + "learning_rate": 1e-06, + "loss": 0.8023, + "mean_token_accuracy": 0.7479166984558105, + "num_tokens": 432427042.0, + "step": 17307 + }, + { + "epoch": 1.9007247968372503, + "grad_norm": 2.1531641483306885, + "learning_rate": 1e-06, + "loss": 0.8048, + "mean_token_accuracy": 0.7438960075378418, + "num_tokens": 432452012.0, + "step": 17308 + }, + { + "epoch": 1.9008346145398638, + "grad_norm": 2.2257211208343506, + "learning_rate": 1e-06, + "loss": 0.946, + "mean_token_accuracy": 0.7066396474838257, + "num_tokens": 432478910.0, + "step": 17309 + }, + { + "epoch": 1.9009444322424773, + "grad_norm": 2.2507805824279785, + "learning_rate": 1e-06, + "loss": 0.8769, + "mean_token_accuracy": 0.7321431636810303, + "num_tokens": 432503840.0, + "step": 17310 + }, + { + "epoch": 1.901054249945091, + "grad_norm": 2.1206445693969727, + "learning_rate": 1e-06, + "loss": 0.9377, + "mean_token_accuracy": 0.7083160877227783, + "num_tokens": 432532444.0, + "step": 17311 + }, + { + "epoch": 1.9011640676477048, + "grad_norm": 2.222184658050537, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7388409376144409, + "num_tokens": 432557718.0, + "step": 17312 + }, + { + "epoch": 1.9012738853503186, + "grad_norm": 2.3645215034484863, + "learning_rate": 1e-06, + "loss": 0.9356, + "mean_token_accuracy": 0.7189892530441284, + "num_tokens": 432580538.0, + "step": 17313 + }, + { + "epoch": 1.9013837030529321, + "grad_norm": 2.178354501724243, + "learning_rate": 1e-06, + "loss": 0.9021, + "mean_token_accuracy": 0.715322732925415, + "num_tokens": 432609060.0, + "step": 17314 + }, + { + "epoch": 1.9014935207555457, + "grad_norm": 2.4294111728668213, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7324379682540894, + "num_tokens": 432631082.0, + "step": 17315 + }, + { + "epoch": 1.9016033384581594, + "grad_norm": 2.823657274246216, + "learning_rate": 1e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7558140754699707, + "num_tokens": 432646595.0, + "step": 17316 + }, + { + "epoch": 1.9017131561607732, + "grad_norm": 2.1616835594177246, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7170780897140503, + "num_tokens": 432674603.0, + "step": 17317 + }, + { + "epoch": 1.9018229738633867, + "grad_norm": 2.3330936431884766, + "learning_rate": 1e-06, + "loss": 0.9415, + "mean_token_accuracy": 0.7150272130966187, + "num_tokens": 432697909.0, + "step": 17318 + }, + { + "epoch": 1.9019327915660005, + "grad_norm": 2.446779489517212, + "learning_rate": 1e-06, + "loss": 0.8737, + "mean_token_accuracy": 0.7307262420654297, + "num_tokens": 432719469.0, + "step": 17319 + }, + { + "epoch": 1.902042609268614, + "grad_norm": 2.150822639465332, + "learning_rate": 1e-06, + "loss": 0.9241, + "mean_token_accuracy": 0.7199686765670776, + "num_tokens": 432746341.0, + "step": 17320 + }, + { + "epoch": 1.9021524269712278, + "grad_norm": 2.4095444679260254, + "learning_rate": 1e-06, + "loss": 0.9022, + "mean_token_accuracy": 0.726434588432312, + "num_tokens": 432769243.0, + "step": 17321 + }, + { + "epoch": 1.9022622446738415, + "grad_norm": 2.396453619003296, + "learning_rate": 1e-06, + "loss": 0.9329, + "mean_token_accuracy": 0.7200369238853455, + "num_tokens": 432792369.0, + "step": 17322 + }, + { + "epoch": 1.902372062376455, + "grad_norm": 2.0747275352478027, + "learning_rate": 1e-06, + "loss": 0.9875, + "mean_token_accuracy": 0.7006642818450928, + "num_tokens": 432823382.0, + "step": 17323 + }, + { + "epoch": 1.9024818800790686, + "grad_norm": 2.284620761871338, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7339822053909302, + "num_tokens": 432848188.0, + "step": 17324 + }, + { + "epoch": 1.9025916977816824, + "grad_norm": 2.230876922607422, + "learning_rate": 1e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7403838038444519, + "num_tokens": 432871050.0, + "step": 17325 + }, + { + "epoch": 1.9027015154842961, + "grad_norm": 2.355743408203125, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7254748344421387, + "num_tokens": 432894037.0, + "step": 17326 + }, + { + "epoch": 1.9028113331869099, + "grad_norm": 2.314469575881958, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.7174665927886963, + "num_tokens": 432918566.0, + "step": 17327 + }, + { + "epoch": 1.9029211508895234, + "grad_norm": 2.157989025115967, + "learning_rate": 1e-06, + "loss": 0.9023, + "mean_token_accuracy": 0.7189613580703735, + "num_tokens": 432945325.0, + "step": 17328 + }, + { + "epoch": 1.903030968592137, + "grad_norm": 2.702103853225708, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7083264589309692, + "num_tokens": 432966190.0, + "step": 17329 + }, + { + "epoch": 1.9031407862947507, + "grad_norm": 2.1272432804107666, + "learning_rate": 1e-06, + "loss": 0.8837, + "mean_token_accuracy": 0.7317385077476501, + "num_tokens": 432991272.0, + "step": 17330 + }, + { + "epoch": 1.9032506039973645, + "grad_norm": 2.0225651264190674, + "learning_rate": 1e-06, + "loss": 0.9385, + "mean_token_accuracy": 0.7115117907524109, + "num_tokens": 433023821.0, + "step": 17331 + }, + { + "epoch": 1.903360421699978, + "grad_norm": 2.2519853115081787, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7195990681648254, + "num_tokens": 433048851.0, + "step": 17332 + }, + { + "epoch": 1.9034702394025917, + "grad_norm": 2.1575567722320557, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.6981337070465088, + "num_tokens": 433076183.0, + "step": 17333 + }, + { + "epoch": 1.9035800571052053, + "grad_norm": 2.1405045986175537, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7259140610694885, + "num_tokens": 433105220.0, + "step": 17334 + }, + { + "epoch": 1.903689874807819, + "grad_norm": 2.2149527072906494, + "learning_rate": 1e-06, + "loss": 0.9184, + "mean_token_accuracy": 0.7200288772583008, + "num_tokens": 433129938.0, + "step": 17335 + }, + { + "epoch": 1.9037996925104328, + "grad_norm": 2.2014265060424805, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7038771510124207, + "num_tokens": 433155115.0, + "step": 17336 + }, + { + "epoch": 1.9039095102130463, + "grad_norm": 2.2275047302246094, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7227879762649536, + "num_tokens": 433181116.0, + "step": 17337 + }, + { + "epoch": 1.9040193279156599, + "grad_norm": 2.5624513626098633, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.743504524230957, + "num_tokens": 433203076.0, + "step": 17338 + }, + { + "epoch": 1.9041291456182736, + "grad_norm": 1.8891786336898804, + "learning_rate": 1e-06, + "loss": 0.9535, + "mean_token_accuracy": 0.7075121998786926, + "num_tokens": 433238987.0, + "step": 17339 + }, + { + "epoch": 1.9042389633208874, + "grad_norm": 2.396810531616211, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7248290181159973, + "num_tokens": 433262334.0, + "step": 17340 + }, + { + "epoch": 1.9043487810235011, + "grad_norm": 2.1278445720672607, + "learning_rate": 1e-06, + "loss": 0.9248, + "mean_token_accuracy": 0.7090981006622314, + "num_tokens": 433290100.0, + "step": 17341 + }, + { + "epoch": 1.9044585987261147, + "grad_norm": 1.8728734254837036, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7375307083129883, + "num_tokens": 433325686.0, + "step": 17342 + }, + { + "epoch": 1.9045684164287282, + "grad_norm": 2.2513599395751953, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.7250134944915771, + "num_tokens": 433350134.0, + "step": 17343 + }, + { + "epoch": 1.904678234131342, + "grad_norm": 2.0708248615264893, + "learning_rate": 1e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7285733819007874, + "num_tokens": 433376019.0, + "step": 17344 + }, + { + "epoch": 1.9047880518339557, + "grad_norm": 2.1964848041534424, + "learning_rate": 1e-06, + "loss": 0.8445, + "mean_token_accuracy": 0.7333004474639893, + "num_tokens": 433401563.0, + "step": 17345 + }, + { + "epoch": 1.9048978695365693, + "grad_norm": 2.1463119983673096, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7190384864807129, + "num_tokens": 433428771.0, + "step": 17346 + }, + { + "epoch": 1.9050076872391828, + "grad_norm": 2.7052998542785645, + "learning_rate": 1e-06, + "loss": 0.8119, + "mean_token_accuracy": 0.7333606481552124, + "num_tokens": 433446558.0, + "step": 17347 + }, + { + "epoch": 1.9051175049417965, + "grad_norm": 2.72100830078125, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7205363512039185, + "num_tokens": 433465900.0, + "step": 17348 + }, + { + "epoch": 1.9052273226444103, + "grad_norm": 2.254621744155884, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7189280986785889, + "num_tokens": 433490813.0, + "step": 17349 + }, + { + "epoch": 1.905337140347024, + "grad_norm": 2.2023427486419678, + "learning_rate": 1e-06, + "loss": 0.8947, + "mean_token_accuracy": 0.7199893593788147, + "num_tokens": 433515926.0, + "step": 17350 + }, + { + "epoch": 1.9054469580496376, + "grad_norm": 2.480757713317871, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7303446531295776, + "num_tokens": 433536466.0, + "step": 17351 + }, + { + "epoch": 1.9055567757522511, + "grad_norm": 2.1720988750457764, + "learning_rate": 1e-06, + "loss": 0.9383, + "mean_token_accuracy": 0.7169386148452759, + "num_tokens": 433564975.0, + "step": 17352 + }, + { + "epoch": 1.9056665934548649, + "grad_norm": 2.288909912109375, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.721852719783783, + "num_tokens": 433591016.0, + "step": 17353 + }, + { + "epoch": 1.9057764111574786, + "grad_norm": 2.35975980758667, + "learning_rate": 1e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7267625331878662, + "num_tokens": 433613780.0, + "step": 17354 + }, + { + "epoch": 1.9058862288600924, + "grad_norm": 2.0330281257629395, + "learning_rate": 1e-06, + "loss": 1.0105, + "mean_token_accuracy": 0.6885459423065186, + "num_tokens": 433644220.0, + "step": 17355 + }, + { + "epoch": 1.905996046562706, + "grad_norm": 2.5435516834259033, + "learning_rate": 1e-06, + "loss": 0.7847, + "mean_token_accuracy": 0.7486069202423096, + "num_tokens": 433664472.0, + "step": 17356 + }, + { + "epoch": 1.9061058642653195, + "grad_norm": 2.3940131664276123, + "learning_rate": 1e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.7392546534538269, + "num_tokens": 433685483.0, + "step": 17357 + }, + { + "epoch": 1.9062156819679332, + "grad_norm": 2.2308268547058105, + "learning_rate": 1e-06, + "loss": 0.9981, + "mean_token_accuracy": 0.7006782293319702, + "num_tokens": 433712852.0, + "step": 17358 + }, + { + "epoch": 1.906325499670547, + "grad_norm": 2.2170777320861816, + "learning_rate": 1e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7406584024429321, + "num_tokens": 433736514.0, + "step": 17359 + }, + { + "epoch": 1.9064353173731605, + "grad_norm": 2.178668260574341, + "learning_rate": 1e-06, + "loss": 0.8995, + "mean_token_accuracy": 0.727678656578064, + "num_tokens": 433763171.0, + "step": 17360 + }, + { + "epoch": 1.906545135075774, + "grad_norm": 2.1969876289367676, + "learning_rate": 1e-06, + "loss": 0.8137, + "mean_token_accuracy": 0.7402279376983643, + "num_tokens": 433789041.0, + "step": 17361 + }, + { + "epoch": 1.9066549527783878, + "grad_norm": 2.329524517059326, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.7211799025535583, + "num_tokens": 433814708.0, + "step": 17362 + }, + { + "epoch": 1.9067647704810016, + "grad_norm": 2.594860315322876, + "learning_rate": 1e-06, + "loss": 0.9387, + "mean_token_accuracy": 0.7228988409042358, + "num_tokens": 433835584.0, + "step": 17363 + }, + { + "epoch": 1.9068745881836153, + "grad_norm": 2.3928842544555664, + "learning_rate": 1e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7331517934799194, + "num_tokens": 433857548.0, + "step": 17364 + }, + { + "epoch": 1.9069844058862289, + "grad_norm": 2.4404821395874023, + "learning_rate": 1e-06, + "loss": 0.9244, + "mean_token_accuracy": 0.7138091325759888, + "num_tokens": 433879917.0, + "step": 17365 + }, + { + "epoch": 1.9070942235888424, + "grad_norm": 2.172977924346924, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7482314705848694, + "num_tokens": 433904298.0, + "step": 17366 + }, + { + "epoch": 1.9072040412914562, + "grad_norm": 2.158482074737549, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7030834555625916, + "num_tokens": 433933756.0, + "step": 17367 + }, + { + "epoch": 1.90731385899407, + "grad_norm": 2.1879770755767822, + "learning_rate": 1e-06, + "loss": 0.8998, + "mean_token_accuracy": 0.7217719554901123, + "num_tokens": 433961092.0, + "step": 17368 + }, + { + "epoch": 1.9074236766966837, + "grad_norm": 2.2117724418640137, + "learning_rate": 1e-06, + "loss": 0.8092, + "mean_token_accuracy": 0.73897385597229, + "num_tokens": 433985323.0, + "step": 17369 + }, + { + "epoch": 1.9075334943992972, + "grad_norm": 2.1403536796569824, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.705432653427124, + "num_tokens": 434013188.0, + "step": 17370 + }, + { + "epoch": 1.9076433121019107, + "grad_norm": 1.8926876783370972, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7219630479812622, + "num_tokens": 434048678.0, + "step": 17371 + }, + { + "epoch": 1.9077531298045245, + "grad_norm": 2.096158742904663, + "learning_rate": 1e-06, + "loss": 1.0455, + "mean_token_accuracy": 0.6863389015197754, + "num_tokens": 434079250.0, + "step": 17372 + }, + { + "epoch": 1.9078629475071383, + "grad_norm": 2.1338250637054443, + "learning_rate": 1e-06, + "loss": 0.9647, + "mean_token_accuracy": 0.7004941701889038, + "num_tokens": 434106232.0, + "step": 17373 + }, + { + "epoch": 1.9079727652097518, + "grad_norm": 2.221679449081421, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7336894869804382, + "num_tokens": 434130261.0, + "step": 17374 + }, + { + "epoch": 1.9080825829123653, + "grad_norm": 2.4081411361694336, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7185892462730408, + "num_tokens": 434153235.0, + "step": 17375 + }, + { + "epoch": 1.908192400614979, + "grad_norm": 2.1713192462921143, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7300892472267151, + "num_tokens": 434178429.0, + "step": 17376 + }, + { + "epoch": 1.9083022183175928, + "grad_norm": 2.2157175540924072, + "learning_rate": 1e-06, + "loss": 0.9191, + "mean_token_accuracy": 0.7161296606063843, + "num_tokens": 434204795.0, + "step": 17377 + }, + { + "epoch": 1.9084120360202066, + "grad_norm": 2.559520721435547, + "learning_rate": 1e-06, + "loss": 0.9263, + "mean_token_accuracy": 0.7289190292358398, + "num_tokens": 434225284.0, + "step": 17378 + }, + { + "epoch": 1.9085218537228201, + "grad_norm": 2.6758880615234375, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7214190363883972, + "num_tokens": 434244365.0, + "step": 17379 + }, + { + "epoch": 1.9086316714254337, + "grad_norm": 2.292701482772827, + "learning_rate": 1e-06, + "loss": 0.9524, + "mean_token_accuracy": 0.7191834449768066, + "num_tokens": 434269229.0, + "step": 17380 + }, + { + "epoch": 1.9087414891280474, + "grad_norm": 2.4687232971191406, + "learning_rate": 1e-06, + "loss": 0.9156, + "mean_token_accuracy": 0.7166217565536499, + "num_tokens": 434290471.0, + "step": 17381 + }, + { + "epoch": 1.9088513068306612, + "grad_norm": 2.4971063137054443, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7206629514694214, + "num_tokens": 434312344.0, + "step": 17382 + }, + { + "epoch": 1.9089611245332747, + "grad_norm": 2.2083423137664795, + "learning_rate": 1e-06, + "loss": 0.8673, + "mean_token_accuracy": 0.7350160479545593, + "num_tokens": 434336901.0, + "step": 17383 + }, + { + "epoch": 1.9090709422358885, + "grad_norm": 2.6097328662872314, + "learning_rate": 1e-06, + "loss": 0.798, + "mean_token_accuracy": 0.7431178689002991, + "num_tokens": 434354715.0, + "step": 17384 + }, + { + "epoch": 1.909180759938502, + "grad_norm": 2.2854113578796387, + "learning_rate": 1e-06, + "loss": 0.8754, + "mean_token_accuracy": 0.7207178473472595, + "num_tokens": 434377355.0, + "step": 17385 + }, + { + "epoch": 1.9092905776411158, + "grad_norm": 2.10748553276062, + "learning_rate": 1e-06, + "loss": 0.9669, + "mean_token_accuracy": 0.7020072937011719, + "num_tokens": 434407753.0, + "step": 17386 + }, + { + "epoch": 1.9094003953437295, + "grad_norm": 2.3029966354370117, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7227813005447388, + "num_tokens": 434432821.0, + "step": 17387 + }, + { + "epoch": 1.909510213046343, + "grad_norm": 2.3637921810150146, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7159619927406311, + "num_tokens": 434456783.0, + "step": 17388 + }, + { + "epoch": 1.9096200307489566, + "grad_norm": 2.206881523132324, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7161456346511841, + "num_tokens": 434483243.0, + "step": 17389 + }, + { + "epoch": 1.9097298484515703, + "grad_norm": 2.492175817489624, + "learning_rate": 1e-06, + "loss": 0.8512, + "mean_token_accuracy": 0.736944317817688, + "num_tokens": 434504726.0, + "step": 17390 + }, + { + "epoch": 1.909839666154184, + "grad_norm": 2.470014810562134, + "learning_rate": 1e-06, + "loss": 0.8979, + "mean_token_accuracy": 0.7203551530838013, + "num_tokens": 434527730.0, + "step": 17391 + }, + { + "epoch": 1.9099494838567979, + "grad_norm": 2.2122137546539307, + "learning_rate": 1e-06, + "loss": 0.9, + "mean_token_accuracy": 0.7125539779663086, + "num_tokens": 434553450.0, + "step": 17392 + }, + { + "epoch": 1.9100593015594114, + "grad_norm": 2.6266908645629883, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.71506667137146, + "num_tokens": 434572529.0, + "step": 17393 + }, + { + "epoch": 1.910169119262025, + "grad_norm": 2.3797974586486816, + "learning_rate": 1e-06, + "loss": 0.9378, + "mean_token_accuracy": 0.7117959856987, + "num_tokens": 434599331.0, + "step": 17394 + }, + { + "epoch": 1.9102789369646387, + "grad_norm": 2.182931900024414, + "learning_rate": 1e-06, + "loss": 0.8528, + "mean_token_accuracy": 0.731313943862915, + "num_tokens": 434624303.0, + "step": 17395 + }, + { + "epoch": 1.9103887546672524, + "grad_norm": 2.0489909648895264, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7141737341880798, + "num_tokens": 434654267.0, + "step": 17396 + }, + { + "epoch": 1.910498572369866, + "grad_norm": 2.5583012104034424, + "learning_rate": 1e-06, + "loss": 0.7497, + "mean_token_accuracy": 0.7604368329048157, + "num_tokens": 434672731.0, + "step": 17397 + }, + { + "epoch": 1.9106083900724797, + "grad_norm": 2.0784599781036377, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7089723348617554, + "num_tokens": 434702673.0, + "step": 17398 + }, + { + "epoch": 1.9107182077750933, + "grad_norm": 2.352778434753418, + "learning_rate": 1e-06, + "loss": 0.8271, + "mean_token_accuracy": 0.7384488582611084, + "num_tokens": 434727091.0, + "step": 17399 + }, + { + "epoch": 1.910828025477707, + "grad_norm": 2.169822931289673, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7364722490310669, + "num_tokens": 434751728.0, + "step": 17400 + }, + { + "epoch": 1.9109378431803208, + "grad_norm": 2.218813180923462, + "learning_rate": 1e-06, + "loss": 0.9259, + "mean_token_accuracy": 0.7172716856002808, + "num_tokens": 434777431.0, + "step": 17401 + }, + { + "epoch": 1.9110476608829343, + "grad_norm": 2.235339879989624, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7181367874145508, + "num_tokens": 434801743.0, + "step": 17402 + }, + { + "epoch": 1.9111574785855479, + "grad_norm": 2.6994822025299072, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7249264717102051, + "num_tokens": 434820338.0, + "step": 17403 + }, + { + "epoch": 1.9112672962881616, + "grad_norm": 2.1713480949401855, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7190375328063965, + "num_tokens": 434846768.0, + "step": 17404 + }, + { + "epoch": 1.9113771139907754, + "grad_norm": 2.585979700088501, + "learning_rate": 1e-06, + "loss": 0.8439, + "mean_token_accuracy": 0.7280707955360413, + "num_tokens": 434867243.0, + "step": 17405 + }, + { + "epoch": 1.9114869316933891, + "grad_norm": 2.4934616088867188, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7340877056121826, + "num_tokens": 434890080.0, + "step": 17406 + }, + { + "epoch": 1.9115967493960027, + "grad_norm": 2.4416255950927734, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7344318628311157, + "num_tokens": 434909891.0, + "step": 17407 + }, + { + "epoch": 1.9117065670986162, + "grad_norm": 2.356879234313965, + "learning_rate": 1e-06, + "loss": 0.8722, + "mean_token_accuracy": 0.72556471824646, + "num_tokens": 434932685.0, + "step": 17408 + }, + { + "epoch": 1.91181638480123, + "grad_norm": 2.2085723876953125, + "learning_rate": 1e-06, + "loss": 0.8936, + "mean_token_accuracy": 0.7207964658737183, + "num_tokens": 434961296.0, + "step": 17409 + }, + { + "epoch": 1.9119262025038437, + "grad_norm": 2.2929439544677734, + "learning_rate": 1e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7242318391799927, + "num_tokens": 434985063.0, + "step": 17410 + }, + { + "epoch": 1.9120360202064572, + "grad_norm": 2.3745217323303223, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7087187170982361, + "num_tokens": 435007435.0, + "step": 17411 + }, + { + "epoch": 1.9121458379090708, + "grad_norm": 2.216060161590576, + "learning_rate": 1e-06, + "loss": 0.8583, + "mean_token_accuracy": 0.7335216403007507, + "num_tokens": 435033341.0, + "step": 17412 + }, + { + "epoch": 1.9122556556116845, + "grad_norm": 2.0822126865386963, + "learning_rate": 1e-06, + "loss": 0.825, + "mean_token_accuracy": 0.7449674010276794, + "num_tokens": 435059530.0, + "step": 17413 + }, + { + "epoch": 1.9123654733142983, + "grad_norm": 2.3623299598693848, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7342562675476074, + "num_tokens": 435083281.0, + "step": 17414 + }, + { + "epoch": 1.912475291016912, + "grad_norm": 2.23675537109375, + "learning_rate": 1e-06, + "loss": 0.984, + "mean_token_accuracy": 0.6996074318885803, + "num_tokens": 435109810.0, + "step": 17415 + }, + { + "epoch": 1.9125851087195256, + "grad_norm": 2.339139699935913, + "learning_rate": 1e-06, + "loss": 0.9029, + "mean_token_accuracy": 0.7238538265228271, + "num_tokens": 435134029.0, + "step": 17416 + }, + { + "epoch": 1.9126949264221391, + "grad_norm": 2.149388313293457, + "learning_rate": 1e-06, + "loss": 0.8131, + "mean_token_accuracy": 0.7395280599594116, + "num_tokens": 435160412.0, + "step": 17417 + }, + { + "epoch": 1.9128047441247529, + "grad_norm": 2.328317880630493, + "learning_rate": 1e-06, + "loss": 0.8486, + "mean_token_accuracy": 0.7342904806137085, + "num_tokens": 435184106.0, + "step": 17418 + }, + { + "epoch": 1.9129145618273666, + "grad_norm": 2.154332399368286, + "learning_rate": 1e-06, + "loss": 0.8398, + "mean_token_accuracy": 0.7426236867904663, + "num_tokens": 435209504.0, + "step": 17419 + }, + { + "epoch": 1.9130243795299804, + "grad_norm": 2.6235949993133545, + "learning_rate": 1e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7299391031265259, + "num_tokens": 435230056.0, + "step": 17420 + }, + { + "epoch": 1.913134197232594, + "grad_norm": 2.4942433834075928, + "learning_rate": 1e-06, + "loss": 0.8444, + "mean_token_accuracy": 0.7358449697494507, + "num_tokens": 435251598.0, + "step": 17421 + }, + { + "epoch": 1.9132440149352075, + "grad_norm": 2.285724401473999, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7162262797355652, + "num_tokens": 435278849.0, + "step": 17422 + }, + { + "epoch": 1.9133538326378212, + "grad_norm": 2.465162754058838, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7102885246276855, + "num_tokens": 435303740.0, + "step": 17423 + }, + { + "epoch": 1.913463650340435, + "grad_norm": 2.4418795108795166, + "learning_rate": 1e-06, + "loss": 0.898, + "mean_token_accuracy": 0.7209830284118652, + "num_tokens": 435327930.0, + "step": 17424 + }, + { + "epoch": 1.9135734680430485, + "grad_norm": 2.2019402980804443, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7196030616760254, + "num_tokens": 435353960.0, + "step": 17425 + }, + { + "epoch": 1.913683285745662, + "grad_norm": 2.32476544380188, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7285517454147339, + "num_tokens": 435377078.0, + "step": 17426 + }, + { + "epoch": 1.9137931034482758, + "grad_norm": 2.343149423599243, + "learning_rate": 1e-06, + "loss": 0.9013, + "mean_token_accuracy": 0.719111442565918, + "num_tokens": 435400625.0, + "step": 17427 + }, + { + "epoch": 1.9139029211508896, + "grad_norm": 1.8471771478652954, + "learning_rate": 1e-06, + "loss": 0.9476, + "mean_token_accuracy": 0.7144526839256287, + "num_tokens": 435435760.0, + "step": 17428 + }, + { + "epoch": 1.9140127388535033, + "grad_norm": 2.518519639968872, + "learning_rate": 1e-06, + "loss": 0.9143, + "mean_token_accuracy": 0.7219593524932861, + "num_tokens": 435456952.0, + "step": 17429 + }, + { + "epoch": 1.9141225565561168, + "grad_norm": 2.2155776023864746, + "learning_rate": 1e-06, + "loss": 0.9254, + "mean_token_accuracy": 0.7097010612487793, + "num_tokens": 435484088.0, + "step": 17430 + }, + { + "epoch": 1.9142323742587304, + "grad_norm": 2.512246608734131, + "learning_rate": 1e-06, + "loss": 0.932, + "mean_token_accuracy": 0.7144085764884949, + "num_tokens": 435505232.0, + "step": 17431 + }, + { + "epoch": 1.9143421919613441, + "grad_norm": 2.2281503677368164, + "learning_rate": 1e-06, + "loss": 0.9037, + "mean_token_accuracy": 0.7178815007209778, + "num_tokens": 435530134.0, + "step": 17432 + }, + { + "epoch": 1.914452009663958, + "grad_norm": 2.193472146987915, + "learning_rate": 1e-06, + "loss": 0.8227, + "mean_token_accuracy": 0.7465142011642456, + "num_tokens": 435555342.0, + "step": 17433 + }, + { + "epoch": 1.9145618273665717, + "grad_norm": 2.4073283672332764, + "learning_rate": 1e-06, + "loss": 0.8471, + "mean_token_accuracy": 0.7360839247703552, + "num_tokens": 435577181.0, + "step": 17434 + }, + { + "epoch": 1.9146716450691852, + "grad_norm": 2.4405086040496826, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7339568138122559, + "num_tokens": 435598329.0, + "step": 17435 + }, + { + "epoch": 1.9147814627717987, + "grad_norm": 2.297794818878174, + "learning_rate": 1e-06, + "loss": 0.9527, + "mean_token_accuracy": 0.7037280797958374, + "num_tokens": 435623400.0, + "step": 17436 + }, + { + "epoch": 1.9148912804744125, + "grad_norm": 2.6665093898773193, + "learning_rate": 1e-06, + "loss": 0.9866, + "mean_token_accuracy": 0.6983262300491333, + "num_tokens": 435645608.0, + "step": 17437 + }, + { + "epoch": 1.9150010981770262, + "grad_norm": 2.2932918071746826, + "learning_rate": 1e-06, + "loss": 0.9355, + "mean_token_accuracy": 0.7059269547462463, + "num_tokens": 435671872.0, + "step": 17438 + }, + { + "epoch": 1.9151109158796398, + "grad_norm": 2.324176788330078, + "learning_rate": 1e-06, + "loss": 0.9817, + "mean_token_accuracy": 0.7103928327560425, + "num_tokens": 435697475.0, + "step": 17439 + }, + { + "epoch": 1.9152207335822533, + "grad_norm": 2.4778287410736084, + "learning_rate": 1e-06, + "loss": 0.8666, + "mean_token_accuracy": 0.7310132384300232, + "num_tokens": 435719794.0, + "step": 17440 + }, + { + "epoch": 1.915330551284867, + "grad_norm": 2.184377670288086, + "learning_rate": 1e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.739093005657196, + "num_tokens": 435745185.0, + "step": 17441 + }, + { + "epoch": 1.9154403689874808, + "grad_norm": 2.4039347171783447, + "learning_rate": 1e-06, + "loss": 0.8014, + "mean_token_accuracy": 0.7513030171394348, + "num_tokens": 435768506.0, + "step": 17442 + }, + { + "epoch": 1.9155501866900946, + "grad_norm": 2.0122909545898438, + "learning_rate": 1e-06, + "loss": 0.9477, + "mean_token_accuracy": 0.7067418098449707, + "num_tokens": 435800315.0, + "step": 17443 + }, + { + "epoch": 1.9156600043927081, + "grad_norm": 2.5280210971832275, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7263596057891846, + "num_tokens": 435821869.0, + "step": 17444 + }, + { + "epoch": 1.9157698220953217, + "grad_norm": 2.23331356048584, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7158084511756897, + "num_tokens": 435847996.0, + "step": 17445 + }, + { + "epoch": 1.9158796397979354, + "grad_norm": 2.433905601501465, + "learning_rate": 1e-06, + "loss": 0.8466, + "mean_token_accuracy": 0.7358626127243042, + "num_tokens": 435869469.0, + "step": 17446 + }, + { + "epoch": 1.9159894575005492, + "grad_norm": 2.223078727722168, + "learning_rate": 1e-06, + "loss": 0.9443, + "mean_token_accuracy": 0.7026616930961609, + "num_tokens": 435896392.0, + "step": 17447 + }, + { + "epoch": 1.9160992752031627, + "grad_norm": 2.053731918334961, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7045109272003174, + "num_tokens": 435926045.0, + "step": 17448 + }, + { + "epoch": 1.9162090929057765, + "grad_norm": 2.0783560276031494, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7292199730873108, + "num_tokens": 435955318.0, + "step": 17449 + }, + { + "epoch": 1.91631891060839, + "grad_norm": 2.0575406551361084, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7127503156661987, + "num_tokens": 435984609.0, + "step": 17450 + }, + { + "epoch": 1.9164287283110037, + "grad_norm": 2.058861494064331, + "learning_rate": 1e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.7249006628990173, + "num_tokens": 436013178.0, + "step": 17451 + }, + { + "epoch": 1.9165385460136175, + "grad_norm": 2.314967632293701, + "learning_rate": 1e-06, + "loss": 0.9578, + "mean_token_accuracy": 0.6979384422302246, + "num_tokens": 436039229.0, + "step": 17452 + }, + { + "epoch": 1.916648363716231, + "grad_norm": 2.2320001125335693, + "learning_rate": 1e-06, + "loss": 0.9218, + "mean_token_accuracy": 0.7150384783744812, + "num_tokens": 436063869.0, + "step": 17453 + }, + { + "epoch": 1.9167581814188446, + "grad_norm": 2.289247989654541, + "learning_rate": 1e-06, + "loss": 0.8794, + "mean_token_accuracy": 0.728323757648468, + "num_tokens": 436088343.0, + "step": 17454 + }, + { + "epoch": 1.9168679991214583, + "grad_norm": 2.2221662998199463, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.724609375, + "num_tokens": 436114217.0, + "step": 17455 + }, + { + "epoch": 1.916977816824072, + "grad_norm": 2.1842947006225586, + "learning_rate": 1e-06, + "loss": 0.9201, + "mean_token_accuracy": 0.7121331095695496, + "num_tokens": 436139701.0, + "step": 17456 + }, + { + "epoch": 1.9170876345266858, + "grad_norm": 2.3858203887939453, + "learning_rate": 1e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.7383348345756531, + "num_tokens": 436161675.0, + "step": 17457 + }, + { + "epoch": 1.9171974522292994, + "grad_norm": 2.467006206512451, + "learning_rate": 1e-06, + "loss": 0.9627, + "mean_token_accuracy": 0.702825665473938, + "num_tokens": 436187037.0, + "step": 17458 + }, + { + "epoch": 1.917307269931913, + "grad_norm": 2.1321449279785156, + "learning_rate": 1e-06, + "loss": 0.9171, + "mean_token_accuracy": 0.7156335711479187, + "num_tokens": 436214712.0, + "step": 17459 + }, + { + "epoch": 1.9174170876345267, + "grad_norm": 2.071760892868042, + "learning_rate": 1e-06, + "loss": 0.962, + "mean_token_accuracy": 0.7106665372848511, + "num_tokens": 436243537.0, + "step": 17460 + }, + { + "epoch": 1.9175269053371404, + "grad_norm": 2.574326992034912, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7418900728225708, + "num_tokens": 436264073.0, + "step": 17461 + }, + { + "epoch": 1.917636723039754, + "grad_norm": 2.1316990852355957, + "learning_rate": 1e-06, + "loss": 0.8788, + "mean_token_accuracy": 0.7234156131744385, + "num_tokens": 436291458.0, + "step": 17462 + }, + { + "epoch": 1.9177465407423677, + "grad_norm": 2.2160544395446777, + "learning_rate": 1e-06, + "loss": 0.8857, + "mean_token_accuracy": 0.7378385066986084, + "num_tokens": 436315437.0, + "step": 17463 + }, + { + "epoch": 1.9178563584449813, + "grad_norm": 2.452822685241699, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7079854011535645, + "num_tokens": 436337739.0, + "step": 17464 + }, + { + "epoch": 1.917966176147595, + "grad_norm": 2.1577272415161133, + "learning_rate": 1e-06, + "loss": 0.8683, + "mean_token_accuracy": 0.7385714054107666, + "num_tokens": 436362797.0, + "step": 17465 + }, + { + "epoch": 1.9180759938502088, + "grad_norm": 2.1143698692321777, + "learning_rate": 1e-06, + "loss": 0.9917, + "mean_token_accuracy": 0.6961141228675842, + "num_tokens": 436392263.0, + "step": 17466 + }, + { + "epoch": 1.9181858115528223, + "grad_norm": 2.13400936126709, + "learning_rate": 1e-06, + "loss": 0.8851, + "mean_token_accuracy": 0.7266606688499451, + "num_tokens": 436419295.0, + "step": 17467 + }, + { + "epoch": 1.9182956292554358, + "grad_norm": 2.339405059814453, + "learning_rate": 1e-06, + "loss": 0.9279, + "mean_token_accuracy": 0.7094118595123291, + "num_tokens": 436444392.0, + "step": 17468 + }, + { + "epoch": 1.9184054469580496, + "grad_norm": 2.373715877532959, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7208402156829834, + "num_tokens": 436466691.0, + "step": 17469 + }, + { + "epoch": 1.9185152646606634, + "grad_norm": 2.2764360904693604, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7133829593658447, + "num_tokens": 436490789.0, + "step": 17470 + }, + { + "epoch": 1.9186250823632771, + "grad_norm": 2.213792085647583, + "learning_rate": 1e-06, + "loss": 0.9715, + "mean_token_accuracy": 0.695960283279419, + "num_tokens": 436517299.0, + "step": 17471 + }, + { + "epoch": 1.9187349000658906, + "grad_norm": 2.59116268157959, + "learning_rate": 1e-06, + "loss": 0.9193, + "mean_token_accuracy": 0.7077323198318481, + "num_tokens": 436539056.0, + "step": 17472 + }, + { + "epoch": 1.9188447177685042, + "grad_norm": 2.259765863418579, + "learning_rate": 1e-06, + "loss": 0.7443, + "mean_token_accuracy": 0.7631865739822388, + "num_tokens": 436561265.0, + "step": 17473 + }, + { + "epoch": 1.918954535471118, + "grad_norm": 2.1792964935302734, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.7187337279319763, + "num_tokens": 436587956.0, + "step": 17474 + }, + { + "epoch": 1.9190643531737317, + "grad_norm": 2.0656659603118896, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7357486486434937, + "num_tokens": 436615568.0, + "step": 17475 + }, + { + "epoch": 1.9191741708763452, + "grad_norm": 1.8981701135635376, + "learning_rate": 1e-06, + "loss": 0.9017, + "mean_token_accuracy": 0.7161778807640076, + "num_tokens": 436647923.0, + "step": 17476 + }, + { + "epoch": 1.9192839885789588, + "grad_norm": 2.3748083114624023, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7371833324432373, + "num_tokens": 436669583.0, + "step": 17477 + }, + { + "epoch": 1.9193938062815725, + "grad_norm": 2.0074546337127686, + "learning_rate": 1e-06, + "loss": 0.9874, + "mean_token_accuracy": 0.6995309591293335, + "num_tokens": 436700267.0, + "step": 17478 + }, + { + "epoch": 1.9195036239841863, + "grad_norm": 1.9120070934295654, + "learning_rate": 1e-06, + "loss": 0.913, + "mean_token_accuracy": 0.721786618232727, + "num_tokens": 436730884.0, + "step": 17479 + }, + { + "epoch": 1.9196134416868, + "grad_norm": 2.628190040588379, + "learning_rate": 1e-06, + "loss": 0.8288, + "mean_token_accuracy": 0.7314431071281433, + "num_tokens": 436749587.0, + "step": 17480 + }, + { + "epoch": 1.9197232593894136, + "grad_norm": 2.795742988586426, + "learning_rate": 1e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7446225881576538, + "num_tokens": 436766564.0, + "step": 17481 + }, + { + "epoch": 1.919833077092027, + "grad_norm": 2.08520245552063, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7117276191711426, + "num_tokens": 436795849.0, + "step": 17482 + }, + { + "epoch": 1.9199428947946409, + "grad_norm": 2.34582257270813, + "learning_rate": 1e-06, + "loss": 0.8208, + "mean_token_accuracy": 0.7414565682411194, + "num_tokens": 436817569.0, + "step": 17483 + }, + { + "epoch": 1.9200527124972546, + "grad_norm": 2.127232551574707, + "learning_rate": 1e-06, + "loss": 0.9595, + "mean_token_accuracy": 0.7044092416763306, + "num_tokens": 436846082.0, + "step": 17484 + }, + { + "epoch": 1.9201625301998684, + "grad_norm": 2.1128790378570557, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7024397850036621, + "num_tokens": 436873436.0, + "step": 17485 + }, + { + "epoch": 1.920272347902482, + "grad_norm": 2.14585280418396, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.7055897116661072, + "num_tokens": 436902201.0, + "step": 17486 + }, + { + "epoch": 1.9203821656050954, + "grad_norm": 2.5012075901031494, + "learning_rate": 1e-06, + "loss": 0.7551, + "mean_token_accuracy": 0.755604088306427, + "num_tokens": 436920777.0, + "step": 17487 + }, + { + "epoch": 1.9204919833077092, + "grad_norm": 2.327084541320801, + "learning_rate": 1e-06, + "loss": 0.9529, + "mean_token_accuracy": 0.7021387815475464, + "num_tokens": 436943918.0, + "step": 17488 + }, + { + "epoch": 1.920601801010323, + "grad_norm": 2.0238828659057617, + "learning_rate": 1e-06, + "loss": 0.8817, + "mean_token_accuracy": 0.7225706577301025, + "num_tokens": 436976406.0, + "step": 17489 + }, + { + "epoch": 1.9207116187129365, + "grad_norm": 2.529480457305908, + "learning_rate": 1e-06, + "loss": 0.9304, + "mean_token_accuracy": 0.715977668762207, + "num_tokens": 437000678.0, + "step": 17490 + }, + { + "epoch": 1.92082143641555, + "grad_norm": 2.4470181465148926, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7079645395278931, + "num_tokens": 437025723.0, + "step": 17491 + }, + { + "epoch": 1.9209312541181638, + "grad_norm": 2.6706008911132812, + "learning_rate": 1e-06, + "loss": 0.8926, + "mean_token_accuracy": 0.7331228256225586, + "num_tokens": 437043517.0, + "step": 17492 + }, + { + "epoch": 1.9210410718207775, + "grad_norm": 2.147897481918335, + "learning_rate": 1e-06, + "loss": 0.955, + "mean_token_accuracy": 0.717933177947998, + "num_tokens": 437073472.0, + "step": 17493 + }, + { + "epoch": 1.9211508895233913, + "grad_norm": 2.628668785095215, + "learning_rate": 1e-06, + "loss": 0.8034, + "mean_token_accuracy": 0.7453328967094421, + "num_tokens": 437093967.0, + "step": 17494 + }, + { + "epoch": 1.9212607072260048, + "grad_norm": 2.206660032272339, + "learning_rate": 1e-06, + "loss": 0.8734, + "mean_token_accuracy": 0.7304722666740417, + "num_tokens": 437119265.0, + "step": 17495 + }, + { + "epoch": 1.9213705249286184, + "grad_norm": 2.1570932865142822, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7220126390457153, + "num_tokens": 437150815.0, + "step": 17496 + }, + { + "epoch": 1.9214803426312321, + "grad_norm": 2.080413579940796, + "learning_rate": 1e-06, + "loss": 0.7908, + "mean_token_accuracy": 0.7492333650588989, + "num_tokens": 437178572.0, + "step": 17497 + }, + { + "epoch": 1.9215901603338459, + "grad_norm": 2.0641181468963623, + "learning_rate": 1e-06, + "loss": 0.9048, + "mean_token_accuracy": 0.7211841344833374, + "num_tokens": 437207815.0, + "step": 17498 + }, + { + "epoch": 1.9216999780364594, + "grad_norm": 2.220466375350952, + "learning_rate": 1e-06, + "loss": 0.9505, + "mean_token_accuracy": 0.7079954743385315, + "num_tokens": 437235087.0, + "step": 17499 + }, + { + "epoch": 1.9218097957390732, + "grad_norm": 2.2390685081481934, + "learning_rate": 1e-06, + "loss": 0.9621, + "mean_token_accuracy": 0.6962875723838806, + "num_tokens": 437262269.0, + "step": 17500 + }, + { + "epoch": 1.9219196134416867, + "grad_norm": 2.2958972454071045, + "learning_rate": 1e-06, + "loss": 0.9576, + "mean_token_accuracy": 0.7024340629577637, + "num_tokens": 437287567.0, + "step": 17501 + }, + { + "epoch": 1.9220294311443005, + "grad_norm": 2.604656219482422, + "learning_rate": 1e-06, + "loss": 0.9465, + "mean_token_accuracy": 0.7111895084381104, + "num_tokens": 437307438.0, + "step": 17502 + }, + { + "epoch": 1.9221392488469142, + "grad_norm": 2.4036028385162354, + "learning_rate": 1e-06, + "loss": 0.8502, + "mean_token_accuracy": 0.7357745170593262, + "num_tokens": 437330091.0, + "step": 17503 + }, + { + "epoch": 1.9222490665495278, + "grad_norm": 2.290358543395996, + "learning_rate": 1e-06, + "loss": 0.9876, + "mean_token_accuracy": 0.7120223045349121, + "num_tokens": 437355853.0, + "step": 17504 + }, + { + "epoch": 1.9223588842521413, + "grad_norm": 2.47774338722229, + "learning_rate": 1e-06, + "loss": 0.836, + "mean_token_accuracy": 0.7388026714324951, + "num_tokens": 437375916.0, + "step": 17505 + }, + { + "epoch": 1.922468701954755, + "grad_norm": 2.073983907699585, + "learning_rate": 1e-06, + "loss": 0.9985, + "mean_token_accuracy": 0.6899130940437317, + "num_tokens": 437406432.0, + "step": 17506 + }, + { + "epoch": 1.9225785196573688, + "grad_norm": 2.5374813079833984, + "learning_rate": 1e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7257341146469116, + "num_tokens": 437425899.0, + "step": 17507 + }, + { + "epoch": 1.9226883373599826, + "grad_norm": 2.5800514221191406, + "learning_rate": 1e-06, + "loss": 0.7785, + "mean_token_accuracy": 0.7580370903015137, + "num_tokens": 437444500.0, + "step": 17508 + }, + { + "epoch": 1.922798155062596, + "grad_norm": 2.5846378803253174, + "learning_rate": 1e-06, + "loss": 0.8483, + "mean_token_accuracy": 0.7249930500984192, + "num_tokens": 437464663.0, + "step": 17509 + }, + { + "epoch": 1.9229079727652096, + "grad_norm": 2.16984224319458, + "learning_rate": 1e-06, + "loss": 0.8826, + "mean_token_accuracy": 0.7213922739028931, + "num_tokens": 437492110.0, + "step": 17510 + }, + { + "epoch": 1.9230177904678234, + "grad_norm": 2.5653631687164307, + "learning_rate": 1e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7435372471809387, + "num_tokens": 437513287.0, + "step": 17511 + }, + { + "epoch": 1.9231276081704372, + "grad_norm": 1.9574341773986816, + "learning_rate": 1e-06, + "loss": 0.978, + "mean_token_accuracy": 0.7037503719329834, + "num_tokens": 437546780.0, + "step": 17512 + }, + { + "epoch": 1.9232374258730507, + "grad_norm": 2.0633137226104736, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7121567130088806, + "num_tokens": 437576013.0, + "step": 17513 + }, + { + "epoch": 1.9233472435756644, + "grad_norm": 2.451418161392212, + "learning_rate": 1e-06, + "loss": 0.9106, + "mean_token_accuracy": 0.7114720940589905, + "num_tokens": 437598144.0, + "step": 17514 + }, + { + "epoch": 1.923457061278278, + "grad_norm": 2.2826385498046875, + "learning_rate": 1e-06, + "loss": 1.0262, + "mean_token_accuracy": 0.6877466440200806, + "num_tokens": 437624992.0, + "step": 17515 + }, + { + "epoch": 1.9235668789808917, + "grad_norm": 2.2795560359954834, + "learning_rate": 1e-06, + "loss": 0.8246, + "mean_token_accuracy": 0.7323660850524902, + "num_tokens": 437648169.0, + "step": 17516 + }, + { + "epoch": 1.9236766966835055, + "grad_norm": 2.0630669593811035, + "learning_rate": 1e-06, + "loss": 0.8637, + "mean_token_accuracy": 0.7291440367698669, + "num_tokens": 437676674.0, + "step": 17517 + }, + { + "epoch": 1.923786514386119, + "grad_norm": 2.7874550819396973, + "learning_rate": 1e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7225518226623535, + "num_tokens": 437694371.0, + "step": 17518 + }, + { + "epoch": 1.9238963320887326, + "grad_norm": 2.34702467918396, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7073972821235657, + "num_tokens": 437717419.0, + "step": 17519 + }, + { + "epoch": 1.9240061497913463, + "grad_norm": 2.5460832118988037, + "learning_rate": 1e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7301335334777832, + "num_tokens": 437737962.0, + "step": 17520 + }, + { + "epoch": 1.92411596749396, + "grad_norm": 2.240931987762451, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.709522008895874, + "num_tokens": 437763721.0, + "step": 17521 + }, + { + "epoch": 1.9242257851965738, + "grad_norm": 2.2315714359283447, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7042304277420044, + "num_tokens": 437791678.0, + "step": 17522 + }, + { + "epoch": 1.9243356028991874, + "grad_norm": 2.319498062133789, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7204986214637756, + "num_tokens": 437815816.0, + "step": 17523 + }, + { + "epoch": 1.924445420601801, + "grad_norm": 2.292257308959961, + "learning_rate": 1e-06, + "loss": 0.9411, + "mean_token_accuracy": 0.7084673643112183, + "num_tokens": 437841510.0, + "step": 17524 + }, + { + "epoch": 1.9245552383044147, + "grad_norm": 2.2514238357543945, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7228551506996155, + "num_tokens": 437866470.0, + "step": 17525 + }, + { + "epoch": 1.9246650560070284, + "grad_norm": 2.3593811988830566, + "learning_rate": 1e-06, + "loss": 0.8818, + "mean_token_accuracy": 0.7350836992263794, + "num_tokens": 437889961.0, + "step": 17526 + }, + { + "epoch": 1.924774873709642, + "grad_norm": 2.6261179447174072, + "learning_rate": 1e-06, + "loss": 0.9289, + "mean_token_accuracy": 0.719318151473999, + "num_tokens": 437909396.0, + "step": 17527 + }, + { + "epoch": 1.9248846914122557, + "grad_norm": 2.500304698944092, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.729214072227478, + "num_tokens": 437931656.0, + "step": 17528 + }, + { + "epoch": 1.9249945091148692, + "grad_norm": 2.3363964557647705, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7235280275344849, + "num_tokens": 437956982.0, + "step": 17529 + }, + { + "epoch": 1.925104326817483, + "grad_norm": 2.626854658126831, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7143145799636841, + "num_tokens": 437977121.0, + "step": 17530 + }, + { + "epoch": 1.9252141445200968, + "grad_norm": 2.4172303676605225, + "learning_rate": 1e-06, + "loss": 0.8868, + "mean_token_accuracy": 0.7210192680358887, + "num_tokens": 438001607.0, + "step": 17531 + }, + { + "epoch": 1.9253239622227103, + "grad_norm": 2.282881259918213, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.750746488571167, + "num_tokens": 438027097.0, + "step": 17532 + }, + { + "epoch": 1.9254337799253238, + "grad_norm": 1.9526368379592896, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.6987476944923401, + "num_tokens": 438057893.0, + "step": 17533 + }, + { + "epoch": 1.9255435976279376, + "grad_norm": 2.423624277114868, + "learning_rate": 1e-06, + "loss": 0.8111, + "mean_token_accuracy": 0.7363139986991882, + "num_tokens": 438078847.0, + "step": 17534 + }, + { + "epoch": 1.9256534153305513, + "grad_norm": 2.3477261066436768, + "learning_rate": 1e-06, + "loss": 0.9295, + "mean_token_accuracy": 0.7062498331069946, + "num_tokens": 438103815.0, + "step": 17535 + }, + { + "epoch": 1.925763233033165, + "grad_norm": 2.1988677978515625, + "learning_rate": 1e-06, + "loss": 0.9486, + "mean_token_accuracy": 0.7047803401947021, + "num_tokens": 438130444.0, + "step": 17536 + }, + { + "epoch": 1.9258730507357786, + "grad_norm": 2.172884464263916, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7222031950950623, + "num_tokens": 438158255.0, + "step": 17537 + }, + { + "epoch": 1.9259828684383922, + "grad_norm": 2.0997626781463623, + "learning_rate": 1e-06, + "loss": 0.9653, + "mean_token_accuracy": 0.7010233402252197, + "num_tokens": 438188012.0, + "step": 17538 + }, + { + "epoch": 1.926092686141006, + "grad_norm": 2.169003963470459, + "learning_rate": 1e-06, + "loss": 0.8287, + "mean_token_accuracy": 0.7359499335289001, + "num_tokens": 438214024.0, + "step": 17539 + }, + { + "epoch": 1.9262025038436197, + "grad_norm": 2.256404161453247, + "learning_rate": 1e-06, + "loss": 0.8625, + "mean_token_accuracy": 0.7283242344856262, + "num_tokens": 438240519.0, + "step": 17540 + }, + { + "epoch": 1.9263123215462332, + "grad_norm": 2.355412721633911, + "learning_rate": 1e-06, + "loss": 0.871, + "mean_token_accuracy": 0.7216274738311768, + "num_tokens": 438263116.0, + "step": 17541 + }, + { + "epoch": 1.9264221392488468, + "grad_norm": 2.359388828277588, + "learning_rate": 1e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7397735118865967, + "num_tokens": 438288617.0, + "step": 17542 + }, + { + "epoch": 1.9265319569514605, + "grad_norm": 2.3027632236480713, + "learning_rate": 1e-06, + "loss": 0.9899, + "mean_token_accuracy": 0.6975507140159607, + "num_tokens": 438316704.0, + "step": 17543 + }, + { + "epoch": 1.9266417746540743, + "grad_norm": 2.4735050201416016, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.714610755443573, + "num_tokens": 438338753.0, + "step": 17544 + }, + { + "epoch": 1.926751592356688, + "grad_norm": 2.4487290382385254, + "learning_rate": 1e-06, + "loss": 0.8223, + "mean_token_accuracy": 0.7356192469596863, + "num_tokens": 438360157.0, + "step": 17545 + }, + { + "epoch": 1.9268614100593016, + "grad_norm": 2.038809299468994, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7128957509994507, + "num_tokens": 438390024.0, + "step": 17546 + }, + { + "epoch": 1.926971227761915, + "grad_norm": 2.4619126319885254, + "learning_rate": 1e-06, + "loss": 0.8593, + "mean_token_accuracy": 0.7382598519325256, + "num_tokens": 438412616.0, + "step": 17547 + }, + { + "epoch": 1.9270810454645289, + "grad_norm": 2.0565407276153564, + "learning_rate": 1e-06, + "loss": 1.0145, + "mean_token_accuracy": 0.6906407475471497, + "num_tokens": 438443246.0, + "step": 17548 + }, + { + "epoch": 1.9271908631671426, + "grad_norm": 2.253434658050537, + "learning_rate": 1e-06, + "loss": 0.9521, + "mean_token_accuracy": 0.7017068862915039, + "num_tokens": 438469692.0, + "step": 17549 + }, + { + "epoch": 1.9273006808697564, + "grad_norm": 2.344238042831421, + "learning_rate": 1e-06, + "loss": 0.8865, + "mean_token_accuracy": 0.7208216786384583, + "num_tokens": 438493868.0, + "step": 17550 + }, + { + "epoch": 1.92741049857237, + "grad_norm": 2.0234873294830322, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7227455377578735, + "num_tokens": 438525197.0, + "step": 17551 + }, + { + "epoch": 1.9275203162749834, + "grad_norm": 2.260345697402954, + "learning_rate": 1e-06, + "loss": 0.988, + "mean_token_accuracy": 0.691412091255188, + "num_tokens": 438551170.0, + "step": 17552 + }, + { + "epoch": 1.9276301339775972, + "grad_norm": 2.2487142086029053, + "learning_rate": 1e-06, + "loss": 0.8963, + "mean_token_accuracy": 0.7188898921012878, + "num_tokens": 438577049.0, + "step": 17553 + }, + { + "epoch": 1.927739951680211, + "grad_norm": 2.3857898712158203, + "learning_rate": 1e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.728585422039032, + "num_tokens": 438598715.0, + "step": 17554 + }, + { + "epoch": 1.9278497693828245, + "grad_norm": 2.1897671222686768, + "learning_rate": 1e-06, + "loss": 0.9951, + "mean_token_accuracy": 0.6987924575805664, + "num_tokens": 438624192.0, + "step": 17555 + }, + { + "epoch": 1.927959587085438, + "grad_norm": 2.3415279388427734, + "learning_rate": 1e-06, + "loss": 0.893, + "mean_token_accuracy": 0.7191013097763062, + "num_tokens": 438648708.0, + "step": 17556 + }, + { + "epoch": 1.9280694047880518, + "grad_norm": 2.21134877204895, + "learning_rate": 1e-06, + "loss": 0.9749, + "mean_token_accuracy": 0.7051085233688354, + "num_tokens": 438674646.0, + "step": 17557 + }, + { + "epoch": 1.9281792224906655, + "grad_norm": 2.26285982131958, + "learning_rate": 1e-06, + "loss": 0.8617, + "mean_token_accuracy": 0.7304343581199646, + "num_tokens": 438700878.0, + "step": 17558 + }, + { + "epoch": 1.9282890401932793, + "grad_norm": 2.1950035095214844, + "learning_rate": 1e-06, + "loss": 0.9413, + "mean_token_accuracy": 0.7088778018951416, + "num_tokens": 438727837.0, + "step": 17559 + }, + { + "epoch": 1.9283988578958928, + "grad_norm": 2.6217777729034424, + "learning_rate": 1e-06, + "loss": 0.8442, + "mean_token_accuracy": 0.7380913496017456, + "num_tokens": 438747275.0, + "step": 17560 + }, + { + "epoch": 1.9285086755985064, + "grad_norm": 2.065338373184204, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.6984142065048218, + "num_tokens": 438779283.0, + "step": 17561 + }, + { + "epoch": 1.9286184933011201, + "grad_norm": 2.0046324729919434, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6884419918060303, + "num_tokens": 438809067.0, + "step": 17562 + }, + { + "epoch": 1.9287283110037339, + "grad_norm": 2.235471487045288, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7103855609893799, + "num_tokens": 438836469.0, + "step": 17563 + }, + { + "epoch": 1.9288381287063474, + "grad_norm": 2.490722894668579, + "learning_rate": 1e-06, + "loss": 0.9451, + "mean_token_accuracy": 0.7216089963912964, + "num_tokens": 438860540.0, + "step": 17564 + }, + { + "epoch": 1.9289479464089612, + "grad_norm": 2.100623607635498, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7334436178207397, + "num_tokens": 438887841.0, + "step": 17565 + }, + { + "epoch": 1.9290577641115747, + "grad_norm": 2.227030038833618, + "learning_rate": 1e-06, + "loss": 0.8804, + "mean_token_accuracy": 0.7156224846839905, + "num_tokens": 438911188.0, + "step": 17566 + }, + { + "epoch": 1.9291675818141885, + "grad_norm": 2.2399868965148926, + "learning_rate": 1e-06, + "loss": 0.969, + "mean_token_accuracy": 0.6987518072128296, + "num_tokens": 438937427.0, + "step": 17567 + }, + { + "epoch": 1.9292773995168022, + "grad_norm": 2.3837063312530518, + "learning_rate": 1e-06, + "loss": 0.9004, + "mean_token_accuracy": 0.7169198989868164, + "num_tokens": 438958618.0, + "step": 17568 + }, + { + "epoch": 1.9293872172194158, + "grad_norm": 2.3972625732421875, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7174468040466309, + "num_tokens": 438980173.0, + "step": 17569 + }, + { + "epoch": 1.9294970349220293, + "grad_norm": 2.34409236907959, + "learning_rate": 1e-06, + "loss": 0.9811, + "mean_token_accuracy": 0.7049437761306763, + "num_tokens": 439005426.0, + "step": 17570 + }, + { + "epoch": 1.929606852624643, + "grad_norm": 2.364603281021118, + "learning_rate": 1e-06, + "loss": 0.883, + "mean_token_accuracy": 0.7226226925849915, + "num_tokens": 439029788.0, + "step": 17571 + }, + { + "epoch": 1.9297166703272568, + "grad_norm": 2.3172991275787354, + "learning_rate": 1e-06, + "loss": 0.976, + "mean_token_accuracy": 0.7024494409561157, + "num_tokens": 439055711.0, + "step": 17572 + }, + { + "epoch": 1.9298264880298706, + "grad_norm": 2.4122869968414307, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7283197641372681, + "num_tokens": 439077804.0, + "step": 17573 + }, + { + "epoch": 1.929936305732484, + "grad_norm": 2.7863192558288574, + "learning_rate": 1e-06, + "loss": 0.8154, + "mean_token_accuracy": 0.7391690015792847, + "num_tokens": 439095417.0, + "step": 17574 + }, + { + "epoch": 1.9300461234350976, + "grad_norm": 2.699289321899414, + "learning_rate": 1e-06, + "loss": 0.9718, + "mean_token_accuracy": 0.7053203582763672, + "num_tokens": 439114071.0, + "step": 17575 + }, + { + "epoch": 1.9301559411377114, + "grad_norm": 2.4534788131713867, + "learning_rate": 1e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7288620471954346, + "num_tokens": 439134962.0, + "step": 17576 + }, + { + "epoch": 1.9302657588403251, + "grad_norm": 2.592573881149292, + "learning_rate": 1e-06, + "loss": 0.894, + "mean_token_accuracy": 0.7245751023292542, + "num_tokens": 439155846.0, + "step": 17577 + }, + { + "epoch": 1.9303755765429387, + "grad_norm": 2.2244462966918945, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7333120107650757, + "num_tokens": 439181901.0, + "step": 17578 + }, + { + "epoch": 1.9304853942455524, + "grad_norm": 2.193577766418457, + "learning_rate": 1e-06, + "loss": 0.9153, + "mean_token_accuracy": 0.7145540714263916, + "num_tokens": 439207083.0, + "step": 17579 + }, + { + "epoch": 1.930595211948166, + "grad_norm": 2.6744067668914795, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7192473411560059, + "num_tokens": 439226623.0, + "step": 17580 + }, + { + "epoch": 1.9307050296507797, + "grad_norm": 2.1458232402801514, + "learning_rate": 1e-06, + "loss": 0.9302, + "mean_token_accuracy": 0.7069659233093262, + "num_tokens": 439254933.0, + "step": 17581 + }, + { + "epoch": 1.9308148473533935, + "grad_norm": 2.363748788833618, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7274153828620911, + "num_tokens": 439279101.0, + "step": 17582 + }, + { + "epoch": 1.930924665056007, + "grad_norm": 2.3471596240997314, + "learning_rate": 1e-06, + "loss": 0.9353, + "mean_token_accuracy": 0.7049147486686707, + "num_tokens": 439302956.0, + "step": 17583 + }, + { + "epoch": 1.9310344827586206, + "grad_norm": 2.558413505554199, + "learning_rate": 1e-06, + "loss": 0.878, + "mean_token_accuracy": 0.7258313298225403, + "num_tokens": 439323215.0, + "step": 17584 + }, + { + "epoch": 1.9311443004612343, + "grad_norm": 2.374652147293091, + "learning_rate": 1e-06, + "loss": 1.0006, + "mean_token_accuracy": 0.6867987513542175, + "num_tokens": 439347328.0, + "step": 17585 + }, + { + "epoch": 1.931254118163848, + "grad_norm": 2.163188934326172, + "learning_rate": 1e-06, + "loss": 0.8253, + "mean_token_accuracy": 0.7402619123458862, + "num_tokens": 439373435.0, + "step": 17586 + }, + { + "epoch": 1.9313639358664618, + "grad_norm": 2.1920392513275146, + "learning_rate": 1e-06, + "loss": 0.9065, + "mean_token_accuracy": 0.7239102125167847, + "num_tokens": 439397823.0, + "step": 17587 + }, + { + "epoch": 1.9314737535690754, + "grad_norm": 2.366144895553589, + "learning_rate": 1e-06, + "loss": 0.9062, + "mean_token_accuracy": 0.719939112663269, + "num_tokens": 439421880.0, + "step": 17588 + }, + { + "epoch": 1.931583571271689, + "grad_norm": 2.564866065979004, + "learning_rate": 1e-06, + "loss": 0.8693, + "mean_token_accuracy": 0.7320733666419983, + "num_tokens": 439442465.0, + "step": 17589 + }, + { + "epoch": 1.9316933889743026, + "grad_norm": 2.3111019134521484, + "learning_rate": 1e-06, + "loss": 0.8796, + "mean_token_accuracy": 0.7256512641906738, + "num_tokens": 439467288.0, + "step": 17590 + }, + { + "epoch": 1.9318032066769164, + "grad_norm": 2.260481119155884, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7139054536819458, + "num_tokens": 439493148.0, + "step": 17591 + }, + { + "epoch": 1.93191302437953, + "grad_norm": 2.055041551589966, + "learning_rate": 1e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7255936861038208, + "num_tokens": 439523842.0, + "step": 17592 + }, + { + "epoch": 1.9320228420821435, + "grad_norm": 2.1470940113067627, + "learning_rate": 1e-06, + "loss": 0.8755, + "mean_token_accuracy": 0.7237362861633301, + "num_tokens": 439550175.0, + "step": 17593 + }, + { + "epoch": 1.9321326597847572, + "grad_norm": 2.72052001953125, + "learning_rate": 1e-06, + "loss": 0.8872, + "mean_token_accuracy": 0.7245661020278931, + "num_tokens": 439568703.0, + "step": 17594 + }, + { + "epoch": 1.932242477487371, + "grad_norm": 2.1727287769317627, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7154258489608765, + "num_tokens": 439593934.0, + "step": 17595 + }, + { + "epoch": 1.9323522951899847, + "grad_norm": 1.9916423559188843, + "learning_rate": 1e-06, + "loss": 0.7989, + "mean_token_accuracy": 0.7450453042984009, + "num_tokens": 439623325.0, + "step": 17596 + }, + { + "epoch": 1.9324621128925983, + "grad_norm": 2.239635467529297, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.735362708568573, + "num_tokens": 439650183.0, + "step": 17597 + }, + { + "epoch": 1.9325719305952118, + "grad_norm": 2.3819878101348877, + "learning_rate": 1e-06, + "loss": 0.8036, + "mean_token_accuracy": 0.7478065490722656, + "num_tokens": 439671571.0, + "step": 17598 + }, + { + "epoch": 1.9326817482978256, + "grad_norm": 1.9789083003997803, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7144091129302979, + "num_tokens": 439704279.0, + "step": 17599 + }, + { + "epoch": 1.9327915660004393, + "grad_norm": 2.028498888015747, + "learning_rate": 1e-06, + "loss": 0.881, + "mean_token_accuracy": 0.7181284427642822, + "num_tokens": 439733191.0, + "step": 17600 + }, + { + "epoch": 1.932901383703053, + "grad_norm": 2.5171334743499756, + "learning_rate": 1e-06, + "loss": 0.9007, + "mean_token_accuracy": 0.7283740043640137, + "num_tokens": 439754279.0, + "step": 17601 + }, + { + "epoch": 1.9330112014056666, + "grad_norm": 2.249875545501709, + "learning_rate": 1e-06, + "loss": 0.8839, + "mean_token_accuracy": 0.7273654341697693, + "num_tokens": 439779701.0, + "step": 17602 + }, + { + "epoch": 1.9331210191082802, + "grad_norm": 2.1416921615600586, + "learning_rate": 1e-06, + "loss": 0.8381, + "mean_token_accuracy": 0.7341176271438599, + "num_tokens": 439808178.0, + "step": 17603 + }, + { + "epoch": 1.933230836810894, + "grad_norm": 2.4005463123321533, + "learning_rate": 1e-06, + "loss": 0.9474, + "mean_token_accuracy": 0.7132937908172607, + "num_tokens": 439832212.0, + "step": 17604 + }, + { + "epoch": 1.9333406545135077, + "grad_norm": 1.970229148864746, + "learning_rate": 1e-06, + "loss": 0.9498, + "mean_token_accuracy": 0.7056882381439209, + "num_tokens": 439864990.0, + "step": 17605 + }, + { + "epoch": 1.9334504722161212, + "grad_norm": 2.372831344604492, + "learning_rate": 1e-06, + "loss": 0.9437, + "mean_token_accuracy": 0.7042936086654663, + "num_tokens": 439889188.0, + "step": 17606 + }, + { + "epoch": 1.9335602899187347, + "grad_norm": 2.2340571880340576, + "learning_rate": 1e-06, + "loss": 0.9619, + "mean_token_accuracy": 0.6992522478103638, + "num_tokens": 439917434.0, + "step": 17607 + }, + { + "epoch": 1.9336701076213485, + "grad_norm": 2.6195273399353027, + "learning_rate": 1e-06, + "loss": 0.8425, + "mean_token_accuracy": 0.734261691570282, + "num_tokens": 439937810.0, + "step": 17608 + }, + { + "epoch": 1.9337799253239623, + "grad_norm": 2.203758955001831, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.714745819568634, + "num_tokens": 439965033.0, + "step": 17609 + }, + { + "epoch": 1.933889743026576, + "grad_norm": 2.1294100284576416, + "learning_rate": 1e-06, + "loss": 0.926, + "mean_token_accuracy": 0.7097012996673584, + "num_tokens": 439994936.0, + "step": 17610 + }, + { + "epoch": 1.9339995607291895, + "grad_norm": 2.259748697280884, + "learning_rate": 1e-06, + "loss": 0.9116, + "mean_token_accuracy": 0.7102405428886414, + "num_tokens": 440022068.0, + "step": 17611 + }, + { + "epoch": 1.934109378431803, + "grad_norm": 2.397428274154663, + "learning_rate": 1e-06, + "loss": 0.8761, + "mean_token_accuracy": 0.725051999092102, + "num_tokens": 440043638.0, + "step": 17612 + }, + { + "epoch": 1.9342191961344168, + "grad_norm": 2.3557960987091064, + "learning_rate": 1e-06, + "loss": 0.8514, + "mean_token_accuracy": 0.7269631624221802, + "num_tokens": 440065002.0, + "step": 17613 + }, + { + "epoch": 1.9343290138370306, + "grad_norm": 2.3132312297821045, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7385843992233276, + "num_tokens": 440088158.0, + "step": 17614 + }, + { + "epoch": 1.9344388315396444, + "grad_norm": 2.1694793701171875, + "learning_rate": 1e-06, + "loss": 0.7963, + "mean_token_accuracy": 0.749280571937561, + "num_tokens": 440113224.0, + "step": 17615 + }, + { + "epoch": 1.934548649242258, + "grad_norm": 2.389087438583374, + "learning_rate": 1e-06, + "loss": 0.8328, + "mean_token_accuracy": 0.739021897315979, + "num_tokens": 440135102.0, + "step": 17616 + }, + { + "epoch": 1.9346584669448714, + "grad_norm": 2.1115622520446777, + "learning_rate": 1e-06, + "loss": 0.9426, + "mean_token_accuracy": 0.7110260725021362, + "num_tokens": 440163155.0, + "step": 17617 + }, + { + "epoch": 1.9347682846474852, + "grad_norm": 2.2593131065368652, + "learning_rate": 1e-06, + "loss": 0.9322, + "mean_token_accuracy": 0.713295578956604, + "num_tokens": 440190073.0, + "step": 17618 + }, + { + "epoch": 1.934878102350099, + "grad_norm": 2.2156121730804443, + "learning_rate": 1e-06, + "loss": 0.927, + "mean_token_accuracy": 0.7189416885375977, + "num_tokens": 440216824.0, + "step": 17619 + }, + { + "epoch": 1.9349879200527125, + "grad_norm": 2.4518067836761475, + "learning_rate": 1e-06, + "loss": 0.8295, + "mean_token_accuracy": 0.7326747179031372, + "num_tokens": 440237817.0, + "step": 17620 + }, + { + "epoch": 1.935097737755326, + "grad_norm": 2.289762496948242, + "learning_rate": 1e-06, + "loss": 0.9424, + "mean_token_accuracy": 0.7110105752944946, + "num_tokens": 440264390.0, + "step": 17621 + }, + { + "epoch": 1.9352075554579398, + "grad_norm": 2.1476266384124756, + "learning_rate": 1e-06, + "loss": 0.9959, + "mean_token_accuracy": 0.7025116682052612, + "num_tokens": 440292801.0, + "step": 17622 + }, + { + "epoch": 1.9353173731605535, + "grad_norm": 2.460052728652954, + "learning_rate": 1e-06, + "loss": 0.8665, + "mean_token_accuracy": 0.7215596437454224, + "num_tokens": 440313458.0, + "step": 17623 + }, + { + "epoch": 1.9354271908631673, + "grad_norm": 2.5986766815185547, + "learning_rate": 1e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7328019142150879, + "num_tokens": 440333241.0, + "step": 17624 + }, + { + "epoch": 1.9355370085657808, + "grad_norm": 2.148146152496338, + "learning_rate": 1e-06, + "loss": 0.9342, + "mean_token_accuracy": 0.7116072773933411, + "num_tokens": 440360216.0, + "step": 17625 + }, + { + "epoch": 1.9356468262683943, + "grad_norm": 2.2607572078704834, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7345702648162842, + "num_tokens": 440384391.0, + "step": 17626 + }, + { + "epoch": 1.935756643971008, + "grad_norm": 2.2395410537719727, + "learning_rate": 1e-06, + "loss": 0.9648, + "mean_token_accuracy": 0.697169303894043, + "num_tokens": 440410290.0, + "step": 17627 + }, + { + "epoch": 1.9358664616736219, + "grad_norm": 2.1767592430114746, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7130275964736938, + "num_tokens": 440436758.0, + "step": 17628 + }, + { + "epoch": 1.9359762793762354, + "grad_norm": 2.1932947635650635, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.726356565952301, + "num_tokens": 440461626.0, + "step": 17629 + }, + { + "epoch": 1.9360860970788492, + "grad_norm": 2.3759939670562744, + "learning_rate": 1e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7275400757789612, + "num_tokens": 440485750.0, + "step": 17630 + }, + { + "epoch": 1.9361959147814627, + "grad_norm": 2.000232219696045, + "learning_rate": 1e-06, + "loss": 0.9644, + "mean_token_accuracy": 0.6991612911224365, + "num_tokens": 440517650.0, + "step": 17631 + }, + { + "epoch": 1.9363057324840764, + "grad_norm": 2.578211545944214, + "learning_rate": 1e-06, + "loss": 0.9429, + "mean_token_accuracy": 0.7111613750457764, + "num_tokens": 440539704.0, + "step": 17632 + }, + { + "epoch": 1.9364155501866902, + "grad_norm": 2.27715802192688, + "learning_rate": 1e-06, + "loss": 0.7789, + "mean_token_accuracy": 0.7538779377937317, + "num_tokens": 440561545.0, + "step": 17633 + }, + { + "epoch": 1.9365253678893037, + "grad_norm": 2.0079922676086426, + "learning_rate": 1e-06, + "loss": 0.8789, + "mean_token_accuracy": 0.7235552668571472, + "num_tokens": 440594346.0, + "step": 17634 + }, + { + "epoch": 1.9366351855919173, + "grad_norm": 1.8814613819122314, + "learning_rate": 1e-06, + "loss": 0.9015, + "mean_token_accuracy": 0.7126844525337219, + "num_tokens": 440626087.0, + "step": 17635 + }, + { + "epoch": 1.936745003294531, + "grad_norm": 2.1026456356048584, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7177980542182922, + "num_tokens": 440652486.0, + "step": 17636 + }, + { + "epoch": 1.9368548209971448, + "grad_norm": 2.4188730716705322, + "learning_rate": 1e-06, + "loss": 0.8915, + "mean_token_accuracy": 0.7202160358428955, + "num_tokens": 440673828.0, + "step": 17637 + }, + { + "epoch": 1.9369646386997585, + "grad_norm": 2.5491015911102295, + "learning_rate": 1e-06, + "loss": 0.7961, + "mean_token_accuracy": 0.7469748854637146, + "num_tokens": 440692881.0, + "step": 17638 + }, + { + "epoch": 1.937074456402372, + "grad_norm": 2.045644998550415, + "learning_rate": 1e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7382770776748657, + "num_tokens": 440718872.0, + "step": 17639 + }, + { + "epoch": 1.9371842741049856, + "grad_norm": 2.377164125442505, + "learning_rate": 1e-06, + "loss": 0.9713, + "mean_token_accuracy": 0.7008660435676575, + "num_tokens": 440744388.0, + "step": 17640 + }, + { + "epoch": 1.9372940918075994, + "grad_norm": 2.2716360092163086, + "learning_rate": 1e-06, + "loss": 1.0019, + "mean_token_accuracy": 0.6966572999954224, + "num_tokens": 440771666.0, + "step": 17641 + }, + { + "epoch": 1.9374039095102131, + "grad_norm": 2.1614696979522705, + "learning_rate": 1e-06, + "loss": 0.9738, + "mean_token_accuracy": 0.706241250038147, + "num_tokens": 440797228.0, + "step": 17642 + }, + { + "epoch": 1.9375137272128267, + "grad_norm": 2.4063050746917725, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.6992943286895752, + "num_tokens": 440820502.0, + "step": 17643 + }, + { + "epoch": 1.9376235449154404, + "grad_norm": 2.267324209213257, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7217962741851807, + "num_tokens": 440844999.0, + "step": 17644 + }, + { + "epoch": 1.937733362618054, + "grad_norm": 2.233903646469116, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7296366691589355, + "num_tokens": 440869605.0, + "step": 17645 + }, + { + "epoch": 1.9378431803206677, + "grad_norm": 1.8949284553527832, + "learning_rate": 1e-06, + "loss": 0.9074, + "mean_token_accuracy": 0.7129192352294922, + "num_tokens": 440902711.0, + "step": 17646 + }, + { + "epoch": 1.9379529980232815, + "grad_norm": 2.1597635746002197, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7314241528511047, + "num_tokens": 440929561.0, + "step": 17647 + }, + { + "epoch": 1.938062815725895, + "grad_norm": 2.255190134048462, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7192705869674683, + "num_tokens": 440955137.0, + "step": 17648 + }, + { + "epoch": 1.9381726334285085, + "grad_norm": 2.315598964691162, + "learning_rate": 1e-06, + "loss": 0.8971, + "mean_token_accuracy": 0.7208136320114136, + "num_tokens": 440978443.0, + "step": 17649 + }, + { + "epoch": 1.9382824511311223, + "grad_norm": 2.272144079208374, + "learning_rate": 1e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7360673546791077, + "num_tokens": 441002490.0, + "step": 17650 + }, + { + "epoch": 1.938392268833736, + "grad_norm": 2.3498215675354004, + "learning_rate": 1e-06, + "loss": 0.8993, + "mean_token_accuracy": 0.7197938561439514, + "num_tokens": 441023845.0, + "step": 17651 + }, + { + "epoch": 1.9385020865363498, + "grad_norm": 2.451486825942993, + "learning_rate": 1e-06, + "loss": 0.9145, + "mean_token_accuracy": 0.7252976298332214, + "num_tokens": 441047263.0, + "step": 17652 + }, + { + "epoch": 1.9386119042389633, + "grad_norm": 2.3871140480041504, + "learning_rate": 1e-06, + "loss": 0.9333, + "mean_token_accuracy": 0.71490877866745, + "num_tokens": 441070408.0, + "step": 17653 + }, + { + "epoch": 1.9387217219415769, + "grad_norm": 2.248997926712036, + "learning_rate": 1e-06, + "loss": 0.8521, + "mean_token_accuracy": 0.7337634563446045, + "num_tokens": 441094598.0, + "step": 17654 + }, + { + "epoch": 1.9388315396441906, + "grad_norm": 2.204364776611328, + "learning_rate": 1e-06, + "loss": 0.9808, + "mean_token_accuracy": 0.702042818069458, + "num_tokens": 441121696.0, + "step": 17655 + }, + { + "epoch": 1.9389413573468044, + "grad_norm": 2.594923496246338, + "learning_rate": 1e-06, + "loss": 0.8261, + "mean_token_accuracy": 0.7396910190582275, + "num_tokens": 441141746.0, + "step": 17656 + }, + { + "epoch": 1.939051175049418, + "grad_norm": 2.293311595916748, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.7020449638366699, + "num_tokens": 441166143.0, + "step": 17657 + }, + { + "epoch": 1.9391609927520315, + "grad_norm": 2.511136531829834, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7109895944595337, + "num_tokens": 441187487.0, + "step": 17658 + }, + { + "epoch": 1.9392708104546452, + "grad_norm": 1.8668452501296997, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7228105068206787, + "num_tokens": 441219450.0, + "step": 17659 + }, + { + "epoch": 1.939380628157259, + "grad_norm": 2.1110329627990723, + "learning_rate": 1e-06, + "loss": 1.0208, + "mean_token_accuracy": 0.6978632807731628, + "num_tokens": 441248369.0, + "step": 17660 + }, + { + "epoch": 1.9394904458598727, + "grad_norm": 2.3078670501708984, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7423100471496582, + "num_tokens": 441271104.0, + "step": 17661 + }, + { + "epoch": 1.9396002635624863, + "grad_norm": 2.51338529586792, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7217836976051331, + "num_tokens": 441291880.0, + "step": 17662 + }, + { + "epoch": 1.9397100812650998, + "grad_norm": 2.480753183364868, + "learning_rate": 1e-06, + "loss": 0.7092, + "mean_token_accuracy": 0.7697346806526184, + "num_tokens": 441310736.0, + "step": 17663 + }, + { + "epoch": 1.9398198989677136, + "grad_norm": 2.496311902999878, + "learning_rate": 1e-06, + "loss": 0.7166, + "mean_token_accuracy": 0.7629003524780273, + "num_tokens": 441329189.0, + "step": 17664 + }, + { + "epoch": 1.9399297166703273, + "grad_norm": 2.342489719390869, + "learning_rate": 1e-06, + "loss": 0.9487, + "mean_token_accuracy": 0.7035632133483887, + "num_tokens": 441353199.0, + "step": 17665 + }, + { + "epoch": 1.940039534372941, + "grad_norm": 2.085178852081299, + "learning_rate": 1e-06, + "loss": 0.877, + "mean_token_accuracy": 0.7359962463378906, + "num_tokens": 441380965.0, + "step": 17666 + }, + { + "epoch": 1.9401493520755546, + "grad_norm": 2.280104160308838, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7133708596229553, + "num_tokens": 441406236.0, + "step": 17667 + }, + { + "epoch": 1.9402591697781681, + "grad_norm": 2.461801767349243, + "learning_rate": 1e-06, + "loss": 0.9126, + "mean_token_accuracy": 0.7224553227424622, + "num_tokens": 441428680.0, + "step": 17668 + }, + { + "epoch": 1.940368987480782, + "grad_norm": 2.4957172870635986, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7427353858947754, + "num_tokens": 441450859.0, + "step": 17669 + }, + { + "epoch": 1.9404788051833957, + "grad_norm": 2.842261791229248, + "learning_rate": 1e-06, + "loss": 0.8729, + "mean_token_accuracy": 0.7244951725006104, + "num_tokens": 441470444.0, + "step": 17670 + }, + { + "epoch": 1.9405886228860092, + "grad_norm": 2.168684959411621, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7069183588027954, + "num_tokens": 441497269.0, + "step": 17671 + }, + { + "epoch": 1.9406984405886227, + "grad_norm": 2.3467066287994385, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7199589610099792, + "num_tokens": 441519599.0, + "step": 17672 + }, + { + "epoch": 1.9408082582912365, + "grad_norm": 2.27559757232666, + "learning_rate": 1e-06, + "loss": 0.9236, + "mean_token_accuracy": 0.7093192934989929, + "num_tokens": 441543891.0, + "step": 17673 + }, + { + "epoch": 1.9409180759938502, + "grad_norm": 2.172952890396118, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7073776721954346, + "num_tokens": 441570644.0, + "step": 17674 + }, + { + "epoch": 1.941027893696464, + "grad_norm": 2.1267282962799072, + "learning_rate": 1e-06, + "loss": 0.9502, + "mean_token_accuracy": 0.7039796113967896, + "num_tokens": 441598835.0, + "step": 17675 + }, + { + "epoch": 1.9411377113990775, + "grad_norm": 2.3643696308135986, + "learning_rate": 1e-06, + "loss": 0.8687, + "mean_token_accuracy": 0.7264021635055542, + "num_tokens": 441620471.0, + "step": 17676 + }, + { + "epoch": 1.941247529101691, + "grad_norm": 2.1767544746398926, + "learning_rate": 1e-06, + "loss": 0.8918, + "mean_token_accuracy": 0.7221999168395996, + "num_tokens": 441646480.0, + "step": 17677 + }, + { + "epoch": 1.9413573468043048, + "grad_norm": 2.1842899322509766, + "learning_rate": 1e-06, + "loss": 0.9912, + "mean_token_accuracy": 0.6923825740814209, + "num_tokens": 441675282.0, + "step": 17678 + }, + { + "epoch": 1.9414671645069186, + "grad_norm": 2.595787286758423, + "learning_rate": 1e-06, + "loss": 0.7558, + "mean_token_accuracy": 0.7556664347648621, + "num_tokens": 441692413.0, + "step": 17679 + }, + { + "epoch": 1.9415769822095323, + "grad_norm": 2.1963326930999756, + "learning_rate": 1e-06, + "loss": 0.9142, + "mean_token_accuracy": 0.7292921543121338, + "num_tokens": 441717689.0, + "step": 17680 + }, + { + "epoch": 1.9416867999121459, + "grad_norm": 2.493543863296509, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.728168249130249, + "num_tokens": 441738367.0, + "step": 17681 + }, + { + "epoch": 1.9417966176147594, + "grad_norm": 2.4165256023406982, + "learning_rate": 1e-06, + "loss": 0.8949, + "mean_token_accuracy": 0.7331607341766357, + "num_tokens": 441758962.0, + "step": 17682 + }, + { + "epoch": 1.9419064353173732, + "grad_norm": 2.425816297531128, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7158071994781494, + "num_tokens": 441788716.0, + "step": 17683 + }, + { + "epoch": 1.942016253019987, + "grad_norm": 2.3752613067626953, + "learning_rate": 1e-06, + "loss": 0.8723, + "mean_token_accuracy": 0.7303537130355835, + "num_tokens": 441811213.0, + "step": 17684 + }, + { + "epoch": 1.9421260707226005, + "grad_norm": 2.196849822998047, + "learning_rate": 1e-06, + "loss": 0.8072, + "mean_token_accuracy": 0.7507531642913818, + "num_tokens": 441836225.0, + "step": 17685 + }, + { + "epoch": 1.942235888425214, + "grad_norm": 2.246626615524292, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7306596040725708, + "num_tokens": 441863551.0, + "step": 17686 + }, + { + "epoch": 1.9423457061278278, + "grad_norm": 2.439866542816162, + "learning_rate": 1e-06, + "loss": 0.895, + "mean_token_accuracy": 0.7175767421722412, + "num_tokens": 441886789.0, + "step": 17687 + }, + { + "epoch": 1.9424555238304415, + "grad_norm": 2.035212278366089, + "learning_rate": 1e-06, + "loss": 0.9288, + "mean_token_accuracy": 0.709754467010498, + "num_tokens": 441916358.0, + "step": 17688 + }, + { + "epoch": 1.9425653415330553, + "grad_norm": 2.648033380508423, + "learning_rate": 1e-06, + "loss": 0.8776, + "mean_token_accuracy": 0.7285280227661133, + "num_tokens": 441935764.0, + "step": 17689 + }, + { + "epoch": 1.9426751592356688, + "grad_norm": 2.0565481185913086, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7232905030250549, + "num_tokens": 441965763.0, + "step": 17690 + }, + { + "epoch": 1.9427849769382823, + "grad_norm": 2.528003215789795, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.733985185623169, + "num_tokens": 441988347.0, + "step": 17691 + }, + { + "epoch": 1.942894794640896, + "grad_norm": 2.019150733947754, + "learning_rate": 1e-06, + "loss": 0.8884, + "mean_token_accuracy": 0.7223860621452332, + "num_tokens": 442019723.0, + "step": 17692 + }, + { + "epoch": 1.9430046123435099, + "grad_norm": 2.0881361961364746, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7106107473373413, + "num_tokens": 442049007.0, + "step": 17693 + }, + { + "epoch": 1.9431144300461234, + "grad_norm": 1.9392387866973877, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7272087335586548, + "num_tokens": 442080906.0, + "step": 17694 + }, + { + "epoch": 1.9432242477487371, + "grad_norm": 2.3668460845947266, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7304166555404663, + "num_tokens": 442101891.0, + "step": 17695 + }, + { + "epoch": 1.9433340654513507, + "grad_norm": 2.794757127761841, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.719099223613739, + "num_tokens": 442119203.0, + "step": 17696 + }, + { + "epoch": 1.9434438831539644, + "grad_norm": 2.338881492614746, + "learning_rate": 1e-06, + "loss": 0.8864, + "mean_token_accuracy": 0.7223737239837646, + "num_tokens": 442143580.0, + "step": 17697 + }, + { + "epoch": 1.9435537008565782, + "grad_norm": 2.277095079421997, + "learning_rate": 1e-06, + "loss": 0.9093, + "mean_token_accuracy": 0.7135220766067505, + "num_tokens": 442169609.0, + "step": 17698 + }, + { + "epoch": 1.9436635185591917, + "grad_norm": 2.2035269737243652, + "learning_rate": 1e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.742660641670227, + "num_tokens": 442193156.0, + "step": 17699 + }, + { + "epoch": 1.9437733362618053, + "grad_norm": 2.065369129180908, + "learning_rate": 1e-06, + "loss": 0.9409, + "mean_token_accuracy": 0.704927384853363, + "num_tokens": 442223385.0, + "step": 17700 + }, + { + "epoch": 1.943883153964419, + "grad_norm": 2.3296782970428467, + "learning_rate": 1e-06, + "loss": 0.9152, + "mean_token_accuracy": 0.720130443572998, + "num_tokens": 442246062.0, + "step": 17701 + }, + { + "epoch": 1.9439929716670328, + "grad_norm": 2.0740582942962646, + "learning_rate": 1e-06, + "loss": 0.7741, + "mean_token_accuracy": 0.7500540018081665, + "num_tokens": 442272453.0, + "step": 17702 + }, + { + "epoch": 1.9441027893696465, + "grad_norm": 2.1031172275543213, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7091377973556519, + "num_tokens": 442300923.0, + "step": 17703 + }, + { + "epoch": 1.94421260707226, + "grad_norm": 2.28409481048584, + "learning_rate": 1e-06, + "loss": 0.9464, + "mean_token_accuracy": 0.7074976563453674, + "num_tokens": 442327609.0, + "step": 17704 + }, + { + "epoch": 1.9443224247748736, + "grad_norm": 2.2139084339141846, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7198309898376465, + "num_tokens": 442352424.0, + "step": 17705 + }, + { + "epoch": 1.9444322424774874, + "grad_norm": 2.581258773803711, + "learning_rate": 1e-06, + "loss": 0.8464, + "mean_token_accuracy": 0.7330939173698425, + "num_tokens": 442371412.0, + "step": 17706 + }, + { + "epoch": 1.9445420601801011, + "grad_norm": 2.670966148376465, + "learning_rate": 1e-06, + "loss": 0.8454, + "mean_token_accuracy": 0.7318928241729736, + "num_tokens": 442390862.0, + "step": 17707 + }, + { + "epoch": 1.9446518778827147, + "grad_norm": 2.2399027347564697, + "learning_rate": 1e-06, + "loss": 0.9727, + "mean_token_accuracy": 0.7053169012069702, + "num_tokens": 442418515.0, + "step": 17708 + }, + { + "epoch": 1.9447616955853284, + "grad_norm": 2.4174964427948, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7330334186553955, + "num_tokens": 442439200.0, + "step": 17709 + }, + { + "epoch": 1.944871513287942, + "grad_norm": 2.245479106903076, + "learning_rate": 1e-06, + "loss": 0.9543, + "mean_token_accuracy": 0.7110147476196289, + "num_tokens": 442464531.0, + "step": 17710 + }, + { + "epoch": 1.9449813309905557, + "grad_norm": 2.2693934440612793, + "learning_rate": 1e-06, + "loss": 0.9519, + "mean_token_accuracy": 0.7201317548751831, + "num_tokens": 442489193.0, + "step": 17711 + }, + { + "epoch": 1.9450911486931695, + "grad_norm": 2.2942848205566406, + "learning_rate": 1e-06, + "loss": 0.9148, + "mean_token_accuracy": 0.7198499441146851, + "num_tokens": 442513978.0, + "step": 17712 + }, + { + "epoch": 1.945200966395783, + "grad_norm": 2.2788658142089844, + "learning_rate": 1e-06, + "loss": 0.9769, + "mean_token_accuracy": 0.7052149772644043, + "num_tokens": 442539370.0, + "step": 17713 + }, + { + "epoch": 1.9453107840983965, + "grad_norm": 1.914717435836792, + "learning_rate": 1e-06, + "loss": 0.9345, + "mean_token_accuracy": 0.7163134813308716, + "num_tokens": 442570939.0, + "step": 17714 + }, + { + "epoch": 1.9454206018010103, + "grad_norm": 2.2731456756591797, + "learning_rate": 1e-06, + "loss": 0.9407, + "mean_token_accuracy": 0.7143386602401733, + "num_tokens": 442594418.0, + "step": 17715 + }, + { + "epoch": 1.945530419503624, + "grad_norm": 2.5370380878448486, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7203371524810791, + "num_tokens": 442615744.0, + "step": 17716 + }, + { + "epoch": 1.9456402372062378, + "grad_norm": 2.0251872539520264, + "learning_rate": 1e-06, + "loss": 0.9432, + "mean_token_accuracy": 0.7079832553863525, + "num_tokens": 442646675.0, + "step": 17717 + }, + { + "epoch": 1.9457500549088513, + "grad_norm": 2.6837711334228516, + "learning_rate": 1e-06, + "loss": 0.8292, + "mean_token_accuracy": 0.7343896627426147, + "num_tokens": 442665732.0, + "step": 17718 + }, + { + "epoch": 1.9458598726114649, + "grad_norm": 2.0676491260528564, + "learning_rate": 1e-06, + "loss": 0.9301, + "mean_token_accuracy": 0.711183488368988, + "num_tokens": 442694875.0, + "step": 17719 + }, + { + "epoch": 1.9459696903140786, + "grad_norm": 2.638497829437256, + "learning_rate": 1e-06, + "loss": 0.8399, + "mean_token_accuracy": 0.731547474861145, + "num_tokens": 442713595.0, + "step": 17720 + }, + { + "epoch": 1.9460795080166924, + "grad_norm": 2.4273836612701416, + "learning_rate": 1e-06, + "loss": 0.9063, + "mean_token_accuracy": 0.7136194109916687, + "num_tokens": 442736979.0, + "step": 17721 + }, + { + "epoch": 1.946189325719306, + "grad_norm": 1.9542028903961182, + "learning_rate": 1e-06, + "loss": 0.8933, + "mean_token_accuracy": 0.7206101417541504, + "num_tokens": 442767485.0, + "step": 17722 + }, + { + "epoch": 1.9462991434219195, + "grad_norm": 2.7371199131011963, + "learning_rate": 1e-06, + "loss": 0.8316, + "mean_token_accuracy": 0.7340713739395142, + "num_tokens": 442785647.0, + "step": 17723 + }, + { + "epoch": 1.9464089611245332, + "grad_norm": 2.1948401927948, + "learning_rate": 1e-06, + "loss": 0.8189, + "mean_token_accuracy": 0.7421150207519531, + "num_tokens": 442812168.0, + "step": 17724 + }, + { + "epoch": 1.946518778827147, + "grad_norm": 2.509521007537842, + "learning_rate": 1e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7378072738647461, + "num_tokens": 442833576.0, + "step": 17725 + }, + { + "epoch": 1.9466285965297607, + "grad_norm": 2.4718525409698486, + "learning_rate": 1e-06, + "loss": 0.8825, + "mean_token_accuracy": 0.7231032252311707, + "num_tokens": 442854917.0, + "step": 17726 + }, + { + "epoch": 1.9467384142323743, + "grad_norm": 2.1481971740722656, + "learning_rate": 1e-06, + "loss": 0.8013, + "mean_token_accuracy": 0.7430130243301392, + "num_tokens": 442879787.0, + "step": 17727 + }, + { + "epoch": 1.9468482319349878, + "grad_norm": 2.3098385334014893, + "learning_rate": 1e-06, + "loss": 0.8808, + "mean_token_accuracy": 0.7168205380439758, + "num_tokens": 442903821.0, + "step": 17728 + }, + { + "epoch": 1.9469580496376016, + "grad_norm": 2.2199363708496094, + "learning_rate": 1e-06, + "loss": 0.9118, + "mean_token_accuracy": 0.735848069190979, + "num_tokens": 442928840.0, + "step": 17729 + }, + { + "epoch": 1.9470678673402153, + "grad_norm": 2.964202880859375, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7333552837371826, + "num_tokens": 442945023.0, + "step": 17730 + }, + { + "epoch": 1.947177685042829, + "grad_norm": 2.4073150157928467, + "learning_rate": 1e-06, + "loss": 0.9416, + "mean_token_accuracy": 0.7039631605148315, + "num_tokens": 442969431.0, + "step": 17731 + }, + { + "epoch": 1.9472875027454426, + "grad_norm": 2.477208137512207, + "learning_rate": 1e-06, + "loss": 0.8515, + "mean_token_accuracy": 0.7283525466918945, + "num_tokens": 442991098.0, + "step": 17732 + }, + { + "epoch": 1.9473973204480561, + "grad_norm": 2.324648141860962, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7232018113136292, + "num_tokens": 443016156.0, + "step": 17733 + }, + { + "epoch": 1.94750713815067, + "grad_norm": 2.436229705810547, + "learning_rate": 1e-06, + "loss": 0.7245, + "mean_token_accuracy": 0.7675032615661621, + "num_tokens": 443036930.0, + "step": 17734 + }, + { + "epoch": 1.9476169558532836, + "grad_norm": 2.5889077186584473, + "learning_rate": 1e-06, + "loss": 0.7313, + "mean_token_accuracy": 0.7611562013626099, + "num_tokens": 443054895.0, + "step": 17735 + }, + { + "epoch": 1.9477267735558972, + "grad_norm": 2.2618987560272217, + "learning_rate": 1e-06, + "loss": 0.916, + "mean_token_accuracy": 0.7135782241821289, + "num_tokens": 443079699.0, + "step": 17736 + }, + { + "epoch": 1.9478365912585107, + "grad_norm": 2.642195701599121, + "learning_rate": 1e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7258968353271484, + "num_tokens": 443099641.0, + "step": 17737 + }, + { + "epoch": 1.9479464089611245, + "grad_norm": 2.125661849975586, + "learning_rate": 1e-06, + "loss": 0.9704, + "mean_token_accuracy": 0.7052533626556396, + "num_tokens": 443126871.0, + "step": 17738 + }, + { + "epoch": 1.9480562266637382, + "grad_norm": 2.328758716583252, + "learning_rate": 1e-06, + "loss": 0.9045, + "mean_token_accuracy": 0.7232764959335327, + "num_tokens": 443151835.0, + "step": 17739 + }, + { + "epoch": 1.948166044366352, + "grad_norm": 2.4741108417510986, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7102071046829224, + "num_tokens": 443173831.0, + "step": 17740 + }, + { + "epoch": 1.9482758620689655, + "grad_norm": 2.1568076610565186, + "learning_rate": 1e-06, + "loss": 0.8472, + "mean_token_accuracy": 0.7343435287475586, + "num_tokens": 443198620.0, + "step": 17741 + }, + { + "epoch": 1.948385679771579, + "grad_norm": 2.1399312019348145, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.718257486820221, + "num_tokens": 443224508.0, + "step": 17742 + }, + { + "epoch": 1.9484954974741928, + "grad_norm": 2.4122440814971924, + "learning_rate": 1e-06, + "loss": 0.8298, + "mean_token_accuracy": 0.7399609684944153, + "num_tokens": 443246480.0, + "step": 17743 + }, + { + "epoch": 1.9486053151768066, + "grad_norm": 2.3211653232574463, + "learning_rate": 1e-06, + "loss": 0.8285, + "mean_token_accuracy": 0.7377579212188721, + "num_tokens": 443270496.0, + "step": 17744 + }, + { + "epoch": 1.94871513287942, + "grad_norm": 2.2085020542144775, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.737419843673706, + "num_tokens": 443294600.0, + "step": 17745 + }, + { + "epoch": 1.9488249505820339, + "grad_norm": 2.1433653831481934, + "learning_rate": 1e-06, + "loss": 0.8889, + "mean_token_accuracy": 0.7279859781265259, + "num_tokens": 443321889.0, + "step": 17746 + }, + { + "epoch": 1.9489347682846474, + "grad_norm": 2.114480972290039, + "learning_rate": 1e-06, + "loss": 0.8773, + "mean_token_accuracy": 0.7279041409492493, + "num_tokens": 443349030.0, + "step": 17747 + }, + { + "epoch": 1.9490445859872612, + "grad_norm": 2.3112590312957764, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7230544090270996, + "num_tokens": 443372853.0, + "step": 17748 + }, + { + "epoch": 1.949154403689875, + "grad_norm": 2.5999865531921387, + "learning_rate": 1e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7347330451011658, + "num_tokens": 443392327.0, + "step": 17749 + }, + { + "epoch": 1.9492642213924884, + "grad_norm": 1.9084917306900024, + "learning_rate": 1e-06, + "loss": 0.8379, + "mean_token_accuracy": 0.7366262674331665, + "num_tokens": 443424865.0, + "step": 17750 + }, + { + "epoch": 1.949374039095102, + "grad_norm": 2.2244701385498047, + "learning_rate": 1e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7232370376586914, + "num_tokens": 443452264.0, + "step": 17751 + }, + { + "epoch": 1.9494838567977157, + "grad_norm": 2.651958465576172, + "learning_rate": 1e-06, + "loss": 0.805, + "mean_token_accuracy": 0.7379469275474548, + "num_tokens": 443470906.0, + "step": 17752 + }, + { + "epoch": 1.9495936745003295, + "grad_norm": 2.290278911590576, + "learning_rate": 1e-06, + "loss": 0.9597, + "mean_token_accuracy": 0.6997879147529602, + "num_tokens": 443497031.0, + "step": 17753 + }, + { + "epoch": 1.9497034922029433, + "grad_norm": 2.4506888389587402, + "learning_rate": 1e-06, + "loss": 0.8155, + "mean_token_accuracy": 0.7416646480560303, + "num_tokens": 443517149.0, + "step": 17754 + }, + { + "epoch": 1.9498133099055568, + "grad_norm": 2.163362979888916, + "learning_rate": 1e-06, + "loss": 0.9177, + "mean_token_accuracy": 0.7164928317070007, + "num_tokens": 443542186.0, + "step": 17755 + }, + { + "epoch": 1.9499231276081703, + "grad_norm": 1.9462517499923706, + "learning_rate": 1e-06, + "loss": 0.9307, + "mean_token_accuracy": 0.7090846300125122, + "num_tokens": 443574648.0, + "step": 17756 + }, + { + "epoch": 1.950032945310784, + "grad_norm": 2.547210216522217, + "learning_rate": 1e-06, + "loss": 0.8083, + "mean_token_accuracy": 0.7449656128883362, + "num_tokens": 443593556.0, + "step": 17757 + }, + { + "epoch": 1.9501427630133978, + "grad_norm": 2.043659210205078, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.725448727607727, + "num_tokens": 443623405.0, + "step": 17758 + }, + { + "epoch": 1.9502525807160114, + "grad_norm": 2.3342626094818115, + "learning_rate": 1e-06, + "loss": 0.9341, + "mean_token_accuracy": 0.7082923650741577, + "num_tokens": 443646240.0, + "step": 17759 + }, + { + "epoch": 1.9503623984186251, + "grad_norm": 2.2874372005462646, + "learning_rate": 1e-06, + "loss": 0.9986, + "mean_token_accuracy": 0.6903805732727051, + "num_tokens": 443672391.0, + "step": 17760 + }, + { + "epoch": 1.9504722161212387, + "grad_norm": 2.4611642360687256, + "learning_rate": 1e-06, + "loss": 0.8148, + "mean_token_accuracy": 0.7366498708724976, + "num_tokens": 443693859.0, + "step": 17761 + }, + { + "epoch": 1.9505820338238524, + "grad_norm": 2.2464194297790527, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7041809558868408, + "num_tokens": 443718741.0, + "step": 17762 + }, + { + "epoch": 1.9506918515264662, + "grad_norm": 1.937145471572876, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7067834734916687, + "num_tokens": 443751204.0, + "step": 17763 + }, + { + "epoch": 1.9508016692290797, + "grad_norm": 2.157438039779663, + "learning_rate": 1e-06, + "loss": 0.8934, + "mean_token_accuracy": 0.7198318839073181, + "num_tokens": 443778924.0, + "step": 17764 + }, + { + "epoch": 1.9509114869316933, + "grad_norm": 2.0967061519622803, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7089298367500305, + "num_tokens": 443806912.0, + "step": 17765 + }, + { + "epoch": 1.951021304634307, + "grad_norm": 2.2323267459869385, + "learning_rate": 1e-06, + "loss": 0.8179, + "mean_token_accuracy": 0.7402442097663879, + "num_tokens": 443832047.0, + "step": 17766 + }, + { + "epoch": 1.9511311223369208, + "grad_norm": 2.3800950050354004, + "learning_rate": 1e-06, + "loss": 0.787, + "mean_token_accuracy": 0.7406487464904785, + "num_tokens": 443855021.0, + "step": 17767 + }, + { + "epoch": 1.9512409400395345, + "grad_norm": 2.6306605339050293, + "learning_rate": 1e-06, + "loss": 0.8828, + "mean_token_accuracy": 0.7181366682052612, + "num_tokens": 443875631.0, + "step": 17768 + }, + { + "epoch": 1.951350757742148, + "grad_norm": 2.6024181842803955, + "learning_rate": 1e-06, + "loss": 0.8176, + "mean_token_accuracy": 0.7345575094223022, + "num_tokens": 443898515.0, + "step": 17769 + }, + { + "epoch": 1.9514605754447616, + "grad_norm": 2.1925597190856934, + "learning_rate": 1e-06, + "loss": 0.8948, + "mean_token_accuracy": 0.7189062237739563, + "num_tokens": 443926193.0, + "step": 17770 + }, + { + "epoch": 1.9515703931473753, + "grad_norm": 2.3391008377075195, + "learning_rate": 1e-06, + "loss": 0.8832, + "mean_token_accuracy": 0.7231736183166504, + "num_tokens": 443950361.0, + "step": 17771 + }, + { + "epoch": 1.951680210849989, + "grad_norm": 1.988033413887024, + "learning_rate": 1e-06, + "loss": 0.9086, + "mean_token_accuracy": 0.7251092195510864, + "num_tokens": 443982761.0, + "step": 17772 + }, + { + "epoch": 1.9517900285526026, + "grad_norm": 2.4018173217773438, + "learning_rate": 1e-06, + "loss": 0.745, + "mean_token_accuracy": 0.7573661804199219, + "num_tokens": 444002754.0, + "step": 17773 + }, + { + "epoch": 1.9518998462552162, + "grad_norm": 2.018782615661621, + "learning_rate": 1e-06, + "loss": 0.843, + "mean_token_accuracy": 0.7425174713134766, + "num_tokens": 444031125.0, + "step": 17774 + }, + { + "epoch": 1.95200966395783, + "grad_norm": 2.2377660274505615, + "learning_rate": 1e-06, + "loss": 0.9124, + "mean_token_accuracy": 0.727804958820343, + "num_tokens": 444056339.0, + "step": 17775 + }, + { + "epoch": 1.9521194816604437, + "grad_norm": 1.9801580905914307, + "learning_rate": 1e-06, + "loss": 1.0223, + "mean_token_accuracy": 0.6882550716400146, + "num_tokens": 444088584.0, + "step": 17776 + }, + { + "epoch": 1.9522292993630574, + "grad_norm": 2.860407590866089, + "learning_rate": 1e-06, + "loss": 0.8077, + "mean_token_accuracy": 0.7380113005638123, + "num_tokens": 444104782.0, + "step": 17777 + }, + { + "epoch": 1.952339117065671, + "grad_norm": 2.1160459518432617, + "learning_rate": 1e-06, + "loss": 0.8117, + "mean_token_accuracy": 0.7410760521888733, + "num_tokens": 444132573.0, + "step": 17778 + }, + { + "epoch": 1.9524489347682845, + "grad_norm": 2.091583728790283, + "learning_rate": 1e-06, + "loss": 0.8935, + "mean_token_accuracy": 0.7230708599090576, + "num_tokens": 444160142.0, + "step": 17779 + }, + { + "epoch": 1.9525587524708983, + "grad_norm": 2.227135181427002, + "learning_rate": 1e-06, + "loss": 0.8845, + "mean_token_accuracy": 0.7220680713653564, + "num_tokens": 444185269.0, + "step": 17780 + }, + { + "epoch": 1.952668570173512, + "grad_norm": 2.6128365993499756, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7305190563201904, + "num_tokens": 444203997.0, + "step": 17781 + }, + { + "epoch": 1.9527783878761258, + "grad_norm": 2.1755692958831787, + "learning_rate": 1e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7375088334083557, + "num_tokens": 444230508.0, + "step": 17782 + }, + { + "epoch": 1.9528882055787393, + "grad_norm": 2.6352944374084473, + "learning_rate": 1e-06, + "loss": 0.8259, + "mean_token_accuracy": 0.7419742345809937, + "num_tokens": 444251114.0, + "step": 17783 + }, + { + "epoch": 1.9529980232813529, + "grad_norm": 2.070358991622925, + "learning_rate": 1e-06, + "loss": 0.9181, + "mean_token_accuracy": 0.7148293256759644, + "num_tokens": 444282331.0, + "step": 17784 + }, + { + "epoch": 1.9531078409839666, + "grad_norm": 2.517697334289551, + "learning_rate": 1e-06, + "loss": 0.7986, + "mean_token_accuracy": 0.7537226676940918, + "num_tokens": 444302590.0, + "step": 17785 + }, + { + "epoch": 1.9532176586865804, + "grad_norm": 2.3413543701171875, + "learning_rate": 1e-06, + "loss": 0.9539, + "mean_token_accuracy": 0.7053464651107788, + "num_tokens": 444328109.0, + "step": 17786 + }, + { + "epoch": 1.953327476389194, + "grad_norm": 2.2326407432556152, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7405824661254883, + "num_tokens": 444353999.0, + "step": 17787 + }, + { + "epoch": 1.9534372940918074, + "grad_norm": 2.211935520172119, + "learning_rate": 1e-06, + "loss": 0.8866, + "mean_token_accuracy": 0.719598650932312, + "num_tokens": 444380288.0, + "step": 17788 + }, + { + "epoch": 1.9535471117944212, + "grad_norm": 1.9931669235229492, + "learning_rate": 1e-06, + "loss": 0.923, + "mean_token_accuracy": 0.7116037607192993, + "num_tokens": 444410494.0, + "step": 17789 + }, + { + "epoch": 1.953656929497035, + "grad_norm": 2.036760091781616, + "learning_rate": 1e-06, + "loss": 0.9654, + "mean_token_accuracy": 0.7044975757598877, + "num_tokens": 444439929.0, + "step": 17790 + }, + { + "epoch": 1.9537667471996487, + "grad_norm": 2.5059359073638916, + "learning_rate": 1e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7397047281265259, + "num_tokens": 444462502.0, + "step": 17791 + }, + { + "epoch": 1.9538765649022622, + "grad_norm": 2.1889989376068115, + "learning_rate": 1e-06, + "loss": 0.9506, + "mean_token_accuracy": 0.699266791343689, + "num_tokens": 444491087.0, + "step": 17792 + }, + { + "epoch": 1.9539863826048758, + "grad_norm": 2.1571872234344482, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7206884622573853, + "num_tokens": 444518704.0, + "step": 17793 + }, + { + "epoch": 1.9540962003074895, + "grad_norm": 2.29717755317688, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7406344413757324, + "num_tokens": 444539960.0, + "step": 17794 + }, + { + "epoch": 1.9542060180101033, + "grad_norm": 2.348266124725342, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7387046813964844, + "num_tokens": 444563643.0, + "step": 17795 + }, + { + "epoch": 1.954315835712717, + "grad_norm": 1.9920669794082642, + "learning_rate": 1e-06, + "loss": 0.9174, + "mean_token_accuracy": 0.7222663164138794, + "num_tokens": 444595269.0, + "step": 17796 + }, + { + "epoch": 1.9544256534153306, + "grad_norm": 2.158031463623047, + "learning_rate": 1e-06, + "loss": 0.9957, + "mean_token_accuracy": 0.7035214900970459, + "num_tokens": 444623630.0, + "step": 17797 + }, + { + "epoch": 1.9545354711179441, + "grad_norm": 2.443356513977051, + "learning_rate": 1e-06, + "loss": 0.8364, + "mean_token_accuracy": 0.7382057309150696, + "num_tokens": 444645877.0, + "step": 17798 + }, + { + "epoch": 1.9546452888205579, + "grad_norm": 2.306687355041504, + "learning_rate": 1e-06, + "loss": 0.8848, + "mean_token_accuracy": 0.7232356071472168, + "num_tokens": 444669711.0, + "step": 17799 + }, + { + "epoch": 1.9547551065231716, + "grad_norm": 2.0944011211395264, + "learning_rate": 1e-06, + "loss": 0.9993, + "mean_token_accuracy": 0.6889505386352539, + "num_tokens": 444699287.0, + "step": 17800 + }, + { + "epoch": 1.9548649242257852, + "grad_norm": 2.2487406730651855, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7118538618087769, + "num_tokens": 444726276.0, + "step": 17801 + }, + { + "epoch": 1.9549747419283987, + "grad_norm": 2.4005532264709473, + "learning_rate": 1e-06, + "loss": 0.964, + "mean_token_accuracy": 0.7007744908332825, + "num_tokens": 444751027.0, + "step": 17802 + }, + { + "epoch": 1.9550845596310125, + "grad_norm": 2.333725690841675, + "learning_rate": 1e-06, + "loss": 0.8142, + "mean_token_accuracy": 0.7393088340759277, + "num_tokens": 444774659.0, + "step": 17803 + }, + { + "epoch": 1.9551943773336262, + "grad_norm": 2.3473739624023438, + "learning_rate": 1e-06, + "loss": 0.9071, + "mean_token_accuracy": 0.7157208919525146, + "num_tokens": 444798520.0, + "step": 17804 + }, + { + "epoch": 1.95530419503624, + "grad_norm": 2.2789604663848877, + "learning_rate": 1e-06, + "loss": 0.9247, + "mean_token_accuracy": 0.7156654596328735, + "num_tokens": 444825690.0, + "step": 17805 + }, + { + "epoch": 1.9554140127388535, + "grad_norm": 2.348245620727539, + "learning_rate": 1e-06, + "loss": 0.9526, + "mean_token_accuracy": 0.7003599405288696, + "num_tokens": 444849418.0, + "step": 17806 + }, + { + "epoch": 1.955523830441467, + "grad_norm": 2.0924439430236816, + "learning_rate": 1e-06, + "loss": 0.9573, + "mean_token_accuracy": 0.697708785533905, + "num_tokens": 444879695.0, + "step": 17807 + }, + { + "epoch": 1.9556336481440808, + "grad_norm": 2.4498183727264404, + "learning_rate": 1e-06, + "loss": 0.8752, + "mean_token_accuracy": 0.7261279821395874, + "num_tokens": 444902217.0, + "step": 17808 + }, + { + "epoch": 1.9557434658466946, + "grad_norm": 2.2210912704467773, + "learning_rate": 1e-06, + "loss": 0.8721, + "mean_token_accuracy": 0.7321062684059143, + "num_tokens": 444927506.0, + "step": 17809 + }, + { + "epoch": 1.955853283549308, + "grad_norm": 2.2049667835235596, + "learning_rate": 1e-06, + "loss": 0.9466, + "mean_token_accuracy": 0.7061799764633179, + "num_tokens": 444955045.0, + "step": 17810 + }, + { + "epoch": 1.9559631012519219, + "grad_norm": 2.6767523288726807, + "learning_rate": 1e-06, + "loss": 0.9203, + "mean_token_accuracy": 0.7202513217926025, + "num_tokens": 444973381.0, + "step": 17811 + }, + { + "epoch": 1.9560729189545354, + "grad_norm": 2.0983355045318604, + "learning_rate": 1e-06, + "loss": 0.9255, + "mean_token_accuracy": 0.7100434303283691, + "num_tokens": 445001687.0, + "step": 17812 + }, + { + "epoch": 1.9561827366571491, + "grad_norm": 2.2223739624023438, + "learning_rate": 1e-06, + "loss": 0.9856, + "mean_token_accuracy": 0.6966407299041748, + "num_tokens": 445029747.0, + "step": 17813 + }, + { + "epoch": 1.956292554359763, + "grad_norm": 2.183037281036377, + "learning_rate": 1e-06, + "loss": 0.8989, + "mean_token_accuracy": 0.7139061689376831, + "num_tokens": 445055555.0, + "step": 17814 + }, + { + "epoch": 1.9564023720623764, + "grad_norm": 2.376943588256836, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7235316038131714, + "num_tokens": 445078282.0, + "step": 17815 + }, + { + "epoch": 1.95651218976499, + "grad_norm": 2.279006004333496, + "learning_rate": 1e-06, + "loss": 0.9767, + "mean_token_accuracy": 0.6979156136512756, + "num_tokens": 445104064.0, + "step": 17816 + }, + { + "epoch": 1.9566220074676037, + "grad_norm": 2.198251962661743, + "learning_rate": 1e-06, + "loss": 0.9002, + "mean_token_accuracy": 0.723464846611023, + "num_tokens": 445129391.0, + "step": 17817 + }, + { + "epoch": 1.9567318251702175, + "grad_norm": 2.3578131198883057, + "learning_rate": 1e-06, + "loss": 0.8536, + "mean_token_accuracy": 0.7302131056785583, + "num_tokens": 445153106.0, + "step": 17818 + }, + { + "epoch": 1.9568416428728312, + "grad_norm": 2.2603323459625244, + "learning_rate": 1e-06, + "loss": 0.8827, + "mean_token_accuracy": 0.7237644195556641, + "num_tokens": 445177246.0, + "step": 17819 + }, + { + "epoch": 1.9569514605754448, + "grad_norm": 2.127068281173706, + "learning_rate": 1e-06, + "loss": 0.9223, + "mean_token_accuracy": 0.7091779708862305, + "num_tokens": 445207975.0, + "step": 17820 + }, + { + "epoch": 1.9570612782780583, + "grad_norm": 2.5840842723846436, + "learning_rate": 1e-06, + "loss": 0.7708, + "mean_token_accuracy": 0.7527340054512024, + "num_tokens": 445227487.0, + "step": 17821 + }, + { + "epoch": 1.957171095980672, + "grad_norm": 2.3509769439697266, + "learning_rate": 1e-06, + "loss": 0.8749, + "mean_token_accuracy": 0.721666157245636, + "num_tokens": 445249817.0, + "step": 17822 + }, + { + "epoch": 1.9572809136832858, + "grad_norm": 2.54780650138855, + "learning_rate": 1e-06, + "loss": 0.7613, + "mean_token_accuracy": 0.7485135793685913, + "num_tokens": 445269112.0, + "step": 17823 + }, + { + "epoch": 1.9573907313858994, + "grad_norm": 2.4091193675994873, + "learning_rate": 1e-06, + "loss": 0.8101, + "mean_token_accuracy": 0.7450888156890869, + "num_tokens": 445290406.0, + "step": 17824 + }, + { + "epoch": 1.9575005490885131, + "grad_norm": 2.4701952934265137, + "learning_rate": 1e-06, + "loss": 0.9053, + "mean_token_accuracy": 0.7206308841705322, + "num_tokens": 445311314.0, + "step": 17825 + }, + { + "epoch": 1.9576103667911267, + "grad_norm": 2.5250730514526367, + "learning_rate": 1e-06, + "loss": 0.8234, + "mean_token_accuracy": 0.7422856092453003, + "num_tokens": 445331595.0, + "step": 17826 + }, + { + "epoch": 1.9577201844937404, + "grad_norm": 2.1295995712280273, + "learning_rate": 1e-06, + "loss": 0.8338, + "mean_token_accuracy": 0.7361661195755005, + "num_tokens": 445357597.0, + "step": 17827 + }, + { + "epoch": 1.9578300021963542, + "grad_norm": 2.5280425548553467, + "learning_rate": 1e-06, + "loss": 0.9675, + "mean_token_accuracy": 0.6957528591156006, + "num_tokens": 445379559.0, + "step": 17828 + }, + { + "epoch": 1.9579398198989677, + "grad_norm": 2.082691192626953, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.7062965631484985, + "num_tokens": 445408036.0, + "step": 17829 + }, + { + "epoch": 1.9580496376015812, + "grad_norm": 2.399172306060791, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7356910705566406, + "num_tokens": 445430625.0, + "step": 17830 + }, + { + "epoch": 1.958159455304195, + "grad_norm": 2.2938828468322754, + "learning_rate": 1e-06, + "loss": 0.9757, + "mean_token_accuracy": 0.7031694650650024, + "num_tokens": 445456732.0, + "step": 17831 + }, + { + "epoch": 1.9582692730068088, + "grad_norm": 2.361419439315796, + "learning_rate": 1e-06, + "loss": 0.8753, + "mean_token_accuracy": 0.7319709062576294, + "num_tokens": 445478362.0, + "step": 17832 + }, + { + "epoch": 1.9583790907094225, + "grad_norm": 1.9388771057128906, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7208936214447021, + "num_tokens": 445511882.0, + "step": 17833 + }, + { + "epoch": 1.958488908412036, + "grad_norm": 2.1621289253234863, + "learning_rate": 1e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.7337651252746582, + "num_tokens": 445538383.0, + "step": 17834 + }, + { + "epoch": 1.9585987261146496, + "grad_norm": 2.1220438480377197, + "learning_rate": 1e-06, + "loss": 0.8999, + "mean_token_accuracy": 0.7172006368637085, + "num_tokens": 445565646.0, + "step": 17835 + }, + { + "epoch": 1.9587085438172633, + "grad_norm": 2.331740140914917, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7253342866897583, + "num_tokens": 445590133.0, + "step": 17836 + }, + { + "epoch": 1.958818361519877, + "grad_norm": 2.0696189403533936, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7127285003662109, + "num_tokens": 445620090.0, + "step": 17837 + }, + { + "epoch": 1.9589281792224906, + "grad_norm": 2.1786887645721436, + "learning_rate": 1e-06, + "loss": 0.9035, + "mean_token_accuracy": 0.7135410308837891, + "num_tokens": 445646612.0, + "step": 17838 + }, + { + "epoch": 1.9590379969251042, + "grad_norm": 2.196336030960083, + "learning_rate": 1e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7264772653579712, + "num_tokens": 445672297.0, + "step": 17839 + }, + { + "epoch": 1.959147814627718, + "grad_norm": 2.101283311843872, + "learning_rate": 1e-06, + "loss": 0.8767, + "mean_token_accuracy": 0.7282410860061646, + "num_tokens": 445697605.0, + "step": 17840 + }, + { + "epoch": 1.9592576323303317, + "grad_norm": 2.072375774383545, + "learning_rate": 1e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7406097054481506, + "num_tokens": 445725208.0, + "step": 17841 + }, + { + "epoch": 1.9593674500329454, + "grad_norm": 2.2734391689300537, + "learning_rate": 1e-06, + "loss": 0.9545, + "mean_token_accuracy": 0.714651346206665, + "num_tokens": 445752027.0, + "step": 17842 + }, + { + "epoch": 1.959477267735559, + "grad_norm": 2.2801194190979004, + "learning_rate": 1e-06, + "loss": 0.9112, + "mean_token_accuracy": 0.7170252203941345, + "num_tokens": 445777299.0, + "step": 17843 + }, + { + "epoch": 1.9595870854381725, + "grad_norm": 2.2545087337493896, + "learning_rate": 1e-06, + "loss": 0.9276, + "mean_token_accuracy": 0.7105058431625366, + "num_tokens": 445803692.0, + "step": 17844 + }, + { + "epoch": 1.9596969031407863, + "grad_norm": 2.081169366836548, + "learning_rate": 1e-06, + "loss": 0.9421, + "mean_token_accuracy": 0.7087127566337585, + "num_tokens": 445831770.0, + "step": 17845 + }, + { + "epoch": 1.9598067208434, + "grad_norm": 2.394658088684082, + "learning_rate": 1e-06, + "loss": 0.9305, + "mean_token_accuracy": 0.7073596715927124, + "num_tokens": 445854498.0, + "step": 17846 + }, + { + "epoch": 1.9599165385460138, + "grad_norm": 2.6463189125061035, + "learning_rate": 1e-06, + "loss": 0.8632, + "mean_token_accuracy": 0.7245875597000122, + "num_tokens": 445873563.0, + "step": 17847 + }, + { + "epoch": 1.9600263562486273, + "grad_norm": 2.546563148498535, + "learning_rate": 1e-06, + "loss": 0.8902, + "mean_token_accuracy": 0.7182198166847229, + "num_tokens": 445894533.0, + "step": 17848 + }, + { + "epoch": 1.9601361739512408, + "grad_norm": 2.336475133895874, + "learning_rate": 1e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.7294235825538635, + "num_tokens": 445918319.0, + "step": 17849 + }, + { + "epoch": 1.9602459916538546, + "grad_norm": 2.6125993728637695, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7351876497268677, + "num_tokens": 445937689.0, + "step": 17850 + }, + { + "epoch": 1.9603558093564684, + "grad_norm": 2.328941822052002, + "learning_rate": 1e-06, + "loss": 0.8567, + "mean_token_accuracy": 0.725208044052124, + "num_tokens": 445961444.0, + "step": 17851 + }, + { + "epoch": 1.960465627059082, + "grad_norm": 2.06178879737854, + "learning_rate": 1e-06, + "loss": 1.0383, + "mean_token_accuracy": 0.6863548159599304, + "num_tokens": 445991969.0, + "step": 17852 + }, + { + "epoch": 1.9605754447616954, + "grad_norm": 2.435711622238159, + "learning_rate": 1e-06, + "loss": 0.914, + "mean_token_accuracy": 0.7121347188949585, + "num_tokens": 446014077.0, + "step": 17853 + }, + { + "epoch": 1.9606852624643092, + "grad_norm": 2.308516263961792, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.7194991707801819, + "num_tokens": 446038209.0, + "step": 17854 + }, + { + "epoch": 1.960795080166923, + "grad_norm": 2.2973082065582275, + "learning_rate": 1e-06, + "loss": 0.9183, + "mean_token_accuracy": 0.7144067287445068, + "num_tokens": 446063416.0, + "step": 17855 + }, + { + "epoch": 1.9609048978695367, + "grad_norm": 2.2391462326049805, + "learning_rate": 1e-06, + "loss": 0.9165, + "mean_token_accuracy": 0.717986524105072, + "num_tokens": 446088677.0, + "step": 17856 + }, + { + "epoch": 1.9610147155721502, + "grad_norm": 2.048229932785034, + "learning_rate": 1e-06, + "loss": 0.9778, + "mean_token_accuracy": 0.7022278308868408, + "num_tokens": 446119442.0, + "step": 17857 + }, + { + "epoch": 1.9611245332747638, + "grad_norm": 2.2275259494781494, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7405225038528442, + "num_tokens": 446145771.0, + "step": 17858 + }, + { + "epoch": 1.9612343509773775, + "grad_norm": 2.393782377243042, + "learning_rate": 1e-06, + "loss": 0.9457, + "mean_token_accuracy": 0.709753155708313, + "num_tokens": 446171745.0, + "step": 17859 + }, + { + "epoch": 1.9613441686799913, + "grad_norm": 2.5794448852539062, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.71946120262146, + "num_tokens": 446194423.0, + "step": 17860 + }, + { + "epoch": 1.961453986382605, + "grad_norm": 2.3156914710998535, + "learning_rate": 1e-06, + "loss": 0.9001, + "mean_token_accuracy": 0.7247999906539917, + "num_tokens": 446218322.0, + "step": 17861 + }, + { + "epoch": 1.9615638040852186, + "grad_norm": 2.0782320499420166, + "learning_rate": 1e-06, + "loss": 0.9321, + "mean_token_accuracy": 0.7123664617538452, + "num_tokens": 446246639.0, + "step": 17862 + }, + { + "epoch": 1.9616736217878321, + "grad_norm": 2.2682271003723145, + "learning_rate": 1e-06, + "loss": 0.9121, + "mean_token_accuracy": 0.7185262441635132, + "num_tokens": 446272837.0, + "step": 17863 + }, + { + "epoch": 1.9617834394904459, + "grad_norm": 2.1397576332092285, + "learning_rate": 1e-06, + "loss": 0.9463, + "mean_token_accuracy": 0.7022186517715454, + "num_tokens": 446301698.0, + "step": 17864 + }, + { + "epoch": 1.9618932571930596, + "grad_norm": 2.4190213680267334, + "learning_rate": 1e-06, + "loss": 0.9726, + "mean_token_accuracy": 0.7052450776100159, + "num_tokens": 446325544.0, + "step": 17865 + }, + { + "epoch": 1.9620030748956732, + "grad_norm": 2.2798807621002197, + "learning_rate": 1e-06, + "loss": 0.9394, + "mean_token_accuracy": 0.7032637000083923, + "num_tokens": 446352002.0, + "step": 17866 + }, + { + "epoch": 1.9621128925982867, + "grad_norm": 2.1131091117858887, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7213389277458191, + "num_tokens": 446377970.0, + "step": 17867 + }, + { + "epoch": 1.9622227103009005, + "grad_norm": 2.136765956878662, + "learning_rate": 1e-06, + "loss": 0.9564, + "mean_token_accuracy": 0.7008151412010193, + "num_tokens": 446404979.0, + "step": 17868 + }, + { + "epoch": 1.9623325280035142, + "grad_norm": 2.5509469509124756, + "learning_rate": 1e-06, + "loss": 0.7864, + "mean_token_accuracy": 0.7462433576583862, + "num_tokens": 446424936.0, + "step": 17869 + }, + { + "epoch": 1.962442345706128, + "grad_norm": 1.894129991531372, + "learning_rate": 1e-06, + "loss": 0.8965, + "mean_token_accuracy": 0.7171062231063843, + "num_tokens": 446457941.0, + "step": 17870 + }, + { + "epoch": 1.9625521634087415, + "grad_norm": 2.4118807315826416, + "learning_rate": 1e-06, + "loss": 0.9129, + "mean_token_accuracy": 0.7242146730422974, + "num_tokens": 446480829.0, + "step": 17871 + }, + { + "epoch": 1.962661981111355, + "grad_norm": 2.158463716506958, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7287715673446655, + "num_tokens": 446507815.0, + "step": 17872 + }, + { + "epoch": 1.9627717988139688, + "grad_norm": 2.1463608741760254, + "learning_rate": 1e-06, + "loss": 0.8893, + "mean_token_accuracy": 0.7277618646621704, + "num_tokens": 446536984.0, + "step": 17873 + }, + { + "epoch": 1.9628816165165826, + "grad_norm": 2.243467330932617, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7065386772155762, + "num_tokens": 446563485.0, + "step": 17874 + }, + { + "epoch": 1.962991434219196, + "grad_norm": 2.337066650390625, + "learning_rate": 1e-06, + "loss": 0.9132, + "mean_token_accuracy": 0.7158396244049072, + "num_tokens": 446589265.0, + "step": 17875 + }, + { + "epoch": 1.9631012519218098, + "grad_norm": 2.9327104091644287, + "learning_rate": 1e-06, + "loss": 0.8654, + "mean_token_accuracy": 0.7282994985580444, + "num_tokens": 446607031.0, + "step": 17876 + }, + { + "epoch": 1.9632110696244234, + "grad_norm": 2.6478464603424072, + "learning_rate": 1e-06, + "loss": 0.7916, + "mean_token_accuracy": 0.7509973645210266, + "num_tokens": 446627741.0, + "step": 17877 + }, + { + "epoch": 1.9633208873270371, + "grad_norm": 2.4955151081085205, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7061117887496948, + "num_tokens": 446651788.0, + "step": 17878 + }, + { + "epoch": 1.963430705029651, + "grad_norm": 2.6802492141723633, + "learning_rate": 1e-06, + "loss": 0.8446, + "mean_token_accuracy": 0.7308099269866943, + "num_tokens": 446670219.0, + "step": 17879 + }, + { + "epoch": 1.9635405227322644, + "grad_norm": 2.5473437309265137, + "learning_rate": 1e-06, + "loss": 0.8694, + "mean_token_accuracy": 0.7196816205978394, + "num_tokens": 446691684.0, + "step": 17880 + }, + { + "epoch": 1.963650340434878, + "grad_norm": 2.403531789779663, + "learning_rate": 1e-06, + "loss": 0.851, + "mean_token_accuracy": 0.7317882776260376, + "num_tokens": 446715155.0, + "step": 17881 + }, + { + "epoch": 1.9637601581374917, + "grad_norm": 2.138162136077881, + "learning_rate": 1e-06, + "loss": 0.902, + "mean_token_accuracy": 0.72151118516922, + "num_tokens": 446743867.0, + "step": 17882 + }, + { + "epoch": 1.9638699758401055, + "grad_norm": 2.1149394512176514, + "learning_rate": 1e-06, + "loss": 0.8809, + "mean_token_accuracy": 0.7267588376998901, + "num_tokens": 446771590.0, + "step": 17883 + }, + { + "epoch": 1.9639797935427192, + "grad_norm": 2.17561411857605, + "learning_rate": 1e-06, + "loss": 0.8361, + "mean_token_accuracy": 0.7319269180297852, + "num_tokens": 446796503.0, + "step": 17884 + }, + { + "epoch": 1.9640896112453328, + "grad_norm": 2.2112629413604736, + "learning_rate": 1e-06, + "loss": 1.0357, + "mean_token_accuracy": 0.6856439113616943, + "num_tokens": 446823432.0, + "step": 17885 + }, + { + "epoch": 1.9641994289479463, + "grad_norm": 2.5028181076049805, + "learning_rate": 1e-06, + "loss": 0.8774, + "mean_token_accuracy": 0.7248368859291077, + "num_tokens": 446844358.0, + "step": 17886 + }, + { + "epoch": 1.96430924665056, + "grad_norm": 2.582329750061035, + "learning_rate": 1e-06, + "loss": 0.9546, + "mean_token_accuracy": 0.7071221470832825, + "num_tokens": 446865138.0, + "step": 17887 + }, + { + "epoch": 1.9644190643531738, + "grad_norm": 2.4851338863372803, + "learning_rate": 1e-06, + "loss": 0.79, + "mean_token_accuracy": 0.7493950128555298, + "num_tokens": 446885243.0, + "step": 17888 + }, + { + "epoch": 1.9645288820557874, + "grad_norm": 2.290350914001465, + "learning_rate": 1e-06, + "loss": 0.9207, + "mean_token_accuracy": 0.7144651412963867, + "num_tokens": 446908799.0, + "step": 17889 + }, + { + "epoch": 1.964638699758401, + "grad_norm": 2.4686362743377686, + "learning_rate": 1e-06, + "loss": 0.8917, + "mean_token_accuracy": 0.7262276411056519, + "num_tokens": 446929504.0, + "step": 17890 + }, + { + "epoch": 1.9647485174610146, + "grad_norm": 2.5873429775238037, + "learning_rate": 1e-06, + "loss": 0.86, + "mean_token_accuracy": 0.7235552072525024, + "num_tokens": 446951043.0, + "step": 17891 + }, + { + "epoch": 1.9648583351636284, + "grad_norm": 2.366637706756592, + "learning_rate": 1e-06, + "loss": 0.8708, + "mean_token_accuracy": 0.7233219146728516, + "num_tokens": 446974259.0, + "step": 17892 + }, + { + "epoch": 1.9649681528662422, + "grad_norm": 2.370872735977173, + "learning_rate": 1e-06, + "loss": 0.9309, + "mean_token_accuracy": 0.7090383172035217, + "num_tokens": 446998389.0, + "step": 17893 + }, + { + "epoch": 1.9650779705688557, + "grad_norm": 2.335130214691162, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7146672606468201, + "num_tokens": 447020388.0, + "step": 17894 + }, + { + "epoch": 1.9651877882714692, + "grad_norm": 2.1879539489746094, + "learning_rate": 1e-06, + "loss": 0.917, + "mean_token_accuracy": 0.7175335884094238, + "num_tokens": 447045400.0, + "step": 17895 + }, + { + "epoch": 1.965297605974083, + "grad_norm": 2.450409173965454, + "learning_rate": 1e-06, + "loss": 0.8891, + "mean_token_accuracy": 0.7255988121032715, + "num_tokens": 447068242.0, + "step": 17896 + }, + { + "epoch": 1.9654074236766967, + "grad_norm": 2.3542094230651855, + "learning_rate": 1e-06, + "loss": 0.8242, + "mean_token_accuracy": 0.7350808382034302, + "num_tokens": 447092683.0, + "step": 17897 + }, + { + "epoch": 1.9655172413793105, + "grad_norm": 2.2332990169525146, + "learning_rate": 1e-06, + "loss": 0.9227, + "mean_token_accuracy": 0.7128521203994751, + "num_tokens": 447117294.0, + "step": 17898 + }, + { + "epoch": 1.965627059081924, + "grad_norm": 2.4269702434539795, + "learning_rate": 1e-06, + "loss": 0.7919, + "mean_token_accuracy": 0.747113823890686, + "num_tokens": 447138847.0, + "step": 17899 + }, + { + "epoch": 1.9657368767845376, + "grad_norm": 2.4721360206604004, + "learning_rate": 1e-06, + "loss": 0.9402, + "mean_token_accuracy": 0.7063504457473755, + "num_tokens": 447161009.0, + "step": 17900 + }, + { + "epoch": 1.9658466944871513, + "grad_norm": 2.1963608264923096, + "learning_rate": 1e-06, + "loss": 0.915, + "mean_token_accuracy": 0.7244613170623779, + "num_tokens": 447188121.0, + "step": 17901 + }, + { + "epoch": 1.965956512189765, + "grad_norm": 2.223398208618164, + "learning_rate": 1e-06, + "loss": 0.8509, + "mean_token_accuracy": 0.7349262237548828, + "num_tokens": 447213689.0, + "step": 17902 + }, + { + "epoch": 1.9660663298923786, + "grad_norm": 2.321706533432007, + "learning_rate": 1e-06, + "loss": 0.8895, + "mean_token_accuracy": 0.718257486820221, + "num_tokens": 447237583.0, + "step": 17903 + }, + { + "epoch": 1.9661761475949922, + "grad_norm": 2.580205202102661, + "learning_rate": 1e-06, + "loss": 0.9352, + "mean_token_accuracy": 0.7138526439666748, + "num_tokens": 447258984.0, + "step": 17904 + }, + { + "epoch": 1.966285965297606, + "grad_norm": 1.8563321828842163, + "learning_rate": 1e-06, + "loss": 1.0082, + "mean_token_accuracy": 0.6899584531784058, + "num_tokens": 447296247.0, + "step": 17905 + }, + { + "epoch": 1.9663957830002197, + "grad_norm": 2.3745317459106445, + "learning_rate": 1e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7539967894554138, + "num_tokens": 447317908.0, + "step": 17906 + }, + { + "epoch": 1.9665056007028334, + "grad_norm": 2.3338584899902344, + "learning_rate": 1e-06, + "loss": 0.8301, + "mean_token_accuracy": 0.7401597499847412, + "num_tokens": 447340942.0, + "step": 17907 + }, + { + "epoch": 1.966615418405447, + "grad_norm": 2.2783145904541016, + "learning_rate": 1e-06, + "loss": 0.9098, + "mean_token_accuracy": 0.7208645343780518, + "num_tokens": 447365657.0, + "step": 17908 + }, + { + "epoch": 1.9667252361080605, + "grad_norm": 2.0763137340545654, + "learning_rate": 1e-06, + "loss": 0.8905, + "mean_token_accuracy": 0.7149078845977783, + "num_tokens": 447393463.0, + "step": 17909 + }, + { + "epoch": 1.9668350538106742, + "grad_norm": 1.9463720321655273, + "learning_rate": 1e-06, + "loss": 1.032, + "mean_token_accuracy": 0.6859219074249268, + "num_tokens": 447428436.0, + "step": 17910 + }, + { + "epoch": 1.966944871513288, + "grad_norm": 2.185839891433716, + "learning_rate": 1e-06, + "loss": 0.8494, + "mean_token_accuracy": 0.7435925006866455, + "num_tokens": 447454159.0, + "step": 17911 + }, + { + "epoch": 1.9670546892159018, + "grad_norm": 2.420238971710205, + "learning_rate": 1e-06, + "loss": 0.8793, + "mean_token_accuracy": 0.7275949716567993, + "num_tokens": 447476470.0, + "step": 17912 + }, + { + "epoch": 1.9671645069185153, + "grad_norm": 2.0493714809417725, + "learning_rate": 1e-06, + "loss": 0.7246, + "mean_token_accuracy": 0.764485239982605, + "num_tokens": 447501051.0, + "step": 17913 + }, + { + "epoch": 1.9672743246211288, + "grad_norm": 2.270259380340576, + "learning_rate": 1e-06, + "loss": 0.885, + "mean_token_accuracy": 0.7346795201301575, + "num_tokens": 447526267.0, + "step": 17914 + }, + { + "epoch": 1.9673841423237426, + "grad_norm": 2.315505266189575, + "learning_rate": 1e-06, + "loss": 0.9665, + "mean_token_accuracy": 0.70191490650177, + "num_tokens": 447550296.0, + "step": 17915 + }, + { + "epoch": 1.9674939600263563, + "grad_norm": 2.200509548187256, + "learning_rate": 1e-06, + "loss": 0.897, + "mean_token_accuracy": 0.7211042642593384, + "num_tokens": 447576223.0, + "step": 17916 + }, + { + "epoch": 1.9676037777289699, + "grad_norm": 2.3306663036346436, + "learning_rate": 1e-06, + "loss": 0.8592, + "mean_token_accuracy": 0.7255129814147949, + "num_tokens": 447600531.0, + "step": 17917 + }, + { + "epoch": 1.9677135954315834, + "grad_norm": 2.1792361736297607, + "learning_rate": 1e-06, + "loss": 0.9185, + "mean_token_accuracy": 0.7045853137969971, + "num_tokens": 447628178.0, + "step": 17918 + }, + { + "epoch": 1.9678234131341972, + "grad_norm": 2.2589190006256104, + "learning_rate": 1e-06, + "loss": 0.8456, + "mean_token_accuracy": 0.7361173033714294, + "num_tokens": 447651054.0, + "step": 17919 + }, + { + "epoch": 1.967933230836811, + "grad_norm": 2.538405418395996, + "learning_rate": 1e-06, + "loss": 0.7879, + "mean_token_accuracy": 0.7472343444824219, + "num_tokens": 447671339.0, + "step": 17920 + }, + { + "epoch": 1.9680430485394247, + "grad_norm": 2.0838239192962646, + "learning_rate": 1e-06, + "loss": 0.7399, + "mean_token_accuracy": 0.7622766494750977, + "num_tokens": 447696514.0, + "step": 17921 + }, + { + "epoch": 1.9681528662420382, + "grad_norm": 2.567077875137329, + "learning_rate": 1e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7461737990379333, + "num_tokens": 447715474.0, + "step": 17922 + }, + { + "epoch": 1.9682626839446518, + "grad_norm": 2.021153450012207, + "learning_rate": 1e-06, + "loss": 0.9214, + "mean_token_accuracy": 0.7166525721549988, + "num_tokens": 447744573.0, + "step": 17923 + }, + { + "epoch": 1.9683725016472655, + "grad_norm": 2.2065412998199463, + "learning_rate": 1e-06, + "loss": 0.904, + "mean_token_accuracy": 0.7258371114730835, + "num_tokens": 447772650.0, + "step": 17924 + }, + { + "epoch": 1.9684823193498793, + "grad_norm": 2.4176015853881836, + "learning_rate": 1e-06, + "loss": 0.8704, + "mean_token_accuracy": 0.7473500370979309, + "num_tokens": 447793169.0, + "step": 17925 + }, + { + "epoch": 1.9685921370524928, + "grad_norm": 2.2948131561279297, + "learning_rate": 1e-06, + "loss": 0.8019, + "mean_token_accuracy": 0.7441396713256836, + "num_tokens": 447816483.0, + "step": 17926 + }, + { + "epoch": 1.9687019547551066, + "grad_norm": 2.190293550491333, + "learning_rate": 1e-06, + "loss": 1.0326, + "mean_token_accuracy": 0.6880152821540833, + "num_tokens": 447845452.0, + "step": 17927 + }, + { + "epoch": 1.96881177245772, + "grad_norm": 2.288421392440796, + "learning_rate": 1e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7258191704750061, + "num_tokens": 447870407.0, + "step": 17928 + }, + { + "epoch": 1.9689215901603339, + "grad_norm": 1.9637293815612793, + "learning_rate": 1e-06, + "loss": 0.9751, + "mean_token_accuracy": 0.7032275199890137, + "num_tokens": 447902462.0, + "step": 17929 + }, + { + "epoch": 1.9690314078629476, + "grad_norm": 2.552786111831665, + "learning_rate": 1e-06, + "loss": 0.7589, + "mean_token_accuracy": 0.758804202079773, + "num_tokens": 447922738.0, + "step": 17930 + }, + { + "epoch": 1.9691412255655611, + "grad_norm": 2.2603919506073, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7201125621795654, + "num_tokens": 447947880.0, + "step": 17931 + }, + { + "epoch": 1.9692510432681747, + "grad_norm": 2.2983546257019043, + "learning_rate": 1e-06, + "loss": 0.9495, + "mean_token_accuracy": 0.7033824920654297, + "num_tokens": 447973916.0, + "step": 17932 + }, + { + "epoch": 1.9693608609707884, + "grad_norm": 2.713439464569092, + "learning_rate": 1e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7430182695388794, + "num_tokens": 447994095.0, + "step": 17933 + }, + { + "epoch": 1.9694706786734022, + "grad_norm": 2.093223810195923, + "learning_rate": 1e-06, + "loss": 0.9192, + "mean_token_accuracy": 0.71365886926651, + "num_tokens": 448020214.0, + "step": 17934 + }, + { + "epoch": 1.969580496376016, + "grad_norm": 2.2650413513183594, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7097830176353455, + "num_tokens": 448045492.0, + "step": 17935 + }, + { + "epoch": 1.9696903140786295, + "grad_norm": 2.0747251510620117, + "learning_rate": 1e-06, + "loss": 0.9091, + "mean_token_accuracy": 0.7196937799453735, + "num_tokens": 448074289.0, + "step": 17936 + }, + { + "epoch": 1.969800131781243, + "grad_norm": 2.1056840419769287, + "learning_rate": 1e-06, + "loss": 0.8957, + "mean_token_accuracy": 0.7224075198173523, + "num_tokens": 448101497.0, + "step": 17937 + }, + { + "epoch": 1.9699099494838568, + "grad_norm": 2.361326217651367, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7222224473953247, + "num_tokens": 448124487.0, + "step": 17938 + }, + { + "epoch": 1.9700197671864705, + "grad_norm": 2.6915266513824463, + "learning_rate": 1e-06, + "loss": 0.8406, + "mean_token_accuracy": 0.7359222173690796, + "num_tokens": 448143020.0, + "step": 17939 + }, + { + "epoch": 1.970129584889084, + "grad_norm": 2.372030258178711, + "learning_rate": 1e-06, + "loss": 0.8981, + "mean_token_accuracy": 0.7179408669471741, + "num_tokens": 448166162.0, + "step": 17940 + }, + { + "epoch": 1.9702394025916978, + "grad_norm": 2.149080753326416, + "learning_rate": 1e-06, + "loss": 0.8746, + "mean_token_accuracy": 0.7278167009353638, + "num_tokens": 448193192.0, + "step": 17941 + }, + { + "epoch": 1.9703492202943114, + "grad_norm": 2.336038589477539, + "learning_rate": 1e-06, + "loss": 0.8011, + "mean_token_accuracy": 0.7491068840026855, + "num_tokens": 448214815.0, + "step": 17942 + }, + { + "epoch": 1.9704590379969251, + "grad_norm": 2.325263261795044, + "learning_rate": 1e-06, + "loss": 0.807, + "mean_token_accuracy": 0.7433744668960571, + "num_tokens": 448237507.0, + "step": 17943 + }, + { + "epoch": 1.9705688556995389, + "grad_norm": 2.3538215160369873, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7240012288093567, + "num_tokens": 448261890.0, + "step": 17944 + }, + { + "epoch": 1.9706786734021524, + "grad_norm": 2.113346576690674, + "learning_rate": 1e-06, + "loss": 0.9059, + "mean_token_accuracy": 0.7217186689376831, + "num_tokens": 448290769.0, + "step": 17945 + }, + { + "epoch": 1.970788491104766, + "grad_norm": 2.4494173526763916, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.716855525970459, + "num_tokens": 448313619.0, + "step": 17946 + }, + { + "epoch": 1.9708983088073797, + "grad_norm": 2.1389951705932617, + "learning_rate": 1e-06, + "loss": 0.9728, + "mean_token_accuracy": 0.7074007987976074, + "num_tokens": 448342137.0, + "step": 17947 + }, + { + "epoch": 1.9710081265099935, + "grad_norm": 2.2245869636535645, + "learning_rate": 1e-06, + "loss": 0.8959, + "mean_token_accuracy": 0.7190111875534058, + "num_tokens": 448367870.0, + "step": 17948 + }, + { + "epoch": 1.9711179442126072, + "grad_norm": 1.967316746711731, + "learning_rate": 1e-06, + "loss": 0.9602, + "mean_token_accuracy": 0.7034469842910767, + "num_tokens": 448399283.0, + "step": 17949 + }, + { + "epoch": 1.9712277619152208, + "grad_norm": 2.1443166732788086, + "learning_rate": 1e-06, + "loss": 0.9296, + "mean_token_accuracy": 0.7101484537124634, + "num_tokens": 448425964.0, + "step": 17950 + }, + { + "epoch": 1.9713375796178343, + "grad_norm": 2.421703338623047, + "learning_rate": 1e-06, + "loss": 0.8525, + "mean_token_accuracy": 0.7351769208908081, + "num_tokens": 448447104.0, + "step": 17951 + }, + { + "epoch": 1.971447397320448, + "grad_norm": 2.1982014179229736, + "learning_rate": 1e-06, + "loss": 0.9047, + "mean_token_accuracy": 0.7202143669128418, + "num_tokens": 448472122.0, + "step": 17952 + }, + { + "epoch": 1.9715572150230618, + "grad_norm": 2.165468692779541, + "learning_rate": 1e-06, + "loss": 0.924, + "mean_token_accuracy": 0.7133694887161255, + "num_tokens": 448498933.0, + "step": 17953 + }, + { + "epoch": 1.9716670327256753, + "grad_norm": 2.022822618484497, + "learning_rate": 1e-06, + "loss": 0.8991, + "mean_token_accuracy": 0.7221070528030396, + "num_tokens": 448528672.0, + "step": 17954 + }, + { + "epoch": 1.9717768504282889, + "grad_norm": 2.2226600646972656, + "learning_rate": 1e-06, + "loss": 0.929, + "mean_token_accuracy": 0.7229092717170715, + "num_tokens": 448554370.0, + "step": 17955 + }, + { + "epoch": 1.9718866681309026, + "grad_norm": 2.1465492248535156, + "learning_rate": 1e-06, + "loss": 0.9141, + "mean_token_accuracy": 0.7153260111808777, + "num_tokens": 448583761.0, + "step": 17956 + }, + { + "epoch": 1.9719964858335164, + "grad_norm": 2.140528917312622, + "learning_rate": 1e-06, + "loss": 0.9634, + "mean_token_accuracy": 0.7104780077934265, + "num_tokens": 448609088.0, + "step": 17957 + }, + { + "epoch": 1.9721063035361301, + "grad_norm": 2.1558356285095215, + "learning_rate": 1e-06, + "loss": 0.9754, + "mean_token_accuracy": 0.7057523131370544, + "num_tokens": 448635958.0, + "step": 17958 + }, + { + "epoch": 1.9722161212387437, + "grad_norm": 2.274897813796997, + "learning_rate": 1e-06, + "loss": 0.9069, + "mean_token_accuracy": 0.7139266729354858, + "num_tokens": 448661217.0, + "step": 17959 + }, + { + "epoch": 1.9723259389413572, + "grad_norm": 2.4165875911712646, + "learning_rate": 1e-06, + "loss": 1.0012, + "mean_token_accuracy": 0.6920177340507507, + "num_tokens": 448685922.0, + "step": 17960 + }, + { + "epoch": 1.972435756643971, + "grad_norm": 2.220141887664795, + "learning_rate": 1e-06, + "loss": 0.8676, + "mean_token_accuracy": 0.7239338159561157, + "num_tokens": 448712902.0, + "step": 17961 + }, + { + "epoch": 1.9725455743465847, + "grad_norm": 2.1242868900299072, + "learning_rate": 1e-06, + "loss": 0.9448, + "mean_token_accuracy": 0.7118979692459106, + "num_tokens": 448741492.0, + "step": 17962 + }, + { + "epoch": 1.9726553920491985, + "grad_norm": 2.146697998046875, + "learning_rate": 1e-06, + "loss": 0.8599, + "mean_token_accuracy": 0.7271445393562317, + "num_tokens": 448768268.0, + "step": 17963 + }, + { + "epoch": 1.972765209751812, + "grad_norm": 2.209345817565918, + "learning_rate": 1e-06, + "loss": 0.8862, + "mean_token_accuracy": 0.7139283418655396, + "num_tokens": 448792648.0, + "step": 17964 + }, + { + "epoch": 1.9728750274544256, + "grad_norm": 2.272494316101074, + "learning_rate": 1e-06, + "loss": 0.9493, + "mean_token_accuracy": 0.71157306432724, + "num_tokens": 448818853.0, + "step": 17965 + }, + { + "epoch": 1.9729848451570393, + "grad_norm": 2.0817813873291016, + "learning_rate": 1e-06, + "loss": 0.9472, + "mean_token_accuracy": 0.7076743841171265, + "num_tokens": 448847019.0, + "step": 17966 + }, + { + "epoch": 1.973094662859653, + "grad_norm": 2.4949681758880615, + "learning_rate": 1e-06, + "loss": 0.9428, + "mean_token_accuracy": 0.7165489196777344, + "num_tokens": 448869203.0, + "step": 17967 + }, + { + "epoch": 1.9732044805622666, + "grad_norm": 1.9463926553726196, + "learning_rate": 1e-06, + "loss": 0.9038, + "mean_token_accuracy": 0.7210296392440796, + "num_tokens": 448900456.0, + "step": 17968 + }, + { + "epoch": 1.9733142982648801, + "grad_norm": 2.315890312194824, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7296745181083679, + "num_tokens": 448924595.0, + "step": 17969 + }, + { + "epoch": 1.973424115967494, + "grad_norm": 2.5133705139160156, + "learning_rate": 1e-06, + "loss": 0.9213, + "mean_token_accuracy": 0.7194781303405762, + "num_tokens": 448945750.0, + "step": 17970 + }, + { + "epoch": 1.9735339336701077, + "grad_norm": 2.1848254203796387, + "learning_rate": 1e-06, + "loss": 0.8358, + "mean_token_accuracy": 0.741011381149292, + "num_tokens": 448972842.0, + "step": 17971 + }, + { + "epoch": 1.9736437513727214, + "grad_norm": 2.317718505859375, + "learning_rate": 1e-06, + "loss": 0.9163, + "mean_token_accuracy": 0.722581148147583, + "num_tokens": 448997075.0, + "step": 17972 + }, + { + "epoch": 1.973753569075335, + "grad_norm": 2.50956130027771, + "learning_rate": 1e-06, + "loss": 0.9018, + "mean_token_accuracy": 0.7179625034332275, + "num_tokens": 449017061.0, + "step": 17973 + }, + { + "epoch": 1.9738633867779485, + "grad_norm": 2.0409538745880127, + "learning_rate": 1e-06, + "loss": 0.8385, + "mean_token_accuracy": 0.7425220608711243, + "num_tokens": 449043856.0, + "step": 17974 + }, + { + "epoch": 1.9739732044805622, + "grad_norm": 2.3253631591796875, + "learning_rate": 1e-06, + "loss": 0.8051, + "mean_token_accuracy": 0.7452180981636047, + "num_tokens": 449066947.0, + "step": 17975 + }, + { + "epoch": 1.974083022183176, + "grad_norm": 2.2207021713256836, + "learning_rate": 1e-06, + "loss": 0.912, + "mean_token_accuracy": 0.7196382284164429, + "num_tokens": 449092117.0, + "step": 17976 + }, + { + "epoch": 1.9741928398857898, + "grad_norm": 2.5816333293914795, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7071183919906616, + "num_tokens": 449112973.0, + "step": 17977 + }, + { + "epoch": 1.9743026575884033, + "grad_norm": 2.4041731357574463, + "learning_rate": 1e-06, + "loss": 0.8248, + "mean_token_accuracy": 0.74004727602005, + "num_tokens": 449135203.0, + "step": 17978 + }, + { + "epoch": 1.9744124752910168, + "grad_norm": 2.471951484680176, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7520711421966553, + "num_tokens": 449155273.0, + "step": 17979 + }, + { + "epoch": 1.9745222929936306, + "grad_norm": 2.1783666610717773, + "learning_rate": 1e-06, + "loss": 0.9166, + "mean_token_accuracy": 0.7129852771759033, + "num_tokens": 449181882.0, + "step": 17980 + }, + { + "epoch": 1.9746321106962443, + "grad_norm": 2.2375190258026123, + "learning_rate": 1e-06, + "loss": 0.9154, + "mean_token_accuracy": 0.7101652026176453, + "num_tokens": 449207762.0, + "step": 17981 + }, + { + "epoch": 1.9747419283988579, + "grad_norm": 2.443873643875122, + "learning_rate": 1e-06, + "loss": 0.8392, + "mean_token_accuracy": 0.7372294664382935, + "num_tokens": 449229647.0, + "step": 17982 + }, + { + "epoch": 1.9748517461014714, + "grad_norm": 2.3747951984405518, + "learning_rate": 1e-06, + "loss": 0.8164, + "mean_token_accuracy": 0.7406054139137268, + "num_tokens": 449251403.0, + "step": 17983 + }, + { + "epoch": 1.9749615638040852, + "grad_norm": 2.2133169174194336, + "learning_rate": 1e-06, + "loss": 0.9864, + "mean_token_accuracy": 0.6945993900299072, + "num_tokens": 449277647.0, + "step": 17984 + }, + { + "epoch": 1.975071381506699, + "grad_norm": 2.138362169265747, + "learning_rate": 1e-06, + "loss": 0.8578, + "mean_token_accuracy": 0.7267360687255859, + "num_tokens": 449304055.0, + "step": 17985 + }, + { + "epoch": 1.9751811992093127, + "grad_norm": 2.2953457832336426, + "learning_rate": 1e-06, + "loss": 0.9219, + "mean_token_accuracy": 0.7151762843132019, + "num_tokens": 449328126.0, + "step": 17986 + }, + { + "epoch": 1.9752910169119262, + "grad_norm": 2.033270835876465, + "learning_rate": 1e-06, + "loss": 0.9606, + "mean_token_accuracy": 0.6995669603347778, + "num_tokens": 449356730.0, + "step": 17987 + }, + { + "epoch": 1.9754008346145397, + "grad_norm": 2.125393867492676, + "learning_rate": 1e-06, + "loss": 0.8628, + "mean_token_accuracy": 0.7244404554367065, + "num_tokens": 449382316.0, + "step": 17988 + }, + { + "epoch": 1.9755106523171535, + "grad_norm": 2.3968591690063477, + "learning_rate": 1e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7443355321884155, + "num_tokens": 449402945.0, + "step": 17989 + }, + { + "epoch": 1.9756204700197673, + "grad_norm": 2.579451322555542, + "learning_rate": 1e-06, + "loss": 0.8331, + "mean_token_accuracy": 0.7319478988647461, + "num_tokens": 449422719.0, + "step": 17990 + }, + { + "epoch": 1.9757302877223808, + "grad_norm": 2.513139009475708, + "learning_rate": 1e-06, + "loss": 0.8701, + "mean_token_accuracy": 0.7185442447662354, + "num_tokens": 449444247.0, + "step": 17991 + }, + { + "epoch": 1.9758401054249946, + "grad_norm": 2.4548022747039795, + "learning_rate": 1e-06, + "loss": 0.9008, + "mean_token_accuracy": 0.7274007797241211, + "num_tokens": 449466188.0, + "step": 17992 + }, + { + "epoch": 1.975949923127608, + "grad_norm": 2.5386595726013184, + "learning_rate": 1e-06, + "loss": 0.8477, + "mean_token_accuracy": 0.7342709898948669, + "num_tokens": 449486115.0, + "step": 17993 + }, + { + "epoch": 1.9760597408302218, + "grad_norm": 2.242144823074341, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7052333950996399, + "num_tokens": 449512557.0, + "step": 17994 + }, + { + "epoch": 1.9761695585328356, + "grad_norm": 2.428481101989746, + "learning_rate": 1e-06, + "loss": 0.9102, + "mean_token_accuracy": 0.7123523354530334, + "num_tokens": 449535485.0, + "step": 17995 + }, + { + "epoch": 1.9762793762354491, + "grad_norm": 2.07340669631958, + "learning_rate": 1e-06, + "loss": 0.8463, + "mean_token_accuracy": 0.7340385913848877, + "num_tokens": 449563511.0, + "step": 17996 + }, + { + "epoch": 1.9763891939380627, + "grad_norm": 2.267317533493042, + "learning_rate": 1e-06, + "loss": 0.8691, + "mean_token_accuracy": 0.7301700711250305, + "num_tokens": 449586992.0, + "step": 17997 + }, + { + "epoch": 1.9764990116406764, + "grad_norm": 2.1168694496154785, + "learning_rate": 1e-06, + "loss": 0.9367, + "mean_token_accuracy": 0.704116702079773, + "num_tokens": 449615567.0, + "step": 17998 + }, + { + "epoch": 1.9766088293432902, + "grad_norm": 2.18965744972229, + "learning_rate": 1e-06, + "loss": 0.8347, + "mean_token_accuracy": 0.7330286502838135, + "num_tokens": 449641818.0, + "step": 17999 + }, + { + "epoch": 1.976718647045904, + "grad_norm": 2.572305202484131, + "learning_rate": 1e-06, + "loss": 0.873, + "mean_token_accuracy": 0.7243505120277405, + "num_tokens": 449661584.0, + "step": 18000 + }, + { + "epoch": 1.9768284647485175, + "grad_norm": 2.4191994667053223, + "learning_rate": 1e-06, + "loss": 0.9508, + "mean_token_accuracy": 0.7067195177078247, + "num_tokens": 449685704.0, + "step": 18001 + }, + { + "epoch": 1.976938282451131, + "grad_norm": 2.251912832260132, + "learning_rate": 1e-06, + "loss": 0.8764, + "mean_token_accuracy": 0.7295719981193542, + "num_tokens": 449709290.0, + "step": 18002 + }, + { + "epoch": 1.9770481001537448, + "grad_norm": 2.2770910263061523, + "learning_rate": 1e-06, + "loss": 0.9548, + "mean_token_accuracy": 0.7112337350845337, + "num_tokens": 449731962.0, + "step": 18003 + }, + { + "epoch": 1.9771579178563585, + "grad_norm": 2.2471890449523926, + "learning_rate": 1e-06, + "loss": 0.9819, + "mean_token_accuracy": 0.7031596899032593, + "num_tokens": 449758747.0, + "step": 18004 + }, + { + "epoch": 1.977267735558972, + "grad_norm": 2.5346450805664062, + "learning_rate": 1e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7415684461593628, + "num_tokens": 449781081.0, + "step": 18005 + }, + { + "epoch": 1.9773775532615858, + "grad_norm": 2.8019495010375977, + "learning_rate": 1e-06, + "loss": 0.8533, + "mean_token_accuracy": 0.7276303768157959, + "num_tokens": 449799808.0, + "step": 18006 + }, + { + "epoch": 1.9774873709641994, + "grad_norm": 2.4437317848205566, + "learning_rate": 1e-06, + "loss": 0.8698, + "mean_token_accuracy": 0.7288511991500854, + "num_tokens": 449823014.0, + "step": 18007 + }, + { + "epoch": 1.977597188666813, + "grad_norm": 2.0631682872772217, + "learning_rate": 1e-06, + "loss": 1.0691, + "mean_token_accuracy": 0.6872965097427368, + "num_tokens": 449852499.0, + "step": 18008 + }, + { + "epoch": 1.9777070063694269, + "grad_norm": 2.1296606063842773, + "learning_rate": 1e-06, + "loss": 1.0259, + "mean_token_accuracy": 0.6885823011398315, + "num_tokens": 449882397.0, + "step": 18009 + }, + { + "epoch": 1.9778168240720404, + "grad_norm": 2.4066812992095947, + "learning_rate": 1e-06, + "loss": 0.9051, + "mean_token_accuracy": 0.7210517525672913, + "num_tokens": 449905067.0, + "step": 18010 + }, + { + "epoch": 1.977926641774654, + "grad_norm": 2.2369377613067627, + "learning_rate": 1e-06, + "loss": 0.9408, + "mean_token_accuracy": 0.7134791612625122, + "num_tokens": 449930706.0, + "step": 18011 + }, + { + "epoch": 1.9780364594772677, + "grad_norm": 2.393054723739624, + "learning_rate": 1e-06, + "loss": 0.8834, + "mean_token_accuracy": 0.7206057906150818, + "num_tokens": 449953673.0, + "step": 18012 + }, + { + "epoch": 1.9781462771798815, + "grad_norm": 2.2584478855133057, + "learning_rate": 1e-06, + "loss": 0.8326, + "mean_token_accuracy": 0.7309589385986328, + "num_tokens": 449976744.0, + "step": 18013 + }, + { + "epoch": 1.9782560948824952, + "grad_norm": 2.354620933532715, + "learning_rate": 1e-06, + "loss": 0.9272, + "mean_token_accuracy": 0.7132678031921387, + "num_tokens": 449998290.0, + "step": 18014 + }, + { + "epoch": 1.9783659125851087, + "grad_norm": 2.2820754051208496, + "learning_rate": 1e-06, + "loss": 0.9284, + "mean_token_accuracy": 0.7099747657775879, + "num_tokens": 450024324.0, + "step": 18015 + }, + { + "epoch": 1.9784757302877223, + "grad_norm": 2.615018367767334, + "learning_rate": 1e-06, + "loss": 0.8765, + "mean_token_accuracy": 0.7185295820236206, + "num_tokens": 450043361.0, + "step": 18016 + }, + { + "epoch": 1.978585547990336, + "grad_norm": 2.354642152786255, + "learning_rate": 1e-06, + "loss": 0.9458, + "mean_token_accuracy": 0.7077480554580688, + "num_tokens": 450069029.0, + "step": 18017 + }, + { + "epoch": 1.9786953656929498, + "grad_norm": 2.0229921340942383, + "learning_rate": 1e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7265046834945679, + "num_tokens": 450096998.0, + "step": 18018 + }, + { + "epoch": 1.9788051833955633, + "grad_norm": 2.141098737716675, + "learning_rate": 1e-06, + "loss": 0.9433, + "mean_token_accuracy": 0.7080405950546265, + "num_tokens": 450125040.0, + "step": 18019 + }, + { + "epoch": 1.9789150010981769, + "grad_norm": 2.419910430908203, + "learning_rate": 1e-06, + "loss": 0.7987, + "mean_token_accuracy": 0.7517731189727783, + "num_tokens": 450146180.0, + "step": 18020 + }, + { + "epoch": 1.9790248188007906, + "grad_norm": 2.085524559020996, + "learning_rate": 1e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7361257672309875, + "num_tokens": 450172666.0, + "step": 18021 + }, + { + "epoch": 1.9791346365034044, + "grad_norm": 2.1781668663024902, + "learning_rate": 1e-06, + "loss": 0.9175, + "mean_token_accuracy": 0.7124441862106323, + "num_tokens": 450199871.0, + "step": 18022 + }, + { + "epoch": 1.9792444542060181, + "grad_norm": 2.2259321212768555, + "learning_rate": 1e-06, + "loss": 0.8996, + "mean_token_accuracy": 0.7223920822143555, + "num_tokens": 450223842.0, + "step": 18023 + }, + { + "epoch": 1.9793542719086317, + "grad_norm": 2.770531177520752, + "learning_rate": 1e-06, + "loss": 0.961, + "mean_token_accuracy": 0.7181892395019531, + "num_tokens": 450243053.0, + "step": 18024 + }, + { + "epoch": 1.9794640896112452, + "grad_norm": 2.3441355228424072, + "learning_rate": 1e-06, + "loss": 0.8792, + "mean_token_accuracy": 0.7207199335098267, + "num_tokens": 450266546.0, + "step": 18025 + }, + { + "epoch": 1.979573907313859, + "grad_norm": 2.2850773334503174, + "learning_rate": 1e-06, + "loss": 0.9902, + "mean_token_accuracy": 0.6980918645858765, + "num_tokens": 450291596.0, + "step": 18026 + }, + { + "epoch": 1.9796837250164727, + "grad_norm": 2.4580957889556885, + "learning_rate": 1e-06, + "loss": 0.8527, + "mean_token_accuracy": 0.7295119166374207, + "num_tokens": 450314258.0, + "step": 18027 + }, + { + "epoch": 1.9797935427190865, + "grad_norm": 2.1855318546295166, + "learning_rate": 1e-06, + "loss": 0.9215, + "mean_token_accuracy": 0.7213177680969238, + "num_tokens": 450340533.0, + "step": 18028 + }, + { + "epoch": 1.9799033604217, + "grad_norm": 2.4529917240142822, + "learning_rate": 1e-06, + "loss": 0.7457, + "mean_token_accuracy": 0.7628264427185059, + "num_tokens": 450360904.0, + "step": 18029 + }, + { + "epoch": 1.9800131781243135, + "grad_norm": 2.16973614692688, + "learning_rate": 1e-06, + "loss": 0.7688, + "mean_token_accuracy": 0.750679075717926, + "num_tokens": 450386916.0, + "step": 18030 + }, + { + "epoch": 1.9801229958269273, + "grad_norm": 2.6148593425750732, + "learning_rate": 1e-06, + "loss": 0.8504, + "mean_token_accuracy": 0.737452507019043, + "num_tokens": 450408287.0, + "step": 18031 + }, + { + "epoch": 1.980232813529541, + "grad_norm": 2.2139079570770264, + "learning_rate": 1e-06, + "loss": 0.9057, + "mean_token_accuracy": 0.7197750806808472, + "num_tokens": 450433589.0, + "step": 18032 + }, + { + "epoch": 1.9803426312321546, + "grad_norm": 2.3344168663024902, + "learning_rate": 1e-06, + "loss": 0.8739, + "mean_token_accuracy": 0.7265402674674988, + "num_tokens": 450456510.0, + "step": 18033 + }, + { + "epoch": 1.9804524489347681, + "grad_norm": 2.3367116451263428, + "learning_rate": 1e-06, + "loss": 0.8539, + "mean_token_accuracy": 0.7388172745704651, + "num_tokens": 450479610.0, + "step": 18034 + }, + { + "epoch": 1.9805622666373819, + "grad_norm": 2.488020420074463, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.7029731273651123, + "num_tokens": 450500894.0, + "step": 18035 + }, + { + "epoch": 1.9806720843399956, + "grad_norm": 2.3224618434906006, + "learning_rate": 1e-06, + "loss": 0.9401, + "mean_token_accuracy": 0.7045685052871704, + "num_tokens": 450527649.0, + "step": 18036 + }, + { + "epoch": 1.9807819020426094, + "grad_norm": 2.379162549972534, + "learning_rate": 1e-06, + "loss": 0.9199, + "mean_token_accuracy": 0.7151233553886414, + "num_tokens": 450551499.0, + "step": 18037 + }, + { + "epoch": 1.980891719745223, + "grad_norm": 2.502314805984497, + "learning_rate": 1e-06, + "loss": 0.9565, + "mean_token_accuracy": 0.7088419795036316, + "num_tokens": 450575646.0, + "step": 18038 + }, + { + "epoch": 1.9810015374478365, + "grad_norm": 2.375647783279419, + "learning_rate": 1e-06, + "loss": 0.8885, + "mean_token_accuracy": 0.7208243608474731, + "num_tokens": 450598349.0, + "step": 18039 + }, + { + "epoch": 1.9811113551504502, + "grad_norm": 2.402132034301758, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.71882164478302, + "num_tokens": 450622020.0, + "step": 18040 + }, + { + "epoch": 1.981221172853064, + "grad_norm": 2.1592202186584473, + "learning_rate": 1e-06, + "loss": 0.8925, + "mean_token_accuracy": 0.717332661151886, + "num_tokens": 450650067.0, + "step": 18041 + }, + { + "epoch": 1.9813309905556777, + "grad_norm": 2.3809518814086914, + "learning_rate": 1e-06, + "loss": 0.7964, + "mean_token_accuracy": 0.7459912896156311, + "num_tokens": 450672245.0, + "step": 18042 + }, + { + "epoch": 1.9814408082582913, + "grad_norm": 2.3396923542022705, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7361642718315125, + "num_tokens": 450696352.0, + "step": 18043 + }, + { + "epoch": 1.9815506259609048, + "grad_norm": 2.6012723445892334, + "learning_rate": 1e-06, + "loss": 0.9269, + "mean_token_accuracy": 0.7145041227340698, + "num_tokens": 450717424.0, + "step": 18044 + }, + { + "epoch": 1.9816604436635186, + "grad_norm": 2.292372703552246, + "learning_rate": 1e-06, + "loss": 0.9044, + "mean_token_accuracy": 0.713516891002655, + "num_tokens": 450741816.0, + "step": 18045 + }, + { + "epoch": 1.9817702613661323, + "grad_norm": 2.134019613265991, + "learning_rate": 1e-06, + "loss": 0.9155, + "mean_token_accuracy": 0.7126387357711792, + "num_tokens": 450768773.0, + "step": 18046 + }, + { + "epoch": 1.9818800790687459, + "grad_norm": 2.1943111419677734, + "learning_rate": 1e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.747128427028656, + "num_tokens": 450793611.0, + "step": 18047 + }, + { + "epoch": 1.9819898967713594, + "grad_norm": 2.1899218559265137, + "learning_rate": 1e-06, + "loss": 0.903, + "mean_token_accuracy": 0.7219668030738831, + "num_tokens": 450818499.0, + "step": 18048 + }, + { + "epoch": 1.9820997144739732, + "grad_norm": 2.040318012237549, + "learning_rate": 1e-06, + "loss": 1.0032, + "mean_token_accuracy": 0.6949058175086975, + "num_tokens": 450849392.0, + "step": 18049 + }, + { + "epoch": 1.982209532176587, + "grad_norm": 2.378354072570801, + "learning_rate": 1e-06, + "loss": 0.8262, + "mean_token_accuracy": 0.7368961572647095, + "num_tokens": 450872290.0, + "step": 18050 + }, + { + "epoch": 1.9823193498792007, + "grad_norm": 2.077482223510742, + "learning_rate": 1e-06, + "loss": 0.934, + "mean_token_accuracy": 0.7104148864746094, + "num_tokens": 450901327.0, + "step": 18051 + }, + { + "epoch": 1.9824291675818142, + "grad_norm": 2.4218053817749023, + "learning_rate": 1e-06, + "loss": 0.918, + "mean_token_accuracy": 0.7152367830276489, + "num_tokens": 450924196.0, + "step": 18052 + }, + { + "epoch": 1.9825389852844277, + "grad_norm": 2.382530450820923, + "learning_rate": 1e-06, + "loss": 0.8571, + "mean_token_accuracy": 0.7302117347717285, + "num_tokens": 450946637.0, + "step": 18053 + }, + { + "epoch": 1.9826488029870415, + "grad_norm": 2.2594404220581055, + "learning_rate": 1e-06, + "loss": 0.8389, + "mean_token_accuracy": 0.734561562538147, + "num_tokens": 450970577.0, + "step": 18054 + }, + { + "epoch": 1.9827586206896552, + "grad_norm": 2.0713443756103516, + "learning_rate": 1e-06, + "loss": 0.9537, + "mean_token_accuracy": 0.7069272994995117, + "num_tokens": 450999673.0, + "step": 18055 + }, + { + "epoch": 1.9828684383922688, + "grad_norm": 2.1893560886383057, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7205343246459961, + "num_tokens": 451026019.0, + "step": 18056 + }, + { + "epoch": 1.9829782560948825, + "grad_norm": 2.359067678451538, + "learning_rate": 1e-06, + "loss": 0.9708, + "mean_token_accuracy": 0.7054437398910522, + "num_tokens": 451050659.0, + "step": 18057 + }, + { + "epoch": 1.983088073797496, + "grad_norm": 2.182403087615967, + "learning_rate": 1e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.7366165518760681, + "num_tokens": 451076353.0, + "step": 18058 + }, + { + "epoch": 1.9831978915001098, + "grad_norm": 2.260162115097046, + "learning_rate": 1e-06, + "loss": 0.9774, + "mean_token_accuracy": 0.7032591104507446, + "num_tokens": 451103547.0, + "step": 18059 + }, + { + "epoch": 1.9833077092027236, + "grad_norm": 2.4311132431030273, + "learning_rate": 1e-06, + "loss": 0.8931, + "mean_token_accuracy": 0.7175432443618774, + "num_tokens": 451125607.0, + "step": 18060 + }, + { + "epoch": 1.9834175269053371, + "grad_norm": 1.961639404296875, + "learning_rate": 1e-06, + "loss": 0.9151, + "mean_token_accuracy": 0.7145664095878601, + "num_tokens": 451158862.0, + "step": 18061 + }, + { + "epoch": 1.9835273446079507, + "grad_norm": 2.018589973449707, + "learning_rate": 1e-06, + "loss": 0.9194, + "mean_token_accuracy": 0.721052885055542, + "num_tokens": 451190779.0, + "step": 18062 + }, + { + "epoch": 1.9836371623105644, + "grad_norm": 2.7026069164276123, + "learning_rate": 1e-06, + "loss": 0.9456, + "mean_token_accuracy": 0.7064494490623474, + "num_tokens": 451211288.0, + "step": 18063 + }, + { + "epoch": 1.9837469800131782, + "grad_norm": 2.311793327331543, + "learning_rate": 1e-06, + "loss": 0.8213, + "mean_token_accuracy": 0.7333669662475586, + "num_tokens": 451234786.0, + "step": 18064 + }, + { + "epoch": 1.983856797715792, + "grad_norm": 2.1181719303131104, + "learning_rate": 1e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7210239171981812, + "num_tokens": 451261391.0, + "step": 18065 + }, + { + "epoch": 1.9839666154184055, + "grad_norm": 2.434718370437622, + "learning_rate": 1e-06, + "loss": 0.8305, + "mean_token_accuracy": 0.7362347841262817, + "num_tokens": 451282308.0, + "step": 18066 + }, + { + "epoch": 1.984076433121019, + "grad_norm": 2.2269325256347656, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7232037782669067, + "num_tokens": 451306710.0, + "step": 18067 + }, + { + "epoch": 1.9841862508236328, + "grad_norm": 2.156856060028076, + "learning_rate": 1e-06, + "loss": 0.7591, + "mean_token_accuracy": 0.7643075585365295, + "num_tokens": 451329802.0, + "step": 18068 + }, + { + "epoch": 1.9842960685262465, + "grad_norm": 2.7503750324249268, + "learning_rate": 1e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7362845540046692, + "num_tokens": 451348270.0, + "step": 18069 + }, + { + "epoch": 1.98440588622886, + "grad_norm": 2.186206579208374, + "learning_rate": 1e-06, + "loss": 0.83, + "mean_token_accuracy": 0.7416973114013672, + "num_tokens": 451372710.0, + "step": 18070 + }, + { + "epoch": 1.9845157039314738, + "grad_norm": 2.332719326019287, + "learning_rate": 1e-06, + "loss": 1.0098, + "mean_token_accuracy": 0.7054173946380615, + "num_tokens": 451398285.0, + "step": 18071 + }, + { + "epoch": 1.9846255216340873, + "grad_norm": 1.8735800981521606, + "learning_rate": 1e-06, + "loss": 0.9717, + "mean_token_accuracy": 0.6998288631439209, + "num_tokens": 451432512.0, + "step": 18072 + }, + { + "epoch": 1.984735339336701, + "grad_norm": 2.5165185928344727, + "learning_rate": 1e-06, + "loss": 0.8909, + "mean_token_accuracy": 0.7219079732894897, + "num_tokens": 451455744.0, + "step": 18073 + }, + { + "epoch": 1.9848451570393149, + "grad_norm": 2.1762444972991943, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.7217763662338257, + "num_tokens": 451482192.0, + "step": 18074 + }, + { + "epoch": 1.9849549747419284, + "grad_norm": 2.6090943813323975, + "learning_rate": 1e-06, + "loss": 0.8495, + "mean_token_accuracy": 0.7325645685195923, + "num_tokens": 451501192.0, + "step": 18075 + }, + { + "epoch": 1.985064792444542, + "grad_norm": 2.860591411590576, + "learning_rate": 1e-06, + "loss": 0.7689, + "mean_token_accuracy": 0.7616273760795593, + "num_tokens": 451517497.0, + "step": 18076 + }, + { + "epoch": 1.9851746101471557, + "grad_norm": 2.2680749893188477, + "learning_rate": 1e-06, + "loss": 0.8419, + "mean_token_accuracy": 0.7307870388031006, + "num_tokens": 451542421.0, + "step": 18077 + }, + { + "epoch": 1.9852844278497694, + "grad_norm": 2.3203542232513428, + "learning_rate": 1e-06, + "loss": 0.8892, + "mean_token_accuracy": 0.7204615473747253, + "num_tokens": 451566956.0, + "step": 18078 + }, + { + "epoch": 1.9853942455523832, + "grad_norm": 2.344639539718628, + "learning_rate": 1e-06, + "loss": 0.9164, + "mean_token_accuracy": 0.7174062728881836, + "num_tokens": 451590846.0, + "step": 18079 + }, + { + "epoch": 1.9855040632549967, + "grad_norm": 2.32069993019104, + "learning_rate": 1e-06, + "loss": 0.8116, + "mean_token_accuracy": 0.7416188716888428, + "num_tokens": 451614523.0, + "step": 18080 + }, + { + "epoch": 1.9856138809576103, + "grad_norm": 2.3394968509674072, + "learning_rate": 1e-06, + "loss": 0.9033, + "mean_token_accuracy": 0.7145849466323853, + "num_tokens": 451642042.0, + "step": 18081 + }, + { + "epoch": 1.985723698660224, + "grad_norm": 2.4950191974639893, + "learning_rate": 1e-06, + "loss": 0.9616, + "mean_token_accuracy": 0.7087969779968262, + "num_tokens": 451665323.0, + "step": 18082 + }, + { + "epoch": 1.9858335163628378, + "grad_norm": 2.141075611114502, + "learning_rate": 1e-06, + "loss": 0.9094, + "mean_token_accuracy": 0.7132082581520081, + "num_tokens": 451695584.0, + "step": 18083 + }, + { + "epoch": 1.9859433340654513, + "grad_norm": 2.2619287967681885, + "learning_rate": 1e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7375500798225403, + "num_tokens": 451719492.0, + "step": 18084 + }, + { + "epoch": 1.9860531517680649, + "grad_norm": 2.075442314147949, + "learning_rate": 1e-06, + "loss": 0.9379, + "mean_token_accuracy": 0.7080754637718201, + "num_tokens": 451748705.0, + "step": 18085 + }, + { + "epoch": 1.9861629694706786, + "grad_norm": 2.2668933868408203, + "learning_rate": 1e-06, + "loss": 0.8562, + "mean_token_accuracy": 0.7356320023536682, + "num_tokens": 451772802.0, + "step": 18086 + }, + { + "epoch": 1.9862727871732924, + "grad_norm": 2.3942272663116455, + "learning_rate": 1e-06, + "loss": 0.8018, + "mean_token_accuracy": 0.7467974424362183, + "num_tokens": 451796361.0, + "step": 18087 + }, + { + "epoch": 1.9863826048759061, + "grad_norm": 2.2210888862609863, + "learning_rate": 1e-06, + "loss": 0.9911, + "mean_token_accuracy": 0.692070484161377, + "num_tokens": 451821589.0, + "step": 18088 + }, + { + "epoch": 1.9864924225785197, + "grad_norm": 2.366136074066162, + "learning_rate": 1e-06, + "loss": 0.8629, + "mean_token_accuracy": 0.7273010611534119, + "num_tokens": 451844521.0, + "step": 18089 + }, + { + "epoch": 1.9866022402811332, + "grad_norm": 2.4612269401550293, + "learning_rate": 1e-06, + "loss": 0.8852, + "mean_token_accuracy": 0.7221084833145142, + "num_tokens": 451866284.0, + "step": 18090 + }, + { + "epoch": 1.986712057983747, + "grad_norm": 1.9208450317382812, + "learning_rate": 1e-06, + "loss": 0.9909, + "mean_token_accuracy": 0.6931543946266174, + "num_tokens": 451898459.0, + "step": 18091 + }, + { + "epoch": 1.9868218756863607, + "grad_norm": 2.2493913173675537, + "learning_rate": 1e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7408920526504517, + "num_tokens": 451922895.0, + "step": 18092 + }, + { + "epoch": 1.9869316933889745, + "grad_norm": 2.263343095779419, + "learning_rate": 1e-06, + "loss": 0.9489, + "mean_token_accuracy": 0.7172287702560425, + "num_tokens": 451948601.0, + "step": 18093 + }, + { + "epoch": 1.987041511091588, + "grad_norm": 2.0123937129974365, + "learning_rate": 1e-06, + "loss": 0.9417, + "mean_token_accuracy": 0.7137125730514526, + "num_tokens": 451979980.0, + "step": 18094 + }, + { + "epoch": 1.9871513287942015, + "grad_norm": 2.307739734649658, + "learning_rate": 1e-06, + "loss": 0.9078, + "mean_token_accuracy": 0.7180616855621338, + "num_tokens": 452004984.0, + "step": 18095 + }, + { + "epoch": 1.9872611464968153, + "grad_norm": 2.405089855194092, + "learning_rate": 1e-06, + "loss": 0.9454, + "mean_token_accuracy": 0.7077319622039795, + "num_tokens": 452026663.0, + "step": 18096 + }, + { + "epoch": 1.987370964199429, + "grad_norm": 2.442856550216675, + "learning_rate": 1e-06, + "loss": 0.8854, + "mean_token_accuracy": 0.725760281085968, + "num_tokens": 452049901.0, + "step": 18097 + }, + { + "epoch": 1.9874807819020426, + "grad_norm": 2.4244801998138428, + "learning_rate": 1e-06, + "loss": 0.9577, + "mean_token_accuracy": 0.707422137260437, + "num_tokens": 452072087.0, + "step": 18098 + }, + { + "epoch": 1.9875905996046561, + "grad_norm": 2.200655221939087, + "learning_rate": 1e-06, + "loss": 0.8659, + "mean_token_accuracy": 0.7220684885978699, + "num_tokens": 452096640.0, + "step": 18099 + }, + { + "epoch": 1.9877004173072699, + "grad_norm": 2.249349594116211, + "learning_rate": 1e-06, + "loss": 0.9504, + "mean_token_accuracy": 0.7137120366096497, + "num_tokens": 452124085.0, + "step": 18100 + }, + { + "epoch": 1.9878102350098836, + "grad_norm": 2.1243646144866943, + "learning_rate": 1e-06, + "loss": 0.8992, + "mean_token_accuracy": 0.7188295722007751, + "num_tokens": 452152791.0, + "step": 18101 + }, + { + "epoch": 1.9879200527124974, + "grad_norm": 2.341843366622925, + "learning_rate": 1e-06, + "loss": 0.7153, + "mean_token_accuracy": 0.7624316215515137, + "num_tokens": 452174514.0, + "step": 18102 + }, + { + "epoch": 1.988029870415111, + "grad_norm": 2.0955820083618164, + "learning_rate": 1e-06, + "loss": 0.9412, + "mean_token_accuracy": 0.7175334692001343, + "num_tokens": 452204872.0, + "step": 18103 + }, + { + "epoch": 1.9881396881177245, + "grad_norm": 2.260993003845215, + "learning_rate": 1e-06, + "loss": 0.832, + "mean_token_accuracy": 0.735568642616272, + "num_tokens": 452229484.0, + "step": 18104 + }, + { + "epoch": 1.9882495058203382, + "grad_norm": 2.235785961151123, + "learning_rate": 1e-06, + "loss": 0.7784, + "mean_token_accuracy": 0.7578807473182678, + "num_tokens": 452254123.0, + "step": 18105 + }, + { + "epoch": 1.988359323522952, + "grad_norm": 2.435626983642578, + "learning_rate": 1e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7483150959014893, + "num_tokens": 452275449.0, + "step": 18106 + }, + { + "epoch": 1.9884691412255655, + "grad_norm": 2.601548194885254, + "learning_rate": 1e-06, + "loss": 0.9422, + "mean_token_accuracy": 0.7084447145462036, + "num_tokens": 452296827.0, + "step": 18107 + }, + { + "epoch": 1.9885789589281793, + "grad_norm": 2.1991915702819824, + "learning_rate": 1e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7335840463638306, + "num_tokens": 452321746.0, + "step": 18108 + }, + { + "epoch": 1.9886887766307928, + "grad_norm": 2.830767869949341, + "learning_rate": 1e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7353335022926331, + "num_tokens": 452336922.0, + "step": 18109 + }, + { + "epoch": 1.9887985943334066, + "grad_norm": 2.1647660732269287, + "learning_rate": 1e-06, + "loss": 0.9344, + "mean_token_accuracy": 0.7233242392539978, + "num_tokens": 452367892.0, + "step": 18110 + }, + { + "epoch": 1.9889084120360203, + "grad_norm": 2.4338717460632324, + "learning_rate": 1e-06, + "loss": 0.7548, + "mean_token_accuracy": 0.751523494720459, + "num_tokens": 452387676.0, + "step": 18111 + }, + { + "epoch": 1.9890182297386338, + "grad_norm": 2.151209592819214, + "learning_rate": 1e-06, + "loss": 0.9741, + "mean_token_accuracy": 0.705178439617157, + "num_tokens": 452413898.0, + "step": 18112 + }, + { + "epoch": 1.9891280474412474, + "grad_norm": 2.0386457443237305, + "learning_rate": 1e-06, + "loss": 0.8824, + "mean_token_accuracy": 0.7254788875579834, + "num_tokens": 452443384.0, + "step": 18113 + }, + { + "epoch": 1.9892378651438611, + "grad_norm": 2.275467872619629, + "learning_rate": 1e-06, + "loss": 0.9144, + "mean_token_accuracy": 0.7093846201896667, + "num_tokens": 452466212.0, + "step": 18114 + }, + { + "epoch": 1.989347682846475, + "grad_norm": 2.5544795989990234, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.7188488245010376, + "num_tokens": 452486377.0, + "step": 18115 + }, + { + "epoch": 1.9894575005490887, + "grad_norm": 2.4845328330993652, + "learning_rate": 1e-06, + "loss": 0.8651, + "mean_token_accuracy": 0.725365400314331, + "num_tokens": 452508529.0, + "step": 18116 + }, + { + "epoch": 1.9895673182517022, + "grad_norm": 1.8854141235351562, + "learning_rate": 1e-06, + "loss": 0.947, + "mean_token_accuracy": 0.7023032903671265, + "num_tokens": 452542761.0, + "step": 18117 + }, + { + "epoch": 1.9896771359543157, + "grad_norm": 2.752995252609253, + "learning_rate": 1e-06, + "loss": 0.8396, + "mean_token_accuracy": 0.7306544780731201, + "num_tokens": 452560407.0, + "step": 18118 + }, + { + "epoch": 1.9897869536569295, + "grad_norm": 2.2349302768707275, + "learning_rate": 1e-06, + "loss": 0.8415, + "mean_token_accuracy": 0.7339158058166504, + "num_tokens": 452585051.0, + "step": 18119 + }, + { + "epoch": 1.9898967713595432, + "grad_norm": 2.0141782760620117, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.701036274433136, + "num_tokens": 452614817.0, + "step": 18120 + }, + { + "epoch": 1.9900065890621568, + "grad_norm": 2.4355385303497314, + "learning_rate": 1e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7285875082015991, + "num_tokens": 452634492.0, + "step": 18121 + }, + { + "epoch": 1.9901164067647705, + "grad_norm": 2.158869981765747, + "learning_rate": 1e-06, + "loss": 0.9125, + "mean_token_accuracy": 0.7179784178733826, + "num_tokens": 452661339.0, + "step": 18122 + }, + { + "epoch": 1.990226224467384, + "grad_norm": 2.1876161098480225, + "learning_rate": 1e-06, + "loss": 0.8552, + "mean_token_accuracy": 0.7346204519271851, + "num_tokens": 452687424.0, + "step": 18123 + }, + { + "epoch": 1.9903360421699978, + "grad_norm": 1.9431425333023071, + "learning_rate": 1e-06, + "loss": 0.9009, + "mean_token_accuracy": 0.7237284183502197, + "num_tokens": 452718327.0, + "step": 18124 + }, + { + "epoch": 1.9904458598726116, + "grad_norm": 2.3856382369995117, + "learning_rate": 1e-06, + "loss": 0.837, + "mean_token_accuracy": 0.7374546527862549, + "num_tokens": 452740329.0, + "step": 18125 + }, + { + "epoch": 1.9905556775752251, + "grad_norm": 2.2720327377319336, + "learning_rate": 1e-06, + "loss": 0.8145, + "mean_token_accuracy": 0.745032548904419, + "num_tokens": 452763710.0, + "step": 18126 + }, + { + "epoch": 1.9906654952778386, + "grad_norm": 2.2285051345825195, + "learning_rate": 1e-06, + "loss": 0.952, + "mean_token_accuracy": 0.713405966758728, + "num_tokens": 452789825.0, + "step": 18127 + }, + { + "epoch": 1.9907753129804524, + "grad_norm": 2.3082680702209473, + "learning_rate": 1e-06, + "loss": 0.907, + "mean_token_accuracy": 0.71500164270401, + "num_tokens": 452813660.0, + "step": 18128 + }, + { + "epoch": 1.9908851306830662, + "grad_norm": 2.1580963134765625, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7127012014389038, + "num_tokens": 452842227.0, + "step": 18129 + }, + { + "epoch": 1.99099494838568, + "grad_norm": 2.3541758060455322, + "learning_rate": 1e-06, + "loss": 0.905, + "mean_token_accuracy": 0.7125744819641113, + "num_tokens": 452866990.0, + "step": 18130 + }, + { + "epoch": 1.9911047660882935, + "grad_norm": 2.3917579650878906, + "learning_rate": 1e-06, + "loss": 0.8841, + "mean_token_accuracy": 0.7235038876533508, + "num_tokens": 452889544.0, + "step": 18131 + }, + { + "epoch": 1.991214583790907, + "grad_norm": 2.235178232192993, + "learning_rate": 1e-06, + "loss": 0.9311, + "mean_token_accuracy": 0.7083489894866943, + "num_tokens": 452914836.0, + "step": 18132 + }, + { + "epoch": 1.9913244014935207, + "grad_norm": 2.755188226699829, + "learning_rate": 1e-06, + "loss": 0.727, + "mean_token_accuracy": 0.7612385153770447, + "num_tokens": 452931151.0, + "step": 18133 + }, + { + "epoch": 1.9914342191961345, + "grad_norm": 2.2420735359191895, + "learning_rate": 1e-06, + "loss": 0.8066, + "mean_token_accuracy": 0.7414031624794006, + "num_tokens": 452954870.0, + "step": 18134 + }, + { + "epoch": 1.991544036898748, + "grad_norm": 2.6191422939300537, + "learning_rate": 1e-06, + "loss": 0.9393, + "mean_token_accuracy": 0.7148079872131348, + "num_tokens": 452974566.0, + "step": 18135 + }, + { + "epoch": 1.9916538546013618, + "grad_norm": 2.5707242488861084, + "learning_rate": 1e-06, + "loss": 0.7799, + "mean_token_accuracy": 0.7513209581375122, + "num_tokens": 452996322.0, + "step": 18136 + }, + { + "epoch": 1.9917636723039753, + "grad_norm": 2.347640037536621, + "learning_rate": 1e-06, + "loss": 0.9952, + "mean_token_accuracy": 0.690930962562561, + "num_tokens": 453021495.0, + "step": 18137 + }, + { + "epoch": 1.991873490006589, + "grad_norm": 2.3162624835968018, + "learning_rate": 1e-06, + "loss": 0.8365, + "mean_token_accuracy": 0.7383533120155334, + "num_tokens": 453044204.0, + "step": 18138 + }, + { + "epoch": 1.9919833077092028, + "grad_norm": 2.095607280731201, + "learning_rate": 1e-06, + "loss": 0.9293, + "mean_token_accuracy": 0.7100393772125244, + "num_tokens": 453073095.0, + "step": 18139 + }, + { + "epoch": 1.9920931254118164, + "grad_norm": 2.146902084350586, + "learning_rate": 1e-06, + "loss": 0.9391, + "mean_token_accuracy": 0.711205244064331, + "num_tokens": 453101852.0, + "step": 18140 + }, + { + "epoch": 1.99220294311443, + "grad_norm": 1.9802472591400146, + "learning_rate": 1e-06, + "loss": 1.0195, + "mean_token_accuracy": 0.6969314813613892, + "num_tokens": 453133768.0, + "step": 18141 + }, + { + "epoch": 1.9923127608170437, + "grad_norm": 2.086095094680786, + "learning_rate": 1e-06, + "loss": 0.9994, + "mean_token_accuracy": 0.6921091079711914, + "num_tokens": 453162860.0, + "step": 18142 + }, + { + "epoch": 1.9924225785196574, + "grad_norm": 2.2928881645202637, + "learning_rate": 1e-06, + "loss": 0.8418, + "mean_token_accuracy": 0.7377116680145264, + "num_tokens": 453185830.0, + "step": 18143 + }, + { + "epoch": 1.9925323962222712, + "grad_norm": 2.1832070350646973, + "learning_rate": 1e-06, + "loss": 0.8823, + "mean_token_accuracy": 0.7337022423744202, + "num_tokens": 453210917.0, + "step": 18144 + }, + { + "epoch": 1.9926422139248847, + "grad_norm": 2.193324565887451, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7290420532226562, + "num_tokens": 453236671.0, + "step": 18145 + }, + { + "epoch": 1.9927520316274983, + "grad_norm": 2.07145619392395, + "learning_rate": 1e-06, + "loss": 0.9955, + "mean_token_accuracy": 0.7000857591629028, + "num_tokens": 453267965.0, + "step": 18146 + }, + { + "epoch": 1.992861849330112, + "grad_norm": 2.5962116718292236, + "learning_rate": 1e-06, + "loss": 0.9099, + "mean_token_accuracy": 0.7205275297164917, + "num_tokens": 453288337.0, + "step": 18147 + }, + { + "epoch": 1.9929716670327258, + "grad_norm": 2.290285587310791, + "learning_rate": 1e-06, + "loss": 0.9282, + "mean_token_accuracy": 0.7061386108398438, + "num_tokens": 453312992.0, + "step": 18148 + }, + { + "epoch": 1.9930814847353393, + "grad_norm": 2.111997604370117, + "learning_rate": 1e-06, + "loss": 0.938, + "mean_token_accuracy": 0.7068018913269043, + "num_tokens": 453342681.0, + "step": 18149 + }, + { + "epoch": 1.9931913024379528, + "grad_norm": 2.095735788345337, + "learning_rate": 1e-06, + "loss": 0.9603, + "mean_token_accuracy": 0.7111733555793762, + "num_tokens": 453371030.0, + "step": 18150 + }, + { + "epoch": 1.9933011201405666, + "grad_norm": 2.1640772819519043, + "learning_rate": 1e-06, + "loss": 0.8478, + "mean_token_accuracy": 0.7380403876304626, + "num_tokens": 453395774.0, + "step": 18151 + }, + { + "epoch": 1.9934109378431804, + "grad_norm": 2.1455883979797363, + "learning_rate": 1e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7270418405532837, + "num_tokens": 453420487.0, + "step": 18152 + }, + { + "epoch": 1.993520755545794, + "grad_norm": 1.946728229522705, + "learning_rate": 1e-06, + "loss": 0.9147, + "mean_token_accuracy": 0.7095670700073242, + "num_tokens": 453452201.0, + "step": 18153 + }, + { + "epoch": 1.9936305732484076, + "grad_norm": 2.0191762447357178, + "learning_rate": 1e-06, + "loss": 1.0075, + "mean_token_accuracy": 0.6956005096435547, + "num_tokens": 453482571.0, + "step": 18154 + }, + { + "epoch": 1.9937403909510212, + "grad_norm": 2.4150187969207764, + "learning_rate": 1e-06, + "loss": 0.9005, + "mean_token_accuracy": 0.7200618982315063, + "num_tokens": 453506297.0, + "step": 18155 + }, + { + "epoch": 1.993850208653635, + "grad_norm": 2.2847044467926025, + "learning_rate": 1e-06, + "loss": 0.7735, + "mean_token_accuracy": 0.7504856586456299, + "num_tokens": 453528994.0, + "step": 18156 + }, + { + "epoch": 1.9939600263562487, + "grad_norm": 2.186011552810669, + "learning_rate": 1e-06, + "loss": 0.9373, + "mean_token_accuracy": 0.7117455005645752, + "num_tokens": 453557826.0, + "step": 18157 + }, + { + "epoch": 1.9940698440588625, + "grad_norm": 2.5046656131744385, + "learning_rate": 1e-06, + "loss": 0.8482, + "mean_token_accuracy": 0.7309707403182983, + "num_tokens": 453578633.0, + "step": 18158 + }, + { + "epoch": 1.994179661761476, + "grad_norm": 2.1979706287384033, + "learning_rate": 1e-06, + "loss": 0.8436, + "mean_token_accuracy": 0.7353821396827698, + "num_tokens": 453602711.0, + "step": 18159 + }, + { + "epoch": 1.9942894794640895, + "grad_norm": 2.137086868286133, + "learning_rate": 1e-06, + "loss": 0.9054, + "mean_token_accuracy": 0.7220149636268616, + "num_tokens": 453629151.0, + "step": 18160 + }, + { + "epoch": 1.9943992971667033, + "grad_norm": 2.4894425868988037, + "learning_rate": 1e-06, + "loss": 0.9843, + "mean_token_accuracy": 0.7126860022544861, + "num_tokens": 453652018.0, + "step": 18161 + }, + { + "epoch": 1.994509114869317, + "grad_norm": 2.5598247051239014, + "learning_rate": 1e-06, + "loss": 0.8089, + "mean_token_accuracy": 0.7391083240509033, + "num_tokens": 453671251.0, + "step": 18162 + }, + { + "epoch": 1.9946189325719306, + "grad_norm": 2.253067970275879, + "learning_rate": 1e-06, + "loss": 0.8976, + "mean_token_accuracy": 0.7230788469314575, + "num_tokens": 453695660.0, + "step": 18163 + }, + { + "epoch": 1.994728750274544, + "grad_norm": 2.4522855281829834, + "learning_rate": 1e-06, + "loss": 0.8929, + "mean_token_accuracy": 0.7316535115242004, + "num_tokens": 453718342.0, + "step": 18164 + }, + { + "epoch": 1.9948385679771579, + "grad_norm": 2.5125081539154053, + "learning_rate": 1e-06, + "loss": 0.8264, + "mean_token_accuracy": 0.7397398948669434, + "num_tokens": 453739442.0, + "step": 18165 + }, + { + "epoch": 1.9949483856797716, + "grad_norm": 2.483710527420044, + "learning_rate": 1e-06, + "loss": 0.925, + "mean_token_accuracy": 0.7120128870010376, + "num_tokens": 453760860.0, + "step": 18166 + }, + { + "epoch": 1.9950582033823854, + "grad_norm": 2.357053756713867, + "learning_rate": 1e-06, + "loss": 0.9216, + "mean_token_accuracy": 0.7177608609199524, + "num_tokens": 453787271.0, + "step": 18167 + }, + { + "epoch": 1.995168021084999, + "grad_norm": 2.1733291149139404, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7174355387687683, + "num_tokens": 453814288.0, + "step": 18168 + }, + { + "epoch": 1.9952778387876124, + "grad_norm": 2.382371664047241, + "learning_rate": 1e-06, + "loss": 0.958, + "mean_token_accuracy": 0.7097209692001343, + "num_tokens": 453838971.0, + "step": 18169 + }, + { + "epoch": 1.9953876564902262, + "grad_norm": 1.968234896659851, + "learning_rate": 1e-06, + "loss": 0.9405, + "mean_token_accuracy": 0.7017683982849121, + "num_tokens": 453870748.0, + "step": 18170 + }, + { + "epoch": 1.99549747419284, + "grad_norm": 2.2024850845336914, + "learning_rate": 1e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7369016408920288, + "num_tokens": 453897373.0, + "step": 18171 + }, + { + "epoch": 1.9956072918954535, + "grad_norm": 2.4179158210754395, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7349027395248413, + "num_tokens": 453918450.0, + "step": 18172 + }, + { + "epoch": 1.9957171095980673, + "grad_norm": 2.552567958831787, + "learning_rate": 1e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7358663082122803, + "num_tokens": 453939767.0, + "step": 18173 + }, + { + "epoch": 1.9958269273006808, + "grad_norm": 2.1149210929870605, + "learning_rate": 1e-06, + "loss": 0.9028, + "mean_token_accuracy": 0.7172960042953491, + "num_tokens": 453967244.0, + "step": 18174 + }, + { + "epoch": 1.9959367450032945, + "grad_norm": 2.487842082977295, + "learning_rate": 1e-06, + "loss": 0.9039, + "mean_token_accuracy": 0.7183408141136169, + "num_tokens": 453993158.0, + "step": 18175 + }, + { + "epoch": 1.9960465627059083, + "grad_norm": 2.097506046295166, + "learning_rate": 1e-06, + "loss": 0.9327, + "mean_token_accuracy": 0.7155422568321228, + "num_tokens": 454024572.0, + "step": 18176 + }, + { + "epoch": 1.9961563804085218, + "grad_norm": 2.424146890640259, + "learning_rate": 1e-06, + "loss": 0.8218, + "mean_token_accuracy": 0.7389194369316101, + "num_tokens": 454045512.0, + "step": 18177 + }, + { + "epoch": 1.9962661981111354, + "grad_norm": 2.199467182159424, + "learning_rate": 1e-06, + "loss": 0.8795, + "mean_token_accuracy": 0.7227288484573364, + "num_tokens": 454069435.0, + "step": 18178 + }, + { + "epoch": 1.9963760158137491, + "grad_norm": 2.0637426376342773, + "learning_rate": 1e-06, + "loss": 1.0029, + "mean_token_accuracy": 0.6917147636413574, + "num_tokens": 454098799.0, + "step": 18179 + }, + { + "epoch": 1.9964858335163629, + "grad_norm": 2.1896519660949707, + "learning_rate": 1e-06, + "loss": 0.9324, + "mean_token_accuracy": 0.7124062776565552, + "num_tokens": 454124271.0, + "step": 18180 + }, + { + "epoch": 1.9965956512189766, + "grad_norm": 2.1122353076934814, + "learning_rate": 1e-06, + "loss": 0.9605, + "mean_token_accuracy": 0.6996726989746094, + "num_tokens": 454152152.0, + "step": 18181 + }, + { + "epoch": 1.9967054689215902, + "grad_norm": 2.1645500659942627, + "learning_rate": 1e-06, + "loss": 0.9231, + "mean_token_accuracy": 0.7263701558113098, + "num_tokens": 454177238.0, + "step": 18182 + }, + { + "epoch": 1.9968152866242037, + "grad_norm": 2.0200037956237793, + "learning_rate": 1e-06, + "loss": 0.8863, + "mean_token_accuracy": 0.7315096259117126, + "num_tokens": 454207151.0, + "step": 18183 + }, + { + "epoch": 1.9969251043268175, + "grad_norm": 2.381695032119751, + "learning_rate": 1e-06, + "loss": 0.8897, + "mean_token_accuracy": 0.7185583710670471, + "num_tokens": 454228725.0, + "step": 18184 + }, + { + "epoch": 1.9970349220294312, + "grad_norm": 2.4835305213928223, + "learning_rate": 1e-06, + "loss": 0.911, + "mean_token_accuracy": 0.7134818434715271, + "num_tokens": 454250390.0, + "step": 18185 + }, + { + "epoch": 1.9971447397320448, + "grad_norm": 2.1091675758361816, + "learning_rate": 1e-06, + "loss": 0.9419, + "mean_token_accuracy": 0.709619402885437, + "num_tokens": 454278047.0, + "step": 18186 + }, + { + "epoch": 1.9972545574346585, + "grad_norm": 2.065307855606079, + "learning_rate": 1e-06, + "loss": 0.9006, + "mean_token_accuracy": 0.720937967300415, + "num_tokens": 454309003.0, + "step": 18187 + }, + { + "epoch": 1.997364375137272, + "grad_norm": 2.6608824729919434, + "learning_rate": 1e-06, + "loss": 0.87, + "mean_token_accuracy": 0.7317038178443909, + "num_tokens": 454329323.0, + "step": 18188 + }, + { + "epoch": 1.9974741928398858, + "grad_norm": 2.3828699588775635, + "learning_rate": 1e-06, + "loss": 0.862, + "mean_token_accuracy": 0.7320540547370911, + "num_tokens": 454353095.0, + "step": 18189 + }, + { + "epoch": 1.9975840105424996, + "grad_norm": 2.273406982421875, + "learning_rate": 1e-06, + "loss": 0.8775, + "mean_token_accuracy": 0.7265541553497314, + "num_tokens": 454379188.0, + "step": 18190 + }, + { + "epoch": 1.997693828245113, + "grad_norm": 2.093996286392212, + "learning_rate": 1e-06, + "loss": 0.9752, + "mean_token_accuracy": 0.6985757946968079, + "num_tokens": 454408424.0, + "step": 18191 + }, + { + "epoch": 1.9978036459477266, + "grad_norm": 2.560744524002075, + "learning_rate": 1e-06, + "loss": 0.9308, + "mean_token_accuracy": 0.7181001901626587, + "num_tokens": 454428279.0, + "step": 18192 + }, + { + "epoch": 1.9979134636503404, + "grad_norm": 2.1246323585510254, + "learning_rate": 1e-06, + "loss": 0.9556, + "mean_token_accuracy": 0.7040365934371948, + "num_tokens": 454456830.0, + "step": 18193 + }, + { + "epoch": 1.9980232813529542, + "grad_norm": 2.1194369792938232, + "learning_rate": 1e-06, + "loss": 0.9828, + "mean_token_accuracy": 0.6920590400695801, + "num_tokens": 454486443.0, + "step": 18194 + }, + { + "epoch": 1.998133099055568, + "grad_norm": 2.2355854511260986, + "learning_rate": 1e-06, + "loss": 0.8672, + "mean_token_accuracy": 0.7367851138114929, + "num_tokens": 454511233.0, + "step": 18195 + }, + { + "epoch": 1.9982429167581814, + "grad_norm": 2.044863224029541, + "learning_rate": 1e-06, + "loss": 0.9635, + "mean_token_accuracy": 0.7096841931343079, + "num_tokens": 454541295.0, + "step": 18196 + }, + { + "epoch": 1.998352734460795, + "grad_norm": 2.4674904346466064, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7099915742874146, + "num_tokens": 454564483.0, + "step": 18197 + }, + { + "epoch": 1.9984625521634087, + "grad_norm": 2.37844181060791, + "learning_rate": 1e-06, + "loss": 0.7787, + "mean_token_accuracy": 0.7462105751037598, + "num_tokens": 454587046.0, + "step": 18198 + }, + { + "epoch": 1.9985723698660225, + "grad_norm": 2.3345210552215576, + "learning_rate": 1e-06, + "loss": 0.8548, + "mean_token_accuracy": 0.7334123253822327, + "num_tokens": 454610456.0, + "step": 18199 + }, + { + "epoch": 1.998682187568636, + "grad_norm": 2.548877000808716, + "learning_rate": 1e-06, + "loss": 0.9243, + "mean_token_accuracy": 0.7136272192001343, + "num_tokens": 454631615.0, + "step": 18200 + }, + { + "epoch": 1.9987920052712496, + "grad_norm": 2.269287109375, + "learning_rate": 1e-06, + "loss": 0.826, + "mean_token_accuracy": 0.7403301000595093, + "num_tokens": 454655017.0, + "step": 18201 + }, + { + "epoch": 1.9989018229738633, + "grad_norm": 2.1709706783294678, + "learning_rate": 1e-06, + "loss": 0.982, + "mean_token_accuracy": 0.7019439339637756, + "num_tokens": 454684699.0, + "step": 18202 + }, + { + "epoch": 1.999011640676477, + "grad_norm": 2.4080278873443604, + "learning_rate": 1e-06, + "loss": 0.761, + "mean_token_accuracy": 0.7550314664840698, + "num_tokens": 454705997.0, + "step": 18203 + }, + { + "epoch": 1.9991214583790908, + "grad_norm": 2.497459650039673, + "learning_rate": 1e-06, + "loss": 0.7924, + "mean_token_accuracy": 0.7470940947532654, + "num_tokens": 454726716.0, + "step": 18204 + }, + { + "epoch": 1.9992312760817044, + "grad_norm": 2.2681102752685547, + "learning_rate": 1e-06, + "loss": 0.9368, + "mean_token_accuracy": 0.7240121364593506, + "num_tokens": 454751051.0, + "step": 18205 + }, + { + "epoch": 1.999341093784318, + "grad_norm": 2.2533674240112305, + "learning_rate": 1e-06, + "loss": 0.9944, + "mean_token_accuracy": 0.7031902074813843, + "num_tokens": 454777976.0, + "step": 18206 + }, + { + "epoch": 1.9994509114869317, + "grad_norm": 2.4207870960235596, + "learning_rate": 1e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7340006828308105, + "num_tokens": 454801217.0, + "step": 18207 + }, + { + "epoch": 1.9995607291895454, + "grad_norm": 2.2077975273132324, + "learning_rate": 1e-06, + "loss": 0.7997, + "mean_token_accuracy": 0.7485467195510864, + "num_tokens": 454825890.0, + "step": 18208 + }, + { + "epoch": 1.9996705468921592, + "grad_norm": 1.871306300163269, + "learning_rate": 1e-06, + "loss": 0.9108, + "mean_token_accuracy": 0.7152973413467407, + "num_tokens": 454859879.0, + "step": 18209 + }, + { + "epoch": 1.9997803645947727, + "grad_norm": 2.4710330963134766, + "learning_rate": 1e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7330666780471802, + "num_tokens": 454881399.0, + "step": 18210 + }, + { + "epoch": 1.9998901822973862, + "grad_norm": 2.288555860519409, + "learning_rate": 1e-06, + "loss": 0.887, + "mean_token_accuracy": 0.7225549221038818, + "num_tokens": 454907610.0, + "step": 18211 + }, + { + "epoch": 2.0, + "grad_norm": 2.2059690952301025, + "learning_rate": 1e-06, + "loss": 0.7898, + "mean_token_accuracy": 0.7481889724731445, + "num_tokens": 454931464.0, + "step": 18212 + }, + { + "epoch": 2.0, + "step": 18212, + "total_flos": 2.048535820913292e+19, + "train_loss": 0.9530186036766531, + "train_runtime": 21116.7041, + "train_samples_per_second": 13.799, + "train_steps_per_second": 0.862 + } + ], + "logging_steps": 1, + "max_steps": 18212, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 9106, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.048535820913292e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..88051a1 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49ce122279436e5f11e172584abf18d249269ed969a2a196687f826be15b4111 +size 13329